## Mallet LDA

In [2]:
!pip install little_mallet_wrapper

Defaulting to user installation because normal site-packages is not writeable
Collecting little_mallet_wrapper
  Downloading little_mallet_wrapper-0.5.0-py3-none-any.whl.metadata (13 kB)
Downloading little_mallet_wrapper-0.5.0-py3-none-any.whl (19 kB)
Installing collected packages: little_mallet_wrapper
Successfully installed little_mallet_wrapper-0.5.0


In [None]:
import little_mallet_wrapper as lmw
import pandas as pd
import pdb
import argparse

In [None]:
PATH_TO_MALLET = "path_to_mallet"

file_path = 'path_to_reddit_tsv'
df = pd.read_csv(file_path, sep='\t')
df = df.dropna()

dataset = df.sample(n=10000, random_state=42)

training_data = [lmw.process_string(t) for t in dataset['summary'].tolist()]
training_data = [d.replace("NUM", "") for d in training_data if d.strip()]

In [22]:
topic_keys, topic_distributions = lmw.quick_train_topic_model(PATH_TO_MALLET, "./", 200, training_data)

Importing data...
Complete
Training topic model...


Mallet LDA: 200 topics, 8 topic bits, 11111111 topic mask
Data loaded.
max tokens: 49
total tokens: 232656
<10> LL/token: -9.25307
<20> LL/token: -8.73781
<30> LL/token: -8.49904
<40> LL/token: -8.35538

0	0.025	financial due wants money income expenses struggling security financially new helping ask struggles future impact low torn goals supporting saving 
1	0.025	dilemma moral creating show empathy prioritizing asked honesty leading kindness showing personal faced appreciation accommodating wants understanding support respecting choose 
2	0.025	roommate living person shared roommates space must decide room hygiene causing others lives personal apartment clean laundry shares comfort one 
3	0.025	food dietary needs preferences dinner guests vegan host eating hosting conflict meal meat person accommodating non specific meals restrictions decide 
4	0.025	student school classmate high faced despite situation feeling teacher leaving class senior laundry guilty discovering showing complete 

Complete


In [57]:
import numpy as np
from sklearn.cluster import KMeans

input_file = "mallet.topic_distributions.200"
output_file = "RedditMallet200.txt"

data = []
kmeans_data = []

with open(input_file, "r") as infile:
    for line in infile:
        # Extract columns 3 onward as a vector
        values = list(map(float, line.strip().split("\t")[2:]))
        kmeans_data.append(values)
        
        max_index = np.argmax(values)
        data.append(max_index)

dataset['topic'] = data

In [58]:
dataset['topic'].describe()

count    10000.000000
mean       102.169200
std         56.648752
min          0.000000
25%         60.000000
50%         98.000000
75%        155.000000
max        199.000000
Name: topic, dtype: float64

In [None]:
import pandas as pd
import numpy as np

if len(dataset['topic'].unique()) < 200:
    raise ValueError("The DataFrame does not contain all 200 topics.")

distinct_rows = dataset.groupby('topic').apply(lambda group: group.sample(1)).reset_index(drop=True)

remaining_count = 200 - len(distinct_rows)
additional_rows = dataset.sample(remaining_count)

final_sample = pd.concat([distinct_rows, additional_rows]).sample(frac=1).reset_index(drop=True)

  distinct_rows = dataset.groupby('topic').apply(lambda group: group.sample(1)).reset_index(drop=True)


In [None]:
final_sample.to_csv('Mallet_distinct_summaries.csv', index=False)

In [62]:
num_clusters = 200

kmeans = KMeans(n_clusters=num_clusters, random_state=42)
kmeans.fit(kmeans_data)

labels = kmeans.labels_



In [None]:
closest_points_per_cluster = []
for i in range(num_clusters):
    distances_to_centroid = kmeans.transform(kmeans_data)[:, i]
    closest_point_index = np.argmin(distances_to_centroid)
    closest_points_per_cluster.append(closest_point_index)

In [82]:
closest_points_per_cluster

[1939,
 8229,
 4764,
 7961,
 4639,
 2414,
 7247,
 7402,
 8664,
 9530,
 2580,
 2334,
 9424,
 1194,
 8626,
 7461,
 6450,
 8915,
 2992,
 736,
 8023,
 2128,
 6766,
 4352,
 2128,
 2396,
 8879,
 8449,
 5685,
 6000,
 2937,
 1355,
 7230,
 5302,
 5878,
 5979,
 9260,
 6534,
 7482,
 6916,
 9720,
 4876,
 7847,
 1363,
 1339,
 8894,
 6243,
 3200,
 9835,
 6585,
 1955,
 298,
 9902,
 8374,
 2329,
 5186,
 3360,
 3474,
 3172,
 1314,
 7246,
 8611,
 9281,
 4187,
 7892,
 2517,
 1098,
 2271,
 5881,
 6781,
 8973,
 1413,
 6753,
 789,
 3778,
 6023,
 48,
 2363,
 3394,
 9767,
 4760,
 96,
 9163,
 6561,
 7910,
 4350,
 1556,
 8250,
 7429,
 1704,
 4383,
 9210,
 1155,
 5296,
 7783,
 6888,
 545,
 1032,
 298,
 2911,
 8797,
 5005,
 6733,
 3095,
 2516,
 2086,
 7707,
 297,
 9028,
 7940,
 1793,
 1291,
 6461,
 8679,
 4697,
 9579,
 2679,
 8075,
 3769,
 63,
 178,
 2144,
 3242,
 7650,
 6047,
 6193,
 2949,
 5286,
 2715,
 8713,
 96,
 7598,
 4510,
 3480,
 8535,
 8932,
 934,
 3166,
 1341,
 3665,
 6015,
 5159,
 633,
 2927,
 4904,
 4

In [None]:
distinct_indices = np.argsort(np.min(kmeans.transform(kmeans_data), axis=1))[:num_clusters]
distinct_summaries = dataset.iloc[distinct_indices]

In [83]:
distinct_summaries = dataset.iloc[closest_points_per_cluster]

In [84]:
distinct_summaries

Unnamed: 0,id,summary,option1,option2,topic
245712,9wqdte,A person is faced with the dilemma of whether ...,Ask their father to intervene and request that...,Avoid addressing the issue and risk embarrassi...,73
59777,bl5ntw,A person must decide whether to announce their...,Announce the engagement plans during the visit...,Wait until after the visit to share the engage...,53
107638,k5pjix,A person's mother deliberately broke their bel...,Accept the mother's offer to replace the broke...,Refuse the mother's offer and tell her that sh...,188
150301,du7d8d,A person is deciding whether to inform their e...,Tell the mother about the decision to return t...,Withhold the information from the mother to av...,6
96027,r96ovq,A college student is considering moving out of...,Move out of the current living situation to pr...,Stay in the current living situation to mainta...,130
...,...,...,...,...,...
169814,j0oc83,An employee is faced with a moral dilemma afte...,Report the incident and prioritize one's own c...,Do not report the incident and tolerate the ha...,32
144897,a49t22,A person is in a relationship with someone who...,Put aside personal feelings and be friendly wi...,Prioritize personal feelings and values by mai...,55
68138,awxhb1,A person is in a marriage where their partner'...,End the marriage due to the perceived lack of ...,Stay in the marriage and try to address the is...,147
22028,jhoh0f,"A 21-year-old woman, who has been living with ...",Move out of her grandparents' house to pursue ...,Stay with her grandparents and put her own des...,138


In [85]:
final_sample.to_csv('Mallet_distinct_summaries_Kmeans.csv', index=False)