In [59]:
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering

from gensim.models.ldamodel import LdaModel
from gensim.corpora import Dictionary


In [60]:
processed_df = pd.read_pickle('../data/preprocessed_df.pkl')

In [61]:
processed_df

Unnamed: 0,text
TheRedPill,google fire ph.d biologist/engineer claiming a...
BlackPeopleTwitter,wildest trend imo corjay lmao_shippuden inco l...
WhitePeopleTwitter,solution obvious shooting foot secular talk vv...
politics,megathread joe biden projected defeat presiden...
law,law school leaked draft dobbs opinion justice ...
...,...
Offensivejokes,vegan view vegan think animal die wild mac che...
climateskeptics,hypocrisy michael bloomberg plane ae ae _-heth...
LockdownCriticalLeft,worker uniting solidarity authoritarian govern...
FightingFakeNews,le heckin reddit moment reddit year reddit wee...


In [62]:
# Vectorize the data

tfidf_vectorizer = TfidfVectorizer(max_df = .9, min_df = .1)

text_transformed = tfidf_vectorizer.fit_transform(processed_df['text'])

In [63]:
text_transformed

<193x5339 sparse matrix of type '<class 'numpy.float64'>'
	with 284410 stored elements in Compressed Sparse Row format>

In [64]:
# Create 5 clusters with k means. Bit of a dummy model

k_clusterer = KMeans(n_clusters = 5, random_state = 305)

k_clusterer.fit(text_transformed)

KMeans(n_clusters=5, random_state=305)

In [65]:
#Add to df

processed_df['k_means_labels'] = k_clusterer.labels_

In [66]:
#Check counts in each category
processed_df['k_means_labels'].value_counts()

0    60
1    55
3    38
4    33
2     7
Name: k_means_labels, dtype: int64

In [67]:
#Cell for checking which subreddits were clustered together
processed_df.loc[processed_df['k_means_labels'] == 3]

Unnamed: 0,text,k_means_labels
BlackPeopleTwitter,wildest trend imo corjay lmao_shippuden inco l...,3
WhitePeopleTwitter,solution obvious shooting foot secular talk vv...,3
dataisbeautiful,oc trending google search state 2018 2020 anal...,3
MurderedByWords,sudden law order apply uh ike yvar- qevatevere...,3
wallstreetbets,time square right upvote everyone see support ...,3
shitposting,death sentence dollar dollar bill y'all russi...,3
PoliticalHumor,try potus challenge holding book read front pl...,3
rareinsults,wrong pharrell listed beverly hill mansion 17 ...,3
awfuleverything,poor guy binkybrain 10h award wife cancer took...,3
insanepeoplefacebook,accidentally left wing tweet bernie sander ber...,3


In [68]:
# Better model: Agglomerative Clustering based on minimum distance
agg_clusterer = AgglomerativeClustering(n_clusters = None, distance_threshold = 1.65, compute_full_tree = True)
agg_clusterer.fit(text_transformed.toarray())

AgglomerativeClustering(compute_full_tree=True, distance_threshold=1.65,
                        n_clusters=None)

In [69]:
# Add labels to df
processed_df['agg_labels'] = agg_clusterer.labels_

In [70]:
# Check counts in each category
processed_df['agg_labels'].value_counts()

13    27
2     24
3     21
0     16
6     14
7     13
11    13
1     11
14     9
4      8
5      8
15     7
9      6
8      5
10     4
16     3
12     2
17     2
Name: agg_labels, dtype: int64

In [71]:
# Cell for checking which subs were clustered together
processed_df.loc[processed_df['agg_labels'] == 8]

Unnamed: 0,text,k_means_labels,agg_labels
OutOfTheLoop,going net neutrality ask question hey folk rec...,1,8
TopMindsOfReddit,the_donald quarantined update look top mind ca...,1,8
AgainstHateSubreddits,fucking nazi white nationalist /r/the_donald s...,1,8
ProudMaleFeminists,yikes grocery store today buying soy milk usua...,1,8
IncelsCircleJerk,bro black pill exist imagine life tutorial mod...,4,8


In [72]:
# send off for an additional feature engineers
pd.to_pickle(processed_df, '../data/agg_labels.pkl')

# Clustering on Posts

In [73]:
post_df = pd.read_pickle('../data/df_by_post.pkl')

In [74]:
tfidf_vectorizer = TfidfVectorizer(max_df = .9)

text_transformed = tfidf_vectorizer.fit_transform(post_df['finalized_text'])

In [75]:
k_clusterer = KMeans(n_clusters = 50, random_state = 305)

k_clusterer.fit(text_transformed)

KMeans(n_clusters=50, random_state=305)

In [76]:
post_df['k_means_labels'] = k_clusterer.labels_

In [77]:
post_df['k_means_labels'].value_counts()

0     14035
28     1225
20     1224
1      1136
30      925
23      912
42      870
34      861
37      699
13      605
31      565
6       553
33      547
14      512
16      491
44      445
11      434
27      412
45      404
35      397
32      393
43      386
5       360
24      360
40      358
12      353
10      349
17      347
25      336
26      333
2       326
46      318
8       318
9       313
4       312
36      306
19      246
15      246
22      243
38      237
39      211
7       195
48      192
47      166
21      159
18      158
29      147
3        76
49       73
41       62
Name: k_means_labels, dtype: int64

In [78]:
post_df.loc[post_df['k_means_labels'] == 6]

Unnamed: 0,created_utc,subreddit,subreddit_subscribers,upvote_ratio,num_comments,url,image?,image_text,total_text,tokenned_stopped_text,finalized_text,alignment,k_means_labels
602,1.597330e+09,politics,8062883.0,0.93,7916.0,https://www.independent.co.uk/news/world/ameri...,False,,AOC challenges Trump to release his college tr...,"[aoc, challenge, trump, release, college, tran...",aoc challenge trump release college transcript...,0,6
629,1.608071e+09,politics,8062883.0,0.88,4233.0,https://www.independent.co.uk/news/world/ameri...,False,,Trump must turn over financial documents to Ne...,"[trump, turn, financial, document, new, york, ...",trump turn financial document new york attorne...,0,6
734,1.644417e+09,politics,8062883.0,0.89,4710.0,https://www.rollingstone.com/politics/politics...,False,,Louisiana Senate Candidate Literally Torches a...,"[louisiana, senate, candidate, torch, confeder...",louisiana senate candidate torch confederate f...,0,6
838,1.624548e+09,law,186908.0,0.96,164.0,https://www.cnbc.com/2021/06/24/rudy-giuliani-...,False,,Giuliani suspended from practicing law in New ...,"[giuliani, suspended, practicing, law, new, yo...",giuliani suspended practicing law new york fal...,0,6
848,1.616757e+09,law,186908.0,0.97,124.0,https://newyork.cbslocal.com/2021/03/25/qualif...,False,,New York Becomes First City In U.S. To End Qua...,"[new, york, becomes, first, city, u, end, qual...",new york becomes first city u end qualified im...,0,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...
34895,1.640808e+09,HillaryForPrison,54062.0,0.94,2.0,https://i.redd.it/d98xpd58ej881.jpg,True,Tim Young @\n\n@TimRunsHisMouth\nThere's a cri...,Crime Wave Tim Young @\n\n@TimRunsHisMouth\nTh...,"[crime, wave, tim, young, timrunshismouth, cri...",crime wave tim young timrunshismouth crime wav...,-1,6
34957,1.653308e+09,Offensivejokes,204260.0,0.99,55.0,https://i.redd.it/oxfv6l40x7191.jpg,True,| think that the new James Bond should be\na w...,James Bond (she/her) | think that the new Jame...,"[james, bond, think, new, james, bond, woman, ...",james bond think new james bond woman think gr...,-1,6
35032,1.653308e+09,Offensivejokes,204260.0,0.99,55.0,https://i.redd.it/oxfv6l40x7191.jpg,True,| think that the new James Bond should be\na w...,James Bond (she/her) | think that the new Jame...,"[james, bond, think, new, james, bond, woman, ...",james bond think new james bond woman think gr...,-1,6
35067,1.636956e+09,Offensivejokes,204260.0,1.00,37.0,https://i.redd.it/gxs0qdla9pz71.jpg,True,,Happy new year.,"[happy, new, year]",happy new year,-1,6
