In [1]:
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering

from gensim.models.ldamodel import LdaModel
from gensim.corpora import Dictionary


In [2]:
processed_df = pd.read_pickle('../data/preprocessed_df.pkl')

In [3]:
processed_df.head()

Unnamed: 0,text
TheRedPill,google fire ph.d biologist/engineer claiming a...
ivermectin,evidence whatsoever ivermectin safer treatment...
BlackPeopleTwitter,wildest trend imo gon come world shamble fraud...
WhitePeopleTwitter,solution obvious shooting foot choice clear b...
politics,megathread joe biden projected defeat presiden...


In [4]:
tfidf_vectorizer = TfidfVectorizer(max_df = .95)

text_transformed = tfidf_vectorizer.fit_transform(processed_df['text'])

In [5]:
text_transformed

<235x48331 sparse matrix of type '<class 'numpy.float64'>'
	with 286842 stored elements in Compressed Sparse Row format>

In [6]:
k_clusterer = KMeans(n_clusters = 5, random_state = 305)

k_clusterer.fit(text_transformed)

KMeans(n_clusters=5, random_state=305)

In [7]:
processed_df['k_means_labels'] = k_clusterer.labels_

In [8]:
processed_df['k_means_labels'].value_counts()

0    120
4     47
1     30
2     29
3      9
Name: k_means_labels, dtype: int64

In [9]:
processed_df

Unnamed: 0,text,k_means_labels
TheRedPill,google fire ph.d biologist/engineer claiming a...,1
ivermectin,evidence whatsoever ivermectin safer treatment...,0
BlackPeopleTwitter,wildest trend imo gon come world shamble fraud...,0
WhitePeopleTwitter,solution obvious shooting foot choice clear b...,0
politics,megathread joe biden projected defeat presiden...,2
...,...,...
Glocks,watching shit knowing damn well ammo price goi...,0
tacticalgear,snuck basement 👍🏼 going rock trash man ghillie...,0
liberalgunowners,fight net neutrality view gun ownership side g...,2
HillaryForPrison,unelectable vote show google image people sear...,4


In [10]:
agg_clusterer = AgglomerativeClustering(n_clusters = None, distance_threshold = 1.75, compute_full_tree = True)
agg_clusterer.fit(text_transformed.toarray())

AgglomerativeClustering(compute_full_tree=True, distance_threshold=1.75,
                        n_clusters=None)

In [11]:
processed_df['agg_labels'] = agg_clusterer.labels_

In [12]:
processed_df['agg_labels'].value_counts()

3     39
1     31
15    23
5     23
2     22
7     19
8     15
0     11
9     10
16    10
12    10
4      6
10     6
6      4
13     2
14     2
11     2
Name: agg_labels, dtype: int64

In [13]:
processed_df.loc[processed_df['agg_labels'] == 12]

Unnamed: 0,text,k_means_labels,agg_labels
ivermectin,evidence whatsoever ivermectin safer treatment...,0,12
lgbt,parent chose attend daughter wedding happier s...,0,12
traaaaaaannnnnnnnnns,oc art part trans struggle hard picturing feel...,0,12
trans,love punk transwoman cute comic irl_donut twit...,0,12
GenderCynical,gendercritical banned /r/gendercritical edit m...,0,12
asktransgender,compiled single informed consent clinic countr...,1,12
MtF,funny interaction airport security flying home...,1,12
ftm,king order masculine body dream thought guy o....,0,12
NonBinary,pov barista binder want u tell chest flat bee...,0,12
FreeSpeech,black man feel black life matter becoming bull...,0,12


## Trying clustering on posts

In [14]:
post_df = pd.read_pickle("../data/df_by_post.pkl")

In [16]:
post_df.head()

Unnamed: 0,created_utc,num_comments,subreddit,subreddit_subscribers,upvote_ratio,url,image?,total_text,tokenned_stopped_text,finalized_text
0,1502177000.0,840.0,TheRedPill,0.0,0.65,https://www.reddit.com/r/TheRedPill/comments/6...,False,Google Fires Ph.D Biologist/Engineer For Claim...,"[google, fire, ph.d, biologist/engineer, claim...",google fire ph.d biologist/engineer claiming a...
1,1480424000.0,759.0,TheRedPill,0.0,0.82,https://www.reddit.com/r/TheRedPill/comments/5...,False,HOW TO GET LAID LIKE A WARLORD: 37 Rules of Ap...,"[laid, warlord, 37, rule, approaching, model-t...",laid warlord 37 rule approaching model-tier gi...
2,1502101000.0,179.0,TheRedPill,0.0,0.86,https://www.reddit.com/r/TheRedPill/comments/6...,False,I have been practicing Stoicism for 3 years no...,"[practicing, stoicism, 3, year, quality, life,...",practicing stoicism 3 year quality life increa...
3,1433962000.0,954.0,TheRedPill,0.0,0.87,https://www.reddit.com/r/TheRedPill/comments/3...,False,"[META] Reddit rolls out first ban wave of ""Har...","[meta, reddit, roll, first, ban, wave, harassi...",meta reddit roll first ban wave harassing subr...
4,1500270000.0,336.0,TheRedPill,0.0,0.9,https://www.reddit.com/r/TheRedPill/comments/6...,False,"Man gets his ex-GF to pay child support, and R...","[man, get, ex-gf, pay, child, support, reddit,...",man get ex-gf pay child support reddit freak k...


In [17]:
tfidf_vectorizer = TfidfVectorizer(max_df = .8) #cutting the threshold even more

text_transformed = tfidf_vectorizer.fit_transform(post_df['finalized_text'])

In [18]:
agg_clusterer = AgglomerativeClustering(n_clusters = None, distance_threshold = 1.75, compute_full_tree = True)
agg_clusterer.fit(text_transformed)