In [235]:
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering

from gensim.models.ldamodel import LdaModel
from gensim.corpora import Dictionary


In [236]:
processed_df = pd.read_pickle('../data/preprocessed_df.pkl')

In [237]:
processed_df.head()

Unnamed: 0,text
TheRedPill,google fire ph.d biologist/engineer claiming a...
ivermectin,evidence whatsoever ivermectin safer treatment...
BlackPeopleTwitter,wildest trend imo gon come world shamble fraud...
WhitePeopleTwitter,solution obvious shooting foot choice clear b...
politics,megathread joe biden projected defeat presiden...


In [238]:
tfidf_vectorizer = TfidfVectorizer(max_df = .95)

text_transformed = tfidf_vectorizer.fit_transform(processed_df['text'])

In [239]:
text_transformed

<191x44206 sparse matrix of type '<class 'numpy.float64'>'
	with 246165 stored elements in Compressed Sparse Row format>

In [240]:
k_clusterer = KMeans(n_clusters = 5, random_state = 305)

k_clusterer.fit(text_transformed)

KMeans(n_clusters=5, random_state=305)

In [241]:
processed_df['k_means_labels'] = k_clusterer.labels_

In [242]:
processed_df['k_means_labels'].value_counts()

1    78
2    62
3    35
0     9
4     7
Name: k_means_labels, dtype: int64

In [243]:
processed_df

Unnamed: 0,text,k_means_labels
TheRedPill,google fire ph.d biologist/engineer claiming a...,1
ivermectin,evidence whatsoever ivermectin safer treatment...,4
BlackPeopleTwitter,wildest trend imo gon come world shamble fraud...,2
WhitePeopleTwitter,solution obvious shooting foot choice clear b...,1
politics,megathread joe biden projected defeat presiden...,3
...,...,...
Anarcho_Capitalism,thought police v angry citizen melbourne austr...,0
ar15,guy found detent pin moved 3 year ago green ca...,2
guns,colt .45 limited batch west point class gradua...,1
nra,touch√© protest guy making use 2nd amendment di...,2


In [251]:
agg_clusterer = AgglomerativeClustering(n_clusters = None, distance_threshold = 1.7, compute_full_tree = True)
agg_clusterer.fit(text_transformed.toarray())

AgglomerativeClustering(compute_full_tree=True, distance_threshold=1.7,
                        n_clusters=None)

In [252]:
processed_df['agg_labels'] = agg_clusterer.labels_

In [253]:
processed_df['agg_labels'].value_counts()

0     44
8     23
7     22
5     22
3     18
2     15
6     10
1      9
10     8
11     7
4      5
14     2
13     2
12     2
9      2
Name: agg_labels, dtype: int64

In [268]:
processed_df.loc[processed_df['agg_labels'] == 0]

Unnamed: 0,text,k_means_labels,agg_labels
BlackPeopleTwitter,wildest trend imo gon come world shamble fraud...,2,0
WhitePeopleTwitter,solution obvious shooting foot choice clear b...,1,0
MurderedByWords,sudden law order apply think answer good gas p...,1,0
shitposting,üóø death sentence dollar dollar bill y'all shid...,1,0
linuxmasterrace,title time changing -50m user ltt basically tr...,2,0
memes,short story best ok house big six feel privil...,1,0
gatesopencomeonin,friendly encouragement bikini boy lovely rob z...,1,0
wholesomememes,considerate man wholesome meeting tumblr real ...,1,0
PoliticalCompassMemes,taxation without representation refund gamer w...,3,0
PoliticalHumor,try potus challenge holding book read front pl...,3,0
