In [5]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.metrics.pairwise import cosine_distances
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

import joblib
import pickle


In [6]:
with open('./dataframes/full_data_clean_df_pickle4.pkl', 'rb') as f:
    df = pickle.load(f)

## 1% of dataset

In [7]:
df_subset = df.sample(frac=0.01, replace=False)

In [8]:
docs_cleaned = df_subset['cleaned_bow']

In [9]:
vectorizer = TfidfVectorizer(max_df=0.85,
                             min_df=10,
                             ngram_range=(1,3),
                             max_features=500)
docs_vec = vectorizer.fit_transform(docs_cleaned)
features = vectorizer.get_feature_names()

In [10]:
ks = [k for k in range(50, 301, 50)]
scores = []
for k in ks:
    kmeans = KMeans(n_clusters=k)
    kmeans.fit(docs_vec)
    score = silhouette_score(docs_vec, kmeans.labels_)
    scores.append(score)
    print(f'k = {k}, silhouette score = {score}')

k = 50, silhouette score = 0.047709275626406644
k = 100, silhouette score = 0.046926728739201576
k = 150, silhouette score = 0.04210847599565598
k = 200, silhouette score = 0.04077264312069481
k = 250, silhouette score = 0.03699523837303974
k = 300, silhouette score = 0.035426872735838194


## 10% of dataset

In [11]:
df_subset = df.sample(frac=0.1, replace=False)

In [12]:
docs_cleaned = df_subset['cleaned_bow']

In [13]:
vectorizer = TfidfVectorizer(max_df=0.85,
                             min_df=10,
                             ngram_range=(1,3),
                             max_features=500)
docs_vec = vectorizer.fit_transform(docs_cleaned)
features = vectorizer.get_feature_names()

In [None]:
ks = [k for k in range(30, 111, 10)]
scores = []
for k in ks:
    kmeans = KMeans(n_clusters=k)
    kmeans.fit(docs_vec)
    score = silhouette_score(docs_vec, kmeans.labels_)
    scores.append(score)
    print(f'k = {k}, silhouette score = {score}')