In [None]:
import pandas as pd
import numpy as np
from sklearn.cluster import MiniBatchKMeans, KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE

In [None]:
df = pd.read_csv("mediacloud_parsed_corona_df.csv")
df = df[~df.Text.isnull()]
df.head()

In [None]:
df.shape

In [None]:
vec = TfidfVectorizer(stop_words="english")
vec.fit(df.Text.values)
features = vec.transform(df.Text.values)

## K-means

In [None]:
random_state = 0
num_clusters = 7
model = KMeans(n_clusters=num_clusters, init='k-means++', max_iter=100, n_init=1, random_state = random_state)
data = model.fit(features)

In [None]:
centroids = model.cluster_centers_
tsne_init = 'pca'  # could also be 'random'
tsne_perplexity = 20.0
tsne_early_exaggeration = 4.0
tsne_learning_rate = 1000
random_state = 1
model_tsne = TSNE(n_components=2, random_state=random_state, init=tsne_init, perplexity=tsne_perplexity,
         early_exaggeration=tsne_early_exaggeration, learning_rate=tsne_learning_rate)

transformed_centroids = model.fit_transform(centroids)
plt.scatter(transformed_centroids[:, 0], transformed_centroids[:, 1], marker='x')
plt.show()

In [None]:
from sklearn.metrics import silhouette_score
silhouette_score(features, labels=model.predict(features))

In [None]:
print("Top terms per cluster:")
order_centroids = centroids.argsort()[:, ::-1]
terms = vec.get_feature_names()
for i in range(num_clusters):
    print("Cluster %d:" % i),
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind]),
    print

## DBSCAN

In [None]:
from sklearn.cluster import DBSCAN

eps = 0.3
min_samples=10

dbscan = DBSCAN(eps=eps, min_samples=min_samples)
features_db = features.reshape(-1, 1)
data_db = dbscan.fit(features)
labels = dbscan.labels_



In [None]:
transformed = model_tsne.fit_transform(labels.reshape(-1, 1))
plt.scatter(transformed[:, 0], transformed[:, 1], marker='x')
plt.show()

In [None]:
silhouette_score(features, labels=dbscan.predict(features))