In [None]:
import pandas as pd
import numpy as np
from sklearn.cluster import MiniBatchKMeans, KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from sklearn.metrics import silhouette_score
import ast 
import re
from gensim.corpora import Dictionary
from gensim.models.coherencemodel import CoherenceModel
import gensim
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import ward, dendrogram
from sklearn.feature_selection import chi2
from sklearn.decomposition import TruncatedSVD

In [None]:
df = pd.read_csv("mediacloud_parsed_corona_df.csv")
df = df[~df.Text.isnull()]
df.head()

In [None]:

df['tokens'] = df['tokens'].apply(ast.literal_eval) #transforming string of tokens to list
#df['tokens'] = df['tokens'].apply(remove)

In [None]:
def dummy_fun(doc):
    return doc

tfidf = TfidfVectorizer(  #initiating a tfidf vectorizer from list of tokens
    analyzer='word',
    tokenizer=dummy_fun,
    preprocessor=dummy_fun,
    token_pattern=None) 

In [None]:
tfidf.fit(df['tokens']) #fit and transform to vectors
features = tfidf.transform(df['tokens'])

## K-means

In [None]:
random_state = 0
tsne_init = 'pca'  # could also be 'random'
tsne_perplexity = 20.0
tsne_early_exaggeration = 4.0
tsne_learning_rate = 1000

model_results = {'Topics': [],
                 #'Coherence': [],
                 'Silhouette': [],
                 'Top_terms' : []
                }
for k in range(3, 10, 1):
    model = KMeans(n_clusters=k, init='k-means++', max_iter=100, n_init=1, random_state = random_state)
    data = model.fit(features)
    sil = silhouette_score(features, labels=model.predict(features))
    print(f'Num of topics: {k}')
    centroids = model.cluster_centers_

    model_tsne = TSNE(n_components=2, random_state=random_state, init=tsne_init, perplexity=tsne_perplexity,
             early_exaggeration=tsne_early_exaggeration, learning_rate=tsne_learning_rate)

    transformed_centroids = model.fit_transform(centroids)
    plt.scatter(transformed_centroids[:, 0], transformed_centroids[:, 1], marker='x')
    plt.show()
    
    terms = tfidf.get_feature_names()
    order_centroids = centroids.argsort()[:, ::-1]
    top_terms = []

    for i in range(k):
        temp = []
        for ind in order_centroids[i-1, :10]:
            temp.append(terms[ind])
        top_terms.append(temp)
    
    
    model_results['Topics'].append(k)
    model_results['Silhouette'].append(sil)
    model_results['Top_terms'].append(top_terms)

models_df = pd.DataFrame(model_results)

In [None]:
models_df = pd.read_csv('mediacloud_kmeans_results.csv')

In [None]:
def get_coherence(topics, dct, texts):
    cm = CoherenceModel(topics=topics, dictionary=dct, texts=texts, coherence='c_v')
    coherence = cm.get_coherence()
    return coherence

In [None]:
texts = df['tokens'].tolist()
dct = Dictionary(texts)

models_df['Coherence'] = models_df['Top_terms'].apply(lambda x: get_coherence(x, dct, texts))
models_df.to_csv('mediacloud_kmeans_results.csv')
models_df

In [None]:
models_df.plot.line(x='Topics', y='Coherence')

In [None]:
models_df['Top_terms'][2]

## Removing corona-related terms:

In [None]:
with open('corona_terms.txt', 'r') as corona_terms:
    terms = corona_terms.read().replace('\n', ' ').split(' ')
    
df['tokens_clean'] = df['tokens'].apply(lambda x: [token for token in x if token not in terms])

In [None]:
tfidf.fit(df['tokens_clean']) #fit and transform to vectors
features_clean = tfidf.transform(df['tokens_clean'])

texts = df['tokens_clean'].tolist()
dct = Dictionary(texts)


model_nocorona_results = {'Topics': [],
                 'Coherence': [],
                 'Silhouette': [],
                 'Top_terms' : []
                }
for k in range(3, 15, 1):
    model = KMeans(n_clusters=k, init='k-means++', max_iter=100, n_init=1, random_state = random_state)
    data = model.fit(features_clean)
    sil = silhouette_score(features_clean, labels=model.predict(features_clean))
    print(f'Num of topics: {k}')
    centroids = model.cluster_centers_

    model_tsne = TSNE(n_components=2, random_state=random_state, init=tsne_init, perplexity=tsne_perplexity,
             early_exaggeration=tsne_early_exaggeration, learning_rate=tsne_learning_rate)

    transformed_centroids = model.fit_transform(centroids)
    plt.scatter(transformed_centroids[:, 0], transformed_centroids[:, 1], marker='x')
    plt.show()
    
    terms = tfidf.get_feature_names()
    order_centroids = centroids.argsort()[:, ::-1]
    top_terms = []

    for i in range(k):
        temp = []
        for ind in order_centroids[i-1, :10]:
            temp.append(terms[ind])
        top_terms.append(temp)
    
    coherence = get_coherence(top_terms, dct, texts)
    
    model_nocorona_results['Topics'].append(k)
    model_nocorona_results['Silhouette'].append(sil)
    model_nocorona_results['Top_terms'].append(top_terms)
    model_nocorona_results['Coherence'].append(coherence)

model_nocorona_results_df = pd.DataFrame(model_nocorona_results)

In [None]:
model_nocorona_results_df.to_csv('mediacloud_kmeans_nocorona_results.csv')
model_nocorona_results_df

In [None]:
model_nocorona_results_df.plot.line(x='Topics', y='Coherence')

In [None]:
model_nocorona_results_df['Top_terms'][3]

## Creating bigrams and trigrams

In [None]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(df['tokens_clean'], min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[df['tokens_clean']], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# See trigram example
print(trigram_mod[bigram_mod[df['tokens_clean'][0]]])

In [None]:
def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

data_words_bigrams = make_bigrams(df['tokens_clean'])

data_words_bigrams

In [None]:
tfidf.fit(data_words_bigrams) #fit and transform to vectors
features_bigrams = tfidf.transform(data_words_bigrams)

dct = Dictionary(data_words_bigrams)

In [None]:
random_state = 0
tsne_init = 'pca'  # could also be 'random'
tsne_perplexity = 20.0
tsne_early_exaggeration = 4.0
tsne_learning_rate = 1000

model_bigram_results = {'Topics': [],
                 'Coherence': [],
                 'Silhouette': [],
                 'Top_terms' : []
                }
for k in range(3, 15, 1):
    model = KMeans(n_clusters=k, init='k-means++', max_iter=100, n_init=1, random_state = random_state)
    data = model.fit(features_bigrams)
    sil = silhouette_score(features_bigrams, labels=model.predict(features_bigrams))
    print(f'Num of topics: {k}')
    centroids = model.cluster_centers_

    model_tsne = TSNE(n_components=2, random_state=random_state, init=tsne_init, perplexity=tsne_perplexity,
             early_exaggeration=tsne_early_exaggeration, learning_rate=tsne_learning_rate)

    transformed_centroids = model.fit_transform(centroids)
    plt.scatter(transformed_centroids[:, 0], transformed_centroids[:, 1], marker='x')
    plt.show()
    
    terms = tfidf.get_feature_names()
    order_centroids = centroids.argsort()[:, ::-1]
    top_terms = []

    for i in range(k):
        temp = []
        for ind in order_centroids[i-1, :10]:
            temp.append(terms[ind])
        top_terms.append(temp)
    print('Terms are extracted')
    coherence = get_coherence(top_terms, dct, data_words_bigrams)
    print('Coherence score is calculated')
    
    model_bigram_results['Topics'].append(k)
    model_bigram_results['Silhouette'].append(sil)
    model_bigram_results['Top_terms'].append(top_terms)
    model_bigram_results['Coherence'].append(coherence)
    print('Result is appended')

model_bigram_results_df.append(pd.DataFrame(model_bigram_results))

In [None]:
model_bigram_results_df = pd.DataFrame(model_bigram_results)
model_bigram_results_df.to_csv('mediacloud_kmeans_bigrams_nocorona_results.csv')
model_bigram_results_df

In [None]:
#model_bigram_results_df = pd.DataFrame(model_bigram_results)
model_bigram_results_df = pd.read_csv('mediacloud_kmeans_bigrams_results.csv')
model_bigram_results_df

In [None]:
model_bigram_results_df.plot.line(x='Topics', y='Coherence')

In [None]:
model_bigram_results_df['Top_terms'][7]

In [None]:
random_state = 0
tsne_init = 'pca'  # could also be 'random'
tsne_perplexity = 20.0
tsne_early_exaggeration = 4.0
tsne_learning_rate = 1000

data_words_trigrams = make_trigrams(df['tokens_clean'])
tfidf.fit(data_words_trigrams) #fit and transform to vectors
features_trigrams = tfidf.transform(data_words_trigrams)

dct = Dictionary(data_words_trigrams)

model_trigram_results = {'Topics': [],
                 'Coherence': [],
                 'Silhouette': [],
                 'Top_terms' : []
                }
for k in range(3, 15, 1):
    model = KMeans(n_clusters=k, init='k-means++', max_iter=100, n_init=1, random_state = random_state)
    data = model.fit(features_trigrams)
    sil = silhouette_score(features_trigrams, labels=model.predict(features_trigrams))
    print(f'Num of topics: {k}')
    centroids = model.cluster_centers_

    model_tsne = TSNE(n_components=2, random_state=random_state, init=tsne_init, perplexity=tsne_perplexity,
             early_exaggeration=tsne_early_exaggeration, learning_rate=tsne_learning_rate)

    transformed_centroids = model.fit_transform(centroids)
    plt.scatter(transformed_centroids[:, 0], transformed_centroids[:, 1], marker='x')
    plt.show()
    
    terms = tfidf.get_feature_names()
    order_centroids = centroids.argsort()[:, ::-1]
    top_terms = []

    for i in range(k):
        temp = []
        for ind in order_centroids[i-1, :10]:
            temp.append(terms[ind])
        top_terms.append(temp)
    
    coherence = get_coherence(top_terms, dct, data_words_trigrams)
    
    model_trigram_results['Topics'].append(k)
    model_trigram_results['Silhouette'].append(sil)
    model_trigram_results['Top_terms'].append(top_terms)
    model_trigram_results['Coherence'].append(coherence)

model_trigram_results_df = pd.DataFrame(model_trigram_results)
model_trigram_results_df.to_csv('mediacloud_kmeans_trigrams_results.csv')

In [None]:
model_trigram_results_df = pd.read_csv('mediacloud_kmeans_trigrams_results.csv')
model_trigram_results_df

In [None]:
model_trigram_results_df.plot.line(x='Topics', y='Coherence')

In [None]:
model_trigram_results_df['Top_terms'][4]

## Hierarchical clustering
Applying only on those clusters that seem to relate to stories

In [None]:
model = KMeans(n_clusters=5, init='k-means++', max_iter=100, n_init=1, random_state = random_state)
data = model.fit(features)
labels = model.labels_

In [None]:
centroids = model.cluster_centers_

terms = tfidf.get_feature_names()
order_centroids = centroids.argsort()[:, ::-1]
top_terms = []

for i in range(5):
    temp = []
    for ind in order_centroids[i-1, :10]:
        temp.append(terms[ind])
    top_terms.append(temp)
    
top_terms

In [None]:
#Selecting only those about the first general topic
selected_df = df[labels==1]
selected_features = tfidf.fit_transform(selected_df['tokens']) #fit and transform to vectors

In [None]:
selected_features.shape

In [None]:
svd = TruncatedSVD(n_components=100, random_state=random_state)
features_red = svd.fit_transform(selected_features)
features_red.shape

In [None]:
linkage_matrix = ward(features_red)

In [None]:
fig, ax = plt.subplots(figsize=(15, 20)) # set size
ax = dendrogram(linkage_matrix, orientation="right", labels=selected_df['stories_id'].tolist())

plt.tick_params(\
    axis= 'x',          # changes apply to the x-axis
    which='both',      # both major and minor ticks are affected
    bottom='off',      # ticks along the bottom edge are off
    top='off',         # ticks along the top edge are off
    labelbottom='off')

plt.tight_layout()

In [None]:
from sklearn.cluster import AgglomerativeClustering

n_clusters = 8
model_hierarchical = AgglomerativeClustering(n_clusters=n_clusters, affinity='euclidean', linkage='ward')  
model_hierarchical.fit_predict(features_red)


In [None]:
for label in range(model_hierarchical.n_clusters_):
    print(label)
    display(selected_df[model_hierarchical.labels_==label]['title'])

In [None]:
len(model_hierarchical.labels_)

In [None]:
from yellowbrick.text import TSNEVisualizer

tsne = TSNEVisualizer()
tsne.fit(features_red, model_hierarchical.labels_)
tsne.show()