In [None]:
import pandas as pd
import numpy as np
from sklearn.cluster import MiniBatchKMeans, KMeans
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
# from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
# from sklearn.metrics import silhouette_score
import ast 
# import re
from gensim.corpora import Dictionary
from gensim.models.coherencemodel import CoherenceModel
import gensim
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import ward, dendrogram
# from sklearn.feature_selection import chi2
from sklearn.decomposition import TruncatedSVD
from collections import Counter
from sklearn.manifold import TSNE
import matplotlib.cm as cm
from sklearn.cluster import AgglomerativeClustering
import os.path
import nltk
from nltk.stem import WordNetLemmatizer
from sklearn import metrics

In [None]:
pd.read_csv('preprocessed_results/mediacloud_parsed_corona_df_sep.csv').shape

Reading dataframes with parsed text and meta-information from the csv files for <b>february, may and september</b>, getting samples of 5000 articles for each period and storing them for future work. 

In [None]:
# feb = pd.read_csv("preprocessed_results/mediacloud_parsed_corona_df_feb.csv")
# may = pd.read_csv("preprocessed_results/mediacloud_parsed_corona_df_may.csv")
# sep = pd.read_csv("preprocessed_results/mediacloud_parsed_corona_df_sep.csv")

def sample_and_save(path):
    df = pd.read_csv(path)
    df = df[~df.Text.isnull()]
    df.reset_index(inplace=True)
    df.drop(['Unnamed: 0','Unnamed: 0.1'],axis=1,inplace=True)
    df.sample(n=5000).to_csv(path[:-4]+'_sample.csv')
    print(f'Sample of 5000 is saved to {path[:-4]}_sample.csv')
    
sample_and_save("preprocessed_results/mediacloud_parsed_corona_df_feb.csv")
sample_and_save("preprocessed_results/mediacloud_parsed_corona_df_may.csv")
sample_and_save("preprocessed_results/mediacloud_parsed_corona_df_sep.csv")

Lemmatization is applied afterwards. Lemmatization considers the context and converts the word to its meaningful base form and is considered a better method for document clustering (<a href="https://www.researchgate.net/publication/221615320_Stemming_and_lemmatization_in_the_clustering_of_Finnish_text_documents#:~:text=In%20comparison%20with%20stemming%2C%20lemmatization,are%20clustered%20for%20information%20retrieval.&text=and%20Retrieval%20%E2%80%93%20clustering">here</a>). Here Python NLTK WordNet Lemmatizer is used that is based on the WordNet Database.

Reading the saved samples for each time period. Transforming each article to the tfidf vectors. BIGRAMS parameter is used for specifying if unigrams or bigrams are used.

In [None]:
BIGRAMS = True #a parameter that specifies if unigrams (false) or bigrams (true) are used

def lemmatize(tokens):
    wordnet_lemmatizer = WordNetLemmatizer()
    return [wordnet_lemmatizer.lemmatize(word, pos="v") for word in tokens]

def dummy_fun(doc):
    return doc

cv = CountVectorizer(analyzer='word',
    tokenizer=dummy_fun,
    preprocessor=dummy_fun,
    token_pattern=None)

def make_bigrams(bigram_mod, texts):
    return [bigram_mod[doc] for doc in texts]

def read_and_vectorize(path, cv, bigrams):
    df = pd.read_csv(path)
    df['tokens'] = df['tokens'].apply(ast.literal_eval) #transforming string of tokens to list
    df['norm_tokens'] = df['tokens'].apply(lemmatize)
    if bigrams == True: #specify if bigrams or unigrams are used for future clustering
        bigram = gensim.models.Phrases(df['norm_tokens'], min_count=3, threshold=50) # higher threshold fewer phrases.
        bigram_mod = gensim.models.phrases.Phraser(bigram)
        df['bigrams'] = make_bigrams(bigram_mod, df['norm_tokens'])
        print('Bigrams are created.')
        data = cv.fit_transform(df['bigrams'])
    else:
        data = cv.fit_transform(df['norm_tokens'])
    terms = cv.get_feature_names()
    print(f'Len of terms: {len(terms)}')
    tfidf_transformer = TfidfTransformer()
    tfidf_matrix = tfidf_transformer.fit_transform(data)
    print(f'Tfidf matrix is generated of shape {tfidf_matrix.shape}')
    return df, tfidf_matrix, terms

df_feb, tfidf_matrix_feb, terms_feb = read_and_vectorize('preprocessed_results/mediacloud_parsed_corona_df_feb_sample.csv', cv, BIGRAMS)
df_may, tfidf_matrix_may, terms_may = read_and_vectorize('preprocessed_results/mediacloud_parsed_corona_df_may_sample.csv', cv, BIGRAMS)
df_sep, tfidf_matrix_sep, terms_sep = read_and_vectorize('preprocessed_results/mediacloud_parsed_corona_df_sep_sample.csv', cv, BIGRAMS)

## K-means
Transforming tfidf matrix to the sparse numpy array. Running a loop of different k for Kmeans to find the best k by coherence score.

In [None]:
def transform(tfidf_matrix):
    transformed_tokens = np.empty((tfidf_matrix.shape[0], 0)).tolist()
    for i in range(tfidf_matrix.shape[0]):
        transformed_tokens[i] = tfidf_matrix[i].toarray()[0]
    print(f'Matrix is tranformed into array of len {len(transformed_tokens)}')
    return np.array(transformed_tokens)

def get_coherence(topics, dct, texts):
    cm = CoherenceModel(topics=topics, dictionary=dct, texts=texts, coherence='c_v')
    coherence = cm.get_coherence()
    return coherence

def kmeans_loop(tfidf_matrix, k_start, k_finish, terms, df, month):
    transformed_tokens = transform(tfidf_matrix)
    random_state = 20

    model_results = {'Num_Topics': [],
                     'Coherence': [],
                     'Top_terms' : [],
                     'Count_Clusters': [],
                     'Silhouette':[]
                    }
    for k in range(k_start, k_finish, 1):
        model = KMeans(n_clusters=k, init='k-means++', max_iter=100, n_init=1, random_state = random_state)
        clusters = model.fit_predict(transformed_tokens)
        print(f'Num of topics: {k}')
        centroids = model.cluster_centers_
        order_centroids = centroids.argsort()[:, ::-1]
        top_terms = []

        for i in range(k):
            temp = []
            for ind in order_centroids[i-1, :10]:
                temp.append(terms[ind])
            top_terms.append(temp)
            
        texts = df['bigrams'].tolist()
        dct = Dictionary(texts)
        coherence = get_coherence(top_terms, dct, texts)
        sil = metrics.silhouette_score(transformed_tokens, clusters, metric = 'euclidean')
    
        model_results['Num_Topics'].append(k)
        model_results['Top_terms'].append(top_terms)
        model_results['Coherence'].append(coherence)
        model_results['Count_Clusters'].append(Counter(clusters))
        model_results['Silhouette'].append(sil)

    models_df = pd.DataFrame(model_results)
    models_df.to_csv(f'preprocessed_results/models_df_lemm_{month}.csv')
    print('Model df is saved')
    return models_df
    
# models_df_feb = kmeans_loop(tfidf_matrix_feb, 3, 30, terms_feb, df_feb, 'feb')
models_df_may = kmeans_loop(tfidf_matrix_may, 3, 30, terms_may, df_may, 'may')
models_df_sep = kmeans_loop(tfidf_matrix_sep, 3, 30, terms_sep, df_sep, 'sep')

In [None]:
#choosing the best model by coherence score

def select_k(models_df, month):
    display(models_df.plot.line(x='Num_Topics', y='Silhouette', title=month, figsize=(6, 4)))
    best_model = models_df.iloc[models_df['Silhouette'].idxmax()]
    max_k = best_model['Num_Topics']
    print(f'Max Silhouette score is k = {max_k} for month {month}')
    return best_model

best_model_feb = select_k(models_df_feb, 'February')
best_model_may = select_k(models_df_may, 'May')
best_model_sep = select_k(models_df_sep, 'September')

Getting the information about the best model: how many datapoints are in each cluster, TSNE visualisation and top terms for each cluster

In [None]:
def plot_tsne_pca(tsne, labels, month):
    max_label = max(labels)

    label_subset = [cm.hsv(i/max_label) for i in labels]
    plt.figure(figsize=(10, 6))
    plt.scatter(tsne[:, 0], tsne[:, 1], c=label_subset, s=1)
    plt.title(f'TSNE Cluster Plot for {month}')
    
def get_top_keywords(data, clusters, labels, n_terms):
    df = pd.DataFrame(data).groupby(clusters).mean()
    
    for i,r in df.iterrows():
        print('\nCluster {}'.format(i))
        print(','.join([labels[t] for t in np.argsort(r)[-n_terms:]]))
    
def best_model_kmeans(best_model, month, tfidf_matrix, terms):
    random_state = 20
    transformed_tokens = transform(tfidf_matrix)
    
    model = KMeans(n_clusters=best_model['Num_Topics'], init='k-means++', max_iter=100, n_init=1, random_state = random_state)
    clusters = model.fit_predict(transformed_tokens)
    print('Total number of points in each cluster:', best_model['Count_Clusters'])
    if os.path.isfile(f"preprocessed_results/tsne_lemm_{month}.csv"):
        tsne = np.genfromtxt(f"preprocessed_results/tsne_lemm_{month}.csv", delimiter=',')
    else:
        tsne = TSNE().fit_transform(transformed_tokens)
        np.savetxt(f"preprocessed_results/tsne_lemm_{month}.csv", tsne, delimiter=",")
    plot_tsne_pca(tsne, clusters, month)
    get_top_keywords(transformed_tokens, clusters, terms, 10)
    
best_model_kmeans(best_model_feb, 'february', tfidf_matrix_feb, terms_feb)

In [None]:
best_model_kmeans(best_model_may, 'may', tfidf_matrix_may, terms_may)

In [None]:
best_model_kmeans(best_model_sep, 'september', tfidf_matrix_sep, terms_sep)

Applying kmeans only on the last cluster:

In [None]:
selected_features = [transformed_tokens_feb[i] for i in range(len(transformed_tokens_feb)) if clusters[i]==1]
len(selected_features)

In [None]:
model_sel = KMeans(n_clusters=k, random_state=random_state)
clusters_sel = model.fit_predict(selected_features)
Counter(clusters_sel)

In [None]:
def get_top_keywords(data, clusters, labels, n_terms):
    df = pd.DataFrame(data).groupby(clusters).mean()
    
    for i,r in df.iterrows():
        print('\nCluster {}'.format(i))
        print(','.join([labels[t] for t in np.argsort(r)[-n_terms:]]))
            
get_top_keywords(selected_features, clusters_sel, terms_feb, 5)

## Removing corona-related terms:

In [None]:
with open('corona_terms.txt', 'r') as corona_terms:
    terms = corona_terms.read().replace('\n', ' ').split(' ')
    
df['tokens_clean'] = df['tokens'].apply(lambda x: [token for token in x if token not in terms])

In [None]:
tfidf.fit(df['tokens_clean']) #fit and transform to vectors
features_clean = tfidf.transform(df['tokens_clean'])

texts = df['tokens_clean'].tolist()
dct = Dictionary(texts)


model_nocorona_results = {'Topics': [],
                 'Coherence': [],
                 'Silhouette': [],
                 'Top_terms' : []
                }
for k in range(3, 15, 1):
    model = KMeans(n_clusters=k, init='k-means++', max_iter=100, n_init=1, random_state = random_state)
    data = model.fit(features_clean)
    sil = silhouette_score(features_clean, labels=model.predict(features_clean))
    print(f'Num of topics: {k}')
    centroids = model.cluster_centers_

    model_tsne = TSNE(n_components=2, random_state=random_state, init=tsne_init, perplexity=tsne_perplexity,
             early_exaggeration=tsne_early_exaggeration, learning_rate=tsne_learning_rate)

    transformed_centroids = model.fit_transform(centroids)
    plt.scatter(transformed_centroids[:, 0], transformed_centroids[:, 1], marker='x')
    plt.show()
    
    terms = tfidf.get_feature_names()
    order_centroids = centroids.argsort()[:, ::-1]
    top_terms = []

    for i in range(k):
        temp = []
        for ind in order_centroids[i-1, :10]:
            temp.append(terms[ind])
        top_terms.append(temp)
    
    coherence = get_coherence(top_terms, dct, texts)
    
    model_nocorona_results['Topics'].append(k)
    model_nocorona_results['Silhouette'].append(sil)
    model_nocorona_results['Top_terms'].append(top_terms)
    model_nocorona_results['Coherence'].append(coherence)

model_nocorona_results_df = pd.DataFrame(model_nocorona_results)

In [None]:
model_nocorona_results_df.to_csv('mediacloud_kmeans_nocorona_results.csv')
model_nocorona_results_df

In [None]:
model_nocorona_results_df.plot.line(x='Topics', y='Coherence')

In [None]:
model_nocorona_results_df['Top_terms'][3]

## Creating bigrams and trigrams

In [None]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(df['tokens_clean'], min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[df['tokens_clean']], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# See trigram example
print(trigram_mod[bigram_mod[df['tokens_clean'][0]]])

In [None]:
def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

data_words_bigrams = make_bigrams(df['tokens_clean'])

data_words_bigrams

In [None]:
tfidf.fit(data_words_bigrams) #fit and transform to vectors
features_bigrams = tfidf.transform(data_words_bigrams)

dct = Dictionary(data_words_bigrams)

In [None]:
random_state = 0
tsne_init = 'pca'  # could also be 'random'
tsne_perplexity = 20.0
tsne_early_exaggeration = 4.0
tsne_learning_rate = 1000

model_bigram_results = {'Topics': [],
                 'Coherence': [],
                 'Silhouette': [],
                 'Top_terms' : []
                }
for k in range(3, 15, 1):
    model = KMeans(n_clusters=k, init='k-means++', max_iter=100, n_init=1, random_state = random_state)
    data = model.fit(features_bigrams)
    sil = silhouette_score(features_bigrams, labels=model.predict(features_bigrams))
    print(f'Num of topics: {k}')
    centroids = model.cluster_centers_

    model_tsne = TSNE(n_components=2, random_state=random_state, init=tsne_init, perplexity=tsne_perplexity,
             early_exaggeration=tsne_early_exaggeration, learning_rate=tsne_learning_rate)

    transformed_centroids = model.fit_transform(centroids)
    plt.scatter(transformed_centroids[:, 0], transformed_centroids[:, 1], marker='x')
    plt.show()
    
    terms = tfidf.get_feature_names()
    order_centroids = centroids.argsort()[:, ::-1]
    top_terms = []

    for i in range(k):
        temp = []
        for ind in order_centroids[i-1, :10]:
            temp.append(terms[ind])
        top_terms.append(temp)
    print('Terms are extracted')
    coherence = get_coherence(top_terms, dct, data_words_bigrams)
    print('Coherence score is calculated')
    
    model_bigram_results['Topics'].append(k)
    model_bigram_results['Silhouette'].append(sil)
    model_bigram_results['Top_terms'].append(top_terms)
    model_bigram_results['Coherence'].append(coherence)
    print('Result is appended')

model_bigram_results_df.append(pd.DataFrame(model_bigram_results))

In [None]:
model_bigram_results_df = pd.DataFrame(model_bigram_results)
model_bigram_results_df.to_csv('mediacloud_kmeans_bigrams_nocorona_results.csv')
model_bigram_results_df

In [None]:
#model_bigram_results_df = pd.DataFrame(model_bigram_results)
model_bigram_results_df = pd.read_csv('mediacloud_kmeans_bigrams_results.csv')
model_bigram_results_df

In [None]:
model_bigram_results_df.plot.line(x='Topics', y='Coherence')

In [None]:
model_bigram_results_df['Top_terms'][7]

In [None]:
random_state = 0
tsne_init = 'pca'  # could also be 'random'
tsne_perplexity = 20.0
tsne_early_exaggeration = 4.0
tsne_learning_rate = 1000

data_words_trigrams = make_trigrams(df['tokens_clean'])
tfidf.fit(data_words_trigrams) #fit and transform to vectors
features_trigrams = tfidf.transform(data_words_trigrams)

dct = Dictionary(data_words_trigrams)

model_trigram_results = {'Topics': [],
                 'Coherence': [],
                 'Silhouette': [],
                 'Top_terms' : []
                }
for k in range(3, 15, 1):
    model = KMeans(n_clusters=k, init='k-means++', max_iter=100, n_init=1, random_state = random_state)
    data = model.fit(features_trigrams)
    sil = silhouette_score(features_trigrams, labels=model.predict(features_trigrams))
    print(f'Num of topics: {k}')
    centroids = model.cluster_centers_

    model_tsne = TSNE(n_components=2, random_state=random_state, init=tsne_init, perplexity=tsne_perplexity,
             early_exaggeration=tsne_early_exaggeration, learning_rate=tsne_learning_rate)

    transformed_centroids = model.fit_transform(centroids)
    plt.scatter(transformed_centroids[:, 0], transformed_centroids[:, 1], marker='x')
    plt.show()
    
    terms = tfidf.get_feature_names()
    order_centroids = centroids.argsort()[:, ::-1]
    top_terms = []

    for i in range(k):
        temp = []
        for ind in order_centroids[i-1, :10]:
            temp.append(terms[ind])
        top_terms.append(temp)
    
    coherence = get_coherence(top_terms, dct, data_words_trigrams)
    
    model_trigram_results['Topics'].append(k)
    model_trigram_results['Silhouette'].append(sil)
    model_trigram_results['Top_terms'].append(top_terms)
    model_trigram_results['Coherence'].append(coherence)

model_trigram_results_df = pd.DataFrame(model_trigram_results)
model_trigram_results_df.to_csv('mediacloud_kmeans_trigrams_results.csv')

In [None]:
model_trigram_results_df = pd.read_csv('mediacloud_kmeans_trigrams_results.csv')
model_trigram_results_df

In [None]:
model_trigram_results_df.plot.line(x='Topics', y='Coherence')

In [None]:
model_trigram_results_df['Top_terms'][4]

## Hierarchical clustering
Applying only on those clusters that seem to relate to stories

In [None]:
model = KMeans(n_clusters=5, init='k-means++', max_iter=100, n_init=1, random_state = random_state)
data = model.fit(features)
labels = model.labels_

In [None]:
centroids = model.cluster_centers_

terms = tfidf.get_feature_names()
order_centroids = centroids.argsort()[:, ::-1]
top_terms = []

for i in range(5):
    temp = []
    for ind in order_centroids[i-1, :10]:
        temp.append(terms[ind])
    top_terms.append(temp)
    
top_terms

In [None]:
#Selecting only those about the first general topic
selected_df = df[labels==1]
selected_features = tfidf.fit_transform(selected_df['tokens']) #fit and transform to vectors

In [None]:
selected_features.shape

In [None]:
svd = TruncatedSVD(n_components=100, random_state=random_state)
features_red = svd.fit_transform(selected_features)
features_red.shape

In [None]:
linkage_matrix = ward(features_red)

In [None]:
fig, ax = plt.subplots(figsize=(15, 20)) # set size
ax = dendrogram(linkage_matrix, orientation="right", labels=selected_df['stories_id'].tolist())

plt.tick_params(\
    axis= 'x',          # changes apply to the x-axis
    which='both',      # both major and minor ticks are affected
    bottom='off',      # ticks along the bottom edge are off
    top='off',         # ticks along the top edge are off
    labelbottom='off')

plt.tight_layout()

In [None]:
from sklearn.cluster import AgglomerativeClustering

n_clusters = 8
model_hierarchical = AgglomerativeClustering(n_clusters=n_clusters, affinity='euclidean', linkage='ward')  
model_hierarchical.fit_predict(features_red)


In [None]:
for label in range(model_hierarchical.n_clusters_):
    print(label)
    display(selected_df[model_hierarchical.labels_==label]['title'])

In [None]:
len(model_hierarchical.labels_)

In [None]:
from yellowbrick.text import TSNEVisualizer

tsne = TSNEVisualizer()
tsne.fit(features_red, model_hierarchical.labels_)
tsne.show()