In [None]:
import pandas as pd
import numpy as np
from sklearn.cluster import MiniBatchKMeans, KMeans
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import ast 
from gensim.corpora import Dictionary
from gensim.models.coherencemodel import CoherenceModel
import gensim
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import ward, dendrogram
from sklearn.decomposition import TruncatedSVD
from collections import Counter
from sklearn.manifold import TSNE
import matplotlib.cm as cm
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics.pairwise import cosine_similarity
from scipy.cluster.hierarchy import ward, dendrogram, fcluster, single, complete
from sklearn.metrics import silhouette_score

In [None]:
BIGRAMS = True #a parameter that specifies if unigrams (false) or bigrams (true) are used

def dummy_fun(doc):
    return doc

cv = CountVectorizer(analyzer='word',
    tokenizer=dummy_fun,
    preprocessor=dummy_fun,
    token_pattern=None)

def make_bigrams(bigram_mod, texts):
    return [bigram_mod[doc] for doc in texts]

def read_and_vectorize(path, cv, bigrams):
    df = pd.read_csv(path)
    df['tokens'] = df['tokens'].apply(ast.literal_eval) #transforming string of tokens to list
    if bigrams == True: #specify if bigrams or unigrams are used for future clustering
        bigram = gensim.models.Phrases(df['tokens'], min_count=3, threshold=50) # higher threshold fewer phrases.
        bigram_mod = gensim.models.phrases.Phraser(bigram)
        df['bigrams'] = make_bigrams(bigram_mod, df['tokens'])
        print('Bigrams are created.')
        data = cv.fit_transform(df['bigrams'])
    else:
        data = cv.fit_transform(df['tokens'])
    terms = cv.get_feature_names()
    print(f'Len of terms: {len(terms)}')
    tfidf_transformer = TfidfTransformer()
    tfidf_matrix = tfidf_transformer.fit_transform(data)
    print(f'Tfidf matrix is generated of shape {tfidf_matrix.shape}')
    return df, tfidf_matrix, terms

df_feb, tfidf_matrix_feb, terms_feb = read_and_vectorize('preprocessed_results/mediacloud_parsed_corona_df_feb_sample.csv', cv, BIGRAMS)
df_may, tfidf_matrix_may, terms_may = read_and_vectorize('preprocessed_results/mediacloud_parsed_corona_df_may_sample.csv', cv, BIGRAMS)
df_sep, tfidf_matrix_sep, terms_sep = read_and_vectorize('preprocessed_results/mediacloud_parsed_corona_df_sep_sample.csv', cv, BIGRAMS)

In [None]:
def read_best_kmeans_model(path):
    models_df = pd.read_csv(path)
    best_model = models_df.iloc[models_df['Coherence'].idxmax()]
    return best_model, models_df

best_model_feb, models_df_feb = read_best_kmeans_model('preprocessed_results/models_df_feb.csv')
best_model_may, models_df_may = read_best_kmeans_model('preprocessed_results/models_df_may.csv')
best_model_sep, models_df_sep = read_best_kmeans_model('preprocessed_results/models_df_sep.csv')

In [None]:
def transform(tfidf_matrix):
    transformed_tokens = np.empty((tfidf_matrix.shape[0], 0)).tolist()
    for i in range(tfidf_matrix.shape[0]):
        transformed_tokens[i] = tfidf_matrix[i].toarray()[0]
    print(f'Matrix is tranformed into array of len {len(transformed_tokens)}')
    return np.array(transformed_tokens)

def plot_linkage(linkage_matrix, clusters):
    fig, ax = plt.subplots(figsize=(15, 20)) # set size
    ax = dendrogram(linkage_matrix, orientation="right", labels=clusters)

    plt.tick_params(\
        axis= 'x',          # changes apply to the x-axis
        which='both',      # both major and minor ticks are affected
        bottom='off',      # ticks along the bottom edge are off
        top='off',         # ticks along the top edge are off
        labelbottom='off')

    plt.tight_layout()

def hierarchical_clustering(best_model, tfidf_matrix, cluster):
    random_state = 20
    transformed_tokens = transform(tfidf_matrix)
    
    model = KMeans(n_clusters=best_model['Num_Topics'], init='k-means++', max_iter=100, n_init=1, random_state = random_state)
    clusters = model.fit_predict(transformed_tokens)
    selected_features = [transformed_tokens[i] for i in range(len(transformed_tokens)) if clusters[i]==cluster]
    svd = TruncatedSVD(n_components=100, random_state=random_state)
    features = svd.fit_transform(selected_features)
    print(features.shape)
    linkage_matrix = ward(features)
    plot_linkage(linkage_matrix, clusters)
    return features, transformed_tokens, linkage_matrix, clusters

features_hierarchical_feb, transformed_tokens_feb, linkage_matrix_feb, clusters_feb = hierarchical_clustering(best_model_feb, tfidf_matrix_feb, 6)

In [None]:
def agglomerative_clustering(n_clusters, features, df, cluster, best_model, transformed_tokens, clusters):
    random_state=20
    model_hierarchical = AgglomerativeClustering(n_clusters=n_clusters, affinity='euclidean', linkage='ward')  
    model_hierarchical.fit_predict(features)
    
#     model = KMeans(n_clusters=best_model['Num_Topics'], init='k-means++', max_iter=100, n_init=1, random_state = random_state)
#     clusters = model.fit_predict(transformed_tokens)
    df = df[clusters==cluster]
    
    for label in range(model_hierarchical.n_clusters_):
        print(label)
        display(df[model_hierarchical.labels_==label]['title'])
        
agglomerative_clustering(23, features_hierarchical_feb, df_feb, 6, best_model_feb, transformed_tokens_feb, clusters_feb)

In [None]:
def silhouette_k(distance_matrix, linkage_matrix, max_k=20):
    scores = []
    for i in range(2, max_k+1):
        clusters = fcluster(linkage_matrix, i, criterion='maxclust')
        score = silhouette_score(distance_matrix, clusters, metric='precomputed')
        print("Silhouette score with {} clusters:".format(i), score)
        scores.append(score)
    plt.title("Silhouette score vs. number of clusters")
    plt.xlabel("# of clusters")
    plt.ylabel("Score (higher is better)")
    plt.plot(np.arange(2, max_k+1), scores)
    plt.show()
    return scores

def elbow_method(tfidf_matrix, linkage_matrix):
    dist = 1 - cosine_similarity(tfidf_matrix)
    dist = dist - dist.min() 
    silhouette_k(dist, linkage_matrix, max_k=30)
    
elbow_method(tfidf_matrix_feb[clusters_feb==6], linkage_matrix_feb)

## May

In [None]:
features_hierarchical_may, transformed_tokens_may, linkage_matrix_may, clusters_may = hierarchical_clustering(best_model_may, tfidf_matrix_may, 2)

In [None]:
agglomerative_clustering(6, features_hierarchical_may, df_may, 2, best_model_may, transformed_tokens_may, clusters_may)

In [None]:
elbow_method(tfidf_matrix_may[clusters_may==2], linkage_matrix_may)

## September

In [None]:
features_hierarchical_sep, transformed_tokens_sep, linkage_matrix_sep, clusters_sep = hierarchical_clustering(best_model_sep, tfidf_matrix_sep, 10)

In [None]:
agglomerative_clustering(2, features_hierarchical_sep, df_sep, 10, best_model_sep, transformed_tokens_sep, clusters_sep)