In [None]:
import pandas as pd
import ast 
from collections import Counter
import csv
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from factor_analyzer.factor_analyzer import calculate_kmo
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from advanced_pca import CustomPCA
import gensim
import scipy
import seaborn as sns; sns.set()
from sklearn.cluster import MiniBatchKMeans, KMeans
from sklearn.manifold import TSNE
import matplotlib.cm as cm
from sklearn.cluster import DBSCAN

The data is used from the 'Mediacloud_Analysis.ipynb' (<a href='Mediacloud_Analysis.ipynb'>link</a>). It already contains preprocessed and tokenized text for each article. Also it has a column with corona terms specifically and their frequency. 

In [None]:
#reading the dataframe with pre-processed tokens
df = pd.read_csv("preprocessed_results/mediacloud_parsed_corona_df_feb.csv")
temp = pd.read_csv("preprocessed_results/mediacloud_parsed_corona_df_may.csv")
temp_2 = pd.read_csv("preprocessed_results/mediacloud_parsed_corona_df_sep.csv")
df = pd.concat([df,temp])
df = pd.concat([df,temp_2])
df = df[~df.Text.isnull()] #removing rows withh no text

df['tokens'] = df['tokens'].apply(ast.literal_eval) #transforming string of tokens to list
df.head()

In [None]:
# sample = df.sample(n=1)

temp = pd.read_csv('ncov-or-cov-19-or-covid-or-all-story-urls-20201012133126.csv')
sample = temp.sample(n=1)
print(sample['url'].values[0])
print(sample['title'].values[0])
print(sample['publish_date'])

For further procedures we use 500 most frequent tokens, that are later manually reviewed. All names, countries, dates as well as words that do not carry any strong meaning are excluded. They are saved to the 'most_frequent_tokens.csv' file

In [None]:
#finding 500 most frequent tokens
flatten_tokens = [token for sublist in df['tokens'].tolist() for token in sublist]
counter_tokens = Counter(flatten_tokens)
most_frequent = counter_tokens.most_common(500)

In [None]:
#saving them to csv file
with open('most_frequent_tokens.csv', "w") as the_file:
    csv.register_dialect("custom", delimiter=",", skipinitialspace=True)
    writer = csv.writer(the_file, dialect="custom")
    for tup in most_frequent:
        writer.writerow(tup)

In [None]:

#finding 500 most frequent tokens for SEPTEMBER
flatten_tokens = [token for sublist in df['bigrams'][85298:].tolist() for token in sublist]
counter_tokens = Counter(flatten_tokens)
most_frequent = counter_tokens.most_common(500)

#saving them to csv file
with open('most_frequent_bigrams_SEP.csv', "w") as the_file:
    csv.register_dialect("custom", delimiter=",", skipinitialspace=True)
    writer = csv.writer(the_file, dialect="custom")
    for tup in most_frequent:
        writer.writerow(tup)

## Unigrams

Reading file with reviewed tokens (<a href="most_frequent_tokens_cleaned_v2.csv">file link</a>)

In [None]:
tokens = pd.read_csv('most_frequent_tokens_cleaned_v2.csv', header=None, names=['token', 'frequency'])
#tokens['tfidf'] = 0

Firstly the original tokenized texts are converted to the tfidf scores. The result is sparse tfidf matrix. After that for each row only tfidf scores of frequent tokens are kept (for each sparse vector we match id of the tfidf value with dictionary token and check if this token is in the clean list). As a result for each row in the dataframe there is a vector of length n (nuber of cleaned frequent tokens) with tfidf values.

In [None]:
def dummy_fun(doc):
    return doc

cv = CountVectorizer(analyzer='word',
    tokenizer=dummy_fun,
    preprocessor=dummy_fun,
    token_pattern=None)
data = cv.fit_transform(df['tokens'])

tfidf_transformer = TfidfTransformer()
tfidf_matrix = tfidf_transformer.fit_transform(data)

In [None]:
tfidf_dict = cv.get_feature_names()  #all tokens there are in the original texts
df['transformed_tokens'] = np.empty((len(df), 0)).tolist()

for i in range(tfidf_matrix.shape[0]):
    print(i)
    df.at[i, 'transformed_tokens'] = [tfidf_matrix[i].toarray()[0][j] for j in range(len(tfidf_dict)) if tfidf_dict[j] in tokens['token'].tolist()]
            

In [None]:
temp = df['transformed_tokens'].tolist()
temp = [np.array(x) for x in temp]

tfidf_frequent = np.array(temp)
tfidf_frequent.shape #= [np.array(token_list) for token_list in tokens_transformed]

In [None]:
with open("tfidf_transformed_tokens.csv", "w", newline="") as f:
    writer = csv.writer(f)
    writer.writerows(temp)

KMO score is calculated (according to the <a href="https://www.tandfonline.com/doi/full/10.1080/1369183X.2017.1282813">paper</a>). KMO is a measure for sampling adequacy applied in factor analysis. It informs about the general strength of the relationship among items and thus indicates whether an item (i.e. a word) should be included in a factor analysis or not. Following Backhaus et al. (2006), terms with a KMO value below .50 were subsequently excluded.

In [None]:
kmo_all,kmo_model=calculate_kmo(tfidf_frequent)

In [None]:
kmo_model

In [None]:
features_pca = np.zeros((tfidf_frequent.shape[0], len(kmo_all)))
for i in range(len(kmo_all)):
    if kmo_all[i] > 0.5:  #keeping only those that have kmo over 0.5
        features_pca[i] = tfidf_frequent[i]
    
print(len(features_pca), tfidf_frequent.shape)

Running PCA on the filtered tokens. PCA is applied using <a href="https://pypi.org/project/advanced-pca/"> advanced PCA package</a>. For each number of components factor loadings are calculated (for each term) based on the <a href="https://www.r-bloggers.com/p-is-for-principal-components-analysis-pca/">tutorial here</a>. Only significant terms are taken (with a threshold of 0.1)

In [None]:
scaler = StandardScaler()
features_pca_scaled = scaler.fit_transform(features_pca)

pca_results = {'Num_of_components': [],
                'Explained_variance': [],
               'Sum_Explained_variance': [],
               'Terms':[]
                }
for n in range (3, 21):
    pca_model = (CustomPCA(n_components=n)
                    .fit(features_pca_scaled))
    pca_results['Num_of_components'].append(n)
    pca_results['Explained_variance'].append(pca_model.explained_variance_ratio_)
    pca_results['Sum_Explained_variance'].append(sum(pca_model.explained_variance_ratio_))
    all_terms = []
    for i in range(n):
        scores = [score for score in pca_model.components_[i].round(1) if score>0.1 or score<-0.1]
#             tokens_sign = (pca_model.components_[i].round(1)>0.1) or (pca_model.components_[i].round(1)<-0.1)
        terms = tokens.token[(pca_model.components_[i].round(1)>0.1) | (pca_model.components_[i].round(1)<-0.1)]
        all_terms.append(list(zip(terms, scores)))
    pca_results['Terms'].append(all_terms)
    
pca_results_df = pd.DataFrame(pca_results)

Example with a custom PCA with 3 components, printing variance ratio for each component and factor loadings:

In [None]:
pca_model = (CustomPCA(n_components=5)
                    .fit(features_pca_scaled))
print(pca_model.explained_variance_ratio_)
pca_model.components_[1]

In [None]:
pca_results_df['Terms'][0]

Saving results of the PCA to the csv file 'results/mediacloud_pca_results_shortlist.csv'. Plot the sum of explained variance based on the number of components:

In [None]:
pca_results_df.to_csv('results/mediacloud_pca_results_shortlist.csv')
pca_results_df.plot.line(x='Num_of_components', y='Sum_Explained_variance')

Save the 'significant' terms for all components (each n of components) with corresponding factor loadings to csv file 'results/pca_terms.csv':

In [None]:
pca_results_df['Terms'].to_csv('results/pca_terms.csv')
print(pca_results_df['Terms'][5])

A plot that shows cumulative explained variance and explained variance of each component (with max 20):

In [None]:
cummulative_pca = PCA(n_components=20).fit(features_pca_scaled)

fig, ax = plt.subplots(figsize=(8,6))
x_values = range(1, cummulative_pca.n_components_+1)
ax.plot(x_values, cummulative_pca.explained_variance_ratio_, lw=2, label='explained variance')
ax.plot(x_values, np.cumsum(cummulative_pca.explained_variance_ratio_), lw=2, label='cumulative explained variance')
ax.set_title('PCA on filtered tokens : explained variance of components')
ax.set_xlabel('principal component')
ax.set_ylabel('explained variance')
plt.show()

## Bigrams

Creating bigrams from the original texts. The bigrams are then saved to file 'most_frequent_tokens_bigrams.csv' and reviewed the same way as the unigrams in the file 'most_frequent_tokens_bigrams.csv' (<a href='most_frequent_tokens_bigrams.csv'>link</a>). The final list contains 87 terms

In [None]:
bigram = gensim.models.Phrases(df['tokens'], min_count=3, threshold=50) # higher threshold fewer phrases.
bigram_mod = gensim.models.phrases.Phraser(bigram)

In [None]:
def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

df['bigrams'] = make_bigrams(df['tokens'])
df['bigrams']

In [None]:
flatten_bigrams = [token for sublist in df['bigrams'].tolist() for token in sublist]
counter_bigrams = Counter(flatten_bigrams)
most_frequent = counter_bigrams.most_common(500)

In [None]:
#saving them to csv file
with open('most_frequent_tokens_bigrams.csv', "w") as the_file:
    csv.register_dialect("custom", delimiter=",", skipinitialspace=True)
    writer = csv.writer(the_file, dialect="custom")
    for tup in most_frequent:
        writer.writerow(tup)

In [None]:
# tokens_bigrams = pd.read_csv('most_frequent_tokens_bigrams.csv', header=None, names=['token', 'frequency'])
cv = CountVectorizer(analyzer='word',
    tokenizer=dummy_fun,
    preprocessor=dummy_fun,
    token_pattern=None)
data = cv.fit_transform(df['bigrams'])

tfidf_transformer = TfidfTransformer()
tfidf_matrix = tfidf_transformer.fit_transform(data)

In [None]:
tfidf_matrix

In [None]:
tfidf_dict_bigrams = cv.get_feature_names()  #all tokens there are in the original texts
df['transformed_tokens_bigrams'] = np.empty((len(df), 0)).tolist()

for i in range(tfidf_matrix.shape[0]):
    print(i)
    df.at[i, 'transformed_tokens_bigrams'] = [tfidf_matrix[i].toarray()[0][j] for j in range(len(tfidf_dict_bigrams)) if tfidf_dict_bigrams[j] in tokens_bigrams['token'].tolist()]

In [None]:
with open("tfidf_transformed_bigrams.csv", "w", newline="") as f:
    writer = csv.writer(f)
    writer.writerows(df['transformed_tokens_bigrams'].tolist())

In [None]:
temp = df['transformed_tokens_bigrams'].tolist()
temp = [np.array(x) for x in temp]

tfidf_frequent_bigrams = np.array(temp)
tfidf_frequent_bigrams.shape #= [np.array(token_list) for token_list in tokens_transformed]

In [None]:
kmo_all_bi,kmo_model_bi=calculate_kmo(np.array(tfidf_frequent_bigrams))
kmo_model_bi

In [None]:
features_bigrams = np.zeros((tfidf_frequent_bigrams.shape[0], len(kmo_all_bi)))
for i in range(len(kmo_all_bi)):
    if kmo_all_bi[i] > 0.5:  #keeping only those that have kmo over 0.5
        features_bigrams[i] = tfidf_frequent_bigrams[i]
    
print(len(features_bigrams), tfidf_frequent_bigrams.shape)

In [None]:
scaler = StandardScaler()
features_bi_scaled = scaler.fit_transform(features_bigrams)

pca_results_bi = {'Num_of_components': [],
                'Explained_variance': [],
               'Terms':[]
                }
for n in range (3, 21):
    pca_model = (CustomPCA(n_components=n)
                    .fit(features_bi_scaled))
    pca_results_bi['Num_of_components'].append(n)
    pca_results_bi['Explained_variance'].append(sum(pca_model.explained_variance_ratio_))
    all_terms = []
    for i in range(n):
        scores = [score for score in pca_model.components_[i].round(1) if score>0.1]
#             tokens_sign = (pca_model.components_[i].round(1)>0.1) or (pca_model.components_[i].round(1)<-0.1)
        terms = tokens_bigrams.token[pca_model.components_[i].round(1)>0.1]
        all_terms.append(list(zip(terms, scores)))
    pca_results_bi['Terms'].append(all_terms)
    
pca_results_bi_df = pd.DataFrame(pca_results_bi)

In [None]:
pca_model = (CustomPCA(n_components=3)
                    .fit(features_bi_scaled))
print(pca_model.explained_variance_ratio_)
pca_model.components_[1]

In [None]:
pca_results_bi_df['Terms'][0]

In [None]:
temp = tokens_bigrams['token'].tolist()
pca_dict = {}
for token in temp:
    pca_dict[token] = []
    for topic in pca_results_bi_df['Terms'][17]:
        if token in [term[0] for term in topic]:
            pca_dict[token].append([term[1] for term in topic if term[0]==token][0])
        else:
            pca_dict[token].append(0)
            

pca_df = pd.DataFrame(pca_dict).transpose()

In [None]:
pca_df[pca_df[5]!=0]

In [None]:
pca_results_bi_df.to_csv('results/mediacloud_pca_bigrams_results_shortlist.csv')
pca_results_bi_df.plot.line(x='Num_of_components', y='Explained_variance')

In [None]:
cummulative_pca = PCA(n_components=20).fit(features_bi_scaled)

fig, ax = plt.subplots(figsize=(8,6))
x_values = range(1, cummulative_pca.n_components_+1)
ax.plot(x_values, cummulative_pca.explained_variance_ratio_, lw=2, label='explained variance')
ax.plot(x_values, np.cumsum(cummulative_pca.explained_variance_ratio_), lw=2, label='cumulative explained variance')
ax.set_title('PCA on filtered tokens : explained variance of components')
ax.set_xlabel('principal component')
ax.set_ylabel('explained variance')
plt.show()

## Toy example

The perfect curated list is created, that contains 39 words for 4 frames: economic, medical, travel and restrictions/prevention. The list is available <a href="most_frequent_tokens_toy.csv">here</a>

In [None]:
tokens_toy = pd.read_csv('most_frequent_tokens_toy.csv', header=None, names=['token', 'frequency'])
toy = tokens_toy['token'].sort_values().tolist()

In [None]:
bigrams_sep = pd.read_csv('most_frequent_bigrams_SEP.csv', header=None, names=['token', 'frequency'])
tokens_sep = bigrams_sep['token'].sort_values().tolist()

In [None]:
# tokens_bigrams = pd.read_csv('most_frequent_tokens_bigrams.csv', header=None, names=['token', 'frequency'])
def dummy_fun(doc):
    return doc

cv = CountVectorizer(analyzer='word',
    tokenizer=dummy_fun,
    preprocessor=dummy_fun,
    token_pattern=None)
data = cv.fit_transform(df['bigrams'][85298:])

tfidf_transformer = TfidfTransformer()
tfidf_matrix = tfidf_transformer.fit_transform(data)

In [None]:
df.reset_index(inplace=True)
df.drop(['Unnamed: 0','Unnamed: 0.1'],axis=1,inplace=True)

In [None]:
tfidf_matrix.shape[0]

In [None]:
# tfidf_dict = cv.get_feature_names()  #all tokens there are in the original texts
tfidf_dict_bigrams = cv.get_feature_names()
transformed_tokens_sep = np.empty((tfidf_matrix.shape[0], 86))

for i in range(0, tfidf_matrix.shape[0]):
    print(i)
#     print([tfidf_matrix[i].toarray()[0][j] for j in range(len(tfidf_dict_bigrams)) if tfidf_dict_bigrams[j] in tokens_sep])
    transformed_tokens_sep[i] = [tfidf_matrix[i].toarray()[0][j] for j in range(len(tfidf_dict_bigrams)) if tfidf_dict_bigrams[j] in tokens_sep]

In [None]:
with open("tfidf_transformed_toy_sep.csv", "w", newline="") as f:
    writer = csv.writer(f)
    writer.writerows(df['transformed_tokens_toy2'][1136:].tolist())

In [None]:
with open("tfidf_transformed_toy_sep.csv", newline='') as csvfile:
    data = list(csv.reader(csvfile))

In [None]:
temp = data
temp = [np.array(x) for x in temp]

tfidf_frequent_toy = np.array(temp)
tfidf_frequent_toy.shape #= [np.array(token_list) for token_list in tokens_transformed]

In [None]:
tfidf_frequent_sep = transformed_tokens_sep[:1136].astype(np.float)

In [None]:
kmo_all_toy,kmo_model_toy=calculate_kmo(tfidf_frequent_sep)
kmo_model_toy

In [None]:
features_sep = np.zeros((tfidf_frequent_sep.shape[0], len(kmo_all_toy)))
for i in range(len(kmo_all_toy)):
    if kmo_all_toy[i] > 0.5:  #keeping only those that have kmo over 0.5
        features_sep[i] = tfidf_frequent_sep[i]
    
print(len(features_sep), tfidf_frequent_sep.shape)

Kmeans clustering. For each number of k model is created and fitted on above features (consisting of 36 manually chosen words). Number of texts assigned to each cluster is printed below. Then top words are presented and a tsne graph of them in 2d

In [None]:
random_state = 20
k = 3

model = KMeans(n_clusters=k, random_state=random_state)
clusters = model.fit_predict(features_sep)
# tsne = TSNE().fit_transform(features_sep)
Counter(clusters)
# max_items = np.random.choice(range(features_toy.shape[0]), size=10000, replace=False)


In [None]:
#FEBRUARY
def get_top_keywords(data, clusters, labels, n_terms):
    df = pd.DataFrame(data).groupby(clusters).mean()
    
    for i,r in df.iterrows():
        print('\nCluster {}'.format(i))
        print(','.join([labels[t] for t in np.argsort(r)[-n_terms:]]))
            
get_top_keywords(features_toy, clusters, tokens_toy, 5)

In [None]:
#SEPTEMBER
get_top_keywords(features_sep, clusters, tokens_sep, 5)

In [None]:
#kmeans and dbscan, 3 to 5 k

def plot_tsne_pca(tsne, labels):
    max_label = max(labels)

    label_subset = [cm.hsv(i/max_label) for i in labels]
    
    plt.scatter(tsne[:, 0], tsne[:, 1], c=label_subset)
    plt.title('TSNE Cluster Plot')
    
plot_tsne_pca(tsne[clusters!=0], clusters[clusters!=0])
# plot_tsne_pca(tsne, clusters)

DBSCAN.

In [None]:
eps = 3
min_samples = 3

dbscan = {
    'eps':[],
    'min_samples':[],
    'labels':[]
}

for eps in np.arange(0.01,0.05, 0.01):
    for min_samples in range (3, 10, 1):
        db1 = DBSCAN(eps=eps, min_samples=min_samples).fit(features_toy)
        labels1 = db1.labels_
        print(f"eps: {eps}, min samples: {min_samples}")
        print(Counter(labels1))
        dbscan['eps'].append(eps)
        dbscan['min_samples'].append(min_samples)
        dbscan['labels'].append(labels1)


PCA. Number of components ranging from 3 to 5, printing explained variance ratio, factor loading matrix and significant terms for each component.

In [None]:
scaler = StandardScaler()
features_toy_scaled = scaler.fit_transform(features_sep)
pca_model_toy = (CustomPCA(n_components=4)
                    .fit(features_toy_scaled))
print(pca_model_toy.explained_variance_ratio_)
pca_model_toy.components_[1]

In [None]:
all_terms = []
for i in range(4):
    scores = [score for score in pca_model_toy.components_[i].round(2) if score>=0.2]
    print(scores)
    terms = bigrams_sep.token[pca_model_toy.components_[i].round(2)>=0.2]
    all_terms.append(list(zip(terms, scores)))
    
all_terms

In [None]:
pca_model_toy.components_[3]