In [None]:
import pandas as pd
import ast 
from collections import Counter
import csv
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from factor_analyzer.factor_analyzer import calculate_kmo
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from advanced_pca import CustomPCA
import gensim

In [None]:
#reading the dataframe with pre-processed tokens
df = pd.read_csv("preprocessed_results/mediacloud_parsed_corona_df.csv")
df = df[~df.Text.isnull()]
df['tokens'] = df['tokens'].apply(ast.literal_eval) #transforming string of tokens to list
df.head()

In [None]:
#finding 500 most frequent tokens
flatten_tokens = [token for sublist in df['tokens'].tolist() for token in sublist]
counter_tokens = Counter(flatten_tokens)
most_frequent = counter_tokens.most_common(500)

In [None]:
#saving them to csv file
with open('most_frequent_tokens.csv', "w") as the_file:
    csv.register_dialect("custom", delimiter=",", skipinitialspace=True)
    writer = csv.writer(the_file, dialect="custom")
    for tup in most_frequent:
        writer.writerow(tup)

Manually excluding from this list all names of persons, locations, and organisations, as well as all dates and times

In [None]:
tokens = pd.read_csv('most_frequent_tokens_cleaned_v2.csv', header=None, names=['token', 'frequency'])
tokens['tfidf'] = 0

In [None]:
def dummy_fun(doc):
    return doc

cv = CountVectorizer(analyzer='word',
    tokenizer=dummy_fun,
    preprocessor=dummy_fun,
    token_pattern=None)
data = cv.fit_transform(df['tokens'])

tfidf_transformer = TfidfTransformer()
tfidf_matrix = tfidf_transformer.fit_transform(data)
# create dictionary to find a tfidf word each word
word2tfidf = dict(zip(cv.get_feature_names(), tfidf_transformer.idf_))


In [None]:
for word, score in word2tfidf.items():
    if word in tokens['token'].tolist():
        tokens.loc[tokens['token']==word, 'tfidf'] = score
        
tokens

In [None]:
#keep only the frequent tokens for each document
def filter_tokens(all_tokens):
#     print(np.array([tokens.loc[tokens['token'] == token, 'tfidf'].values[0] for token in all_tokens if token in tokens['token'].tolist()]))
    return np.array([tokens.loc[tokens['token'] == token, 'token'].values[0] for token in all_tokens if token in tokens['token'].tolist()])

# df['filtered_tokens'] = df['tokens'].apply(filter_tokens)
tokens_transformed = df['tokens'].apply(filter_tokens)

In [None]:
tfidf_frequent = np.zeros((len(df['tokens']),len(tokens)))
for i in range(len(tokens_transformed)):
    for j, row in tokens.iterrows():
        if row['token'] in tokens_transformed[i]:
            tfidf_frequent[i,j] = row['tfidf']

tfidf_frequent.shape

In [None]:
tfidf = TfidfVectorizer(  #initiating a tfidf vectorizer from list of tokens
    analyzer='word',
    tokenizer=dummy_fun,
    preprocessor=dummy_fun,
    token_pattern=None) 

tokens_transformed = tfidf.fit_transform(df['filtered_tokens']) #fit and transform to vectors

In [None]:
tfidf_frequent[0]

In [None]:
np.array(tokens_transformed).shape #= [np.array(token_list) for token_list in tokens_transformed]

In [None]:
kmo_all,kmo_model=calculate_kmo(np.array(tfidf_frequent))

In [None]:
kmo_model

In [None]:
features_pca = np.zeros((tfidf_frequent.shape[0], len(kmo_all)))
for i in range(len(kmo_all)):
    if kmo_all[i] > 0.5:  #keeping only those that have kmo over 0.5
        features_pca[i] = tfidf_frequent[i]
    
print(len(features_pca), tfidf_frequent.shape)

Running PCA on the filtered tokens

In [None]:
scaler = StandardScaler()
features_pca_scaled = scaler.fit_transform(features_pca)

pca_results = {'Num_of_components': [],
                'Explained_variance': [],
               'Terms':[]
                }
for n in range (3, 21):
    pca_model = (CustomPCA(n_components=n)
                    .fit(features_pca_scaled))
    pca_results['Num_of_components'].append(n)
    pca_results['Explained_variance'].append(sum(pca_model.explained_variance_ratio_))
    all_terms = []
    for i in range(n):
        scores = [score for score in pca_model.components_[i].round(1) if score>0.1 or score<-0.1]
#             tokens_sign = (pca_model.components_[i].round(1)>0.1) or (pca_model.components_[i].round(1)<-0.1)
        terms = tokens.token[(pca_model.components_[i].round(1)>0.1) | (pca_model.components_[i].round(1)<-0.1)]
        all_terms.append(list(zip(terms, scores)))
    pca_results['Terms'].append(all_terms)
    
pca_results_df = pd.DataFrame(pca_results)

In [None]:
pca_results_df['Terms'][5]

In [None]:
pca_results_df.to_csv('results/mediacloud_pca_results_shortlist.csv')
pca_results_df.plot.line(x='Num_of_components', y='Explained_variance')

In [None]:
pca_results_df['Terms'].to_csv('pca_terms.csv')
print(pca_results_df['Terms'][17])

In [None]:
cummulative_pca = PCA().fit(features_pca_scaled)

fig, ax = plt.subplots(figsize=(8,6))
x_values = range(1, cummulative_pca.n_components_+1)
ax.plot(x_values, cummulative_pca.explained_variance_ratio_, lw=2, label='explained variance')
ax.plot(x_values, np.cumsum(cummulative_pca.explained_variance_ratio_), lw=2, label='cumulative explained variance')
ax.set_title('PCA on filtered tokens : explained variance of components')
ax.set_xlabel('principal component')
ax.set_ylabel('explained variance')
plt.show()

Creating bigrams

In [None]:
bigram = gensim.models.Phrases(df['tokens'], min_count=3, threshold=50) # higher threshold fewer phrases.
bigram_mod = gensim.models.phrases.Phraser(bigram)

In [None]:
def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

df['bigrams'] = make_bigrams(df['tokens'])
df['bigrams']

In [None]:
flatten_bigrams = [token for sublist in df['bigrams'].tolist() for token in sublist]
counter_bigrams = Counter(flatten_bigrams)
most_frequent = counter_bigrams.most_common(500)

In [None]:
#saving them to csv file
with open('most_frequent_tokens_bigrams.csv', "w") as the_file:
    csv.register_dialect("custom", delimiter=",", skipinitialspace=True)
    writer = csv.writer(the_file, dialect="custom")
    for tup in most_frequent:
        writer.writerow(tup)

In [None]:
tokens_bigrams = pd.read_csv('most_frequent_tokens_bigrams.csv', header=None, names=['token', 'frequency'])
tokens_bigrams['tfidf'] = 0
cv = CountVectorizer(analyzer='word',
    tokenizer=dummy_fun,
    preprocessor=dummy_fun,
    token_pattern=None)
data = cv.fit_transform(df['bigrams'])

tfidf_transformer = TfidfTransformer()
tfidf_matrix = tfidf_transformer.fit_transform(data)
# create dictionary to find a tfidf word each word
word2tfidf = dict(zip(cv.get_feature_names(), tfidf_transformer.idf_))

In [None]:
for word, score in word2tfidf.items():
    if word in tokens_bigrams['token'].tolist():
        tokens_bigrams.loc[tokens_bigrams['token']==word, 'tfidf'] = score
        
tokens_bigrams

In [None]:
def filter_tokens_bigrams(all_tokens):
#     print(np.array([tokens.loc[tokens['token'] == token, 'tfidf'].values[0] for token in all_tokens if token in tokens['token'].tolist()]))
    return np.array([tokens_bigrams.loc[tokens_bigrams['token'] == token, 'token'].values[0] for token in all_tokens if token in tokens_bigrams['token'].tolist()])

# df['filtered_tokens'] = df['tokens'].apply(filter_tokens)
tokens_transformed = df['bigrams'].apply(filter_tokens_bigrams)

In [None]:
tfidf_frequent_bigrams = np.zeros((len(df['bigrams']),len(tokens_bigrams)))
for i in range(len(tokens_transformed)):
    for j, row in tokens_bigrams.iterrows():
        if row['token'] in tokens_transformed[i]:
            tfidf_frequent_bigrams[i,j] = row['tfidf']

tfidf_frequent_bigrams.shape

In [None]:
kmo_all_bi,kmo_model_bi=calculate_kmo(np.array(tfidf_frequent_bigrams))
kmo_model_bi

In [None]:
features_bigrams = np.zeros((tfidf_frequent_bigrams.shape[0], len(kmo_all_bi)))
for i in range(len(kmo_all_bi)):
    if kmo_all_bi[i] > 0.5:  #keeping only those that have kmo over 0.5
        features_bigrams[i] = tfidf_frequent_bigrams[i]
    
print(len(features_bigrams), tfidf_frequent_bigrams.shape)

In [None]:
scaler = StandardScaler()
features_bi_scaled = scaler.fit_transform(features_bigrams)

pca_results_bi = {'Num_of_components': [],
                'Explained_variance': [],
               'Terms':[]
                }
for n in range (3, 21):
    pca_model = (CustomPCA(n_components=n)
                    .fit(features_bi_scaled))
    pca_results_bi['Num_of_components'].append(n)
    pca_results_bi['Explained_variance'].append(sum(pca_model.explained_variance_ratio_))
    all_terms = []
    for i in range(n):
        scores = [score for score in pca_model.components_[i].round(1) if score>0.1 or score<-0.1]
#             tokens_sign = (pca_model.components_[i].round(1)>0.1) or (pca_model.components_[i].round(1)<-0.1)
        terms = tokens_bigrams.token[(pca_model.components_[i].round(1)>0.1) | (pca_model.components_[i].round(1)<-0.1)]
        all_terms.append(list(zip(terms, scores)))
    pca_results_bi['Terms'].append(all_terms)
    
pca_results_bi_df = pd.DataFrame(pca_results_bi)

In [None]:
pca_results_bi_df['Terms'][5]

In [None]:
pca_results_bi_df.to_csv('results/mediacloud_pca_bigrams_results_shortlist.csv')
pca_results_bi_df.plot.line(x='Num_of_components', y='Explained_variance')

In [None]:
cummulative_pca = PCA().fit(features_bi_scaled)

fig, ax = plt.subplots(figsize=(8,6))
x_values = range(1, cummulative_pca.n_components_+1)
ax.plot(x_values, cummulative_pca.explained_variance_ratio_, lw=2, label='explained variance')
ax.plot(x_values, np.cumsum(cummulative_pca.explained_variance_ratio_), lw=2, label='cumulative explained variance')
ax.set_title('PCA on filtered tokens : explained variance of components')
ax.set_xlabel('principal component')
ax.set_ylabel('explained variance')
plt.show()