In [None]:
import pandas as pd
import ast 
from collections import Counter
import csv
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from factor_analyzer.factor_analyzer import calculate_kmo
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from advanced_pca import CustomPCA

In [None]:
#reading the dataframe with pre-processed tokens
df = pd.read_csv("preprocessed_results/mediacloud_parsed_corona_df.csv")
df = df[~df.Text.isnull()]
df['tokens'] = df['tokens'].apply(ast.literal_eval) #transforming string of tokens to list
df.head()

In [None]:
#finding 500 most frequent tokens
flatten_tokens = [token for sublist in df['tokens'].tolist() for token in sublist]
counter_tokens = Counter(flatten_tokens)
most_frequent = counter_tokens.most_common(500)

In [None]:
#saving them to csv file
with open('most_frequent_tokens.csv', "w") as the_file:
    csv.register_dialect("custom", delimiter=",", skipinitialspace=True)
    writer = csv.writer(the_file, dialect="custom")
    for tup in most_frequent:
        writer.writerow(tup)

Manually excluding from this list all names of persons, locations, and organisations, as well as all dates and times

In [None]:
tokens = pd.read_csv('preprocessed_results/most_frequent_tokens_cleaned.csv', header=None, names=['token', 'frequency'])
tokens['tfidf'] = 0

In [None]:
def dummy_fun(doc):
    return doc

cv = CountVectorizer(analyzer='word',
    tokenizer=dummy_fun,
    preprocessor=dummy_fun,
    token_pattern=None)
data = cv.fit_transform(df['tokens'])

tfidf_transformer = TfidfTransformer()
tfidf_matrix = tfidf_transformer.fit_transform(data)
# create dictionary to find a tfidf word each word
word2tfidf = dict(zip(cv.get_feature_names(), tfidf_transformer.idf_))


In [None]:
for word, score in word2tfidf.items():
    if word in tokens['token'].tolist():
        tokens.loc[tokens['token']==word, 'tfidf'] = score
        
tokens

In [None]:
#keep only the frequent tokens for each document
def filter_tokens(all_tokens):
    return [token for token in all_tokens if token in tokens['token'].tolist()]

df['filtered_tokens'] = df['tokens'].apply(filter_tokens)

In [None]:
tfidf = TfidfVectorizer(  #initiating a tfidf vectorizer from list of tokens
    analyzer='word',
    tokenizer=dummy_fun,
    preprocessor=dummy_fun,
    token_pattern=None) 

tokens_transformed = tfidf.fit_transform(df['filtered_tokens']) #fit and transform to vectors

In [None]:
print(tokens_transformed[0])

In [None]:
kmo_all,kmo_model=calculate_kmo(tokens_transformed.toarray())

In [None]:
kmo_model

In [None]:
features_pca = np.zeros((tokens_transformed.shape[0], len(kmo_all)))
for i in range(len(kmo_all)):
    if kmo_all[i] > 0.5:  #keeping only those that have kmo over 0.5
        features_pca[i] = tokens_transformed[i].toarray()[0]
    
print(len(features_pca), tokens_transformed.shape)

Running PCA on the filtered tokens

In [None]:
scaler = StandardScaler()
features_pca_scaled = scaler.fit_transform(features_pca)

pca_results = {'Num_of_components': [],
                'Explained_variance': [],
               'Terms':[]
                }
for n in range (3, 21):
    pca_model = (CustomPCA(n_components=n)
                    .fit(features_pca_scaled))
    pca_results['Num_of_components'].append(n)
    pca_results['Explained_variance'].append(sum(pca_model.explained_variance_ratio_))
    terms = []
    for i in range(n):
        terms.append(tokens.token[(pca_model.components_[i].round(1)>0.1) | (pca_model.components_[i].round(1)<-0.1)].tolist())
    pca_results['Terms'].append(terms)
    
pca_results_df = pd.DataFrame(pca_results)

In [None]:
pca_results_df.to_csv('results/mediacloud_pca_results.csv')
pca_results_df.plot.line(x='Num_of_components', y='Explained_variance')

In [None]:
pca_results_df['Terms'][17]

In [None]:
cummulative_pca = PCA().fit(features_pca_scaled)

fig, ax = plt.subplots(figsize=(8,6))
x_values = range(1, cummulative_pca.n_components_+1)
ax.plot(x_values, cummulative_pca.explained_variance_ratio_, lw=2, label='explained variance')
ax.plot(x_values, np.cumsum(cummulative_pca.explained_variance_ratio_), lw=2, label='cumulative explained variance')
ax.set_title('PCA on filtered tokens : explained variance of components')
ax.set_xlabel('principal component')
ax.set_ylabel('explained variance')
plt.show()

In [None]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"
# load dataset into Pandas DataFrame
df = pd.read_csv(url, names=['sepal length','sepal width','petal length','petal width','target'])
from sklearn.preprocessing import StandardScaler
features = ['sepal length', 'sepal width', 'petal length', 'petal width']
# Separating out the features
x = df.loc[:, features].values
# Separating out the target
y = df.loc[:,['target']].values
# Standardizing the features
x = StandardScaler().fit_transform(x)
x

In [None]:
tokens_transformed.toarray().shape[0]