In [None]:
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt
import string
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
from nltk.tokenize import word_tokenize
nltk.download('punkt')
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
import re

In [None]:
df = pd.read_csv("ncov-or-cov-19-or-covid-o-all-story-urls-20200628121924.csv")
print(df.shape)
df.head()

In [None]:
# Selecting only rows where files exist

files = []
for file in os.listdir('Mediacloud_parsed/'):
    files.append(file[:-4])

df['stories_id'] = df['stories_id'].astype('str')
df = df[df['stories_id'].isin(files)]
df['Text'] = None

In [None]:
#Reading files and adding text to the dataframe

for file in os.listdir('Mediacloud_parsed/'):
    if file.endswith('txt'):
        with open('Mediacloud_parsed//' + file, 'r') as text:
            data = text.read().replace('\n', '')
            df.loc[df['stories_id'] == file[:-4], 'Text'] = data 

In [None]:
df.shape #separate english 

In [None]:
#Calculating text length and briefly describing it

df['text_len'] = df['Text'].apply(lambda x: len(str(x).split()) if x else np.nan)
print(df['text_len'].describe())
df.hist(column='text_len', bins=15, grid=False, figsize=(12,8), color='#86bf91', zorder=2, rwidth=0.9)

In [None]:
df.to_csv('mediacloud_text_df.csv')

In [None]:
#splitting dataframes into 2: with text longer than 100 words and shorter for manual analysis
df_short = df[df['text_len'] <= 100].reset_index()
print(df_short.shape)
df_medium = df[(df['text_len'] > 100) & (df['text_len'] <= 300)].reset_index()
print(df_medium.shape)

In [None]:
#df_short.sample(n=50)['Text'].to_csv('mc_short_texts')
df_medium.sample(n=50)['Text'].to_csv('mc_medium_texts')

In [None]:
# Selecting those rows where more than 100 words are present in the text

df = df[df['text_len'] > 100].reset_index() #take a sample of articles of more than 100 and less than 100 and manually annotate if they make sense
df.shape

In [None]:
df = pd.read_csv('mediacloud_text_df.csv')
df = df[~df['Text'].isnull()]
df.head()

<b>Stemming vs. Lemmatization: </b> <br/>
stemming was found to
reduce model fit, negligibly affect topic coherence, and negligibly or negatively affect
model consistency across random initializations (Schofield and Mimno, 2016). In light of these results, authors recommended refraining from stemming the corpus as a pre-processing step and instead stemming the top-m word lists as a post-processing step, as needed. <br/>
TO-DO: <br/>
try a model with lemmatized and stemmed tokens as well as without it 

In [None]:
#Processing the text and returning tokens

def remove(tokens): 
    pattern = '[0-9]'
    new_tokens = [re.sub(pattern, '', i) for i in tokens]  #removing numbers from tokens
    removetable = str.maketrans('', '', '’“”–')  
    new_tokens = [s.translate(removetable) for s in new_tokens] #removing special characters
    return [x for x in new_tokens if len(x)>1]  #removing tokens with length 1 or empty


def text_processing(input_str):
    input_lower = input_str.lower()
    input_punctutation = input_lower.translate(str.maketrans(string.punctuation, ' ' * len(string.punctuation))).replace(' '*4, ' ').replace(' '*3, ' ').replace(' '*2, ' ').strip()
    input_tokens = nltk.word_tokenize(input_punctutation)
    input_stopwords = [i for i in input_tokens if not i in stop_words]
    lemmatizer=WordNetLemmatizer()  #lemmatization vs. stemming
    input_lemmatized = [lemmatizer.lemmatize(word) for word in input_stopwords]
    input_clean = remove(input_lemmatized)
    return (input_clean)
    
df['tokens'] = df['Text'].apply(lambda x: text_processing(x))

In [None]:
df['tokens'].sample(n=1).values

In [None]:
#count the frequency of words and return freq of corona-related terms

def wordListToFreqDict(wordlist, terms):
    wordfreq = [wordlist.count(p) for p in wordlist]
    res = dict(list(zip(wordlist,wordfreq)))
    return [(term, res[term]) for term in terms if term in res.keys()]

with open('corona_terms.txt', 'r') as corona_terms:
    terms = corona_terms.read().replace('\n', ' ').split(' ')

df['corona_terms'] = df['tokens'].apply(lambda x: wordListToFreqDict(x, terms))
df['corona_freq'] = df['corona_terms'].apply(lambda x: sum([item[1] for item in x]))

In [None]:
df['corona_freq'].describe()

In [None]:
df_corona = df[df['corona_freq'] >=3]
df_corona.shape

In [None]:
df['corona_terms']

In [None]:
df_corona.to_csv('mediacloud_parsed_corona_df.csv')