In [1]:
import spacy

from textblob import TextBlob

import nltk
from nltk.collocations import *
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.metrics import BigramAssocMeasures
import string

import pandas as pd
import re
import datetime
from time import process_time

**Setup:**

In [3]:
sources = ['nytimes', 'CNN', 'bbcworld', 'theeconomist', 'reuters', 'WSJ', 'TIME', 'ABC', 'washingtonpost', 'AP']

In [8]:
#to get a string back to a datetime object
to_dt = lambda x: datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S')

In [7]:
#turns a string of a list of tuples back into a list of tuples ('loft')
def str_to_loft(loft):
    stack = []
    rebuilt = []

    for thing in loft.split(','):
        stack.append(thing.translate(str.maketrans('', '', '\'"[]()')).strip())
        if ')' in thing:
            rebuilt.append(tuple([stack.pop(), int(stack.pop()), stack.pop()][ : : -1]))
            
    return rebuilt

In [3]:
#some standard cleaning steps (removing text that isn't actualy part of the tweet)
def clean_text(text):
    clean = text.replace('…', '')
    if clean.startswith('RT @'):
        clean = ':'.join(clean.split(':')[1 : ])
        
    return clean.strip()

In [5]:
#from github
def remove_emoji(string):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002500-\U00002BEF"  # chinese char
                               u"\U00002702-\U000027B0"
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\U0001f926-\U0001f937"
                               u"\U00010000-\U0010ffff"
                               u"\u2640-\u2642"
                               u"\u2600-\u2B55"
                               u"\u200d"
                               u"\u23cf"
                               u"\u23e9"
                               u"\u231a"
                               u"\ufe0f"  # dingbats
                               u"\u3030"
                               "]+", flags = re.UNICODE)
    
    return emoji_pattern.sub(r'', string)

**Applying NER:** The way it's written here, SA is applied before NER.

In [184]:
#function that returns the detected entities in a text
#output is a tuple with format: (ENTITY TEXT, ENTITY FREQ, ENTITY LABEL)
def get_ents(text, nlp):
    doc = nlp(text)
    
    ent_dict = {}
    for ent in doc.ents:
        exclude = ['coronavirus', 'covid', 'covid-19']
        if ('@' not in ent.text) and ('#' not in ent.text) and (ent.text.lower() not in exclude):
            last_check = True
            if (ent.start_char - 1) > 0:
                if text[ent.start_char - 1] in ['@', '#']:
                    last_check = False
            if last_check:
                if ent.text in ent_dict:
                    ent_dict[ent.text][0] += 1
                else:
                    ent_dict[ent.text] = [1, ent.label_]
                
    return [(x, ent_dict[x][0], ent_dict[x][1]) for x in ent_dict]

In [186]:
#applying NER to each data set
nlp = spacy.load('en_core_web_sm')

for s in sources:
    data = pd.read_csv('DATA_W_ANAlYSIS/' + s + '_SENT.csv', index_col = 'ID')
    app_col = data['Tweet_Text'].fillna('').apply(remove_emoji).apply(clean_text)
    data['Named_Entities'] = app_col.apply(lambda x: get_ents(x, nlp))
    data.to_csv('DATA_W_ANAlYSIS/' + s + '_FINAL.csv')
    print(s)

nytimes
CNN
bbcworld
theeconomist
reuters
WSJ
TIME
ABC
washingtonpost
AP


In [15]:
#checking some things for NER application
for s in sources:
    test = pd.read_csv('DATA_W_ANAlYSIS/' + s + '_FINAL.csv', index_col = 'ID')
    other = pd.read_csv('CUT_DATA/' + s + '.csv')
    test['Date_Time'] = test['Date_Time'].apply(to_dt)
    test['Named_Entities'] = test['Named_Entities'].apply(str_to_loft)
    print(s + ':')
    #lengths of data sets match
    print(len(test) == len(other)) #should be True
    #all elements of the NEs column are lists
    print(test['Named_Entities'].apply(lambda x: type(x) == list).sum() == len(test)) #should be True
    #there are no null values in NE column
    print(test['Named_Entities'].isnull().sum()) #should be 0
    #there are no null values in SA column
    print(test['Sentiment'].isnull().sum()) #should be 0
    print()

nytimes:
True
True
0
0

CNN:
True
True
0
0

bbcworld:
True
True
0
0

theeconomist:
True
True
0
0

reuters:
True
True
0
0

WSJ:
True
True
0
0

TIME:
True
True
0
0

ABC:
True
True
0
0

washingtonpost:
True
True
0
0

AP:
True
True
0
0



**Applying Sentiment Analysis:** 

In [180]:
#applying sentiment analysis to each data set
for s in sources:
    data = pd.read_csv('CUT_DATA/' + s + '.csv', index_col = 'ID')
    app_col = data['Tweet_Text'].fillna('').apply(clean_text)
    data['Sentiment'] = app_col.apply(lambda x: TextBlob(x).sentiment.polarity).apply(lambda x: round(x, 3))
    data.to_csv('DATA_W_ANAlYSIS/' + s + '_SENT.csv')

In [11]:
#checking the application of SA
for s in sources:
    data = pd.read_csv('DATA_W_ANAlYSIS/' + s + '_SENT.csv', index_col = 'ID')
    #there are no null values in SA column
    print(s, data['Sentiment'].isnull().sum())

nytimes 0
CNN 0
bbcworld 0
theeconomist 0
reuters 0
WSJ 0
TIME 0
ABC 0
washingtonpost 0
AP 0


**Establishing a Word Collocations Pipeline:** This isn't used in the paper but could be a good method for future analysis.

In [5]:
data = pd.read_csv('CUT_DATA/bbcworld.csv', index_col = 'ID')

#cleaning and putting in list (normalizing with '.lower()')
data['Clean_Text'] = data['Tweet_Text'].apply(clean_text)
list_of_vals = [x.lower() for x in list(data['Clean_Text'].fillna('').values)]

#joining the tweets together and applying SWT
words = ' ~ '.join(list_of_vals)
tok_words = word_tokenize(words)

#removing punctuation, which could stop contiguous words from being associated
#NOT including ',.:;~?!', these are end-of-line characters and should keep tokens seperate
punct = [p for p in string.punctuation if p not in ',.:;~?!'] + ['’', "'s", '``', '“', '”', '—', "n't"]
no_punct = [w for w in tok_words if w not in punct]

finder = BigramCollocationFinder.from_words(no_punct)

#taking out low frequency bigrams
#if not using a word filter, would be good to bump up freq filter
finder.apply_freq_filter(5)

#taking out bigrams with stops
stops = stopwords.words('English') + [p for p in string.punctuation] + ['’', "'s", '``', '“', '”', '—', "n't"]
stops_fil = lambda w: w in stops
finder.apply_word_filter(stops_fil)

#looking for bigrams containing a specific word
word_fil = lambda *w: 'coronavirus' not in w
finder.apply_ngram_filter(word_fil)

#ranking by PMI
#MAYBE use several association measures here!
print(finder.nbest(BigramAssocMeasures.pmi, 20))

[('coronavirus', 'update'), ('coronavirus', 'outbreak'), ('coronavirus', 'crisis'), ('coronavirus', 'fears'), ('coronavirus', 'pandemic'), ('coronavirus', 'cases'), ('coronavirus', 'deaths'), ('global', 'coronavirus'), ('coronavirus', 'death'), ('latest', 'coronavirus'), ('amid', 'coronavirus'), ('coronavirus', 'lockdown'), ('italy', 'coronavirus')]
