# Preprocessing

In [39]:
# import packages
import pandas as pd
import numpy as np
import datetime

# visualization packages
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from wordcloud import WordCloud

# NLP packages
import spacy, re
import en_core_web_sm
from spacy.lang.en import English
from spacy import displacy
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import TfidfVectorizer

# optimize viewing
pd.set_option('max_colwidth', 150)

In [2]:
# read in cleaned dataframe
df = pd.read_csv('./data/clean_df.csv')

In [6]:
df = df.drop(labels=['Unnamed: 0'], axis=1)

In [7]:
df.shape

(77259, 7)

In [8]:
# convert date to datetime object
df['date'] = df['date'].apply(lambda x: datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))

# convert datetime to string without timestamp
df['date'] = df['date'].apply(lambda x: x.strftime('%Y-%m-%d'))

# convert date back to datetime
df['date'] = df['date'].apply(lambda x: datetime.datetime.strptime(x, '%Y-%m-%d'))

## VADER Sentiment Classification

In [9]:
# use vader compound (or composite) score to evaluate the tweet sentiment
vader = SentimentIntensityAnalyzer()

df['vader_score'] = df['tweet'].apply(lambda x: vader.polarity_scores(x)['compound'])

In [13]:
conditions = [
    df['vader_score'] >= .05,
    df['vader_score'] <= -.05
]

classes = ['positive', 'negative']

df['sentiment'] = np.select(conditions, classes, 'neutral')

In [14]:
df.sentiment.value_counts()

positive    31095
negative    25703
neutral     20461
Name: sentiment, dtype: int64

In [15]:
df.sentiment.value_counts(normalize=True)

positive    0.402477
negative    0.332686
neutral     0.264836
Name: sentiment, dtype: float64

In [16]:
df = df.drop(labels='vader_score', axis=1)

In [17]:
df.head()

Unnamed: 0,date,tweet,username,link,nlikes,nreplies,nretweets,sentiment
0,2020-02-28,"For everyone comparing COVID-19 to the flu, just a reminder that even with vaccines the flu kills tens of thousands a year. Now imagine a more dea...",MonaIbrahim,https://twitter.com/MonaIbrahim/status/1233542386669641728,3,0,0,negative
1,2020-02-28,"“Perhaps more than any other type of medicine, the success of a #vaccine depends on the cooperation of everyone. To protect those who can’t have a...",Moratel_Entles,https://twitter.com/Moratel_Entles/status/1233542196193824769,1,0,0,positive
2,2020-02-28,Provide pharmaceutical companies with the opportunity to develop this vaccine and drug as soon as possible and to reach mass production. As you kn...,help_4_Iranian,https://twitter.com/help_4_Iranian/status/1233542052161507328,0,0,0,positive
3,2020-02-28,From John's Hopkins: 1) Immediate risk of COVID-19 remains low in the U.S. The best way to protect yourself is by practicing the same precaution...,TruthFreedom17,https://twitter.com/TruthFreedom17/status/1233541922108694528,1,1,0,positive
4,2020-02-28,If the #stockmarkets keep falling - how confident will you be about a #vaccine for #COVID-19 and funds being available for their development?,notarywise,https://twitter.com/notarywise/status/1233541049903177728,0,0,0,positive


In [18]:
df.tail()

Unnamed: 0,date,tweet,username,link,nlikes,nreplies,nretweets,sentiment
77254,2020-12-12,"Although it’s great news about the #Covid vaccine, it will take time to roll it out across the country. So it's as important as ever to follow the...",coventrycc,https://twitter.com/coventrycc/status/1337751621304987648,0,0,0,positive
77255,2020-12-12,"""FDA approves Pfizer’s COVID-19 vaccine for emergency distribution in the US"" https://t.co/aTAzNLRBcw",gloriahunterlov,https://twitter.com/gloriahunterlov/status/1337751614032064519,0,0,0,neutral
77256,2020-12-12,"Walmart is prepping its 5,000-plus pharmacies to receive doses of the COVID-19 vaccine once it is released in the U.S, The retailer is also imple...",O_Laborne,https://twitter.com/O_Laborne/status/1337751606142652419,1,0,0,neutral
77257,2020-12-12,#GAHaveYouHeard FACT: There is no COVID-19 virus in the vaccine. The vaccine imitates the infection so that our bodies create the antibody defense...,GaDPH,https://twitter.com/GaDPH/status/1337751601658933251,11,0,6,negative
77258,2020-12-12,A COVID-19 response that prioritizes health care workers includes: 💉 Ensuring they are 1st in line for treatment and a future vaccine 🏋️‍♀‍ Provid...,myrighttochoice,https://twitter.com/myrighttochoice/status/1337751601109491714,1,0,0,positive


## SpaCy NLP Preprocessing

#### Tokenizing

In [29]:
# while the sentiment in this tweet is properly classified as 'negative',
# this person is likely in favor of taking a vaccine, and the negative 
# sentiment is more broadly directed at the context of COVID-19
print(df.loc[0]['tweet'],'\n')
print(df.loc[0]['sentiment'])

For everyone comparing COVID-19 to the flu, just a reminder that even with vaccines the flu kills tens of thousands a year. Now imagine a more deadly virus with no vaccine or assurance that any vaccine developed will be affordable and hi! Welcome to the pandemic! 

negative


In [25]:
# instantiate spaCy English object
nlp = English()

text = df.loc[0]['tweet']

my_doc = nlp(text)

token_list = []
for token in my_doc:
    token_list.append(token.text)
print(token_list)

['For', 'everyone', 'comparing', 'COVID-19', 'to', 'the', 'flu', ',', 'just', 'a', 'reminder', 'that', 'even', 'with', 'vaccines', 'the', 'flu', 'kills', 'tens', 'of', 'thousands', 'a', 'year', '.', 'Now', 'imagine', 'a', 'more', 'deadly', 'virus', 'with', 'no', 'vaccine', 'or', 'assurance', 'that', 'any', 'vaccine', 'developed', 'will', 'be', 'affordable', 'and', 'hi', '!', 'Welcome', 'to', 'the', 'pandemic', '!']


#### Stopwords

In [31]:
stopwords = spacy.lang.en.stop_words.STOP_WORDS

filtered_sent = []

doc = nlp(text)

for word in doc:
    if word.is_stop == False:
        filtered_sent.append(word)
print(filtered_sent)

[comparing, COVID-19, flu, ,, reminder, vaccines, flu, kills, tens, thousands, year, ., imagine, deadly, virus, vaccine, assurance, vaccine, developed, affordable, hi, !, Welcome, pandemic, !]


#### Lemmatization

In [35]:
for word in filtered_sent:
    print(word.text, word.lemma_)

comparing compare
COVID-19 COVID-19
flu flu
, ,
reminder reminder
vaccines vaccine
flu flu
kills kill
tens ten
thousands thousand
year year
. .
imagine imagine
deadly deadly
virus virus
vaccine vaccine
assurance assurance
vaccine vaccine
developed develope
affordable affordable
hi hello
! !
Welcome Welcome
pandemic pandemic
! !


In [38]:
nlp = en_core_web_sm.load()



for word in my_doc:
    print(word.text, word.pos_)

For 
everyone 
comparing 
COVID-19 
to 
the 
flu 
, 
just 
a 
reminder 
that 
even 
with 
vaccines 
the 
flu 
kills 
tens 
of 
thousands 
a 
year 
. 
Now 
imagine 
a 
more 
deadly 
virus 
with 
no 
vaccine 
or 
assurance 
that 
any 
vaccine 
developed 
will 
be 
affordable 
and 
hi 
! 
Welcome 
to 
the 
pandemic 
! 


## [Stackoverflow](https://stackoverflow.com/questions/62139308/preprocessing-tweets-remove-and-eliminate-stop-words-and-remove-user-from)

In [None]:
nlp = spacy.load('en')

In [None]:
stop_words = [w.lower() for w in stopwords.words()]