# Preprocessing

In [51]:
# import packages
import pandas as pd
import numpy as np
import datetime

# visualization packages
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from wordcloud import WordCloud

# NLP packages
import string, re
import spacy, re
import en_core_web_sm
from spacy.lang.en import English
from spacy import displacy
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from nltk.tokenize import RegexpTokenizer
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

# scikitlearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import confusion_matrix
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

# optimize viewing
pd.set_option('max_colwidth', 150)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/davidbruce/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/davidbruce/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/davidbruce/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
# read in cleaned dataframe
df = pd.read_csv('./data/clean_df.csv')

In [3]:
df = df.drop(labels=['Unnamed: 0'], axis=1)

In [4]:
df.shape

(77259, 7)

In [5]:
# convert date to datetime object
df['date'] = df['date'].apply(lambda x: datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))

# convert datetime to string without timestamp
df['date'] = df['date'].apply(lambda x: x.strftime('%Y-%m-%d'))

# convert date back to datetime
df['date'] = df['date'].apply(lambda x: datetime.datetime.strptime(x, '%Y-%m-%d'))

## VADER Sentiment Classification

In [6]:
# use vader compound (or composite) score to evaluate the tweet sentiment
vader = SentimentIntensityAnalyzer()

df['vader_score'] = df['tweet'].apply(lambda x: vader.polarity_scores(x)['compound'])

In [7]:
conditions = [
    df['vader_score'] >= .05,
    df['vader_score'] <= -.05
]

classes = ['positive', 'negative']

df['sentiment'] = np.select(conditions, classes, 'neutral')

In [8]:
df.sentiment.value_counts()

positive    31095
negative    25703
neutral     20461
Name: sentiment, dtype: int64

In [9]:
df.sentiment.value_counts(normalize=True)

positive    0.402477
negative    0.332686
neutral     0.264836
Name: sentiment, dtype: float64

In [10]:
df = df.drop(labels='vader_score', axis=1)

In [11]:
# df.head()

In [12]:
# df.tail()

## Sklearn 

In [16]:
token = RegexpTokenizer(r'[a-zA-Z0-9]+')

In [18]:
tf = TfidfVectorizer(lowercase=True, tokenizer=token.tokenize)
text_tf = tf.fit_transform(df['tweet'])
text_tf.data

array([0.13545827, 0.2371028 , 0.24071474, ..., 0.03735544, 0.02551469,
       0.0518535 ])

In [19]:
len(text_tf.data)

2169940

In [21]:
def tfidf(docs, xColNames=None):
    tf = TfidfVectorizer(tokenizer=token.tokenize)
    x1 = tf.fit_transform(docs)
    
    df = pd.DataFrame(x1.toarray().transpose(), index=tf.get_feature_names())
    
    if xColNames is not None:
        df.columns = xColNames
    
    return df

In [23]:
sparse = tfidf(df['tweet']).transpose()

In [24]:
type(sparse)

pandas.core.frame.DataFrame

In [25]:
sparse.shape

(77259, 122638)

In [26]:
sparse.iloc[:,100000:100050]

Unnamed: 0,st0szsenxb,st1,st2c9gbymp,st2fta0kyx,st3dyclmot,st3u6o4zgh,st3v3nparker,st4hptjjcu,st57mmn5g5,st8vritmdz,...,stacked,stacking,stacy,stacyherbert,stacymangia15,stacymcgregor14,stacyontheright,stacysadar,stacyshafer6,stacystranghour
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
77254,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
77255,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
77256,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
77257,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
##### text classification from Intro_to_NLP #####

In [35]:
data = df['tweet']
target = df['sentiment']

In [40]:
stopwords = stopwords.words('english')

In [41]:
stopwords += list(string.punctuation)

In [42]:
stopwords = set(stopwords)

In [44]:
def process_tweet(tweet):
    tokens = nltk.word_tokenize(tweet)
    stopwords_removed = [token.lower() for token in tokens if token.lower() not in stopwords]
    return stopwords_removed

In [45]:
processed_data = list(map(process_tweet, data))

In [49]:
total_vocab = set()
for tweet in processed_data:
    total_vocab.update(tweet)
len(total_vocab)

139512

In [52]:
lemmatizer = WordNetLemmatizer()

In [53]:
lemmatized_output = []

for listy in processed_data:
    lemmed = ' '.join([lemmatizer.lemmatize(w) for w in listy])
    lemmatized_output.append(lemmed)

In [54]:
X_lem = lemmatized_output
y_lem = target

In [63]:
X_train_lem, X_test_lem, y_train_lem, y_test_lem = train_test_split(X_lem, y_lem, random_state=88)
tfidf = TfidfVectorizer()

In [64]:
tfidf_data_train_lem = tfidf.fit_transform(X_train_lem)

In [65]:
tfidf_data_test_lem = tfidf.transform(X_test_lem)

In [66]:
tfidf_data_train_lem

<57944x97613 sparse matrix of type '<class 'numpy.float64'>'
	with 1050101 stored elements in Compressed Sparse Row format>

In [68]:
non_zero_cols = tfidf_data_train_lem.nnz / float(tfidf_data_train_lem.shape[0])
print('Average # of Non-Zero elements in Vectorized Tweets: ', non_zero_cols)

Average # of Non-Zero elements in Vectorized Tweets:  18.12268742233881


In [69]:
percent_sparse = 1 - (non_zero_cols / float(tfidf_data_train_lem.shape[1]))
print('Percentage of cols containing ZERO: ', percent_sparse)

Percentage of cols containing ZERO:  0.9998143414563394


In [70]:
df_freq_pos = df[df['sentiment'] == 'positive']
df_freq_neu = df[df['sentiment'] == 'neutral']
df_freq_neg = df[df['sentiment'] == 'negative']

In [71]:
data_pos = df_freq_pos['tweet']
data_neu = df_freq_neu['tweet']
data_neg = df_freq_neg['tweet']

In [73]:
data_pos.head()

1    “Perhaps more than any other type of medicine, the success of a #vaccine depends on the cooperation of everyone. To protect those who can’t have a...
2    Provide pharmaceutical companies with the opportunity to develop this vaccine and drug as soon as possible and to reach mass production. As you kn...
3    From John's Hopkins: 1)  Immediate risk of COVID-19 remains low in the U.S.  The best way to protect yourself is by practicing the same precaution...
4            If the #stockmarkets keep falling - how confident will you be about a #vaccine for #COVID-19 and funds being available for their development?
5    The Jefferson County Department of Health (@jcdhtweets)  says they're working with local schools and hospitals to prepare for the spread of COVID-...
Name: tweet, dtype: object

In [72]:
pros_pos = list(map(process_tweet, data_pos))
pros_neu = list(map(process_tweet, data_neu))
pros_neg = list(map(process_tweet, data_neg))

TypeError: argument of type 'WordListCorpusReader' is not iterable

## SpaCy NLP Preprocessing

#### Tokenizing

In [27]:
# while the sentiment in this tweet is properly classified as 'negative',
# this person is likely in favor of taking a vaccine, and the negative 
# sentiment is more broadly directed at the context of COVID-19
print(df.loc[0]['tweet'],'\n')
print(df.loc[0]['sentiment'])

For everyone comparing COVID-19 to the flu, just a reminder that even with vaccines the flu kills tens of thousands a year. Now imagine a more deadly virus with no vaccine or assurance that any vaccine developed will be affordable and hi! Welcome to the pandemic! 

negative


In [28]:
# instantiate spaCy English object
nlp = English()

text = df.loc[0]['tweet']

my_doc = nlp(text)

token_list = []
for token in my_doc:
    token_list.append(token.text)
print(token_list)

['For', 'everyone', 'comparing', 'COVID-19', 'to', 'the', 'flu', ',', 'just', 'a', 'reminder', 'that', 'even', 'with', 'vaccines', 'the', 'flu', 'kills', 'tens', 'of', 'thousands', 'a', 'year', '.', 'Now', 'imagine', 'a', 'more', 'deadly', 'virus', 'with', 'no', 'vaccine', 'or', 'assurance', 'that', 'any', 'vaccine', 'developed', 'will', 'be', 'affordable', 'and', 'hi', '!', 'Welcome', 'to', 'the', 'pandemic', '!']


#### Stopwords

In [29]:
stopwords = spacy.lang.en.stop_words.STOP_WORDS

filtered_sent = []

doc = nlp(text)

for word in doc:
    if word.is_stop == False:
        filtered_sent.append(word)
print(filtered_sent)

[comparing, COVID-19, flu, ,, reminder, vaccines, flu, kills, tens, thousands, year, ., imagine, deadly, virus, vaccine, assurance, vaccine, developed, affordable, hi, !, Welcome, pandemic, !]


#### Lemmatization

In [30]:
for word in filtered_sent:
    print(word.text, word.lemma_)

comparing compare
COVID-19 COVID-19
flu flu
, ,
reminder reminder
vaccines vaccine
flu flu
kills kill
tens ten
thousands thousand
year year
. .
imagine imagine
deadly deadly
virus virus
vaccine vaccine
assurance assurance
vaccine vaccine
developed develope
affordable affordable
hi hello
! !
Welcome Welcome
pandemic pandemic
! !


In [31]:
nlp = en_core_web_sm.load()



for word in my_doc:
    print(word.text, word.pos_)

For 
everyone 
comparing 
COVID-19 
to 
the 
flu 
, 
just 
a 
reminder 
that 
even 
with 
vaccines 
the 
flu 
kills 
tens 
of 
thousands 
a 
year 
. 
Now 
imagine 
a 
more 
deadly 
virus 
with 
no 
vaccine 
or 
assurance 
that 
any 
vaccine 
developed 
will 
be 
affordable 
and 
hi 
! 
Welcome 
to 
the 
pandemic 
! 


## [Stackoverflow](https://stackoverflow.com/questions/62139308/preprocessing-tweets-remove-and-eliminate-stop-words-and-remove-user-from)

In [32]:
nlp = spacy.load('en')

OSError: [E050] Can't find model 'en'. It doesn't seem to be a shortcut link, a Python package or a valid path to a data directory.

In [33]:
stop_words = [w.lower() for w in stopwords.words()]

AttributeError: 'set' object has no attribute 'words'