# TODO: 
    1) Neural Net / Transformer of some kind (do this last if at all really?)
    2) General cleanup/commenting/efficiency
    3) Wordcloud visuals?
    4) More EDA (geographic visuals)


In [158]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
import re 
import string
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer

In [159]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/cam/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Let's try a bunch of classification methods.

    1) Logistic Regression
    2) Naive Bayes
    3) SVM
    4) RandomForest
    5) RNN (BERT? GPT-3?)
    
    TFIDF vs CountVectorizer? Other pipeline methods?

In [188]:
# Load in training data
df = pd.read_csv('Corona_NLP_train.csv', encoding = 'latin1')
df.head()

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment
0,3799,48751,London,16-03-2020,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral
1,3800,48752,UK,16-03-2020,advice Talk to your neighbours family to excha...,Positive
2,3801,48753,Vagabonds,16-03-2020,Coronavirus Australia: Woolworths to give elde...,Positive
3,3802,48754,,16-03-2020,My food stock is not the only one which is emp...,Positive
4,3803,48755,,16-03-2020,"Me, ready to go at supermarket during the #COV...",Extremely Negative


In [189]:
# Load in test data
test_df = pd.read_csv('Corona_NLP_test.csv', encoding = 'latin1')
test_df.head()

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment
0,1,44953,NYC,02-03-2020,TRENDING: New Yorkers encounter empty supermar...,Extremely Negative
1,2,44954,"Seattle, WA",02-03-2020,When I couldn't find hand sanitizer at Fred Me...,Positive
2,3,44955,,02-03-2020,Find out how you can protect yourself and love...,Extremely Positive
3,4,44956,Chicagoland,02-03-2020,#Panic buying hits #NewYork City as anxious sh...,Negative
4,5,44957,"Melbourne, Victoria",03-03-2020,#toiletpaper #dunnypaper #coronavirus #coronav...,Neutral


In [190]:
# Check for nulls
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41157 entries, 0 to 41156
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   UserName       41157 non-null  int64 
 1   ScreenName     41157 non-null  int64 
 2   Location       32567 non-null  object
 3   TweetAt        41157 non-null  object
 4   OriginalTweet  41157 non-null  object
 5   Sentiment      41157 non-null  object
dtypes: int64(2), object(4)
memory usage: 1.9+ MB


In [192]:
# Replace na with 'None'
df['Location'].fillna('None', inplace = True)

In [193]:
df.head()

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment
0,3799,48751,London,16-03-2020,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral
1,3800,48752,UK,16-03-2020,advice Talk to your neighbours family to excha...,Positive
2,3801,48753,Vagabonds,16-03-2020,Coronavirus Australia: Woolworths to give elde...,Positive
3,3802,48754,,16-03-2020,My food stock is not the only one which is emp...,Positive
4,3803,48755,,16-03-2020,"Me, ready to go at supermarket during the #COV...",Extremely Negative


In [165]:
# Join stopwords together and set them for use in cleaning function.
", ".join(stopwords.words('english'))
stops = set(stopwords.words('english'))

def clean_tweet(tweet):
    # Remove hyperlinks.
    tweet= re.sub(r'https?://\S+|www\.\S+','',tweet)
    # Remove html
    tweet = re.sub(r'<.*?>','',tweet)
    # Remove numbers (Do we want to remove numbers? Death toll?)
    tweet = re.sub(r'\d+','',tweet)
    # Remove mentions
    tweet = re.sub(r'@\w+','',tweet)
    # Remove punctuation
    tweet = re.sub(r'[^\w\s\d]','',tweet)
    # Remove whitespace
    tweet = re.sub(r'\s+',' ',tweet).strip()
    # Remove stopwords
    tweet = " ".join([word for word in str(tweet).split() if word not in stops])
    
    return tweet.lower()

In [166]:
# Check function
example2 = df['OriginalTweet'][1]
#example="@username these, are words!  a the COvid# 19 https://www.kaggle.com/" 
clean_tweet(example2)

'advice talk neighbours family exchange phone numbers create contact list phone numbers neighbours schools employer chemist gp set online shopping accounts poss adequate supplies regular meds order'

In [167]:
example2

'advice Talk to your neighbours family to exchange phone numbers create contact list with phone numbers of neighbours schools employer chemist GP set up online shopping accounts if poss adequate supplies of regular meds but not over order'

In [194]:
# View dataframe
df.head()

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment
0,3799,48751,London,16-03-2020,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral
1,3800,48752,UK,16-03-2020,advice Talk to your neighbours family to excha...,Positive
2,3801,48753,Vagabonds,16-03-2020,Coronavirus Australia: Woolworths to give elde...,Positive
3,3802,48754,,16-03-2020,My food stock is not the only one which is emp...,Positive
4,3803,48755,,16-03-2020,"Me, ready to go at supermarket during the #COV...",Extremely Negative


In [195]:
# Create more useful labels for classification.
# We will take the original 5 possibilites and
# reduce them to 3, removing the "extremelys".
def make_label(sentiment):
    
    label = ''
    if 'Positive' in sentiment: 
        label = 1
    if 'Negative' in sentiment:
        label = -1
    if 'Neutral' in sentiment:
        label = 0
    return label

In [196]:
# Sanity check
make_label('Extremely Negative')

-1

In [198]:
# Apply make_label funtion to training and test dataframes.
df['label'] = df['Sentiment'].apply(lambda x: make_label(x))
test_df['label'] = test_df['Sentiment'].apply(lambda x: make_label(x))

In [199]:
df.head()

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment,label
0,3799,48751,London,16-03-2020,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral,0
1,3800,48752,UK,16-03-2020,advice Talk to your neighbours family to excha...,Positive,1
2,3801,48753,Vagabonds,16-03-2020,Coronavirus Australia: Woolworths to give elde...,Positive,1
3,3802,48754,,16-03-2020,My food stock is not the only one which is emp...,Positive,1
4,3803,48755,,16-03-2020,"Me, ready to go at supermarket during the #COV...",Extremely Negative,-1


In [200]:
# Apply text cleaning function to training dataframe.
df['newTweet'] = df['OriginalTweet'].apply(lambda x: clean_tweet(x))
df.head()

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment,label,newTweet
0,3799,48751,London,16-03-2020,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral,0,
1,3800,48752,UK,16-03-2020,advice Talk to your neighbours family to excha...,Positive,1,advice talk neighbours family exchange phone n...
2,3801,48753,Vagabonds,16-03-2020,Coronavirus Australia: Woolworths to give elde...,Positive,1,coronavirus australia woolworths give elderly ...
3,3802,48754,,16-03-2020,My food stock is not the only one which is emp...,Positive,1,my food stock one empty please dont panic ther...
4,3803,48755,,16-03-2020,"Me, ready to go at supermarket during the #COV...",Extremely Negative,-1,me ready go supermarket covid outbreak not im ...


In [203]:
# Create SVC model using a pipeline with tfidf.
svc = Pipeline([('tfidf',TfidfVectorizer()),
                    ('clf', LinearSVC()),
                    ])
# Set X and y to proper data.
X = df['newTweet']
y = df['label']

# Split data into training and test sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
# Fit data to training sets.
svc.fit(X_train, y_train)
# Make predicitions on test set. 
predictions = svc.predict(X_test)


print(classification_report(y_test,predictions))
print('Accuracy: ',accuracy_score(y_test,predictions))

              precision    recall  f1-score   support

          -1       0.80      0.81      0.81      5053
           0       0.72      0.66      0.69      2518
           1       0.84      0.86      0.85      6011

    accuracy                           0.80     13582
   macro avg       0.79      0.78      0.78     13582
weighted avg       0.80      0.80      0.80     13582

Accuracy:  0.8034899131203063


In [175]:
logistic = Pipeline([('tfidf',TfidfVectorizer()),
                    ('clf', LogisticRegression(max_iter = 5000)),
                    ])
logistic.fit(X_train, y_train)
predictions = logistic.predict(X_test)

print(classification_report(y_test,predictions))
print(accuracy_score(y_test,predictions))

              precision    recall  f1-score   support

          -1       0.77      0.80      0.79      5053
           0       0.73      0.57      0.64      2518
           1       0.80      0.85      0.83      6011

    accuracy                           0.78     13582
   macro avg       0.77      0.74      0.75     13582
weighted avg       0.78      0.78      0.78     13582

0.7810337211014579


In [176]:
nb = Pipeline([('tfidf',TfidfVectorizer()),
                    ('clf', MultinomialNB()),
                    ])
nb.fit(X_train, y_train)
predictions = nb.predict(X_test)

print(classification_report(y_test,predictions))
print(accuracy_score(y_test,predictions))

              precision    recall  f1-score   support

          -1       0.73      0.63      0.68      5053
           0       0.86      0.02      0.03      2518
           1       0.59      0.91      0.72      6011

    accuracy                           0.64     13582
   macro avg       0.73      0.52      0.48     13582
weighted avg       0.69      0.64      0.58     13582

0.6402591665439552


In [177]:
rf = Pipeline([('tfidf',TfidfVectorizer()),
                    ('clf', RandomForestClassifier()),
                    ])
rf.fit(X_train, y_train)
predictions = rf.predict(X_test)

print(classification_report(y_test,predictions))
print(accuracy_score(y_test,predictions))

              precision    recall  f1-score   support

          -1       0.75      0.70      0.72      5053
           0       0.64      0.62      0.63      2518
           1       0.75      0.80      0.78      6011

    accuracy                           0.73     13582
   macro avg       0.71      0.71      0.71     13582
weighted avg       0.73      0.73      0.73     13582

0.731188337505522


In [178]:
test_df.head()

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment,label
0,1,44953,NYC,02-03-2020,TRENDING: New Yorkers encounter empty supermar...,Extremely Negative,-1
1,2,44954,"Seattle, WA",02-03-2020,When I couldn't find hand sanitizer at Fred Me...,Positive,1
2,3,44955,,02-03-2020,Find out how you can protect yourself and love...,Extremely Positive,1
3,4,44956,Chicagoland,02-03-2020,#Panic buying hits #NewYork City as anxious sh...,Negative,-1
4,5,44957,"Melbourne, Victoria",03-03-2020,#toiletpaper #dunnypaper #coronavirus #coronav...,Neutral,0


In [205]:
test_df['newTweet'] = test_df['OriginalTweet'].apply(lambda x: clean_tweet(x))

In [206]:
test_df.head()

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment,label,newTweet
0,1,44953,NYC,02-03-2020,TRENDING: New Yorkers encounter empty supermar...,Extremely Negative,-1,trending new yorkers encounter empty supermark...
1,2,44954,"Seattle, WA",02-03-2020,When I couldn't find hand sanitizer at Fred Me...,Positive,1,when i couldnt find hand sanitizer fred meyer ...
2,3,44955,,02-03-2020,Find out how you can protect yourself and love...,Extremely Positive,1,find protect loved ones coronavirus
3,4,44956,Chicagoland,02-03-2020,#Panic buying hits #NewYork City as anxious sh...,Negative,-1,panic buying hits newyork city anxious shopper...
4,5,44957,"Melbourne, Victoria",03-03-2020,#toiletpaper #dunnypaper #coronavirus #coronav...,Neutral,0,toiletpaper dunnypaper coronavirus coronavirus...


In [207]:
preds = svc.predict(test_df['newTweet'])
print(accuracy_score(test_df['label'],preds))

0.7790942601369142


In [208]:
preds = logistic.predict(test_df['newTweet'])
print(accuracy_score(test_df['label'],preds))

0.7640863612427593


In [209]:
preds = nb.predict(test_df['newTweet'])
print(accuracy_score(test_df['label'],preds))

0.6316482359136387


In [210]:
preds = rf.predict(test_df['newTweet'])
print(accuracy_score(test_df['label'],preds))

0.70115850447604


In [211]:
svc_cnt = Pipeline([('count',CountVectorizer()),
                    ('clf', LinearSVC()),
                    ])

X = df['newTweet']
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
svc_cnt.fit(X_train, y_train)
predictions = svc_cnt.predict(X_test)

print(classification_report(y_test,predictions))
print(accuracy_score(y_test,predictions))

              precision    recall  f1-score   support

          -1       0.81      0.78      0.79      5053
           0       0.65      0.72      0.68      2518
           1       0.84      0.82      0.83      6011

    accuracy                           0.79     13582
   macro avg       0.77      0.77      0.77     13582
weighted avg       0.79      0.79      0.79     13582

0.7881755264320424


# So SVC with a CountVecotrizer does worse than tfidf...interesting.