In [1]:
import pandas as pd

import numpy as np
np.random.seed(2021)

import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS

import nltk
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
#nltk.download('wordnet') #download if not present already

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn import neighbors
from sklearn import datasets
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

In [2]:
#reading in data
tweet_label = pd.read_csv("../Data/vader_sample_label.csv")
tweet_label = tweet_label.drop(["Jakob", "Jamie", "Darren", "Agree w/ Vader", "Heather"], axis=1)
tweet_label

Unnamed: 0,text,positive,negative,assigned,Final Decision
0,Weve heard that false information about the CO...,0.00,1.00,Negative,p
1,Completely agree. Be smart and get the vaccine...,0.64,0.36,Positive,p
2,You know whats even crazier than indiscriminat...,0.73,0.27,Positive,a
3,Not my problem. Ive not been seeing anyone sin...,0.79,0.21,Positive,a
4,If those wanting a vaccine got the vaccine why...,0.70,0.30,Positive,a
...,...,...,...,...,...
995,The David Hookstead Show: 4th Of July Weekend ...,0.00,1.00,Negative,u
996,Um. The whole justification was to vaccinate i...,0.63,0.37,Positive,u
997,UWM supported by &amp; in collab with #NSS &am...,0.61,0.39,Positive,u
998,Ooops.An error: health department says Sydney ...,0.00,1.00,Negative,u


In [3]:
#replacing labels with numbers
mapping_sentiment = {"Positive": 1, "Negative": 0}
mapping_vaccine = {"p": 1, "a": 0, "u": 2}
tweet_label = tweet_label.replace({"assigned": mapping_sentiment, "Final Decision": mapping_vaccine})

In [4]:
#dropping 2's (unknowns) and reindexing
tweet_label = tweet_label[tweet_label["Final Decision"] != 2]
tweet_label = tweet_label.reset_index(drop = True)

In [5]:
#1 if vader assigned label matched manually assigned label
vader_match = [1 if tweet_label["assigned"][i] == tweet_label["Final Decision"][i] else 0 for i in range(len(tweet_label))]

#vader_match = []
#for i in range(len(tweet_label)):
#    if tweet_label["assigned"][i] == tweet_label["Final Decision"][i]:
#        vader_match.append(1)
#    else:
#        vader_match.append(0)
    
tweet_label["vader_match"] = vader_match

In [6]:
#cleaning functions

def cleaning(text): #to remove mentions and links, taken partially from juejue's and Jamie's notebooks
    text = re.sub(r'[^\w\s]', ' ', text) #remove punctuation
    text = re.sub('\S*@\S*\s?', '',text) #remove emails
    text = re.sub('((www\.[^\s]+)|(https?://[^\s]+))',' ', text) #remove links
    text = re.sub("([^0-9A-Za-z \t])|(\w+:\/\/\S+)", '', text) #also removes links?
    text = re.sub(r'\d+', '', text) #remove numbers
    return text

def clean_stem(tweets, rawtweets = True): #cleaning and stemming
    if rawtweets == True:
        #raw tweets pulled straight from Twitter
        #clears the 'b and the ' at the start and end of the tweet
        tweets = tweets.str[2:] #dropping first two characters apostrophe and b
        tweets = tweets.str[:-1] #dropping last character apostrophe
    
    tweets = tweets.apply(cleaning)

    return tweets

In [7]:
tweet_label["clean_text"] = clean_stem(tweet_label["text"], rawtweets = False)
tweet_label

Unnamed: 0,text,positive,negative,assigned,Final Decision,vader_match,clean_text
0,Weve heard that false information about the CO...,0.00,1.00,0,1,0,Weve heard that false information about the CO...
1,Completely agree. Be smart and get the vaccine...,0.64,0.36,1,1,1,Completely agree Be smart and get the vaccine...
2,You know whats even crazier than indiscriminat...,0.73,0.27,1,0,0,You know whats even crazier than indiscriminat...
3,Not my problem. Ive not been seeing anyone sin...,0.79,0.21,1,0,0,Not my problem Ive not been seeing anyone sin...
4,If those wanting a vaccine got the vaccine why...,0.70,0.30,1,0,0,If those wanting a vaccine got the vaccine why...
...,...,...,...,...,...,...,...
699,Thank you for passing the amendment protecting...,0.58,0.42,1,0,0,Thank you for passing the amendment protecting...
700,"The data, from Imperial College London, sugges...",0.31,0.69,0,1,0,The data from Imperial College London sugges...
701,Forcing college students to receive Covid vacc...,0.00,1.00,0,0,1,Forcing college students to receive Covid vacc...
702,The funny part is i was supposed to get my vac...,0.20,0.80,0,1,0,The funny part is i was supposed to get my vac...


In [8]:
stemmer = SnowballStemmer("english")
text = tweet_label["clean_text"].to_list()
process_tweets = []
for sentence in text:
    #iterate through each word in a tweet/sentence, if not part of stopwords list then keep in tweet
    process_tweets.append(" ".join([stemmer.stem(i) for i in sentence.split() if i not in gensim.parsing.preprocessing.STOPWORDS]))

process_tweets

['weve heard fals inform covid vaccin women hesit get jab the royal colleg midwiv the royal colleg obstetrician gynaecologist evid suggest covidvaccin',
 'complet agre be smart vaccin your take obnoxi team colleg basebal',
 'you know what crazier indiscrimin mandat healthi colleg kid covid vaccin do not mandat staff one actual risk make sens',
 'not problem ive see covid happen i shop colleg area there case im good im healthi i need vaccin',
 'if want vaccin got vaccin care let peopl worri i bet mask peopl issu i critiqu save kid colleg are eat healthi do text drive do read kid',
 'nc state fault want play vaccin polit best chanc win colleg world seri team vaccin best shot caus covid boot lol dumb player coach',
 'i understand jab vaccin experiment canada colleg physician amp politician ban use ivermectin treatment covid peru mexico india provid citizen decim covid seem charg murder',
 'uhhhhh overheard mom sister get vaccin theyr gonna stop pay colleg haha fuck',
 'state gop lawmak mo

In [9]:
#creating frequency matrix
vectorizer = CountVectorizer(min_df=2)

In [10]:
#fitting random forest classifier, using all data as training set right now
clf = RandomForestClassifier()

In [11]:
#kfold random forest validation
def kfold_forest_validation(features, labels, k = 4, shuf = False):
    k_fold = KFold(n_splits = k, shuffle=shuf)
    rfc_list = []
    
    for k, (train, test) in enumerate(k_fold.split(features)):
        clf.fit(features[train], labels[train]) #fitting training data
        label_pred = clf.predict(features[test]) #using trained data to fit to using only X features from test dataset
        rfc_val = (label_pred==labels[test]).sum() / test.size #proportion of correctly predicted points
        rfc_list.append(rfc_val)
    
    return np.mean(rfc_list)

In [12]:
#only tokens (words), no sentiment
features = vectorizer.fit_transform(process_tweets).toarray() #document term matrix w/ frequencies, our "features", the words
manlabels = np.array(tweet_label["vader_match"]) #our manual labels

#only tokens, no sentiment
kfold_forest_validation(features, manlabels)

0.6676136363636364

In [13]:
#including vader sentiment label as a feature
#creating matrix of features
reshaped_values = np.reshape(tweet_label["assigned"].to_numpy(), (len(tweet_label["assigned"]), 1)) #vader sentiment decisions
features_with_sentiment = np.append(features, reshaped_values, 1)

#validation
kfold_forest_validation(features_with_sentiment, manlabels)

0.7258522727272727

In [14]:
#including vader sentiment labels AND values

#creating matrix of features
#first including positive sentiment
reshaped_vader_positive = np.reshape(tweet_label["positive"].to_numpy(), (len(tweet_label["positive"]), 1)) #vader positive sentiment
features_with_sentiment_vals = np.append(features_with_sentiment, reshaped_vader_positive, 1)

#and negative sentiment
reshaped_vader_negative = np.reshape(tweet_label["negative"].to_numpy(), (len(tweet_label["negative"]), 1)) #vader positive sentiment
features_with_sentiment_vals = np.append(features_with_sentiment_vals, reshaped_vader_negative, 1)

#validation
kfold_forest_validation(features_with_sentiment_vals, manlabels)

0.7556818181818181

In [15]:
#k nearest neighbor test with only sentiment values

vader_positive = np.reshape(tweet_label["positive"].to_numpy(), (len(tweet_label["positive"]), 1)) #vader positive sentiment
vader_negative = np.reshape(tweet_label["negative"].to_numpy(), (len(tweet_label["negative"]), 1)) #vader positive sentiment
vader_sentiments = np.append(vader_positive, vader_negative, 1)


knn = neighbors.KNeighborsClassifier(n_neighbors = 5) #arbitarily chose 5
k_fold = KFold(n_splits = 4, shuffle=False)

knn_list = []

for k, (train, test) in enumerate(k_fold.split(vader_sentiments)):
    #k-nearest neighbors
    knn.fit(vader_sentiments[train], manlabels[train]) #fitting NN with X features and y classes
    knn_pred = knn.predict(vader_sentiments[test]) #using trained data to fit to using only X features from test dataset
    knn_val = (knn_pred==manlabels[test]).sum() / test.size #proportion of correctly predicted labels
    knn_list.append(knn_val)

np.mean(knn_list)

0.6732954545454546

In [16]:
#random forest with just the sentiment values
clf.fit(features_with_sentiment_vals, manlabels)
rfc_list = []

for k, (train, test) in enumerate(k_fold.split(vader_sentiments)):
    clf.fit(vader_sentiments[train], manlabels[train]) #fitting training data
    label_pred = clf.predict(vader_sentiments[test]) #using trained data to fit to using only X features from test dataset
    rfc_val = (label_pred==manlabels[test]).sum() / test.size #proportion of correctly predicted points
    rfc_list.append(rfc_val)
    
np.mean(rfc_list)

0.6661931818181818