In [1]:
# packages
import pandas as pd

import numpy as np
np.random.seed(2021)

import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS

import nltk
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
#nltk.download('wordnet') #download if not present already

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn import neighbors
from sklearn import datasets
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

In [2]:
# reading in data
tweet_label = pd.read_csv("../Data/vader_sample_label.csv")
tweet_label = tweet_label.drop(["Jakob", "Jamie", "Darren", "Agree w/ Vader", "Heather"], axis=1)
tweet_label

Unnamed: 0,text,positive,negative,assigned,Final Decision
0,Weve heard that false information about the CO...,0.00,1.00,Negative,p
1,Completely agree. Be smart and get the vaccine...,0.64,0.36,Positive,p
2,You know whats even crazier than indiscriminat...,0.73,0.27,Positive,a
3,Not my problem. Ive not been seeing anyone sin...,0.79,0.21,Positive,a
4,If those wanting a vaccine got the vaccine why...,0.70,0.30,Positive,a
...,...,...,...,...,...
995,The David Hookstead Show: 4th Of July Weekend ...,0.00,1.00,Negative,u
996,Um. The whole justification was to vaccinate i...,0.63,0.37,Positive,u
997,UWM supported by &amp; in collab with #NSS &am...,0.61,0.39,Positive,u
998,Ooops.An error: health department says Sydney ...,0.00,1.00,Negative,u


In [3]:
# replacing labels with numbers
mapping_sentiment = {"Positive": 1, "Negative": 0}
mapping_vaccine = {"p": 1, "a": 0, "u": 2}
tweet_label = tweet_label.replace({"assigned": mapping_sentiment, "Final Decision": mapping_vaccine})

In [4]:
# dropping 2's (unknowns) and reindexing
tweet_label = tweet_label[tweet_label["Final Decision"] != 2]
tweet_label = tweet_label.reset_index(drop = True)

In [5]:
# 1 if vader assigned label matched manually assigned label
vader_match = [1 if tweet_label["assigned"][i] == tweet_label["Final Decision"][i] else 0 for i in range(len(tweet_label))]

#vader_match = []
#for i in range(len(tweet_label)):
#    if tweet_label["assigned"][i] == tweet_label["Final Decision"][i]:
#        vader_match.append(1)
#    else:
#        vader_match.append(0)
    
tweet_label["vader_match"] = vader_match

In [6]:
# # cleaning functions, old

# def cleaning(text): # to remove mentions and links, taken partially from juejue's and jamie's notebooks
#     text = text.replace("\\n", " ") # dropping \n
#     text = re.sub(r'[^\w\s]', ' ', text) # remove punctuation
#     text = re.sub('\S*@\S*\s?', '',text) # remove emails
#     text = re.sub('((www\.[^\s]+)|(https?://[^\s]+))',' ', text) # remove links
#     text = re.sub("([^0-9A-Za-z \t])|(\w+:\/\/\S+)", '', text) # also removes links?
#     text = re.sub(r'\d+', '', text) # remove numbers
#     return text

# def clean_stem(tweets, rawtweets = True): # cleaning and stemming
#     if rawtweets == True:
#         # raw tweets pulled straight from Twitter
#         # clears the 'b and the ' at the start and end of the tweet
#         tweets = tweets.str[2:] # dropping first two characters apostrophe and b
#         tweets = tweets.str[:-1] # dropping last character apostrophe
    
#     tweets = tweets.apply(cleaning)

#     return tweets

In [7]:
# cleaning functions, better?

def cleaning(text): # to remove mentions and links, taken partially from juejue's and jamie's notebooks
    text = text.replace("\\n", " ") # dropping \n
    text = re.sub(r'[^\w\s]', ' ', text) # remove punctuation
    text = re.sub('\S*@\S*\s?', '',text) # remove emails
    text = re.sub('((www\.[^\s]+)|(https?://[^\s]+))',' ', text) # remove links
    text = re.sub("([^0-9A-Za-z \t])|(\w+:\/\/\S+)", '', text) # also removes links?
    text = re.sub(r'\d+', '', text) # remove numbers
    
    return text


def clean_stem(tweets, rawtweets = True): # cleaning and stemming
    
    if rawtweets == True:
        # raw tweets pulled straight from Twitter
        # clears the 'b and the ' at the start and end of the tweet
        tweets = tweets.str[2:] # dropping first two characters apostrophe and b
        tweets = tweets.str[:-1] # dropping last character apostrophe
    
    # removing punctuation, \\n, links, etc.
    tweets = tweets.apply(cleaning)
    
    # stemming the words
    stemmer = SnowballStemmer("english")
    text = tweets.to_list()
    process_tweets = []
    for sentence in text:
        # iterate through each word in a tweet/sentence, if not part of stopwords list then keep in tweet
        process_tweets.append(" ".join([stemmer.stem(i) for i in sentence.split() if i not in gensim.parsing.preprocessing.STOPWORDS]))
    
    #return list of cleaned tweets
    return process_tweets
   

In [8]:
tweet_label["clean_text"] = clean_stem(tweet_label["text"], rawtweets = False)
tweet_label

Unnamed: 0,text,positive,negative,assigned,Final Decision,vader_match,clean_text
0,Weve heard that false information about the CO...,0.00,1.00,0,1,0,weve heard fals inform covid vaccin women hesi...
1,Completely agree. Be smart and get the vaccine...,0.64,0.36,1,1,1,complet agre be smart vaccin your take obnoxi ...
2,You know whats even crazier than indiscriminat...,0.73,0.27,1,0,0,you know what crazier indiscrimin mandat healt...
3,Not my problem. Ive not been seeing anyone sin...,0.79,0.21,1,0,0,not problem ive see covid happen i shop colleg...
4,If those wanting a vaccine got the vaccine why...,0.70,0.30,1,0,0,if want vaccin got vaccin care let peopl worri...
...,...,...,...,...,...,...,...
699,Thank you for passing the amendment protecting...,0.58,0.42,1,0,0,thank pass amend protect k colleg student mand...
700,"The data, from Imperial College London, sugges...",0.31,0.69,0,1,0,the data imperi colleg london suggest technolo...
701,Forcing college students to receive Covid vacc...,0.00,1.00,0,0,1,forc colleg student receiv covid vaccin order ...
702,The funny part is i was supposed to get my vac...,0.20,0.80,0,1,0,the funni suppos vaccin second dose kal ig the...


In [9]:
# stemmer = SnowballStemmer("english")
# text = tweet_label["clean_text"].to_list()
# process_tweets = []
# for sentence in text:
#     # iterate through each word in a tweet/sentence, if not part of stopwords list then keep in tweet
#     process_tweets.append(" ".join([stemmer.stem(i) for i in sentence.split() if i not in gensim.parsing.preprocessing.STOPWORDS]))

In [10]:
# creating frequency matrix
vectorizer = CountVectorizer(min_df=5) # min_df is minimum document frequency to include token/word

In [11]:
# fitting random forest classifier, using all data as training set right now
rfc = RandomForestClassifier()

In [12]:
# kfold random forest validation
def kfold_forest_validation(features, labels, k = 10, shuf = False):
    k_fold = KFold(n_splits = k, shuffle=shuf)
    
    rfc = RandomForestClassifier()
    rfc_list = []
    
    for k, (train, test) in enumerate(k_fold.split(features)):
        rfc.fit(features[train], labels[train]) # fitting training data
        label_pred = rfc.predict(features[test]) # predicting using trained data and features from test dataset
        rfc_acc = (label_pred == labels[test]).sum() / test.size # proportion of correctly predicted labels, accuracy
        rfc_list.append(rfc_acc)
        
    #print(rfc_list)
    
    return np.mean(rfc_list) # return average of all kfold proportions

In [13]:
#vectorizer.get_feature_names()

In [14]:
# only tokens (words), no sentiment
features = vectorizer.fit_transform(tweet_label["clean_text"]).toarray() # document term matrix w/ frequencies, our "features", the words
manlabels = np.array(tweet_label["vader_match"]) # our manual labels

# validation
kfold_forest_validation(features, manlabels)

0.6495171026156943

In [15]:
# including vader sentiment label as a feature
# creating matrix of features
reshaped_values = np.reshape(tweet_label["assigned"].to_numpy(), (len(tweet_label["assigned"]), 1)) # vader sentiment decisions
features_with_sentiment = np.append(features, reshaped_values, 1)

# validation
kfold_forest_validation(features_with_sentiment, manlabels)

0.7530583501006036

In [16]:
# including vader sentiment labels AND values

# creating matrix of features
# first including positive sentiment
reshaped_vader_positive = np.reshape(tweet_label["positive"].to_numpy(), (len(tweet_label["positive"]), 1)) # vader positive sentiment
features_with_sentiment_vals = np.append(features_with_sentiment, reshaped_vader_positive, 1)

# and negative sentiment
reshaped_vader_negative = np.reshape(tweet_label["negative"].to_numpy(), (len(tweet_label["negative"]), 1)) # vader positive sentiment
features_with_sentiment_vals = np.append(features_with_sentiment_vals, reshaped_vader_negative, 1)

# validation
kfold_forest_validation(features_with_sentiment_vals, manlabels)

0.7614889336016096

In [17]:
# random forest with just the sentiment values and nothing else
vader_positive = np.reshape(tweet_label["positive"].to_numpy(), (len(tweet_label["positive"]), 1)) # vader positive sentiment
vader_negative = np.reshape(tweet_label["negative"].to_numpy(), (len(tweet_label["negative"]), 1)) # vader positive sentiment
vader_sentiments = np.append(vader_positive, vader_negative, 1)

kfold_forest_validation(vader_sentiments, manlabels)

0.6520925553319921

In [18]:
#csv_output = np.append(features_with_sentiment_vals, manlabels, 1)
manlabels = np.array(tweet_label["vader_match"])
csv_output = np.append(features_with_sentiment_vals, np.reshape(manlabels, (-1, 1)), 1)
csv_output # array with the words, sentiment label (negative = 0, positive = 1), sentiment values, and pro/anti vaccine labels 

array([[0.  , 0.  , 0.  , ..., 0.  , 1.  , 0.  ],
       [0.  , 0.  , 0.  , ..., 0.64, 0.36, 1.  ],
       [0.  , 0.  , 0.  , ..., 0.73, 0.27, 0.  ],
       ...,
       [0.  , 0.  , 0.  , ..., 0.  , 1.  , 1.  ],
       [0.  , 0.  , 0.  , ..., 0.2 , 0.8 , 0.  ],
       [0.  , 0.  , 0.  , ..., 1.  , 0.  , 0.  ]])

In [19]:
# save frequency matrix
np.savetxt("../Data/term_matrix_sentiment_va.csv", csv_output, delimiter=",")

In [20]:
# only tokens (words), no sentiment
features = vectorizer.fit_transform(tweet_label["clean_text"]).toarray() # document term matrix w/ frequencies, our "features", the words
manlabels = np.array(tweet_label["vader_match"]) # our manual labels

# validation for k=5 and k=10 comparison
print("k=5 Validation: ", kfold_forest_validation(features, manlabels, k=5))
print("k=10 Validation: ", kfold_forest_validation(features, manlabels, k=10))

k=5 Validation:  0.6307396149949341
k=10 Validation:  0.643702213279678


In [21]:
# # Random testing, k nearest neighbor test with only sentiment values, can ignore
# vader_positive = np.reshape(tweet_label["positive"].to_numpy(), (len(tweet_label["positive"]), 1)) # vader positive sentiment
# vader_negative = np.reshape(tweet_label["negative"].to_numpy(), (len(tweet_label["negative"]), 1)) # vader positive sentiment
# vader_sentiments = np.append(vader_positive, vader_negative, 1)

# knn = neighbors.KNeighborsClassifier(n_neighbors = 5) #arbitarily chose 5
# k_fold = KFold(n_splits = 4, shuffle=False)

# knn_list = []

# for k, (train, test) in enumerate(k_fold.split(vader_sentiments)):
#     # k-nearest neighbors
#     knn.fit(vader_sentiments[train], manlabels[train]) # fitting NN with features and labels
#     knn_pred = knn.predict(vader_sentiments[test]) # using trained data and features to fit test data
#     knn_acc = (knn_pred==manlabels[test]).sum() / test.size # proportion of correctly predicted labels, accuracy
#     knn_list.append(knn_acc)

# np.mean(knn_list)

In [22]:
# reading in data
alltweet = pd.read_csv("../Data/all_tweet_data.csv")
alltweet = alltweet.drop_duplicates(subset=["text"], keep='first') # remove duplicate tweets
alltweet = alltweet[~alltweet["text"].str.contains("Pin Code")] # remove tweets that have "Pin Code"
alltweet

Unnamed: 0,date,text,retweet_count,favorite_count,reply
1,2021-06-23 23:16:54,b'@RobSchneider If a college forces the vaccin...,0,0,1
2,2021-06-23 23:14:58,b'@senatenj My Deans List Sophomore unwelcome ...,0,0,0
3,2021-06-23 23:12:56,b'@NYGovCuomo are you paying out if your perso...,0,0,0
4,2021-06-23 23:09:29,b'@tedcruz please help my daughter in #NJ (@Go...,0,0,0
5,2021-06-23 23:00:52,b'Deathsantis say vaccine cards are wrong.. Bu...,0,0,0
...,...,...,...,...,...
9206,2021-07-06 18:46:36,b'@COS_Conservatve @BlushingBelles @Boo1573391...,0,1,1
9250,2021-07-06 15:33:34,b'The fear factor will never end. \nhttps://t....,0,1,0
9275,2021-07-06 13:52:40,b'How College COVID Vaccine Mandates Put Stude...,0,0,0
9726,2021-07-06 00:29:54,b'How College CVD Vaccine Mandates Put Student...,1,1,0


In [23]:
alltweet["clean"] = clean_stem(alltweet["text"])
alltweet

Unnamed: 0,date,text,retweet_count,favorite_count,reply,clean
1,2021-06-23 23:16:54,b'@RobSchneider If a college forces the vaccin...,0,0,1,robschneid if colleg forc vaccin su exist wrec...
2,2021-06-23 23:14:58,b'@senatenj My Deans List Sophomore unwelcome ...,0,0,0,senatenj my dean list sophomor unwelcom colleg...
3,2021-06-23 23:12:56,b'@NYGovCuomo are you paying out if your perso...,0,0,0,nygovcuomo pay person pocket colleg scholarshi...
4,2021-06-23 23:09:29,b'@tedcruz please help my daughter in #NJ (@Go...,0,0,0,tedcruz help daughter nj govmurphi return coll...
5,2021-06-23 23:00:52,b'Deathsantis say vaccine cards are wrong.. Bu...,0,0,0,deathsanti vaccin card wrong but mandat parti ...
...,...,...,...,...,...,...
9206,2021-07-06 18:46:36,b'@COS_Conservatve @BlushingBelles @Boo1573391...,0,1,1,cosconservatv blushingbell boo mani physician ...
9250,2021-07-06 15:33:34,b'The fear factor will never end. \nhttps://t....,0,1,0,the fear factor end https t ljalcvwdh
9275,2021-07-06 13:52:40,b'How College COVID Vaccine Mandates Put Stude...,0,0,0,how colleg covid vaccin mandat put student in ...
9726,2021-07-06 00:29:54,b'How College CVD Vaccine Mandates Put Student...,1,1,0,how colleg cvd vaccin mandat put student in da...


In [24]:
tweet_label

Unnamed: 0,text,positive,negative,assigned,Final Decision,vader_match,clean_text
0,Weve heard that false information about the CO...,0.00,1.00,0,1,0,weve heard fals inform covid vaccin women hesi...
1,Completely agree. Be smart and get the vaccine...,0.64,0.36,1,1,1,complet agre be smart vaccin your take obnoxi ...
2,You know whats even crazier than indiscriminat...,0.73,0.27,1,0,0,you know what crazier indiscrimin mandat healt...
3,Not my problem. Ive not been seeing anyone sin...,0.79,0.21,1,0,0,not problem ive see covid happen i shop colleg...
4,If those wanting a vaccine got the vaccine why...,0.70,0.30,1,0,0,if want vaccin got vaccin care let peopl worri...
...,...,...,...,...,...,...,...
699,Thank you for passing the amendment protecting...,0.58,0.42,1,0,0,thank pass amend protect k colleg student mand...
700,"The data, from Imperial College London, sugges...",0.31,0.69,0,1,0,the data imperi colleg london suggest technolo...
701,Forcing college students to receive Covid vacc...,0.00,1.00,0,0,1,forc colleg student receiv covid vaccin order ...
702,The funny part is i was supposed to get my vac...,0.20,0.80,0,1,0,the funni suppos vaccin second dose kal ig the...


In [25]:
trained_set = tweet_label[["clean_text", "assigned"]]

In [26]:
trained_set

Unnamed: 0,clean_text,assigned
0,weve heard fals inform covid vaccin women hesi...,0
1,complet agre be smart vaccin your take obnoxi ...,1
2,you know what crazier indiscrimin mandat healt...,1
3,not problem ive see covid happen i shop colleg...,1
4,if want vaccin got vaccin care let peopl worri...,1
...,...,...
699,thank pass amend protect k colleg student mand...,1
700,the data imperi colleg london suggest technolo...,0
701,forc colleg student receiv covid vaccin order ...,0
702,the funni suppos vaccin second dose kal ig the...,0
