In [1]:
# packages
import pandas as pd

import numpy as np
np.random.seed(2021)

# install if not installed already
#!conda install gensim
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS

import nltk
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
#nltk.download('wordnet') #download if not present already

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn import neighbors
from sklearn import datasets
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

In [2]:
# reading in data
# first our hand labeled data
training_data = pd.read_csv("../Data/vader_sample_label.csv")
training_data = training_data.drop(["Jakob", "Jamie", "Darren", "Agree w/ Vader", "Heather"], axis=1)

# our data that needs to be hand labaled
data_to_label = pd.read_csv("../Data/vader_result_final.csv")
data_to_label = data_to_label.drop(["Unnamed: 0"], axis=1)

In [3]:
# replacing labels with numbers
mapping_sentiment = {"Positive": 1, "Negative": 0}
mapping_vaccine = {"p": 1, "a": 0, "u": 2}

training_data = training_data.replace({"assigned": mapping_sentiment, "Final Decision": mapping_vaccine})

data_to_label = data_to_label.replace({"assigned": mapping_sentiment})


In [4]:
# dropping 2's (unknowns) and reindexing
training_data = training_data[training_data["Final Decision"] != 2]
training_data = training_data.reset_index(drop = True)

# unknowns should already be removed from data_to_label

In [5]:
# cleaning functions

def cleaning(text): # to remove mentions and links, taken partially from juejue's and jamie's notebooks
    text = text.replace("\\n", " ") # dropping \n
    text = re.sub(r'[^\w\s]', ' ', text) # remove punctuation
    text = re.sub('\S*@\S*\s?', '',text) # remove emails
    text = re.sub('((www\.[^\s]+)|(https?://[^\s]+))',' ', text) # remove links
    text = re.sub("([^0-9A-Za-z \t])|(\w+:\/\/\S+)", '', text) # also removes links?
    text = re.sub(r'\d+', '', text) # remove numbers
    
    return text


def clean_stem(tweets, rawtweets = False): # cleaning and stemming
    
    if rawtweets == True:
        # raw tweets pulled straight from Twitter
        # clears the 'b and the ' at the start and end of the tweet
        tweets = tweets.str[2:] # dropping first two characters apostrophe and b
        tweets = tweets.str[:-1] # dropping last character apostrophe
    
    # removing punctuation, \\n, links, etc.
    tweets = tweets.apply(cleaning)
    
    # stemming the words, also remove stop words
    stemmer = SnowballStemmer("english")
    text = tweets.to_list()
    process_tweets = []
    
    # adding common words to set of stopwords
    new_stopwords = gensim.parsing.preprocessing.STOPWORDS.union(set(["covid", "vaccine", "college"]))

    for sentence in text:
        # iterate through each word in a tweet/sentence, if not part of stopwords list then keep in tweet
        process_tweets.append(" ".join([stemmer.stem(i) for i in sentence.split() if i not in new_stopwords]))
    
    #return list of cleaned tweets
    return process_tweets

In [6]:
# a look at the training data
#training_data

In [7]:
# a look at the data that needs to be labeled
#data_to_label

In [8]:
# combined datasets
combined_set = training_data.append(data_to_label)

In [9]:
combined_set["clean_text"] = clean_stem(combined_set["text"], rawtweets = False)
#combined_set

In [10]:
# removing duplicates and resetting index
combined_set = combined_set.drop_duplicates(subset=["clean_text"], keep='first') # remove any duplicate tweets
combined_set = combined_set.reset_index(drop = True) # rest index

# w/ duplicates removed, taking out the training data
training_data = combined_set[~np.isnan(combined_set["Final Decision"])] # removing NaN values
#training_data # cleaned training data without duplicates

In [11]:
# initializing dtm, random forest classifier
vectorizer = CountVectorizer(min_df=5) # min_df is minimum document frequency to include token/word
rfc = RandomForestClassifier() # random forest classifier

In [12]:
# creating feature matrix, the frequency term matrices
dtm = vectorizer.fit_transform(combined_set["clean_text"]).toarray() # matrix of all tweets
training_dtm = dtm[0:len(training_data)] # features of only training data
data_to_label_dtm = dtm[len(training_data):len(combined_set)] # features of data to be labeled


In [13]:
# labels pro/anti-vaccine
training_labels = np.array(combined_set["Final Decision"][0:len(training_data)]) # labels of training set
rfc.fit(training_dtm, training_labels) # fitting training data


RandomForestClassifier()

In [14]:
# predicting rest of the tweets
label_pred = rfc.predict(data_to_label_dtm)

In [15]:
# dataframe with predicted labels
to_label_df = combined_set[np.isnan(combined_set["Final Decision"])]
to_label_df = to_label_df.reset_index(drop = True) # rest index
to_label_df["Predicted Label"] = label_pred.tolist()
#to_label_df

In [16]:
# getting tokens for dtm
tokens = vectorizer.get_feature_names() # our tokens, the words

#tokens = np.asarray(tokens).reshape(1, len(tokens)) # converting list to numpy array
#tweet_dtm = np.append(tokens, data_to_label_dtm, axis = 0)

# dataframe with tokens as column names
tokens = vectorizer.get_feature_names() # our tokens, the words
tweet_df = pd.DataFrame(data_to_label_dtm, columns = tokens) # creating dataframe
tweet_df

Unnamed: 0,abc,abl,aborigin,abort,abroad,absolut,academ,accept,access,accid,...,younger,youngest,your,youth,youv,yr,yrs,yup,zero,zoom
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3222,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3223,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3224,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3225,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [17]:
# final dataframe with the tweets, sentiments, predicted labels, and the dtm

final_df = pd.concat([to_label_df, tweet_df], axis = 1) # combine predicted label dataframe and dtm dataframe
final_df = final_df.drop(["Final Decision"], axis=1)
final_df.to_csv("../Data/tweet_dtm.csv", sep = ",", index=False)
final_df

Unnamed: 0,text,positive,negative,assigned,clean_text,Predicted Label,abc,abl,aborigin,abort,...,younger,youngest,your,youth,youv,yr,yrs,yup,zero,zoom
0,"If a college forces the vaccine, they should ...",0.000000,1.000000,0,if forc su exist wreck fraud deliber withhold ...,0.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,My Deans List Sophomore unwelcome at her coll...,0.500000,0.500000,0,my dean list sophomor unwelcom w o covid she r...,1.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,are you paying out if your personal pocket fo...,0.000000,1.000000,0,pay person pocket scholarship gave away peopl ...,1.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,please help my daughter in #NJ () return to c...,1.000000,0.000000,1,help daughter nj return shes declin recov campus,1.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Deathsantis say vaccine cards are wrong.. But ...,0.652439,0.347561,1,deathsanti card wrong but mandat parti affili ...,0.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3222,Getting your COVID-19 vaccine will help protec...,1.000000,0.000000,1,get covid help protect famili friend colleg co...,1.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3223,Virologists are worried by the way vaccines ar...,0.309091,0.690909,0,virologist worri way vaccin prevent ill peopl ...,1.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3224,Many physicians recommend the jab because t...,0.746479,0.253521,1,mani physician recommend jab financi incent or...,0.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3225,The fear factor will never end.,0.000000,1.000000,0,the fear factor end,1.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
