#### Load modules

In [8]:
# import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500) # specifies number of rows to show
pd.options.display.max_colwidth = 1000
from matplotlib import pyplot as plt
%matplotlib inline

import mglearn
import random
import spacy   # lemmatization
import nltk
from nltk.corpus import stopwords ## Import stopwords with nltk.
import re
import numpy as np

# reference 
# https://realpython.com/sentiment-analysis-python/
# https://medium.com/@bedigunjit/simple-guide-to-text-classification-nlp-using-svm-and-naive-bayes-with-python-421db3a72d34
# https://wandb.ai/akshayuppal12/Finetune-BERT-Text-Classification/reports/How-to-Fine-Tune-BERT-for-Text-Classification--Vmlldzo4OTk4MzY#setting-up-bert-for-text-classification


In [9]:
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from nltk.corpus import wordnet as wn

from sklearn.pipeline import make_pipeline

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer   # bag of words 
from sklearn import model_selection, naive_bayes, svm
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
from sklearn.metrics import classification_report

from sklearn.model_selection import cross_val_score      # cross validation
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.linear_model import LogisticRegression      # logistic regression 
from sklearn.model_selection import GridSearchCV         # Grid search cross validation  

from sklearn.ensemble import RandomForestClassifier

import re

#### Read in Data

In [10]:
path = "collected_twts/Labeled_1/Labeled_2/labeled_tweets_totrain.json"  # Current
twt_label=pd.read_json(path, dtype=str)
twt_label["label"]=pd.to_numeric(twt_label["label"])
data=twt_label[["id","label","cleaned"]]
# remove duplicated records 
data = data.drop_duplicates(subset=['cleaned'])
print(data.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2065 entries, 0 to 2104
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   id       2065 non-null   object
 1   label    2065 non-null   int64 
 2   cleaned  2065 non-null   object
dtypes: int64(1), object(2)
memory usage: 64.5+ KB
None


In [11]:
data

Unnamed: 0,id,label,cleaned
0,1431545243825016837,-1,the science let's see if you actually understand the science 1 against what precisely does the vax stimulate antibody production 2 why should a person with covid antibodies get the vax 3 should one seek infection once vaxed
1,1370008698525990912,-1,i am against any vaccine if it doesn't work on kids kids do get the virus
2,1342494741376675840,-1,so your vax doesn't work cause if it did and the way its going to be taken it should be absolutely eradicated if your saying therebis many different strans then it is the fuckin flu and once again bigpharma is profiting off of us covidvaccine
3,1406079188956745731,-1,no if you're worried about covid and ok w getting vaccinated get vaccinated if you're not worried about covid or worried about the vaccine don't everybody gets what they took the risk of doing you can't force ppl into taking something bc you feel it's best for you and them
4,1403306558667345922,-1,the vaccine debate is over the vaccine is not necessary for all and should not be required and forced on healthy people
...,...,...,...
2099,1422604608824913923,0,it was good just a week ago new york gov cuomo urges businesses to adopt 'vaccineonly admission' delta yields more reinfections fauci says latest covid19 news
2100,1423790413664694272,0,i'm discussing vaccine is free but insulin is not tuesday aug 10 at 200 pm edt on join us
2101,1425508706876338186,0,come to my show tonight at union pool new tunes great band bring your vax pass tim kuhl 10pm the perfect man 915pm french cleats 830pm 10
2102,1428145994433187852,-1,it's time anyone who lives in long island ny please join us in the fight against agenda 21 and the global vaccine mandatesupport you're healthcare workersnurses and doctors who are for the right to choose


#### Remove Stopwords & Lemmatization

In [12]:
# prepare stopwords
# remove stop words using customized stop-words
with open("collected_twts/cust-stop-word_v5.txt") as file:
    lines = file.readlines()
    stopword_1=[line.rstrip() for line in lines]

print("customized stop words: {}".format(stopword_1))
# print("predefined stop words: {}".format(stopword_2))
stwd = stopword_1
data["cleaned"] = data["cleaned"].apply(lambda x: " ".join([word for word in x.split() if word not in (stwd)]))

# lemmatization
nlp = spacy.load("en_core_web_sm")
data["text_lemmatized"] = data["cleaned"].apply(lambda row: " ".join([w.lemma_ for w in nlp(row)]))

print("\n")
print("The shape of the training data is: {}".format(data.shape))
print("pro-vaccin tweets: {}".format(len(data[data["label"]==1])/len(data)))
print("anti-vaccin tweets: {}".format(len(data[data["label"]==-1])/len(data)))
print("neutral-vaccin tweets: {}".format(len(data[data["label"]==0])/len(data)))

# view final data 
data

customized stop words: ['-PRON-', 'get', 'getting', 'got', 'incredibly', 'do', 'be', "that's", 'lol', 'still', 'currently', 'bro', 'even', 'absolutely', 'fkn', 'thereby', 'dummy', 'next', 'ya', 'rt', 'i', 'me', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'yours', 'yourself', 'yourselves', 'he', "he's", 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", "that'sits", 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'or', 'as', 'of', 'at', 'by', 'for', 'fr', 'with', 'about', 'between', 'into', 'through', 'during', 'before', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'how', 'both', 'each', 'other', 'some', 'such', 'own'

Unnamed: 0,id,label,cleaned,text_lemmatized
0,1431545243825016837,-1,science let's see if actually understand science 1 against what precisely vax stimulate antibody production 2 why should person covid antibodies vax 3 should one seek infection vaxed,science let us see if actually understand science 1 against what precisely vax stimulate antibody production 2 why should person covid antibodie vax 3 should one seek infection vaxe
1,1370008698525990912,-1,against any vaccine if doesn't work kids kids virus,against any vaccine if do not work kid kid virus
2,1342494741376675840,-1,your vax doesn't work cause if way its going taken should eradicated if your saying therebis many different strans fuckin flu bigpharma profiting us covidvaccine,your vax do not work cause if way its going take should eradicate if your say therebis many different strans fuckin flu bigpharma profit we covidvaccine
3,1406079188956745731,-1,no if worried covid ok w vaccinated vaccinated if not worried covid worried vaccine don't everybody gets what took risk can't force ppl taking something bc feel best,no if worry covid ok w vaccinate vaccinated if not worry covid worried vaccine do not everybody get what take risk can not force ppl take something bc feel good
4,1403306558667345922,-1,vaccine debate vaccine not necessary all should not required forced healthy people,vaccine debate vaccine not you all should not require force healthy people
...,...,...,...,...
2099,1422604608824913923,0,good week ago new york gov cuomo urges businesses adopt 'vaccineonly admission' delta yields more reinfections fauci says latest covid19 news,good week ago new york gov cuomo urge business adopt ' vaccineonly admission ' delta yield more reinfection fauci say late covid19 news
2100,1423790413664694272,0,i'm discussing vaccine free but insulin not tuesday aug 10 200 pm edt join us,I be discuss vaccine free but insulin not tuesday aug 10 200 pm edt join we
2101,1425508706876338186,0,come my show tonight union pool new tunes great band bring your vax pass tim kuhl 10pm perfect man 915pm french cleats 830pm 10,come my show tonight union pool new tune great band bring your vax pass tim kuhl 10 pm perfect man 915pm french cleat 830pm 10
2102,1428145994433187852,-1,time anyone lives long island ny please join us fight against agenda 21 global vaccine mandatesupport healthcare workersnurses doctors right choose,time anyone live long island ny please join we fight against agenda 21 global vaccine mandatesupport healthcare workersnurse doctor right choose


#### Build Pipelines 

#### Lemmatization

In [13]:
np.random.seed(250) # 210  125  888 240 900


# split dataset into train and test 
train_x, test_x, train_y, test_y = model_selection.train_test_split(data['text_lemmatized'],data['label'],
                                                                    test_size=0.20, stratify = data['label'])  #  
print("Train set size: {}".format(train_x.shape))
print("Test set size: {}".format(test_x.shape))
print("Test anti-vax size: {}".format(sum(test_y==-1)))

Train set size: (1652,)
Test set size: (413,)
Test anti-vax size: 111


#### Linear SVM

Parameter grid: 

n gram range: unigram, bigram and trigram 

min_df: 1,2,3

In [15]:
pipe = make_pipeline(TfidfVectorizer(norm=None), LinearSVC(max_iter = 1500))
cv=StratifiedShuffleSplit(n_splits=10)    # ten-fold cross validation
param_grid = {"tfidfvectorizer__ngram_range":[(1,1),(1,2),[1,3]],
              "tfidfvectorizer__min_df":[1,2,3,4,5],
             'linearsvc__C': [0.001, 0.01, 0.1]}

grid = GridSearchCV(pipe, param_grid=param_grid, cv=cv)
grid.fit(train_x, train_y)

print("Best cross-validation accuracy: {:.3f}".format(grid.best_score_))
print("Best parameters: ", grid.best_params_)
print("Test set score: {:.3f}".format(grid.score(test_x, test_y)))

Best cross-validation accuracy: 0.763
Best parameters:  {'linearsvc__C': 0.001, 'tfidfvectorizer__min_df': 2, 'tfidfvectorizer__ngram_range': (1, 2)}
Test set score: 0.746


### Predict

Search 2-2 NYS vaccine related tweets

In [16]:
# LINEAR SVC & parameter C  
# train_x, test_x, train_y, test_y

# tf-idf fit 
# s1: vectorize tweets 
vect = TfidfVectorizer(norm=None,ngram_range=(1,2),min_df=2)
vect.fit(data["text_lemmatized"])

# vectorize test and train  
train_x_vct = vect.transform(train_x)
test_x_vct = vect.transform(test_x)
labeled_vct = vect.transform(data["text_lemmatized"])

# model test 
model_test = LinearSVC(C=0.001)
model_test.fit(train_x_vct, train_y)
print("Training set - Test set accuracy score: {}".format(model_test.score(test_x_vct, test_y)))

# FINAL model to predict 
model=LinearSVC(C=0.001)
model.fit(labeled_vct, data["label"])
print("All set - Test set accuracy score: {}".format(model.score(test_x_vct, test_y)))

Training set - Test set accuracy score: 0.7554479418886199
All set - Test set accuracy score: 0.9806295399515739


In [17]:
print("SVM Training model metrics")
test_y_pred = model_test.predict(test_x_vct)
print("Test set accuracy score: {}". format(accuracy_score(test_y, test_y_pred)))
print("Test set F1 score: {}". format(f1_score(test_y, test_y_pred, average="macro")))
print("Test set Precision score: {}". format(precision_score(test_y, test_y_pred, average="macro")))
print("Test set Recall score: {}". format(recall_score(test_y, test_y_pred, average="macro")))
print("Classification report: {}". format(classification_report(test_y, test_y_pred, digits=3)))

SVM Training model metrics
Test set accuracy score: 0.7554479418886199
Test set F1 score: 0.7196226415094339
Test set Precision score: 0.7692515270330772
Test set Recall score: 0.6882064987328146
Classification report:               precision    recall  f1-score   support

          -1      0.663     0.532     0.590       111
           0      0.878     0.655     0.750        55
           1      0.767     0.879     0.819       247

    accuracy                          0.755       413
   macro avg      0.769     0.688     0.720       413
weighted avg      0.754     0.755     0.748       413



### Prediction NYS tweets 

In [84]:
## PREDICT _ NYS tweet dataset 
NY_vax = pd.read_json("collected_twts/search_2-2/labeling/toPredict_NYS.json", dtype=str)
## remove stopwords 
NY_vax["cleaned"] = NY_vax["cleaned"].apply(lambda x: " ".join([word for word in x.split() if word not in (stwd)]))
## lemmatization 
NY_vax["text_lemmatized"] = NY_vax["cleaned"].apply(lambda row: " ".join([w.lemma_ for w in nlp(row)]))
## vectorize 
NY_vct = vect.transform(NY_vax["text_lemmatized"])
# ## predict 
# NY_vax["pred"] = model.predict(NY_vct)

### Prediction Conversation Tweets

In [85]:
## PREDICT _ NYS tweet dataset 
CONV_vax = pd.read_json("collected_twts/search_2-2/labeling/toPredict_CONV.json", dtype=str)
## remove stopwords 
CONV_vax["cleaned"] = CONV_vax["cleaned"].apply(lambda x: " ".join([word for word in x.split() if word not in (stwd)]))
## lemmatization 
CONV_vax["text_lemmatized"] = CONV_vax["cleaned"].apply(lambda row: " ".join([w.lemma_ for w in nlp(row)]))
## vectorize 
CONV_vct = vect.transform(CONV_vax["text_lemmatized"])
# ## predict 
# CONV_vax["pred"] = model.predict(CONV_vct)