In [2]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
import seaborn as sns
import warnings
import nltk
warnings.filterwarnings("ignore")
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import gutenberg as cg
# sentence tokeniser
from nltk.tokenize import word_tokenize as wt 
from collections import Counter
from nltk.corpus import stopwords
from nltk.tag import pos_tag
from nltk.corpus import wordnet
from wordcloud import WordCloud, STOPWORDS,ImageColorGenerator
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import textblob
import gensim
import re
import spacy

#### **Data Loading**

In [72]:
df = pd.read_csv('cleaned.csv')
df.head(5)

Unnamed: 0,cleaned,sentiment,sentiment_label_values
0,no thank you the new vaccine combines the expe...,negative,-1
1,scientificresearch safety and efficacy of dup...,neutral,0
2,dcp fireextinguisher for all classes kg kg kg ...,positive,1
3,rt discourse health news roundup vietnam repor...,neutral,0
4,health news roundup vietnam reports first case...,neutral,0


#### **Playing with Spacy**

In [73]:
from spacy import displacy

nlp = spacy.load("en_core_web_sm")
doc = nlp("This is a sentence.")
displacy.render(doc, style="dep")

In [74]:
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")
print()
for token in doc:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
            token.shape_, token.is_alpha, token.is_stop)


Apple Apple PROPN NNP nsubj Xxxxx True False
is be AUX VBZ aux xx True True
looking look VERB VBG ROOT xxxx True False
at at ADP IN prep xx True True
buying buy VERB VBG pcomp xxxx True False
U.K. U.K. PROPN NNP compound X.X. False False
startup startup NOUN NN dobj xxxx True False
for for ADP IN prep xxx True True
$ $ SYM $ quantmod $ False False
1 1 NUM CD compound d False False
billion billion NUM CD pobj xxxx True False


#### **Main Preprocessing with Spacy**

In [75]:
from spacy.lang.en.stop_words import STOP_WORDS

print(STOP_WORDS,'\n') # <- set of Spacy's default stop words

# STOP_WORDS.add("your_additional_stop_word_here")
# We remove 'not' because it is needed for the analysis of reviews
all_stopwords = STOP_WORDS

{'who', 'can', 'thence', 'above', 'toward', 'myself', 'or', 'now', 'empty', "'m", 'really', 'mine', 'six', 'made', 'themselves', 'before', 'nowhere', 'make', 'neither', 'least', 'in', '’s', 'twelve', 'anything', 'except', 'on', 'ten', 'whither', 'many', 'former', 'often', 'where', 'fifteen', 'which', 'hence', 'see', 'part', 'however', '‘re', 'several', 'noone', 'being', 'and', 'wherein', 'why', 'mostly', 'unless', 'whose', 'already', 'will', 'throughout', 'so', 'please', 'amongst', 'be', 'she', 'eleven', 'whence', 'is', 'for', 'either', 'about', 'even', 'does', 'keep', 'hundred', "'re", 'put', 'meanwhile', 'eight', 'him', 'thus', 'if', 'only', '‘ve', 'his', '’m', 'ca', 'seem', 'always', 'my', 'all', 'ourselves', 'some', 'anyone', 'bottom', 'whole', 'since', 'beforehand', 'you', 'of', 'five', 'became', 'next', 'its', 'per', 'up', 'still', 'again', 'though', 'fifty', 'enough', 'under', 'but', 'off', 'too', 'might', 'say', 'besides', 'somewhere', 'by', "'ve", 'just', 'nothing', 'three', '

In [76]:
# all_stopwords.add('not') # add one stop word at a time
# all_stopwords.remove('not') # remove one stop word at a time

# Adding several stopwords
all_stopwords |= {'not',"no", "n't", 'n’t','n‘t','cannot','none','without','against'}
# Removing several stop words
all_stopwords-= {'not',"no", "n't", 'n’t','n‘t','cannot','none','without','against'}
print('My stops\n',all_stopwords) # has been removed

My stops
 {'who', 'can', 'thence', 'above', 'toward', 'myself', 'or', 'now', 'empty', "'m", 'really', 'mine', 'six', 'made', 'themselves', 'before', 'nowhere', 'make', 'neither', 'least', 'in', '’s', 'twelve', 'anything', 'except', 'on', 'ten', 'whither', 'many', 'former', 'often', 'where', 'fifteen', 'which', 'hence', 'see', 'part', 'however', '‘re', 'several', 'noone', 'being', 'and', 'wherein', 'why', 'mostly', 'unless', 'whose', 'already', 'will', 'throughout', 'so', 'please', 'amongst', 'be', 'she', 'eleven', 'whence', 'is', 'for', 'either', 'about', 'even', 'does', 'keep', 'hundred', "'re", 'put', 'meanwhile', 'eight', 'him', 'thus', 'if', 'only', '‘ve', 'his', '’m', 'ca', 'seem', 'always', 'my', 'all', 'ourselves', 'some', 'anyone', 'bottom', 'whole', 'since', 'beforehand', 'you', 'of', 'five', 'became', 'next', 'its', 'per', 'up', 'still', 'again', 'though', 'fifty', 'enough', 'under', 'but', 'off', 'too', 'might', 'say', 'besides', 'somewhere', 'by', "'ve", 'just', 'nothing', 

In [79]:
stop = set(all_stopwords) # My own stop words

In [91]:
df_preo = df

In [93]:
df_preo.head()

Unnamed: 0,cleaned,sentiment,sentiment_label_values
0,no thank you the new vaccine combines the expe...,negative,-1
1,scientificresearch safety and efficacy of dup...,neutral,0
2,dcp fireextinguisher for all classes kg kg kg ...,positive,1
3,rt discourse health news roundup vietnam repor...,neutral,0
4,health news roundup vietnam reports first case...,neutral,0


In [94]:
# https://stackoverflow.com/questions/55817040/removing-stop-words-using-spacy

df_preo['cleaned_spacy_two'] = df_preo['cleaned'].apply(lambda text: 
                         " ".join(token.lemma_ for token in nlp(text) 
                            if token.text not in stop))
df_preo.head(5)

Unnamed: 0,cleaned,sentiment,sentiment_label_values,cleaned_spacy_two
0,no thank you the new vaccine combines the expe...,negative,-1,no thank new vaccine combine experimental flu ...
1,scientificresearch safety and efficacy of dup...,neutral,0,scientificresearch safety efficacy dupilumab...
2,dcp fireextinguisher for all classes kg kg kg ...,positive,1,dcp fireextinguisher class kg kg kg kg availab...
3,rt discourse health news roundup vietnam repor...,neutral,0,rt discourse health news roundup vietnam repor...
4,health news roundup vietnam reports first case...,neutral,0,health news roundup vietnam report case monkey...


### **Data Transformation**

In [119]:
df_trans = df_preo[['sentiment_label_values','cleaned_spacy_two']]

In [120]:
new_order = [
    'cleaned_spacy_two',
    'sentiment_label_values'
]
df_trans = df_trans[new_order]
df_trans

Unnamed: 0,cleaned_spacy_two,sentiment_label_values
0,no thank new vaccine combine experimental flu ...,-1
1,scientificresearch safety efficacy dupilumab...,0
2,dcp fireextinguisher class kg kg kg kg availab...,1
3,rt discourse health news roundup vietnam repor...,0
4,health news roundup vietnam report case monkey...,0
...,...,...
995,scientificresearch health behaviour change u...,-1
996,americans worried mentalhealth covid new sur...,0
997,need covid resource city phoenix cover connect...,0
998,fauci admit lockdown negative consequence arti...,0


In [195]:
# Creating an unseen data for usage to be predicted on 
df_predict_set = df_trans.iloc[900:,:2]
# df_predict_set.to_frame().reset_index(drop=True)
df_predict_set.reset_index(drop=True)

Unnamed: 0,cleaned_spacy_two,sentiment_label_values
0,warning issue covid vaccine mrna find breast m...,0
1,common gene variant link covid mortality covid...,0
2,update covid coronavirus booster authorize you...,0
3,important covid datum point break prevail na...,0
4,alot prayer quick speedy recovery cheetay alla...,1
...,...,...
95,scientificresearch health behaviour change u...,-1
96,americans worried mentalhealth covid new sur...,0
97,need covid resource city phoenix cover connect...,0
98,fauci admit lockdown negative consequence arti...,0


In [196]:
df_predict_set.to_csv('predict_set.csv',index=False)

In [118]:
# For Train and testing
df_trans_new = df_trans.loc[:899,:]
df_trans_new.tail()

Unnamed: 0,cleaned_spacy_two,sentiment_label_values
895,covid vaccination camp precautionary dose az...,0
896,ignore health people die covid alp lnp varia...,-1
897,real plague endure bureaucrat chinese covid he...,0
898,lead cardiologist warn kid injure vaccine emer...,-1
899,late health safety daily thank covid covid,1


#### **Using CountVectorizer** 

In [243]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer,TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

In [244]:
vectorizer = CountVectorizer()

In [245]:
X  = vectorizer.fit_transform(df_trans_new.iloc[:,0]).toarray()
y = df_trans_new.iloc[:,-1]

In [246]:
# Saving BoW for prediction
import pickle
pickle.dump(vectorizer, open('vectorizer.pkl', 'wb'))

In [247]:
# returning the top ten features
# vectorizer.get_feature_names_out()

**The Model**

In [248]:
X_train, X_test, y_train,y_test= train_test_split(X,y,test_size=.2,random_state=42, stratify=y)

In [249]:
# import sklearn.metrics
# sorted(sklearn.metrics.SCORERS.keys())

In [250]:
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import  RandomForestClassifier
from sklearn.model_selection import StratifiedKFold,cross_val_score,cross_val_predict

scv = StratifiedKFold(n_splits=10)

In [342]:
# Also try out Textblob classifier Later on 
cf = GaussianNB()
cf_2 = DecisionTreeClassifier()
cf_3 = RandomForestClassifier()
# # cf.fit(X_train.toarray(),y_train) # if using TfidfTransformer
# cf.fit(X_train,y_train)

In [343]:
# Cross Validatoin
print("Cross-validation scores:\n{}".format(
cross_val_score(cf, X_train, y_train, cv=scv)))

Cross-validation scores:
[0.68055556 0.56944444 0.66666667 0.66666667 0.70833333 0.75
 0.75       0.63888889 0.73611111 0.76388889]


In [344]:
# Cross Validatoin
print("Cross-validation scores:\n{}".format(
cross_val_score(cf_2, X_train, y_train, cv=scv)))

Cross-validation scores:
[0.81944444 0.73611111 0.66666667 0.75       0.76388889 0.70833333
 0.80555556 0.77777778 0.76388889 0.66666667]


In [345]:
# Cross Validatoin
print("Cross-validation scores:\n{}".format(
cross_val_score(cf_3, X_train, y_train, cv=scv)))

Cross-validation scores:
[0.75       0.70833333 0.70833333 0.70833333 0.69444444 0.77777778
 0.76388889 0.73611111 0.75       0.72222222]


In [346]:
# cf.fit(X_train.toarray(),y_train) # if using TfidfTransformer
cf_3.fit(X_train,y_train)

RandomForestClassifier()

In [347]:
pickle.dump(cf_3, open('cf_3.pkl', 'wb'))

In [348]:
# y_pred = cf.predict(X_test.toarray()) # if using TfidfTransformer
y_pred = cf_3.predict(X_test)

In [349]:
y_pred

array([ 0, -1, -1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  0,  0,  1,  0,  0,
        0,  1,  0,  0,  0,  0, -1,  0,  0,  0,  0, -1,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, -1,  0,  0,  1,  0,  0,
        0, -1, -1,  0,  0,  0,  1, -1,  1,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  1,  0,  0,  0,  0,  0,  0, -1,  0,  0,  1,  0,  0,  0,
        0,  0,  0,  0, -1,  0,  0,  0,  0, -1,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0, -1,  0,  0,  0,  0,  0, -1,  0,  0,  0,  0,  0,  0,  0,
       -1,  0,  0, -1,  0, -1, -1,  0,  0, -1,  1,  0,  0,  0,  0,  0, -1,
        0,  0,  0,  0,  0,  0,  0,  0,  1,  0,  0,  0,  0, -1, -1,  0,  0,
        0,  0,  0,  0,  0,  1,  0,  0,  0,  0], dtype=int64)

In [350]:
# https://datascience.stackexchange.com/questions/22762/understanding-predict-proba-from-multioutputclassifier

In [351]:
from sklearn.metrics import matthews_corrcoef, f1_score

matthews_corrcoef(y_test, y_pred)

0.5635479244235625

In [352]:
f1_score(y_test,y_pred, average="micro")

0.7666666666666667

In [353]:
f1_score(y_test,y_pred, average="weighted")

0.7375945720367486

In [354]:
f1_score(y_test,y_pred, average="macro")

0.6624090898505646

#### **Using TF-IDF** 

In [366]:
vectorizer = TfidfVectorizer()

X  = vectorizer.fit_transform(df_trans_new.iloc[:,0]).toarray()
y = df_trans_new.iloc[:,-1]

# Saving model for TFIDF
pickle.dump(vectorizer, open('vectorizer_tfidf.pkl', 'wb'))

In [367]:
X.shape # (900, 3630)

(900, 3630)

**The Model**

In [368]:
cf_3.fit(X_train,y_train)
pickle.dump(cf_3, open('cf_3_tfidf.pkl', 'wb'))
y_pred = cf_3.predict(X_test)
f1_score(y_test,y_pred, average="micro")

0.7555555555555555

In [369]:
f1_score(y_test,y_pred, average="weighted")

0.7223634784610393

In [370]:
f1_score(y_test,y_pred, average="macro")

0.6405939040085381

#### **Something important** 

If you don't understand what micro and macro average is, just remember the following 'A macro-average will compute the metric independently for each class and then take the average (hence treating all classes equally), whereas a micro-average will aggregate the contributions of all classes to compute the average metric. In a multi-class classification setup, micro-average is preferable if you suspect there might be class imbalance'

#### **Using Textblob** 