In [3]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
import seaborn as sns
import pickle
import spacy

In [131]:
import warnings
warnings.filterwarnings("ignore")

#### **Prediction on unseen data**

In [4]:
df_predict_set = pd.read_csv('tweet_word2vex_for_prediction.csv')
df_predict_set.head()

Unnamed: 0,cleaned_tweets,sentiment_label_values
0,the lethal effects of covid will remain as lon...,-1
1,scientificresearch socioeconomic and immigrat...,0
2,what about covid health decisions go to bed o...,0
3,you can lose many things and rebuild you can s...,-1
4,we re worried about covid and about flu joint...,0


In [5]:
criteria = (df_predict_set.sentiment_label_values == 0)|(df_predict_set.sentiment_label_values == 1)
citeria_cols = ['cleaned_tweets','sentiment_label_values']
df_predict_set = df_predict_set.loc[criteria,citeria_cols]
df_predict_set.head()

Unnamed: 0,cleaned_tweets,sentiment_label_values
1,scientificresearch socioeconomic and immigrat...,0
2,what about covid health decisions go to bed o...,0
4,we re worried about covid and about flu joint...,0
5,here is another amazing benefit hospital decis...,1
6,excellent as always from wallace in on the gov...,1


#### **Loading Saved models**

In [21]:
# Wor2Vec
vectorizer = pickle.load(open('w2v_vectoriser.pkl', "rb"))

In [31]:
# # prediction_pipe_pca
# model_pca = pickle.load(open('w2v_model_pca.pkl', 'rb'))

# Model
model = pickle.load(open('w2v_model.pkl', 'rb'))

#### **Preprocessing**

In [6]:
nlp = spacy.load("en_core_web_sm")

In [7]:
from spacy.lang.en.stop_words import STOP_WORDS
all_stopwords = STOP_WORDS
# Removing several stop words
all_stopwords-= {'not',"no", "n't", 'n’t','n‘t','cannot','none','without','against'}

In [8]:
stop = set(all_stopwords) # My own stop words

In [9]:
def spacy_tokeniser(sent):
    sent = sent.strip().lower()
    doc = nlp(sent)
    mytokens = [word.lemma_ for word in doc if word not in stop]
    
    return mytokens

    # Vecctoriser of Word2vec
def sent_vec(sent):
    vector_size = vectorizer.vector_size
    w2v_resolution = np.zeros(vector_size)
    # print(w2v_resolution)
    ctr = 1
    for w in sent:
        if w in vectorizer:
            ctr += 1
            w2v_resolution += vectorizer[w]
    w2v_resolution = w2v_resolution/ctr
    # print(w2v_resolution)
    return w2v_resolution

In [10]:
df_predict_set['tokens'] = df_predict_set['cleaned_tweets'].apply(spacy_tokeniser)
df_predict_set['vectors'] = df_predict_set['tokens'].apply(sent_vec)

In [11]:
df_predict_set.sample(n=5, random_state=20)

Unnamed: 0,cleaned_tweets,sentiment_label_values,tokens,vectors
6,excellent as always from wallace in on the gov...,1,"[excellent, as, always, from, wallace, in, on,...","[0.3035253339447081, 0.039089504070580004, 0.1..."
28,the latest the health daily thanks to krstos m...,0,"[the, late, the, health, daily, thank, to, krs...","[0.3419655528333452, 0.19213455646402305, 0.12..."
16,the real plague we have been enduring bureaucr...,0,"[the, real, plague, we, have, be, endure, bure...","[0.07462941575795412, 0.34732417257813114, -0...."
2,what about covid health decisions go to bed o...,0,"[what, about, covid, health, decision, go, to,...","[0.33758400082588197, 0.06742650046944618, -0...."
13,wish could get medical care for health issues ...,0,"[wish, could, get, medical, care, for, health,...","[0.11148355801611445, 0.3333354221526967, -0.2..."


In [23]:
X = df_predict_set['vectors'].to_list()
y = df_predict_set['sentiment_label_values'].to_list()

In [24]:
len(X[0])

50

#### **Predictions**

In [32]:
# predcting
y_pred = model.predict(X)

In [33]:
print(y_pred)

[0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0]


In [36]:
y_pedict_proba = model.predict_proba(X)[:,1] >=0.2
y_pedict_proba_adjusted_threshold = y_pedict_proba.astype(int)
print(y_pedict_proba_adjusted_threshold)

[0 0 0 1 0 0 1 0 1 1 0 0 0 0 0 0 0 0 1 0 0 1 1 1 1 1 0 0 0 1 0 0 0 1 0 0]


In [38]:
df_predicted = df_predict_set[['cleaned_tweets','sentiment_label_values']]
df_predicted.head()

Unnamed: 0,cleaned_tweets,sentiment_label_values
1,scientificresearch socioeconomic and immigrat...,0
2,what about covid health decisions go to bed o...,0
4,we re worried about covid and about flu joint...,0
5,here is another amazing benefit hospital decis...,1
6,excellent as always from wallace in on the gov...,1


In [41]:
# Appending predictions to dataset
df_predicted['predicted_label_logistic'] = y_pedict_proba_adjusted_threshold.tolist()
df_predicted

Unnamed: 0,cleaned_tweets,sentiment_label_values,predicted_label,predicted_label_logistic
1,scientificresearch socioeconomic and immigrat...,0,0,0
2,what about covid health decisions go to bed o...,0,0,0
4,we re worried about covid and about flu joint...,0,0,0
5,here is another amazing benefit hospital decis...,1,1,1
6,excellent as always from wallace in on the gov...,1,0,0
7,said people should not hold social events in ...,0,0,0
8,health amp human services and the urban leagu...,1,1,1
9,scream if one more person tells me to take my...,0,0,0
10,fauci awards mil to eco health they just rece...,0,1,1
12,join us for be health empowered featuring our ...,1,1,1


**KNN**

In [172]:
# Model
model_knn = pickle.load(open('w2v_model_knn.pkl', 'rb'))

In [173]:
# predcting
y_pred_knn = model_knn.predict(X)

In [174]:
print(y_pred_knn)

[0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 1 0 0 0 0 0 0 0 0]


In [175]:
y_pedict_proba_knn = model_knn.predict_proba(X)[:,1] >= 0.2
y_pedict_proba_adjusted_threshold_knn = y_pedict_proba_knn.astype(int)
print(y_pedict_proba_adjusted_threshold_knn)

[0 1 0 1 0 0 1 1 0 1 0 0 1 0 0 1 1 1 1 0 0 1 1 1 1 1 0 1 0 1 0 0 0 1 1 0]


In [176]:
# Appending predictions to dataset
df_predicted['predicted_label_knn'] = y_pedict_proba_adjusted_threshold_knn.tolist()
df_predicted[['sentiment_label_values','predicted_label_logistic','predicted_label_knn']]

Unnamed: 0,sentiment_label_values,predicted_label_logistic,predicted_label_knn
1,0,0,0
2,0,0,1
4,0,0,0
5,1,1,1
6,1,0,0
7,0,0,0
8,1,1,1
9,0,0,1
10,0,1,0
12,1,1,1


**KNN without PCA**

In [177]:
# Model
model_knn_no_pca = pickle.load(open('w2v_model_knn_no_pca.pkl', 'rb'))

In [178]:
# predcting
y_pred_knn_no_pca = model_knn_no_pca.predict(X)

In [179]:
print(y_pred_knn_no_pca)

[0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 0 0 0 0 0 0 0]


In [180]:
y_pedict_proba_knn_no_pca = model_knn_no_pca.predict_proba(X)[:,1] >= 0.4
y_pedict_proba_adjusted_threshold_knn_no_pca = y_pedict_proba_knn_no_pca.astype(int)
print(y_pedict_proba_adjusted_threshold_knn_no_pca)

[0 1 0 1 0 0 1 0 0 1 1 0 1 0 0 0 0 0 1 0 0 0 0 1 1 1 0 1 0 0 0 0 0 0 0 0]


In [181]:
# Appending predictions to dataset
df_predicted['predicted_label_knn_no_pca'] = y_pedict_proba_adjusted_threshold_knn_no_pca.tolist()
df_predicted[['sentiment_label_values','predicted_label_logistic','predicted_label_knn','predicted_label_knn_no_pca']]

Unnamed: 0,sentiment_label_values,predicted_label_logistic,predicted_label_knn,predicted_label_knn_no_pca
1,0,0,0,0
2,0,0,1,1
4,0,0,0,0
5,1,1,1,1
6,1,0,0,0
7,0,0,0,0
8,1,1,1,1
9,0,0,1,0
10,0,1,0,0
12,1,1,1,1


**Random Forest**

In [182]:
# Model
model_rf = pickle.load(open('w2v_model_rf.pkl', 'rb'))

In [183]:
# predcting
y_pred_rf = model_rf.predict(X)

In [184]:
print(y_pred_rf)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0]


In [185]:
y_pedict_proba_rf = model_rf.predict_proba(X)[:,1] >= 0.2
y_pedict_proba_adjusted_threshold_rf = y_pedict_proba_rf.astype(int)
print(y_pedict_proba_adjusted_threshold_rf)

[0 1 0 1 1 1 1 0 0 1 0 0 1 0 0 0 0 0 0 0 0 1 1 1 1 1 0 0 0 1 0 0 0 0 0 0]


In [186]:
# Appending predictions to dataset
df_predicted['predicted_label_rf'] = y_pedict_proba_adjusted_threshold_rf.tolist()
df_predicted[['sentiment_label_values',
              'predicted_label_logistic',
              'predicted_label_knn',
              'predicted_label_rf']]

Unnamed: 0,sentiment_label_values,predicted_label_logistic,predicted_label_knn,predicted_label_rf
1,0,0,0,0
2,0,0,1,1
4,0,0,0,0
5,1,1,1,1
6,1,0,0,1
7,0,0,0,1
8,1,1,1,1
9,0,0,1,0
10,0,1,0,0
12,1,1,1,1


**Random Forest without PCA**

In [187]:
# Model
model_rf_no_pca = pickle.load(open('w2v_model_rf_no_pca.pkl', 'rb'))

In [188]:
# predcting
y_pred_rf_no_pca = model_rf_no_pca.predict(X)

In [189]:
print(y_pred_rf_no_pca)

[0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0]


In [190]:
y_pedict_proba_rf_no_pca = model_rf_no_pca.predict_proba(X)[:,1] >= 0.25
y_pedict_proba_adjusted_threshold_rf_no_pca = y_pedict_proba_rf_no_pca.astype(int)
print(y_pedict_proba_adjusted_threshold_rf_no_pca)

[0 1 0 1 0 1 1 1 1 1 1 0 1 0 0 0 0 0 1 0 0 1 1 1 1 0 0 0 0 1 0 0 0 0 1 0]


In [191]:
# Appending predictions to dataset
df_predicted['predicted_label_rf_no_pca'] = y_pedict_proba_adjusted_threshold_rf_no_pca.tolist()
df_predicted[['sentiment_label_values',
              'predicted_label_logistic',
              'predicted_label_knn',
              'predicted_label_rf',
              'predicted_label_rf_no_pca']]

Unnamed: 0,sentiment_label_values,predicted_label_logistic,predicted_label_knn,predicted_label_rf,predicted_label_rf_no_pca
1,0,0,0,0,0
2,0,0,1,1,1
4,0,0,0,0,0
5,1,1,1,1,1
6,1,0,0,1,0
7,0,0,0,1,1
8,1,1,1,1,1
9,0,0,1,0,1
10,0,1,0,0,1
12,1,1,1,1,1


**Random Forest Undersampler**

In [210]:
# Model
model_rf_undersampler = pickle.load(open('w2v_model_rf_undersampler.pkl', 'rb'))

In [211]:
# predcting
y_pred_rf_undersampler = model_rf_undersampler.predict(X)

In [212]:
print(y_pred_rf_undersampler)

[0 1 0 1 1 1 1 0 0 1 1 0 1 0 0 0 0 0 0 0 0 1 0 1 1 1 0 0 0 1 0 0 0 0 0 0]


In [213]:
y_pedict_proba_rf_undersampler = model_rf_undersampler.predict_proba(X)[:,1] >= 0.54
y_pedict_proba_adjusted_threshold_rf_undersampler = y_pedict_proba_rf_undersampler.astype(int)
print(y_pedict_proba_adjusted_threshold_rf_undersampler)

[0 1 0 1 0 0 1 0 0 1 1 0 1 0 0 0 0 0 0 0 0 1 0 1 1 1 0 0 0 1 0 0 0 0 0 0]


In [214]:
# Appending predictions to dataset
df_predicted['predicted_label_rf_undersampler'] = y_pedict_proba_adjusted_threshold_rf_undersampler.tolist()
df_predicted[['sentiment_label_values',
              'predicted_label_logistic',
              'predicted_label_knn',
              'predicted_label_rf',
              'predicted_label_rf_no_pca',
              'predicted_label_rf_undersampler']]

Unnamed: 0,sentiment_label_values,predicted_label_logistic,predicted_label_knn,predicted_label_rf,predicted_label_rf_no_pca,predicted_label_rf_undersampler
1,0,0,0,0,0,0
2,0,0,1,1,1,1
4,0,0,0,0,0,0
5,1,1,1,1,1,1
6,1,0,0,1,0,0
7,0,0,0,1,1,0
8,1,1,1,1,1,1
9,0,0,1,0,1,0
10,0,1,0,0,1,0
12,1,1,1,1,1,1


**Random Forest Undersampler With PCA**

In [215]:
# Model
model_rf_undersampler_pca = pickle.load(open('w2v_model_rf_undersampler_pca.pkl', 'rb'))

In [216]:
# predcting
y_pred_rf_undersampler_pca = model_rf_undersampler_pca.predict(X)

In [217]:
print(y_pred_rf_undersampler_pca)

[0 0 0 1 0 0 1 0 0 1 1 0 1 0 0 0 0 0 0 0 0 1 0 1 1 0 0 0 0 1 0 0 0 0 0 0]


In [218]:
y_pedict_proba_rf_undersampler_pca = model_rf_undersampler_pca.predict_proba(X)[:,1] >= 0.43
y_pedict_proba_adjusted_threshold_rf_undersampler_pca = y_pedict_proba_rf_undersampler_pca.astype(int)
print(y_pedict_proba_adjusted_threshold_rf_undersampler_pca)

[0 0 0 1 1 0 1 0 0 1 1 0 1 0 1 0 0 0 1 0 0 1 0 1 1 0 0 1 0 1 0 0 1 1 0 0]


In [219]:
# Appending predictions to dataset
df_predicted['predicted_label_rf_undersampler_pca'] = y_pedict_proba_adjusted_threshold_rf_undersampler_pca.tolist()
df_predicted[['sentiment_label_values',
              'predicted_label_logistic',
              'predicted_label_knn',
              'predicted_label_rf',
              'predicted_label_rf_no_pca',
              'predicted_label_rf_undersampler',
              'predicted_label_rf_undersampler_pca']]

Unnamed: 0,sentiment_label_values,predicted_label_logistic,predicted_label_knn,predicted_label_rf,predicted_label_rf_no_pca,predicted_label_rf_undersampler,predicted_label_rf_undersampler_pca
1,0,0,0,0,0,0,0
2,0,0,1,1,1,1,0
4,0,0,0,0,0,0,0
5,1,1,1,1,1,1,1
6,1,0,0,1,0,0,1
7,0,0,0,1,1,0,0
8,1,1,1,1,1,1,1
9,0,0,1,0,1,0,0
10,0,1,0,0,1,0,0
12,1,1,1,1,1,1,1


In [220]:
new_colum_order = ['cleaned_tweets', 'sentiment_label_values', 'predicted_label_logistic',
                'predicted_label_knn','predicted_label_knn_no_pca' ,'predicted_label_rf',
                'predicted_label_rf_no_pca',
                'predicted_label_rf_undersampler',
                'predicted_label_rf_undersampler_pca']
df_predicted= df_predicted[new_colum_order]

In [221]:
df_predicted.to_excel("predicted_w2v.xlsx",index=False)  
df_predicted.to_csv("predicted_w2v.csv", index=False)