## `Reference` : ##
https://towardsdatascience.com/sentiment-analysis-using-lstm-and-glove-embeddings-99223a87fe8e

In [1]:
import pandas as pd

df = pd.read_pickle("data_clean.pkl")
dict = {'Real': 0, 'Fake': 1}
df["Type"] = df["Type"].map(dict)
df

Unnamed: 0,Month,Text,Type
0,"Feb,20",kuala lumpur tourism art culture ministry focu...,0
1,"Feb,20",kuching sarawak record four new patient invest...,0
2,"Feb,20",johor baru police open investigation paper spr...,0
3,"Feb,20",johor baru malaysian love like red red rise bl...,0
4,"Feb,20",petaling jaya one day least love trump even an...,0
...,...,...,...
885,"Nov,20",najib loot rm billion bond epf pas rm billion ...,1
886,"Nov,20",regret inform kelantan state ministry health t...,1
887,"Nov,20",due influence majority malaysian face salary p...,1
888,"Nov,20",oldtown curry noodle restaurant rumor contain ...,1


In [2]:
X = df["Text"].values
y = df["Type"].values

In [3]:
type(y)

numpy.ndarray

In [4]:
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer

#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42, stratify=y)

In [5]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)

words_to_index = tokenizer.word_index

In [6]:
import numpy as np
def read_glove_vector(glove_vec):
    with open(glove_vec, 'r', encoding='utf-8') as f:
        words = set()
        word_to_vec_map = {}
        for line in f:
            w_line = line.split()
            curr_word = w_line[0]
            word_to_vec_map[curr_word] = np.array(w_line[1:])

    return word_to_vec_map

In [7]:
GloVe_path = "C:\\Users\\munch\\Desktop\\NLP Pre-Processing\\glove.6B\\"
word_to_vec_map = read_glove_vector(GloVe_path + "glove.6B.50d.txt")

maxLen=150

In [8]:
len(words_to_index)

12158

In [9]:
from keras.layers.embeddings import Embedding

vocab_len = len(words_to_index) + 1
embed_vector_len = 50

emb_matrix = np.zeros((vocab_len, embed_vector_len))

for word, index in words_to_index.items():
    embedding_vector = word_to_vec_map.get(word)
    if embedding_vector is not None:
        emb_matrix[index, :] = embedding_vector

embedding_layer = Embedding(input_dim=vocab_len, output_dim=embed_vector_len, input_length=maxLen, weights=[emb_matrix], trainable=False)

In [10]:
from keras.preprocessing.sequence import pad_sequences

max_length = 150
X_indices = tokenizer.texts_to_sequences(X)

X_indices = pad_sequences(X_indices, maxlen=maxLen, padding="post")

In [11]:
X_indices.shape

(890, 150)

In [12]:
emb_matrix.shape

(12159, 50)

In [13]:
type(X_indices)

numpy.ndarray

In [14]:
import numpy as np
from keras.models import Sequential
from keras.layers import Embedding
from keras.layers import Dense
from keras.layers import Flatten

feature_extractor = Sequential()
feature_extractor.add(Embedding(input_dim=vocab_len, output_dim=50, input_length=maxLen, weights=[emb_matrix], trainable=False))
feature_extractor.add(Flatten())
#model.add(Dense(1, activation="sigmoid"))

In [15]:
#model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
#print(model.summary())

In [16]:
from sklearn.naive_bayes import MultinomialNB
mnb_clf = MultinomialNB(alpha=0.84)

from sklearn.svm import SVC
svm_clf = SVC(kernel="rbf", C=9.0, gamma=0.00126)

from sklearn.tree import DecisionTreeClassifier
dt_clf = DecisionTreeClassifier(max_depth=5)

from sklearn.ensemble import RandomForestClassifier
rf_clf = RandomForestClassifier(n_estimators=1000, random_state=42)

from sklearn.linear_model import LogisticRegression
log_reg_clf = LogisticRegression(solver="lbfgs", random_state=42, C=96)

from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

ada_clf = AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=1), n_estimators=200,
    algorithm="SAMME.R", learning_rate=0.63, random_state=42
)

In [17]:
from sklearn.model_selection import StratifiedKFold

kf = StratifiedKFold(n_splits=10)

In [18]:
arr = np.array([[-1, 2, 3],
                [2, 3, 4]])
np.abs(arr)

array([[1, 2, 3],
       [2, 3, 4]])

In [20]:
%%time
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score
import numpy as np
from scipy.stats import reciprocal
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import reciprocal, uniform

scores_mnb = {'train_acc':[], 'test_acc':[], 'train_f1':[], 'test_f1':[], 'train_precision':[], 'test_precision':[], 'train_recall':[], 'test_recall':[], 'train_auc':[], 'test_auc':[]}
scores_svm = {'train_acc':[], 'test_acc':[], 'train_f1':[], 'test_f1':[], 'train_precision':[], 'test_precision':[], 'train_recall':[], 'test_recall':[], 'train_auc':[], 'test_auc':[]}
scores_dt = {'train_acc':[], 'test_acc':[], 'train_f1':[], 'test_f1':[], 'train_precision':[], 'test_precision':[], 'train_recall':[], 'test_recall':[], 'train_auc':[], 'test_auc':[]}
scores_rf = {'train_acc':[], 'test_acc':[], 'train_f1':[], 'test_f1':[], 'train_precision':[], 'test_precision':[], 'train_recall':[], 'test_recall':[], 'train_auc':[], 'test_auc':[]}
scores_log_reg = {'train_acc':[], 'test_acc':[], 'train_f1':[], 'test_f1':[], 'train_precision':[], 'test_precision':[], 'train_recall':[], 'test_recall':[], 'train_auc':[], 'test_auc':[]}
scores_ada = {'train_acc':[], 'test_acc':[], 'train_f1':[], 'test_f1':[], 'train_precision':[], 'test_precision':[], 'train_recall':[], 'test_recall':[], 'train_auc':[], 'test_auc':[]}
#scores_hard_voting = {'train_acc':[], 'test_acc':[], 'train_f1':[], 'test_f1':[], 'train_precision':[], 'test_precision':[], 'train_recall':[], 'test_recall':[], 'train_auc':[], 'test_auc':[]}

lookup_clf = {0:scores_mnb, 1:scores_svm, 2:scores_dt, 3:scores_rf, 4:scores_log_reg, 5:scores_ada}


param_distributions_mnb = {'alpha': np.linspace(0, 2, 20), 'fit_prior': [True, False]}
param_distributions_svm = {"gamma": reciprocal(0.001, 0.1), "C": uniform(0.01, 10)}
param_distributions_dt = {"max_depth": uniform(5, 30)}
param_distributions_rf = {"n_estimators": np.arange(5, 100)}
param_distributions_log = {"C": uniform(0.01, 100)}
param_distributions_ada = {"learning_rate": uniform(0, 2)}

param_distributions = [param_distributions_mnb, param_distributions_svm, param_distributions_dt, param_distributions_rf, param_distributions_log, param_distributions_ada]

#scores_nn = {'train_acc':[], 'test_acc':[], 'train_f1':[], 'test_f1':[], 'train_precision':[], 'test_precision':[], 'train_recall':[], 'test_recall':[], 'train_auc':[], 'test_auc':[]}

for train_index, test_index in kf.split(X_indices, y):
    X_train, X_test, y_train, y_test = X_indices[train_index], X_indices[test_index], y[train_index], y[test_index]
    X_train_features = feature_extractor.predict(X_train)
    
    X_test_features = feature_extractor.predict(X_test)
    
    clf_list = [mnb_clf, svm_clf, dt_clf, rf_clf, log_reg_clf, ada_clf]
    
    for index, clf in enumerate(clf_list):
        print(f"Training {clf.__class__.__name__}")
        
        if index == 0:
            X_train_features = np.abs(X_train_features)
            X_test_features = np.abs(X_test_features)
        elif index == 1:
            X_train_features = feature_extractor.predict(X_train)
            X_test_features = feature_extractor.predict(X_test)
        
        rnd_search_cv = RandomizedSearchCV(clf, param_distributions[index], n_iter=5, verbose=0, cv=3)
        
        rnd_search_cv.fit(X_train_features, y_train)
        y_train_pred = rnd_search_cv.predict(X_train_features)
        #y_train_pred = [1 if pred>0.5 else 0 for pred in y_train_pred]

        y_pred = rnd_search_cv.predict(X_test_features)
        #y_pred = [1 if pred>0.5 else 0 for pred in predicted_prob]

        train_accuracy = accuracy_score(y_train, y_train_pred)
        test_accuracy = accuracy_score(y_test, y_pred)

        train_f1 = f1_score(y_train, y_train_pred)
        test_f1 = f1_score(y_test, y_pred)

        train_precision = precision_score(y_train, y_train_pred)
        test_precision = precision_score(y_test, y_pred)

        train_recall = recall_score(y_train, y_train_pred)
        test_recall = recall_score(y_test, y_pred)

        train_roc_auc_score = roc_auc_score(y_train, y_train_pred)
        test_roc_auc_score = roc_auc_score(y_test, y_pred)

        scores_clf = lookup_clf[index]
        
        scores_clf["train_acc"].append(train_accuracy)
        scores_clf["test_acc"].append(test_accuracy)
        
        scores_clf["train_f1"].append(train_f1)
        scores_clf["test_f1"].append(test_f1)
        
        scores_clf["train_precision"].append(train_precision)
        scores_clf["test_precision"].append(test_precision)
        
        scores_clf["train_recall"].append(train_recall)
        scores_clf["test_recall"].append(test_recall)
        
        scores_clf["train_auc"].append(train_roc_auc_score)
        scores_clf["test_auc"].append(test_roc_auc_score)
        
        '''
        scores_clf = scores_nn

        scores_clf["train_acc"].append(train_accuracy)
        scores_clf["test_acc"].append(test_accuracy)

        scores_clf["train_f1"].append(train_f1)
        scores_clf["test_f1"].append(test_f1)

        scores_clf["train_precision"].append(train_precision)
        scores_clf["test_precision"].append(test_precision)

        scores_clf["train_recall"].append(train_recall)
        scores_clf["test_recall"].append(test_recall)

        scores_clf["train_auc"].append(train_roc_auc_score)
        scores_clf["test_auc"].append(test_roc_auc_score)
        '''

Training MultinomialNB
Training SVC
Training DecisionTreeClassifier
Training RandomForestClassifier
Training LogisticRegression
Training AdaBoostClassifier
Training MultinomialNB
Training SVC
Training DecisionTreeClassifier
Training RandomForestClassifier
Training LogisticRegression
Training AdaBoostClassifier
Training MultinomialNB




Training SVC
Training DecisionTreeClassifier
Training RandomForestClassifier
Training LogisticRegression
Training AdaBoostClassifier
Training MultinomialNB
Training SVC
Training DecisionTreeClassifier
Training RandomForestClassifier
Training LogisticRegression
Training AdaBoostClassifier
Training MultinomialNB
Training SVC
Training DecisionTreeClassifier
Training RandomForestClassifier
Training LogisticRegression
Training AdaBoostClassifier
Training MultinomialNB
Training SVC
Training DecisionTreeClassifier
Training RandomForestClassifier
Training LogisticRegression
Training AdaBoostClassifier
Training MultinomialNB
Training SVC
Training DecisionTreeClassifier
Training RandomForestClassifier
Training LogisticRegression
Training AdaBoostClassifier
Training MultinomialNB
Training SVC
Training DecisionTreeClassifier
Training RandomForestClassifier
Training LogisticRegression
Training AdaBoostClassifier
Training MultinomialNB




Training SVC
Training DecisionTreeClassifier
Training RandomForestClassifier
Training LogisticRegression
Training AdaBoostClassifier
Training MultinomialNB
Training SVC
Training DecisionTreeClassifier
Training RandomForestClassifier
Training LogisticRegression
Training AdaBoostClassifier
Wall time: 1h 57min 21s


In [21]:
import statistics
clf_list = [mnb_clf, svm_clf, dt_clf, rf_clf, log_reg_clf, ada_clf]
ls = ["train_acc", "test_acc"]
for index, scores_clf in lookup_clf.items():
    clf_class = clf_list[index].__class__.__name__
    print(f"{clf_class}")
    for key, item in scores_clf.items():
        mean = statistics.mean(item)
        print(f"{key}: {mean}")
    print()

MultinomialNB
train_acc: 0.9053682896379526
test_acc: 0.9044943820224719
train_f1: 0.9052721926101617
test_f1: 0.9039830804598593
train_precision: 0.8961047450459029
test_precision: 0.8966323047129325
train_recall: 0.9146464646464647
test_recall: 0.9136363636363637
train_auc: 0.9054713804713805
test_auc: 0.9045959595959596

SVC
train_acc: 1.0
test_acc: 0.8853932584269663
train_f1: 1.0
test_f1: 0.8602579392099815
train_precision: 1.0
test_precision: 0.9854056553911205
train_recall: 1.0
test_recall: 0.7818181818181819
train_auc: 1.0
test_auc: 0.8842424242424243

DecisionTreeClassifier
train_acc: 1.0
test_acc: 0.9011235955056179
train_f1: 1.0
test_f1: 0.8999681995973186
train_precision: 1.0
test_precision: 0.8972138196517079
train_recall: 1.0
test_recall: 0.9045454545454545
train_auc: 1.0
test_auc: 0.9011616161616162

RandomForestClassifier
train_acc: 0.999625468164794
test_acc: 0.9247191011235955
train_f1: 0.9996204131927219
test_f1: 0.9183913100716247
train_precision: 1.0
test_precision

In [35]:
import statistics
for key, item in scores_nn.items():
    mean = statistics.mean(item)
    print(f"{key}: {mean}")

train_acc: 1.0
test_acc: 0.9910112359550561
train_f1: 1.0
test_f1: 0.990909090909091
train_precision: 1.0
test_precision: 0.990909090909091
train_recall: 1.0
test_recall: 0.990909090909091
train_auc: 1.0
test_auc: 0.991010101010101


In [77]:
loss, accuracy = model.evaluate(X_train_indices, y_train)



In [78]:
X_test_indices = tokenizer.texts_to_sequences(X_test)

X_test_indices = pad_sequences(X_test_indices, maxlen=maxLen, padding='post')
model.evaluate(X_test_indices, y_test)



[0.25239327549934387, 0.9400749206542969]

In [79]:
def add_score_prediction(data, text_list_idx):
    data['fake news score'] = 0
    text_list_idx = pad_sequences(text_list_idx, maxlen=maxLen, padding="post")
    text_preds = model.predict(text_list_idx)
    data['fake news score'] = text_preds
    pred_fake = np.array(list(map(lambda x: 'fake' if x > 0.5 else 'real', text_preds)))
    data['predicted type'] = 0
    data['predicted type'] = pred_fake
    return data

In [86]:
df_indices = tokenizer.texts_to_sequences(X)
predicted_df = add_score_prediction(df, df_indices)

In [87]:
predicted_df

Unnamed: 0,Month,Text,Type,fake news score,predicted type
0,"Feb,20",kuala lumpur tourism art culture ministry focu...,0,0.001334,real
1,"Feb,20",kuching sarawak record four new patient invest...,0,0.448223,real
2,"Feb,20",johor baru police open investigation paper spr...,0,0.002466,real
3,"Feb,20",johor baru malaysian love like red red rise bl...,0,0.001535,real
4,"Feb,20",petaling jaya one day least love trump even an...,0,0.031954,real
...,...,...,...,...,...
885,"Nov,20",najib loot rm billion bond epf pas rm billion ...,1,0.985940,fake
886,"Nov,20",regret inform kelantan state ministry health t...,1,0.978134,fake
887,"Nov,20",due influence majority malaysian face salary p...,1,0.989442,fake
888,"Nov,20",oldtown curry noodle restaurant rumor contain ...,1,0.624155,fake
