# Word Embeddings

In [1]:
import os
import pandas as pd
import numpy as np

clean_news_df = pd.read_csv(os.path.join(os.getcwd(),"dataset\\clean_news_df.csv"))

# remove empty values
clean_news_df.dropna(inplace=True)

In [2]:
clean_news_df

Unnamed: 0,true_or_fake,text,cleaned_text
0,true,"As U.S. budget fight looms, Republicans flip t...",budget fight loom republican flip fiscal scrip...
1,true,U.S. military to accept transgender recruits o...,military accept transgender recruit monday pen...
2,true,Senior U.S. Republican senator: 'Let Mr. Muell...,senior republican senator let mueller job wash...
3,true,FBI Russia probe helped by Australian diplomat...,fbi russia probe help australian diplomat tip ...
4,true,Trump wants Postal Service to charge 'much mor...,trump want postal service charge much amazon s...
...,...,...,...
44893,fake,McPain: John McCain Furious That Iran Treated ...,mcpain john mccain furious iran treat sailor w...
44894,fake,JUSTICE? Yahoo Settles E-mail Privacy Class-ac...,justice yahoo settle mail privacy class action...
44895,fake,Sunnistan: US and Allied ‘Safe Zone’ Plan to T...,sunnistan ally safe zone plan take territorial...
44896,fake,How to Blow $700 Million: Al Jazeera America F...,blow million jazeera america finally call quit...


In [3]:
# split into training set and validation set

# train = 80, test = 20
# random_seed = 42

from sklearn.model_selection import train_test_split

X = clean_news_df['cleaned_text'].str.split()
y = clean_news_df['true_or_fake']

X_train,X_test,y_train,y_test = train_test_split(X,y,shuffle=True,random_state=42,test_size=0.2,stratify=y)

X_train = X_train.reset_index(drop = True)
X_test = X_test.reset_index(drop = True)

In [4]:
# train a Word2Vec model

from gensim.models import Word2Vec
w2v_model = Word2Vec(X_train, vector_size=200, window=5, min_count=1)

In [5]:
vocab=list(w2v_model.wv.key_to_index.keys())
print(len(vocab))

87044


In [6]:
words = set(w2v_model.wv.index_to_key)
X_train_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words]) for ls in X_train])
X_test_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words]) for ls in X_test])
X_train_avg = []
for v in X_train_vect:
        X_train_avg.append(v.mean(axis=0))

X_test_avg = []
for v in X_test_vect:
        X_test_avg.append(v.mean(axis=0))

  X_train_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words]) for ls in X_train])
  X_test_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words]) for ls in X_test])


# Model Building

## Gradient Boosting

In [7]:
from sklearn.ensemble import GradientBoostingClassifier

# loading the baseline model
gbc_base = GradientBoostingClassifier(random_state=42)
gbc_base.fit(X_train_avg, y_train)

# gbc_base.score(X_test_avg,y_test)

GradientBoostingClassifier(random_state=42)

In [8]:
from sklearn.metrics import confusion_matrix,accuracy_score,precision_score,recall_score,f1_score, classification_report

gbc_base_preds = gbc_base.predict(X_test_avg)
print("Accuracy: ",accuracy_score(y_true=y_test,y_pred=gbc_base_preds))
print("Precision: ",precision_score(y_true=y_test,y_pred=gbc_base_preds,pos_label='true'))
print("Recall: ",recall_score(y_true=y_test,y_pred=gbc_base_preds,pos_label='true'))
print("F1-Score: ",f1_score(y_true=y_test,y_pred=gbc_base_preds,pos_label='true'))
print(classification_report(y_test, gbc_base_preds))

print("Confusion Matrix [TP FP FN TN]: \n",confusion_matrix(y_true=y_test,y_pred=gbc_base_preds).ravel())


Accuracy:  0.958119848518601
Precision:  0.9527230590961762
Recall:  0.9598412327807612
F1-Score:  0.9562688997441265
              precision    recall  f1-score   support

        fake       0.96      0.96      0.96      4695
        true       0.95      0.96      0.96      4283

    accuracy                           0.96      8978
   macro avg       0.96      0.96      0.96      8978
weighted avg       0.96      0.96      0.96      8978

Confusion Matrix [TP FP FN TN]: 
 [4491  204  172 4111]


### Hyperparameter Tuning

In [9]:
# from sklearn.ensemble import GradientBoostingClassifier

# from sklearn.model_selection import GridSearchCV
# from sklearn.metrics import accuracy_score
# from sklearn.metrics import precision_score
# from sklearn.metrics import recall_score
# from sklearn.metrics import make_scorer

# #creating Scoring parameter: 
# scoring = {'accuracy': make_scorer(accuracy_score),
#            'precision': make_scorer(precision_score,pos_label='true'),'recall':make_scorer(recall_score,pos_label='true')}

# # A parameter distribution

# parameters = {
#     "loss":["deviance"],
#     "learning_rate": [0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2],
#     "min_samples_split": np.linspace(0.1, 0.5, 12),
#     "min_samples_leaf": np.linspace(0.1, 0.5, 12),
#     "max_depth":[3,5,8],
#     "max_features":["log2","sqrt"],
#     "criterion": ["friedman_mse",  "squared_error"],
#     "subsample":[0.5, 0.618, 0.8, 0.85, 0.9, 0.95, 1.0],
#     "n_estimators":[10]
#     }
# #passing the scoring function in the GridSearchCV
# clf = GridSearchCV(GradientBoostingClassifier(), parameters,scoring=scoring,refit=False,cv=5)

# clf.fit(X_train_avg, y_train)
# #converting the clf.cv_results to dataframe
# df=pd.DataFrame.from_dict(clf.cv_results_)
# #here Possible inputs for cross validation is cv=2, there two split split0 and split1
# df[['split0_test_accuracy','split1_test_accuracy','split0_test_precision','split1_test_precision','split0_test_recall','split1_test_recall']]

## Randomised Search

In [21]:
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.model_selection import RandomizedSearchCV

parameters = {
    "loss":["deviance"],
    "learning_rate": [0.05, 0.1, 0.2],
    "min_samples_split": np.linspace(0.1, 0.5, 3),
    "min_samples_leaf": np.linspace(0.1, 0.5, 3),
    "min_weight_fraction_leaf":[0.0, 0.25, 0.5],
    "max_depth":[2,5,8],
    "max_features":["log2","sqrt"],
    "criterion": ["friedman_mse",  "squared_error"],
    "subsample":[0.5, 0.75, 1.0],
    "n_estimators":[10,20]
    }

gbc_random_cv = RandomizedSearchCV(estimator=GradientBoostingClassifier(random_state=42),param_distributions=parameters,n_iter=500,cv=4,scoring='accuracy',random_state=42)
gbc_random_cv.fit(X_train_avg, y_train)

RandomizedSearchCV(cv=4, estimator=GradientBoostingClassifier(random_state=42),
                   n_iter=500,
                   param_distributions={'criterion': ['friedman_mse',
                                                      'squared_error'],
                                        'learning_rate': [0.05, 0.1, 0.2],
                                        'loss': ['deviance'],
                                        'max_depth': [2, 5, 8],
                                        'max_features': ['log2', 'sqrt'],
                                        'min_samples_leaf': array([0.1, 0.3, 0.5]),
                                        'min_samples_split': array([0.1, 0.3, 0.5]),
                                        'min_weight_fraction_leaf': [0.0, 0.25,
                                                                     0.5],
                                        'n_estimators': [10, 20],
                                        'subsample': [0.5, 0.75, 1.0]},
          

In [24]:
gbc_random_cv.best_params_

{'subsample': 1.0,
 'n_estimators': 20,
 'min_weight_fraction_leaf': 0.0,
 'min_samples_split': 0.1,
 'min_samples_leaf': 0.1,
 'max_features': 'sqrt',
 'max_depth': 5,
 'loss': 'deviance',
 'learning_rate': 0.2,
 'criterion': 'squared_error'}

In [25]:
from sklearn.metrics import confusion_matrix,accuracy_score,precision_score,recall_score,f1_score, classification_report

gbc_random_cv_preds = gbc_random_cv.predict(X_test_avg)
print("Accuracy: ",accuracy_score(y_true=y_test,y_pred=gbc_random_cv_preds))
print("Precision: ",precision_score(y_true=y_test,y_pred=gbc_random_cv_preds,pos_label='true'))
print("Recall: ",recall_score(y_true=y_test,y_pred=gbc_random_cv_preds,pos_label='true'))
print("F1-Score: ",f1_score(y_true=y_test,y_pred=gbc_random_cv_preds,pos_label='true'))
print(classification_report(y_test, gbc_random_cv_preds))

print("Confusion Matrix [TP FP FN TN]: \n",confusion_matrix(y_true=y_test,y_pred=gbc_random_cv_preds).ravel())


Accuracy:  0.9336155045667186
Precision:  0.9246717346233587
Recall:  0.9371935559187485
F1-Score:  0.9308905380333953
              precision    recall  f1-score   support

        fake       0.94      0.93      0.94      4695
        true       0.92      0.94      0.93      4283

    accuracy                           0.93      8978
   macro avg       0.93      0.93      0.93      8978
weighted avg       0.93      0.93      0.93      8978

Confusion Matrix [TP FP FN TN]: 
 [4368  327  269 4014]
