# Word Embeddings

In [1]:
import os
import pandas as pd
import numpy as np

clean_news_df = pd.read_csv(os.path.join(os.getcwd(),"dataset\\clean_news_df.csv"))

# remove empty values
clean_news_df.dropna(inplace=True)

In [2]:
clean_news_df

Unnamed: 0,true_or_fake,text,cleaned_text
0,true,"As U.S. budget fight looms, Republicans flip t...",budget fight loom republican flip fiscal scrip...
1,true,U.S. military to accept transgender recruits o...,military accept transgender recruit monday pen...
2,true,Senior U.S. Republican senator: 'Let Mr. Muell...,senior republican senator let mueller job wash...
3,true,FBI Russia probe helped by Australian diplomat...,fbi russia probe help australian diplomat tip ...
4,true,Trump wants Postal Service to charge 'much mor...,trump want postal service charge much amazon s...
...,...,...,...
44893,fake,McPain: John McCain Furious That Iran Treated ...,mcpain john mccain furious iran treat sailor w...
44894,fake,JUSTICE? Yahoo Settles E-mail Privacy Class-ac...,justice yahoo settle mail privacy class action...
44895,fake,Sunnistan: US and Allied ‘Safe Zone’ Plan to T...,sunnistan ally safe zone plan take territorial...
44896,fake,How to Blow $700 Million: Al Jazeera America F...,blow million jazeera america finally call quit...


In [3]:
# split into training set and validation set

# train = 80, test = 20
# random_state = 42

from sklearn.model_selection import train_test_split

X = clean_news_df['cleaned_text'].str.split()
y = clean_news_df['true_or_fake']

X_train,X_test,y_train,y_test = train_test_split(X,y,shuffle=True,random_state=42,test_size=0.2,stratify=y)

X_train = X_train.reset_index(drop = True)
X_test = X_test.reset_index(drop = True)

In [4]:
# train a Word2Vec model

from gensim.models import Word2Vec
w2v_model = Word2Vec(X_train, vector_size=200, window=5, min_count=1)

In [5]:
vocab=list(w2v_model.wv.key_to_index.keys())
print(len(vocab))

87044


In [6]:
words = set(w2v_model.wv.index_to_key)
X_train_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words]) for ls in X_train])
X_test_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words]) for ls in X_test])
X_train_avg = []
for v in X_train_vect:
        X_train_avg.append(v.mean(axis=0))

X_test_avg = []
for v in X_test_vect:
        X_test_avg.append(v.mean(axis=0))

  X_train_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words]) for ls in X_train])
  X_test_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words]) for ls in X_test])


# Model Building

## Gradient Boosting

In [7]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import confusion_matrix,accuracy_score,precision_score,recall_score,f1_score,classification_report

# fitting the baseline model
gbc_base = GradientBoostingClassifier(random_state=42)
gbc_base.fit(X_train_avg, y_train)

# evaluate the baseline model
gbc_base_preds = gbc_base.predict(X_test_avg)
print("====== Baseline Model Evaluation ======")
print("Confusion Matrix [TP FP FN TN]: \n",confusion_matrix(y_true=y_test,y_pred=gbc_base_preds).ravel())
print(f"Accuracy : {accuracy_score(y_true=y_test,y_pred=gbc_base_preds):.3f}")
print(f"Precision: {precision_score(y_true=y_test,y_pred=gbc_base_preds,pos_label='true'):.3f}",)
print(f"Recall   : {recall_score(y_true=y_test,y_pred=gbc_base_preds,pos_label='true'):.3f}")
print(f"F1-Score : {f1_score(y_true=y_test,y_pred=gbc_base_preds,pos_label='true'):.3f}")
print("\nClassification Report\n",classification_report(y_true=y_test,y_pred=gbc_base_preds))

Confusion Matrix [TP FP FN TN]: 
 [4490  205  173 4110]
Accuracy : 0.958
Precision: 0.952
Recall   : 0.960
F1-Score : 0.956

Classification Report
               precision    recall  f1-score   support

        fake       0.96      0.96      0.96      4695
        true       0.95      0.96      0.96      4283

    accuracy                           0.96      8978
   macro avg       0.96      0.96      0.96      8978
weighted avg       0.96      0.96      0.96      8978



## Hyperparameter Tuning (RandomizedSearchCV)

In [8]:
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.model_selection import RandomizedSearchCV

parameters = {
    "loss":["deviance"],
    "learning_rate": [0.05, 0.1, 0.2],
    "min_samples_split": np.linspace(0.1, 0.5, 4),
    "min_samples_leaf": np.linspace(0.1, 0.5, 4),
    "min_weight_fraction_leaf":[0.0, 0.25, 0.5],
    "max_depth":[2,5,8],
    "max_features":["log2","sqrt"],
    "criterion": ["friedman_mse",  "squared_error"],
    "subsample":[0.5, 0.75, 1.0],
    "n_estimators":[10,20]
    }

gbc_random_cv = RandomizedSearchCV(estimator=GradientBoostingClassifier(random_state=42),param_distributions=parameters,n_iter=2000,cv=5,scoring='accuracy',random_state=42)
gbc_random_cv.fit(X_train_avg, y_train)

RandomizedSearchCV(cv=5, estimator=GradientBoostingClassifier(random_state=42),
                   n_iter=2000,
                   param_distributions={'criterion': ['friedman_mse',
                                                      'squared_error'],
                                        'learning_rate': [0.05, 0.1, 0.2],
                                        'loss': ['deviance'],
                                        'max_depth': [2, 5, 8],
                                        'max_features': ['log2', 'sqrt'],
                                        'min_samples_leaf': array([0.1       , 0.23333333, 0.36666667, 0.5       ]),
                                        'min_samples_split': array([0.1       , 0.23333333, 0.36666667, 0.5       ]),
                                        'min_weight_fraction_leaf': [0.0, 0.25,
                                                                     0.5],
                                        'n_estimators': [10, 20],
               

In [9]:
gbc_random_cv.best_params_

{'subsample': 1.0,
 'n_estimators': 20,
 'min_weight_fraction_leaf': 0.0,
 'min_samples_split': 0.1,
 'min_samples_leaf': 0.1,
 'max_features': 'sqrt',
 'max_depth': 5,
 'loss': 'deviance',
 'learning_rate': 0.2,
 'criterion': 'squared_error'}

In [10]:
from sklearn.metrics import confusion_matrix,accuracy_score,precision_score,recall_score,f1_score

# evaluate the tuned model
gbc_random_cv_preds = gbc_random_cv.predict(X_test_avg)
print("====== Hyperparameter Tuned Model Evaluation ======")
print("Confusion Matrix [TP FP FN TN]: \n",confusion_matrix(y_true=y_test,y_pred=gbc_random_cv_preds).ravel())
print(f"Accuracy : {accuracy_score(y_true=y_test,y_pred=gbc_random_cv_preds):.3f}")
print(f"Precision: {precision_score(y_true=y_test,y_pred=gbc_random_cv_preds,pos_label='true'):.3f}",)
print(f"Recall   : {recall_score(y_true=y_test,y_pred=gbc_random_cv_preds,pos_label='true'):.3f}")
print(f"F1-Score : {f1_score(y_true=y_test,y_pred=gbc_random_cv_preds,pos_label='true'):.3f}")

Confusion Matrix [TP FP FN TN]: 
 [4351  344  282 4001]
Accuracy : 0.930
Precision: 0.921
Recall   : 0.934
F1-Score : 0.927
