# Word Embeddings

In [1]:
import os
import pandas as pd
import numpy as np

clean_news_df = pd.read_csv(os.path.join(os.getcwd(),"dataset\\clean_news_df.csv"))

In [2]:
clean_news_df

Unnamed: 0,true_or_fake,text,cleaned_text
0,true,"As U.S. budget fight looms, Republicans flip t...",budget fight loom republican flip fiscal scrip...
1,true,U.S. military to accept transgender recruits o...,military accept transgender recruit monday pen...
2,true,Senior U.S. Republican senator: 'Let Mr. Muell...,senior republican senator let mueller job wash...
3,true,FBI Russia probe helped by Australian diplomat...,fbi russia probe help australian diplomat tip ...
4,true,Trump wants Postal Service to charge 'much mor...,trump want postal service charge much amazon s...
...,...,...,...
44893,fake,McPain: John McCain Furious That Iran Treated ...,mcpain john mccain furious iran treat sailor w...
44894,fake,JUSTICE? Yahoo Settles E-mail Privacy Class-ac...,justice yahoo settle mail privacy class action...
44895,fake,Sunnistan: US and Allied ‘Safe Zone’ Plan to T...,sunnistan ally safe zone plan take territorial...
44896,fake,How to Blow $700 Million: Al Jazeera America F...,blow million jazeera america finally call quit...


In [3]:
# remove empty values
clean_news_df.dropna(inplace=True)

In [4]:
# split into training set (test set?) and validation set

# train = 80, test = 20
# random_seed = 42

from sklearn.model_selection import train_test_split

X = clean_news_df['cleaned_text'].str.split()
y = clean_news_df['true_or_fake']

X_train,X_test,y_train,y_test = train_test_split(X,y,shuffle=True,random_state=42,test_size=0.2,stratify=y)

X_train = X_train.reset_index(drop = True)
X_test = X_test.reset_index(drop = True)


In [5]:
# train a Word2Vec model

from gensim.models import Word2Vec
w2v_model = Word2Vec(X_train, vector_size=200, window=5, min_count=1)

In [6]:
vocab=list(w2v_model.wv.key_to_index.keys())
print(len(vocab))

87044


In [7]:
vocab

['say',
 'trump',
 'state',
 'president',
 'would',
 'people',
 'year',
 'make',
 'one',
 'republican',
 'new',
 'take',
 'obama',
 'clinton',
 'also',
 'house',
 'government',
 'time',
 'reuters',
 'tell',
 'donald',
 'get',
 'call',
 'election',
 'country',
 'american',
 'right',
 'white',
 'could',
 'party',
 'campaign',
 'like',
 'vote',
 'two',
 'official',
 'know',
 'come',
 'news',
 'last',
 'report',
 'united',
 'use',
 'work',
 'include',
 'first',
 'want',
 'group',
 'law',
 'back',
 'even',
 'hillary',
 'washington',
 'see',
 'day',
 'video',
 'show',
 'former',
 'support',
 'give',
 'think',
 'court',
 'week',
 'medium',
 'security',
 'many',
 'woman',
 'attack',
 'national',
 'plan',
 'may',
 'bill',
 'police',
 'well',
 'political',
 'leader',
 'need',
 'million',
 'democrat',
 'russia',
 'image',
 'accord',
 'ask',
 'way',
 'since',
 'administration',
 'month',
 'percent',
 'tax',
 'leave',
 'issue',
 'america',
 'presidential',
 'senate',
 'member',
 'force',
 'democrat

In [8]:
words = set(w2v_model.wv.index_to_key)
X_train_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words]) for ls in X_train])
X_test_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words]) for ls in X_test])
X_train_avg = []
for v in X_train_vect:
        X_train_avg.append(v.mean(axis=0))

X_test_avg = []
for v in X_test_vect:
        X_test_avg.append(v.mean(axis=0))

  X_train_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words]) for ls in X_train])
  X_test_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words]) for ls in X_test])


# Model Building

## Logistic Regression

## Naive Bayes

## Random Forest

In [9]:
from sklearn.ensemble import RandomForestClassifier
# Random Forest baseline model
rfb = RandomForestClassifier()
rfb.fit(X_train_avg, y_train)
rfb.score(X_test_avg,y_test)

0.9643573178881711

In [10]:
from sklearn.ensemble import RandomForestClassifier
import numpy as np
# Number of trees in random forest
n_estimators = [int(x)for x in np.linspace(start=10, stop=80,num=10)]
# Number of features to consider at every split
max_features = ['auto','sqrt']
# Max number of levels in tree
max_depth = [2,4,6,8,10]
# Min number of samples required to split a node
min_samples_split=[2,3,5,10]
# Min number of samples required at each leaf node
min_samples_leaf=[1,2,3,4,5]
# Method of selecting sample for training each tree
bootstrap=[True,False]

In [11]:
# Parameter Grid
param_gridX = {
    'n_estimators': n_estimators,
    'max_features':max_features,
    'max_depth':max_depth,
    'min_samples_split':min_samples_split,
    'min_samples_leaf':min_samples_leaf,
    'bootstrap':bootstrap
}
print(param_gridX)

{'n_estimators': [10, 17, 25, 33, 41, 48, 56, 64, 72, 80], 'max_features': ['auto', 'sqrt'], 'max_depth': [2, 4, 6, 8, 10], 'min_samples_split': [2, 3, 5, 10], 'min_samples_leaf': [1, 2, 3, 4, 5], 'bootstrap': [True, False]}


In [12]:
rf_model = RandomForestClassifier()

In [13]:
# Grid Search
from sklearn.model_selection import GridSearchCV
rf_grid = GridSearchCV(estimator=rf_model,param_grid=param_gridX,cv=10,verbose=2,n_jobs=4)

In [14]:
rf_grid.fit(X_train,y_train)

Fitting 10 folds for each of 4000 candidates, totalling 40000 fits


KeyboardInterrupt: 

In [None]:
rf_grid.best_params_

In [None]:
# Evaluation Metrics For Random Forest 
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Get the best estimator from GridSearchCV
best_rf = rf_grid.best_estimator_

# Make predictions on the test set using the best estimator
y_pred = best_rf.predict(X_test)

# Compute accuracy
accuracy = accuracy_score(y_test, y_pred)

# Compute precision
precision = precision_score(y_test, y_pred)

# Compute recall
recall = recall_score(y_test, y_pred)

# Compute F1-score
f1 = f1_score(y_test, y_pred)

# Print the evaluation metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)

## Gradient Boosting

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

# loading the baseline model
gbc = GradientBoostingClassifier()
gbc.fit(X_train_avg, y_train)

gbc.score(X_test_avg,y_test)

0.9591222989529962

In [None]:
from sklearn.metrics import confusion_matrix,accuracy_score,precision_score,recall_score,f1_score, classification_report

gbc_preds = gbc.predict(X_test_avg)
print("Accuracy: ",accuracy_score(y_true=y_test,y_pred=gbc_preds))
print("Precision: ",precision_score(y_true=y_test,y_pred=gbc_preds,pos_label='true'))
print("Recall: ",recall_score(y_true=y_test,y_pred=gbc_preds,pos_label='true'))
print("F1-Score: ",f1_score(y_true=y_test,y_pred=gbc_preds,pos_label='true'))
print(classification_report(y_test, gbc_preds))

print("Confusion Matrix: \n",confusion_matrix(y_true=y_test,y_pred=gbc_preds).ravel())


Accuracy:  0.9591222989529962
Precision:  0.9549256505576208
Recall:  0.9596077515759981
F1-Score:  0.9572609758937929
              precision    recall  f1-score   support

        fake       0.96      0.96      0.96      4695
        true       0.95      0.96      0.96      4283

    accuracy                           0.96      8978
   macro avg       0.96      0.96      0.96      8978
weighted avg       0.96      0.96      0.96      8978

Confusion Matrix: 
 [4501  194  173 4110]


### Hyperparameter Tuning

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import make_scorer
#creating Scoring parameter: 
scoring = {'accuracy': make_scorer(accuracy_score),
           'precision': make_scorer(precision_score),'recall':make_scorer(recall_score)}

# A sample parameter

parameters = {
    "loss":["deviance"],
    "learning_rate": [0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2],
    "min_samples_split": np.linspace(0.1, 0.5, 12),
    "min_samples_leaf": np.linspace(0.1, 0.5, 12),
    "max_depth":[3,5,8],
    "max_features":["log2","sqrt"],
    "criterion": ["friedman_mse",  "mae"],
    "subsample":[0.5, 0.618, 0.8, 0.85, 0.9, 0.95, 1.0],
    "n_estimators":[10]
    }
#passing the scoring function in the GridSearchCV
clf = GridSearchCV(GradientBoostingClassifier(), parameters,scoring=scoring,refit=False,cv=2, n_jobs=-1)

clf.fit(X_train_avg, y_train)
#converting the clf.cv_results to dataframe
df=pd.DataFrame.from_dict(clf.cv_results_)
#here Possible inputs for cross validation is cv=2, there two split split0 and split1
df[['split0_test_accuracy','split1_test_accuracy','split0_test_precision','split1_test_precision','split0_test_recall','split1_test_recall']]

PicklingError: Could not pickle the task to send it to the workers.