# 432 Project - Fall 2020

#### Student Names:
#### Firas Sawan       ID#26487815 
#### Giselle Martel    ID#26352936

# FakeNewsClassifiers

##### A comparison between different Machine Learning models in predicting which news articles are "Fake News" or "Real News"

In [None]:
import tools as tools
import preprocess as preprocess
import model.logistic_regression as LR
import model.decision_tree as DT
import model.random_forest as RF
import model.support_vector_machine as SVC
import model.naive_bayesian_classifier as NB

import pandas as pd
import numpy as np


# Pre-Processing Data

In [None]:
print("\nPreprocessing of data...\n")

# read dataset from csv files
fake_news = preprocess.parse_dataset("Fake_test.csv", "FAKE")
real_news = preprocess.parse_dataset("True_test.csv", "REAL")

print()

# join all news (fake + real) data
all_news = pd.concat([fake_news, real_news], axis=0)

# tokenize each dataset
fake_news_all_tokens, fake_news_tokens_per_article = preprocess.tokenize(fake_news, "fake_news")
real_news_all_tokens, real_news_tokens_per_article = preprocess.tokenize(real_news, "real_news")

print()

# join tokens (fake + real)
all_tokens = fake_news_all_tokens + real_news_all_tokens
tokens_per_article = fake_news_tokens_per_article + real_news_tokens_per_article

print()

# Split and preprocess the data into training and testing data
X_train, X_test, y_train, y_test = preprocess.split_and_preprocess(all_tokens,tokens_per_article, all_news)

# Logistic Regression Classifier

In [None]:
%time
# Logisitic Regression Classifier
print("\nTesting Logistic Regression Classifier ...\n")

# set the hyperparams
C = np.logspace(-4,4,6)

# perform hyperparam search
estimators, accuracy, best_estimator, hyperparams = LR.logisitic_regression_hyperparam_search(X_train, y_train, C)

# calculate the training and testing scores and plot the result
trn_scores, test_scores = tools.calculate_estimator_scores([X_train, X_test, y_train, y_test], estimators)

# calculate model overfitting
overfitting = tools.determine_overfitting(trn_scores,test_scores)
print("\nLogistic Regression overfitting: {:.3f}\n".format(overfitting))

# plot the scores of each estimator
tools.plot_estimator_scores("LogisticRegression",trn_scores,test_scores)

# display details of best estimator
tools.display_best_estimator(accuracy, "LogisticRegression", hyperparams)

# use best estimator to make predictions
y_pred = LR.logistic_regression_predict(best_estimator, X_test)

tools.plot_predicted_labels(y_test, y_pred, "LogisticRegression")
tools.display_prediction_scores(y_test,y_pred)
tools.plot_confusion_matrix(y_test,y_pred,"LogisticRegression")


# Random Forest Classifier

In [None]:
%time
# Random Forest Classifer
print("\nTesting Random Forest Classifier ...\n")

# set the hyperparams
D = np.linspace(1,1000,20)
N = np.linspace(1,10,1, dtype="int32")

# perform hyperparam search
estimators, accuracy, best_estimator, hyperparams = RF.random_forest_hyperparam_search(X_train, y_train, D, N)

# calculate the training and testing scores and plot the result
trn_scores, test_scores = tools.calculate_estimator_scores([X_train, X_test, y_train, y_test], estimators)

# calculate model overfitting
overfitting = tools.determine_overfitting(trn_scores,test_scores)
print("\nRandom Forest overfitting: {:.3f}\n".format(overfitting))

# plot the scores of each estimator
tools.plot_estimator_scores("RandomForest",trn_scores,test_scores)

# display details of best estimator
tools.display_best_estimator(accuracy, "RandomForest", hyperparams)

# use best estimator to make predictions
y_pred = RF.random_forest_predict(best_estimator, X_test)

tools.plot_predicted_labels(y_test, y_pred, "RandomForest")
tools.display_prediction_scores(y_test,y_pred)
tools.plot_confusion_matrix(y_test,y_pred,"RandomForest")

tools.display_result(best_estimator, X_train)

# Decision Tree Classifier

In [None]:
%time
print("\nTesting Decision Tree Classifier ...\n")

# set the hyperparams
D = np.linspace(1,18,3)

# perform hyperparam search
estimators, accuracy, best_estimator, hyperparams = DT.decision_tree_hyperparam_search(X_train, y_train, D)

# calculate the training and testing scores and plot the result
trn_scores, test_scores = tools.calculate_estimator_scores([X_train, X_test, y_train, y_test], estimators)

# calculate model overfitting
overfitting = tools.determine_overfitting(trn_scores,test_scores)
print("\nDecision Tree overfitting: {:.3f}\n".format(overfitting))

# plot the scores of each estimator
tools.plot_estimator_scores("DecisionTree",trn_scores,test_scores)

# display details of best estimator
tools.display_best_estimator(accuracy, "DecisionTree", hyperparams)

# use best estimator to make predictions
y_pred = DT.decision_tree_predict(best_estimator, X_test)

tools.plot_predicted_labels(y_test, y_pred, "DecisionTree")
tools.display_prediction_scores(y_test,y_pred)
tools.plot_confusion_matrix(y_test,y_pred,"DecisionTree")

# Naive Bayes Classifier

In [None]:
%time
# Naive Bayes Classifier
print("\nTesting Naive Bayesian Classifier ...\n")

# set the hyperparams
A = np.linspace(0.05,1,12)
F = [True, False]

# perform hyperparam search
estimators, accuracy, best_estimator, hyperparams = NB.naive_bayes_hyperparam_search(X_train, y_train, A,F)

# calculate the training and testing scores and plot the result
trn_scores, test_scores = tools.calculate_estimator_scores([X_train, X_test, y_train, y_test], estimators)

# calculate model overfitting
overfitting = tools.determine_overfitting(trn_scores,test_scores)
print("\nNaive Bayes overfitting: {:.3f}\n".format(overfitting))

# plot the scores of each estimator
tools.plot_estimator_scores("NaiveBayes",trn_scores,test_scores)

# display details of best estimator
tools.display_best_estimator(accuracy, "NaiveBayes", hyperparams)

# use best estimator to make predictions
y_pred = NB.naive_bayesian_predict(best_estimator, X_test)

tools.plot_predicted_labels(y_test, y_pred, "NaiveBayes")
tools.display_prediction_scores(y_test,y_pred)
tools.plot_confusion_matrix(y_test,y_pred,"NaiveBayes")

tools.display_result(best_estimator, X_train)

# Support Vector Machine Classifier

In [None]:
%time
# Support Vector Classification
print("\nTesting SVM Classifier ...\n")

# set the hyperparams
C = np.logspace(-4,4,6)
G = np.logspace(-4,4,6)
K = ["rbf", "linear"]

# perform hyperparam search
estimators, accuracy, best_estimator, hyperparams = SVC.svc_hyperparam_search(X_train, y_train, C, G, K)

# calculate the training and testing scores and plot the result
trn_scores, test_scores = tools.calculate_estimator_scores([X_train, X_test, y_train, y_test], estimators)

# calculate model overfitting
overfitting = tools.determine_overfitting(trn_scores,test_scores)
print("\nSVC overfitting: {:.3f}\n".format(overfitting))

# plot the scores of each estimator
tools.plot_estimator_scores("SVC",trn_scores,test_scores)

# display details of best estimator
tools.display_best_estimator(accuracy, "SVC", hyperparams)

# use best estimator to make predictions
y_pred = SVC.support_vector_machine_predict(best_estimator, X_test)

tools.plot_predicted_labels(y_test, y_pred, "SVC")
tools.display_prediction_scores(y_test,y_pred)
tools.plot_confusion_matrix(y_test,y_pred,"SVC")

tools.display_result(best_estimator, X_train)

