# EE 460J Final Project: Fake News Classifier
## By: Andy Wu, Dylan Tabarini, Alex Raterink, ...

In [6]:
# imports
import numpy as np
import pandas as pd
import itertools
import statistics
import scipy
from scipy import sparse
from scipy.sparse import csr_matrix
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier, SGDClassifier, LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix
from statistics import mean
from numpy import var, std

from IPython.display import Audio, display
print('exports done')

exports done


In [7]:
SEED = 42

In [8]:
# Define helper functions here
def alert_when_done():
  display(Audio(url='https://sound.peal.io/ps/audios/000/000/537/original/woo_vu_luvub_dub_dub.wav', autoplay=True))

def convert_series_to_integer_labels(y_train):
    labels = y_train.to_numpy(dtype='str')
    for i in range(len(labels)):
        if labels[i] == 'fake':
            labels[i] = 1
        elif labels[i] == 'real':
            labels[i] = 0
        else:
            raise NameError("y_train data contains labels that are neither 'fake' nor 'real'!")
    return labels.astype(np.int8)

def convert_predictions_to_integer_labels(predictions):
    # Passed in predictions should be numpy arrays of np.str_ type
    for i in range(len(predictions)):
        if predictions[i] == 'fake':
            predictions[i] = 1
        elif predictions[i] == 'real':
            predictions[i] = 0
        else:
            raise NameError("y_train data contains labels that are neither 'fake' nor 'real'!")
    return predictions.astype(np.int8)

def convert_to_tfidf_train_and_test_sets(x_train, x_test):
    """ Returns a tuple containing the converted tfidf vectors: (tfidf_train, tfidf_test)"""
    # Initialize a TfidfVectorizer to filter out English stop words of the most common words and vectorize article text
    tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7)
    tfidf_train = tfidf_vectorizer.fit_transform(x_train['text'])
    tfidf_test = tfidf_vectorizer.transform(x_test['text'])
    
    # Convert back to a scipy.sparse.csr_matrix
    tfidf_train_sparse = scipy.sparse.csr_matrix(tfidf_train_df.values)
    tfidf_test_sparse = scipy.sparse.csr_matrix(tfidf_test_df.values)
    return (tfidf_train_sparse, tfidf_test_sparse)

def convert_to_tfidf_train_and_test_sets_with_sentiment_features(x_train, x_test):
    """ Returns a tuple containing the converted tfidf vectors: (tfidf_train, tfidf_test)"""
    # Initialize a TfidfVectorizer to filter out English stop words of the most common words and vectorize article text
    tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7)
    tfidf_train = tfidf_vectorizer.fit_transform(x_train['text'])
    tfidf_test = tfidf_vectorizer.transform(x_test['text'])
    
    # Convert the TF IDF sparse matrices into dataframes beforing adding Sentiment feature columns
    tfidf_train_df = pd.DataFrame.sparse.from_spmatrix(tfidf_train)
    tfidf_test_df = pd.DataFrame.sparse.from_spmatrix(tfidf_test)
    
    # Add 'tfidf_' suffix to each of the converted tfidf vectors in the train/test sets
    tfidf_train_df.columns = ['tfidf_' + str(col) for col in tfidf_train_df.columns]
    tfidf_train_df
    tfidf_test_df.columns = ['tfidf_' + str(col) for col in tfidf_test_df.columns]
    tfidf_test_df
    
    # Add the 4 sentiment feature columns to both of the 2 converted dataframes
    tfidf_train_df['positive'] = x_train['positive'].tolist()
    tfidf_train_df['negative'] = x_train['negative'].tolist()
    tfidf_train_df['neutral'] = x_train['neutral'].tolist()
    tfidf_train_df['mixed'] = x_train['mixed'].tolist()

    tfidf_test_df['positive'] = x_test['positive'].tolist()
    tfidf_test_df['negative'] = x_test['negative'].tolist()
    tfidf_test_df['neutral'] = x_test['neutral'].tolist()
    tfidf_test_df['mixed'] = x_test['mixed'].tolist()

    # Convert back to a scipy.sparse.csr_matrix
    tfidf_train_sparse = scipy.sparse.csr_matrix(tfidf_train_df.values)
    tfidf_test_sparse = scipy.sparse.csr_matrix(tfidf_test_df.values)
    return (tfidf_train_sparse, tfidf_test_sparse)

def get_cv_score(model, x_df, labels, iterations=5, get_details=False, train_split_size=0.80):
    """ 
        Pass in an untrained model, x_df, and labels to return the average accuracy score across iterations (5 by default).
        Note if get_details is set to True, this method returns a tuple in the format:
        (mean, std, scores_array)
    """
    auc_scores=[]
    
    for i in range(iterations):
        x_train1, x_test1, y_train1, y_test1 = train_test_split(x_df, labels, train_size=train_split_size, test_size=1-train_split_size, random_state=i)
        tfidf_train_with_sentiment, tfidf_test_with_sentiment= convert_to_tfidf_train_and_test_sets_with_sentiment_features(x_train1, x_test1)
        model.fit(tfidf_train_with_sentiment, y_train1)
        true_binary_labels = convert_series_to_integer_labels(y_test1)
        predicted_binary_labels = convert_predictions_to_integer_labels(model.predict(tfidf_test_with_sentiment))
        auc_scores.append(roc_auc_score(true_binary_labels, predicted_binary_labels))
        
    if get_details:
        return (statistics.mean(auc_scores), np.std(auc_scores), auc_scores)
    else:
        return statistics.mean(auc_scores)
    
def get_confusion_matrix(model, x_df, labels):
    """ Given an untrained model and the true labels"""
    x_train1, x_test1, y_train1, y_test1 = train_test_split(x_df, labels, train_size=0.8, test_size=0.2, random_state=SEED)
    tfidf_train_with_sentiment, tfidf_test_with_sentiment= convert_to_tfidf_train_and_test_sets_with_sentiment_features(x_train1, x_test1)
    model.fit(tfidf_train_with_sentiment, y_train1)
    # Visualize the confusion matrix to gain insight into false postives and negatives
    predictions = model.predict(tfidf_test_with_sentiment)
    return confusion_matrix(y_test1, predictions, labels=['fake','real'])


In [11]:
# Read the csv data file. Note that you should replace the line below with the absolute UNIX path of the csv files
df = pd.read_csv("alldata_with_sentiment.csv")
df

Unnamed: 0.1,Unnamed: 0,statement,text,label,positive,negative,neutral,mixed
0,0,covid started because we eat animals,vegan instagram users are pinning the coronavi...,fake,0.010846,0.088485,0.899144,0.001525
1,1,says michelle obama has people on her staff na...,glenn beck rekindled a falsehood about the siz...,fake,0.010427,0.332251,0.498293,0.159030
2,2,says president donald trump has signed more la...,vice president mike pence says that when it co...,real,0.011365,0.204091,0.781942,0.002602
3,3,us representatives promise implement of un gu...,a conservative website falsely claimed that u ...,fake,0.007493,0.347418,0.565760,0.079329
4,4,the federal government borrows billion every ...,hundreds of rhode islanders got phone calls la...,real,0.017309,0.313422,0.630314,0.038955
...,...,...,...,...,...,...,...,...
17482,17482,historically senate ratification of arms cont...,as the house and senate move into a brief lame...,real,0.006171,0.045163,0.934267,0.014399
17483,17483,since the affordable care act passed percent ...,policymakers and pundits are spending a lot of...,real,0.005834,0.292535,0.688702,0.012930
17484,17484,medicare spends billion a year on subsidies to...,in the final presidential debate oct moderator...,real,0.006582,0.557617,0.433883,0.001918
17485,17485,the obama administration is allowing state wai...,former president bill clinton used his elder s...,real,0.008514,0.020729,0.967239,0.003517


In [12]:
# Remove the unamed first column... this is duplicate of the index
df = df.drop(df.columns[0], axis=1)
df

Unnamed: 0,statement,text,label,positive,negative,neutral,mixed
0,covid started because we eat animals,vegan instagram users are pinning the coronavi...,fake,0.010846,0.088485,0.899144,0.001525
1,says michelle obama has people on her staff na...,glenn beck rekindled a falsehood about the siz...,fake,0.010427,0.332251,0.498293,0.159030
2,says president donald trump has signed more la...,vice president mike pence says that when it co...,real,0.011365,0.204091,0.781942,0.002602
3,us representatives promise implement of un gu...,a conservative website falsely claimed that u ...,fake,0.007493,0.347418,0.565760,0.079329
4,the federal government borrows billion every ...,hundreds of rhode islanders got phone calls la...,real,0.017309,0.313422,0.630314,0.038955
...,...,...,...,...,...,...,...
17482,historically senate ratification of arms cont...,as the house and senate move into a brief lame...,real,0.006171,0.045163,0.934267,0.014399
17483,since the affordable care act passed percent ...,policymakers and pundits are spending a lot of...,real,0.005834,0.292535,0.688702,0.012930
17484,medicare spends billion a year on subsidies to...,in the final presidential debate oct moderator...,real,0.006582,0.557617,0.433883,0.001918
17485,the obama administration is allowing state wai...,former president bill clinton used his elder s...,real,0.008514,0.020729,0.967239,0.003517


# Note the our overall dataframe has a 17487 datapoints and now 7 features:
## statement: title of the article
## text: content of the article
## label: 'fake' or 'real'
## positive, negative, neutral, and mixed scores (4 MORE FEATURES)

In [13]:
# Parse the target labels from the dataframe
labels = df.label
labels

0        fake
1        fake
2        real
3        fake
4        real
         ... 
17482    real
17483    real
17484    real
17485    real
17486    real
Name: label, Length: 17487, dtype: object

In [14]:
x_df = df.drop('label', axis=1)
x_df

Unnamed: 0,statement,text,positive,negative,neutral,mixed
0,covid started because we eat animals,vegan instagram users are pinning the coronavi...,0.010846,0.088485,0.899144,0.001525
1,says michelle obama has people on her staff na...,glenn beck rekindled a falsehood about the siz...,0.010427,0.332251,0.498293,0.159030
2,says president donald trump has signed more la...,vice president mike pence says that when it co...,0.011365,0.204091,0.781942,0.002602
3,us representatives promise implement of un gu...,a conservative website falsely claimed that u ...,0.007493,0.347418,0.565760,0.079329
4,the federal government borrows billion every ...,hundreds of rhode islanders got phone calls la...,0.017309,0.313422,0.630314,0.038955
...,...,...,...,...,...,...
17482,historically senate ratification of arms cont...,as the house and senate move into a brief lame...,0.006171,0.045163,0.934267,0.014399
17483,since the affordable care act passed percent ...,policymakers and pundits are spending a lot of...,0.005834,0.292535,0.688702,0.012930
17484,medicare spends billion a year on subsidies to...,in the final presidential debate oct moderator...,0.006582,0.557617,0.433883,0.001918
17485,the obama administration is allowing state wai...,former president bill clinton used his elder s...,0.008514,0.020729,0.967239,0.003517


In [16]:
# Global train and test sets to use (with the added 4 columns)
x_train, x_test, y_train, y_test = train_test_split(x_df, labels, train_size=0.8, test_size=0.2, random_state=SEED)
tfidf_train_sparse, tfidf_test_sparse = convert_to_tfidf_train_and_test_sets_with_sentiment_features(x_train, x_test)
type(tfidf_train_sparse)
print("worked")

worked


# Feature Engineering (Adding 4 sentiment columns)

In [17]:
# Convert the TF IDF sparse matrices into dataframes if we want to add Sentiment feature columns
# tfidf_train_df = pd.DataFrame.sparse.from_spmatrix(tfidf_train)
# tfidf_test_df = pd.DataFrame.sparse.from_spmatrix(tfidf_test)

In [18]:
# Add 'tfidf_' suffix to each of the converted tfidf vectors in the training set
# tfidf_train_df.columns = ['tfidf_' + str(col) for col in tfidf_train_df.columns]
# tfidf_train_df

In [19]:
# Add 'tfidf_' suffix to each of the converted tfidf vectors in the test set
# tfidf_test_df.columns = ['tfidf_' + str(col) for col in tfidf_test_df.columns]
# tfidf_test_df

In [20]:
# Add the sentiment feature column to the 2 converted dataframes
# tfidf_train_df['positive'] = x_train['positive'].tolist()
# tfidf_train_df['negative'] = x_train['negative'].tolist()
# tfidf_train_df['neutral'] = x_train['neutral'].tolist()
# tfidf_train_df['mixed'] = x_train['mixed'].tolist()

# tfidf_test_df['positive'] = x_test['positive'].tolist()
# tfidf_test_df['negative'] = x_test['negative'].tolist()
# tfidf_test_df['neutral'] = x_test['neutral'].tolist()
# tfidf_test_df['mixed'] = x_test['mixed'].tolist()

# tfidf_train_df

# Classifier Training
## PassiveAggressiveClassifier

In [21]:
# Use a PassiveAggressiveClassifier and get the average AUC score across 5 iterations of a 80-20 split
pac_clf = PassiveAggressiveClassifier(class_weight='balanced', C=2, max_iter=1e4)
aucScore, stdDev, scoresArray = get_cv_score(pac_clf, x_df, labels, iterations=5, get_details=True)
print(f'AUC score across 5 iterations: {aucScore}')
print(f'Standard Deviation across 5 iterations: {stdDev}')

AUC score across 5 iterations: 0.6644367416932025
Standard Deviation across 5 iterations: 0.004906826893578784


In [22]:
print(scoresArray)

[0.6726355486280376, 0.6615907192514724, 0.6670737389342555, 0.6586822469793522, 0.6622014546728949]


In [23]:
# Visualize the confusion matrix for the PassiveAgressiveClassifier
conf_matrix = get_confusion_matrix(pac_clf, x_df, labels)
conf_matrix

array([[1115,  569],
       [ 568, 1246]])

In [24]:
# Interpret results of confusion matrix
print(f'Number of true positives (FAKE article correctly classified): {conf_matrix[0][0]}')
print(f'Number of true negatives (REAL article correctly classified): {conf_matrix[1][1]}')
print(f'Number of false positives (REAL article wrongly classified as FAKE): {conf_matrix[1][0]}')
print(f'Number of false negatives (FAKE article wrongly classified as REAL): {conf_matrix[0][1]}')

Number of true positives (FAKE article correctly classified): 1115
Number of true negatives (REAL article correctly classified): 1246
Number of false positives (REAL article wrongly classified as FAKE): 568
Number of false negatives (FAKE article wrongly classified as REAL): 569


## SGDClassifier

In [25]:
# Use a SGDClassifier and fit it on the transformed TF-IDF vectors
sgd_clf = SGDClassifier(penalty='l1', alpha=0.0001, max_iter=1e4, n_jobs=-1)
aucScore, stdDev, scoresArray = get_cv_score(sgd_clf, x_df, labels, iterations=5, get_details=True)
print(f'AUC score across 5 iterations: {aucScore}')
print(f'Standard Deviation across 5 iterations: {stdDev}')
print(scoresArray)

AUC score across 5 iterations: 0.7047333132766321
Standard Deviation across 5 iterations: 0.005755272949536468
[0.7048819604991997, 0.6937075099589546, 0.7066117364329679, 0.7091681506227709, 0.7092972088692677]


In [26]:
# Visualize the confusion matrix for the SGDClassifier
conf_matrix = get_confusion_matrix(sgd_clf, x_df, labels)
print(conf_matrix)

# Interpret results of confusion matrix
print(f'Number of true positives (FAKE article correctly classified): {conf_matrix[0][0]}')
print(f'Number of true negatives (REAL article correctly classified): {conf_matrix[1][1]}')
print(f'Number of false positives (REAL article wrongly classified as FAKE): {conf_matrix[1][0]}')
print(f'Number of false negatives (FAKE article wrongly classified as REAL): {conf_matrix[0][1]}')

[[1017  667]
 [ 319 1495]]
Number of true positives (FAKE article correctly classified): 1017
Number of true negatives (REAL article correctly classified): 1495
Number of false positives (REAL article wrongly classified as FAKE): 319
Number of false negatives (FAKE article wrongly classified as REAL): 667


## LogisticRegressor

In [28]:
# Use a LogisticRegressor and fit it on the transformed TF-IDF vectors
logistic_classifier = LogisticRegression(penalty='elasticnet', l1_ratio=1, C=1, solver='saga', max_iter=1e4, n_jobs=-1)
aucScore, stdDev, scoresArray = get_cv_score(logistic_classifier, x_df, labels, iterations=5, get_details=True)
print(f'AUC score across 5 iterations: {aucScore}')
print(f'Standard Deviation across 5 iterations: {stdDev}')
print(scoresArray)

AUC score across 5 iterations: 0.7138757318577511
Standard Deviation across 5 iterations: 0.004123961140671685
[0.7142530703084954, 0.7083498294268513, 0.710424483189877, 0.7198290875019009, 0.7165221888616308]


In [29]:
# Visualize the confusion matrix for the SGDClassifier
conf_matrix = get_confusion_matrix(logistic_classifier, x_df, labels)
print(conf_matrix)

# Interpret results of confusion matrix
print(f'Number of true positives (FAKE article correctly classified): {conf_matrix[0][0]}')
print(f'Number of true negatives (REAL article correctly classified): {conf_matrix[1][1]}')
print(f'Number of false positives (REAL article wrongly classified as FAKE): {conf_matrix[1][0]}')
print(f'Number of false negatives (FAKE article wrongly classified as REAL): {conf_matrix[0][1]}')

[[1125  559]
 [ 406 1408]]
Number of true positives (FAKE article correctly classified): 1125
Number of true negatives (REAL article correctly classified): 1408
Number of false positives (REAL article wrongly classified as FAKE): 406
Number of false negatives (FAKE article wrongly classified as REAL): 559


## XGBClassifier

In [30]:
# Use a XGBClassifier and fit it on the transformed TF-IDF vectors
xgb_clf = XGBClassifier(eta=0.01, n_estimators=350, max_depth = 9, n_jobs=-1, verbosity=1)
aucScore, stdDev, scoresArray = get_cv_score(xgb_clf, x_df, labels, iterations=5, get_details=True)
print(f'AUC score across 5 iterations: {aucScore}')
print(f'Standard Deviation across 5 iterations: {stdDev}')
print(scoresArray)

AUC score across 5 iterations: 0.7119332778245612
Standard Deviation across 5 iterations: 0.0060951437609422865
[0.7067515403847563, 0.7062777784134654, 0.7081365408666225, 0.7208975358970208, 0.717602993560941]


In [31]:
# Visualize the confusion matrix for the XGBClassifier
conf_matrix = get_confusion_matrix(xgb_clf, x_df, labels)
print(conf_matrix)

# Interpret results of confusion matrix
print(f'Number of true positives (FAKE article correctly classified): {conf_matrix[0][0]}')
print(f'Number of true negatives (REAL article correctly classified): {conf_matrix[1][1]}')
print(f'Number of false positives (REAL article wrongly classified as FAKE): {conf_matrix[1][0]}')
print(f'Number of false negatives (FAKE article wrongly classified as REAL): {conf_matrix[0][1]}')

[[1060  624]
 [ 359 1455]]
Number of true positives (FAKE article correctly classified): 1060
Number of true negatives (REAL article correctly classified): 1455
Number of false positives (REAL article wrongly classified as FAKE): 359
Number of false negatives (FAKE article wrongly classified as REAL): 624


## RandomForestClassifier

In [32]:
# Use a RandomForestClassifier and fit it on the transformed TF-IDF vectors
random_forest_clf = RandomForestClassifier(class_weight='balanced', 
                               criterion = 'gini',
                               n_estimators= 50, 
                               min_samples_split=2, 
                               min_samples_leaf=1,
                               min_weight_fraction_leaf=0.0,
                               max_features=None,
                               max_depth=None, 
                               bootstrap=True,
                               random_state=SEED,
                               n_jobs=-1, 
                               verbose=1)
aucScore, stdDev, scoresArray = get_cv_score(random_forest_clf, x_df, labels, iterations=5, get_details=True)
print(f'AUC score across 5 iterations: {aucScore}')
print(f'Standard Deviation across 5 iterations: {stdDev}')
print(scoresArray)
alert_when_done()

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  8.2min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  9.8min finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:    0.2s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed: 10.9min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed: 12.6min finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:    0.2s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  9.9min
[Parall

AUC score across 5 iterations: 0.6937681240564266
Standard Deviation across 5 iterations: 0.0020516539293107946
[0.6919626044794074, 0.691054186193628, 0.6938487947227739, 0.6954568867827342, 0.6965181481035898]


In [33]:
# Visualize the confusion matrix for the RandomForestClassifier
conf_matrix = get_confusion_matrix(random_forest_clf, x_df, labels)
print(conf_matrix)

# Interpret results of confusion matrix
print(f'Number of true positives (FAKE article correctly classified): {conf_matrix[0][0]}')
print(f'Number of true negatives (REAL article correctly classified): {conf_matrix[1][1]}')
print(f'Number of false positives (REAL article wrongly classified as FAKE): {conf_matrix[1][0]}')
print(f'Number of false negatives (FAKE article wrongly classified as REAL): {conf_matrix[0][1]}')

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed: 11.1min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed: 12.5min finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:    0.1s finished


[[1079  605]
 [ 459 1355]]
Number of true positives (FAKE article correctly classified): 1079
Number of true negatives (REAL article correctly classified): 1355
Number of false positives (REAL article wrongly classified as FAKE): 459
Number of false negatives (FAKE article wrongly classified as REAL): 605


## LinearSVC: Support Vector Machines

In [34]:
# Use a LinearSVC and fit it on the transformed TF-IDF vectors
svc_clf = LinearSVC(class_weight='balanced', penalty='l1', dual=False, C=0.3, max_iter=1e4, verbose=1)
aucScore, stdDev, scoresArray = get_cv_score(svc_clf, x_df, labels, iterations=5, get_details=True)
print(f'AUC score across 5 iterations: {aucScore}')
print(f'Standard Deviation across 5 iterations: {stdDev}')
print(scoresArray)
alert_when_done()

[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear]AUC score across 5 iterations: 0.7141340728798704
Standard Deviation across 5 iterations: 0.004489969355050637
[0.715412935570596, 0.7100148587438558, 0.7079491895338287, 0.7173787203147458, 0.7199146602363256]


In [35]:
# Visualize the confusion matrix for the LinearSVC
conf_matrix = get_confusion_matrix(svc_clf, x_df, labels)
print(conf_matrix)

# Interpret results of confusion matrix
print(f'Number of true positives (FAKE article correctly classified): {conf_matrix[0][0]}')
print(f'Number of true negatives (REAL article correctly classified): {conf_matrix[1][1]}')
print(f'Number of false positives (REAL article wrongly classified as FAKE): {conf_matrix[1][0]}')
print(f'Number of false negatives (FAKE article wrongly classified as REAL): {conf_matrix[0][1]}')

[LibLinear][[1125  559]
 [ 413 1401]]
Number of true positives (FAKE article correctly classified): 1125
Number of true negatives (REAL article correctly classified): 1401
Number of false positives (REAL article wrongly classified as FAKE): 413
Number of false negatives (FAKE article wrongly classified as REAL): 559


## The top 3 performing model types after 5-fold validation that we will choose to tune further are:
- 1.) LogisticRegressor: 71.5% --> Best AUC score
- 2.) LinearSVC: 71.3% --> Also had the lowest standard deviation of 0.00307
- 3.) XGBClassifier tuned with (eta=0.01, n_estimators=350, max_depth = 9): 71.1% --> Could possibly do better with more tuning

# 5-fold Cross-Validation and GridSearch Tuning

In [36]:
# Run GridSearchCV to find best params for XGBClassifier
# Find best n_estimators and max_depth hyperparms using 2D GridSearchCV
n_estimators = [x for x in range(100, 1000, 5)]
max_depth = [x for x in range(6, 12, 1)]
param_grid = {'n_estimators': n_estimators,
              'max_depth': max_depth}
gridsearch = GridSearchCV(XGBClassifier(
                                  eta=0.01, 
                                  n_estimators=350, 
                                  max_depth=10, 
                                  n_jobs=-1, 
                                  verbosity=1), param_grid, scoring='roc_auc', n_jobs=-1, cv=5, verbose=1)
gridsearch.fit(tfidf_train, y_train)

NameError: name 'tfidf_train' is not defined

In [None]:
gridsearch.cv_results_

In [None]:
# Run GridSearchCV to find best params for LogisticRegression
# Find best n_estimators and max_depth hyperparms using 2D GridSearchCV
l1_ratios = [x for x in range(0, 1, 0.1]
C_list = [x for x in range(0, 10, 0.1)]
param_grid = {'l1_ratio': l1_ratios,
              'C': C_list }
gridsearch2 = GridSearchCV(LogisticRegression(
                                            penalty='elasticnet', 
                                            l1_ratio=1, 
                                            C=1, 
                                            solver='saga', 
                                            max_iter=1e4, 
                                            n_jobs=-1), param_grid, scoring='roc_auc', n_jobs=-1, cv=5, verbose=1)
gridsearch2.fit(tfidf_train, y_train)

In [None]:
gridsearch2.cv_results_