## I. Import Libraries and Data

In [None]:
import re
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from tpot import TPOTClassifier
from sklearn.feature_extraction.text import CountVectorizer
from scipy.sparse import coo_matrix, hstack

from sklearn.metrics import classification_report
from sklearn import metrics
import matplotlib.pyplot as plt
import seaborn as sns

from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence

## II. Define Functions

In [None]:
def cleanArticle(string):
    strip_special_chars = re.compile("[^A-Za-z0-9 ]+")
    string = string.lower().replace("<br />", " ")
    return re.sub(strip_special_chars, "", string.lower())

def read_perez_dataset(dataset_name):
    
    def remove_numbers(in_str):
        return re.sub(r'[0-9]+', '', in_str)
    
    print("Reading dataset")
    result_data_list = []
    data_dir = PEREZ_DATASET_PATH
    for news_type in ['fake', 'legit']:
        folder = '%s/%s/%s' % (data_dir, dataset_name, news_type)
        for fname in os.listdir(folder):
            result_data = {}
            result_data['dataset_name'] = dataset_name
            result_data['news_type'] = news_type
            if news_type == 'fake':
                result_data['is_fake'] = 1
            else:
                result_data['is_fake'] = 0
            if dataset_name == 'fakeNewsDataset':
                result_data['news_category'] = remove_numbers(fname.split('.')[0])
            result_data['file_name'] = fname
            filepath = os.path.join(folder, fname)
            with open(filepath, 'r', encoding="utf8") as f:
                file_data = f.read().split('\n')
                # Some articles don't have a headline, but only article body.
                if len(file_data) > 1:
                    news_content_data = ' '.join(file_data[2:])
                    result_data['news_headline'] = file_data[0]
                else:
                    news_content_data = file_data[0]
                    result_data['news_headline'] = ''
                result_data['news_content'] = news_content_data
                result_data['news_all'] = ' '.join(file_data[0:])
                result_data_list.append(result_data)
                
    df = pd.DataFrame(result_data_list)
    
    df['news_all_clean'] = df['news_all'].apply(lambda a: cleanArticle(a))
    
    X_train, X_test, y_train, y_test = train_test_split(df.drop(['is_fake',
                                                               'news_type','file_name'],
                                                               axis = 1), 
                                                        df['is_fake'], 
                                                        test_size=.2, random_state=RANDOM_SEED)
    
    print("Finished reading dataset")
    return df, X_train, y_train, X_test, y_test

def model_report(title, y_test, predictions, predictions_proba):

    """
    Output: Classification report, confusion matrix, and ROC curve
    """
    print(title)
    print("---------")
    print(classification_report(y_test, predictions))

    cm = metrics.confusion_matrix(y_test, predictions)
    plt.figure(figsize=(3,3))
    sns.heatmap(cm, annot=True, fmt=".0f", linewidths=.5, square = True, cmap = 'Blues_r');
    plt.ylabel('Actual label');
    plt.xlabel('Predicted label');
    all_sample_title = 'Accuracy: {0}'.format(round(metrics.accuracy_score(y_test, predictions),2))
    plt.title(all_sample_title, size = 15)
    plt.show()
    
    fpr, tpr, threshold = metrics.roc_curve(y_test, predictions_proba)
    roc_auc = metrics.auc(fpr, tpr)

    plt.title('Receiver Operating Characteristic')
    plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
    plt.legend(loc = 'lower right')
    plt.plot([0, 1], [0, 1],'r--')
    plt.xlim([0, 1])
    plt.ylim([0, 1])
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.show()

## III. Read in Data, Define Test/Train Set

In [None]:
RANDOM_SEED = 42
PEREZ_DATASET_PATH = "../data/fakeNewsDatasets_Perez-Rosas2018"
np.random.seed(RANDOM_SEED)
perez_full, train_data, train_labels, test_data, test_labels = read_perez_dataset('fakeNewsDataset')
train_data.head()

## IV. Quick Look at Data

In [None]:
train_data['news_all_clean'].iloc[1]

In [None]:
print("full perez size: ", perez_full.shape)
print("train size: ",train_labels.shape)

In [None]:
sns.catplot(x="news_category", kind = "count", hue="is_fake", data=perez_full)
plt.show()

In [None]:
perez_full.groupby(['news_category','is_fake']).size()

## V. Classification Models with Automated Machine Learning (TPOT)

### A. Unigram

#### 1. Define Hstacked Unigram (news_all_clean & news_category)

In [None]:
vectorizer = CountVectorizer()
vectorizer.fit(train_data['news_all_clean'])
train_news_all_clean_vec = vectorizer.transform(train_data['news_all_clean'])
test_news_all_clean_vec  = vectorizer.transform(test_data['news_all_clean'])
vectorizer = CountVectorizer()
vectorizer.fit(train_data['news_category'])
train_news_category_vec = vectorizer.transform(train_data['news_category'])
test_news_category_vec  = vectorizer.transform(test_data['news_category'])
train_vec = hstack([train_news_all_clean_vec,train_news_category_vec])
test_vec =  hstack([test_news_all_clean_vec,test_news_category_vec])

#### 2. Run TPOT Optimizer

In [None]:
pipeline_optimizer = TPOTClassifier(generations=5, population_size=20, cv=5,
                                    random_state=42, verbosity=2,
                                   config_dict = "TPOT sparse")
#pipeline_optimizer.fit(train_vec, train_labels) #This takes a couple hours to run

#### 3. Output TPOT Algorithm

In [None]:
#pipeline_optimizer.export('tpot_unigram.py')

#### 4. Implement TPOT Algorithm

In [None]:
import numpy as np
import pandas as pd
from sklearn.feature_selection import SelectPercentile, VarianceThreshold, f_classif
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline

# NOTE: Make sure that the class is labeled 'target' in the data file
features = train_vec
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, train_labels.values, random_state=42)

# Score on the training set was:0.6506291953660375
exported_pipeline = make_pipeline(
    VarianceThreshold(threshold=0.45),
    SelectPercentile(score_func=f_classif, percentile=32),
    LogisticRegression(C=0.1, dual=True, penalty="l2")
)

exported_pipeline.fit(training_features, training_target)
TPOT_results = exported_pipeline.predict(testing_features)
ORIG_results = exported_pipeline.predict(test_vec)
TPOT_proba = exported_pipeline.predict_proba(testing_features)
ORIG_proba = exported_pipeline.predict_proba(test_vec)
print("accuracy on TPOT test set: ",np.mean(TPOT_results == testing_target))
print("accuracy on original test set: ",np.mean(ORIG_results == test_labels))

#### 5. Output Unigram Model Report

In [None]:
model_report("Original Test Set Report", test_labels, ORIG_results, ORIG_proba[:,1])