## Baseline Model

This workbook implements model that makes predictions based on word counts, using CountVectorizer

In [1]:
# General libraries.
import json
import os
import re
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import nltk
import datetime

# SK-learn libraries for learning.
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.grid_search import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD
from sklearn import mixture
from sklearn.model_selection import train_test_split

# SK-learn libraries for evaluation.
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

# SK-learn libraries for feature extraction from text.
from sklearn.feature_extraction.text import *



### Load Data

In [2]:
datapath = 'data/fakeNewsDatasets_Perez-Rosas2018'

# Function to load news articles
def tabulate_data(dataset_name):
    """Create a Pandas dataframe out of input Perez-Rosas dataset files
    @param dataset_name: Name of the dataset (fakenews or celebrity)
    @returns Pandas dataframe with columns:
        dataset_name, news_type, news_category, news_headline, news_content
    """
    def remove_numbers(in_str):
        return re.sub(r'[0-9]+', '', in_str)

    result_data_list = []
    data_dir = datapath
    for news_type in ['fake', 'legit']:
        folder = '%s/%s/%s' % (data_dir, dataset_name, news_type)
        for fname in os.listdir(folder):
            result_data = {}
            result_data['dataset_name'] = dataset_name
            result_data['news_type'] = news_type
            if news_type == 'fake':
                result_data['is_fake'] = 1
            else:
                result_data['is_fake'] = 0
            if dataset_name == 'fakeNewsDataset':
                result_data['news_category'] = remove_numbers(fname.split('.')[0])
            result_data['file_name'] = fname
            filepath = os.path.join(folder, fname)
            with open(filepath, 'r', encoding="utf8") as f:
                file_data = f.read().split('\n')
                # Some articles don't have a headline, but only article body.
                if len(file_data) > 1:
                    news_content_data = ' '.join(file_data[2:])
                    result_data['news_headline'] = file_data[0]
                else:
                    news_content_data = file_data[0]
                    result_data['news_headline'] = ''
                result_data['news_content'] = news_content_data
                result_data['news_all'] = ' '.join(file_data[0:])
                result_data_list.append(result_data)
    df = pd.DataFrame(result_data_list)
    return df

In [3]:
# Load fake news data
fakenews_df = tabulate_data('fakeNewsDataset')
fakenews_df.head()

Unnamed: 0,dataset_name,file_name,is_fake,news_all,news_category,news_content,news_headline,news_type
0,fakeNewsDataset,polit19.fake.txt,1,FBI investigates computer link between Trump a...,polit,(CNN)Federal investigators and computer scie...,FBI investigates computer link between Trump a...,fake
1,fakeNewsDataset,tech028.fake.txt,1,Google steals user location information with a...,tech,Alphabet Inc's Google announced on Wednesday t...,Google steals user location information with a...,fake
2,fakeNewsDataset,polit34.fake.txt,1,"Biden: Trump was wiretapped, but not by US ...",polit,Joe Biden said President Donald Trump was in...,"Biden: Trump was wiretapped, but not by US",fake
3,fakeNewsDataset,edu27.fake.txt,1,"Harvard Law, Moving to Limit Applicant Pool, W...",edu,"Harvard Law School, moving to close its door...","Harvard Law, Moving to Limit Applicant Pool, W...",fake
4,fakeNewsDataset,edu14.fake.txt,1,Microsoft Aims to spread liberalism on the suc...,edu,"With the launch of ""Minecraft"" edition crea...",Microsoft Aims to spread liberalism on the suc...,fake


In [12]:
# Split data into training and test sets
newsVectors, newsVectors_test, classVector, classVector_test = train_test_split(fakenews_df['news_all'],
                                                                                fakenews_df['is_fake'], 
                                                            test_size = .2, random_state = 1)

### Functions to Preprocess Text

In [20]:
# *** Run this download once! ***

# nltk.download('popular')

[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to
[nltk_data]    |     /home/miketp333/nltk_data...
[nltk_data]    |   Unzipping corpora/cmudict.zip.
[nltk_data]    | Downloading package gazetteers to
[nltk_data]    |     /home/miketp333/nltk_data...
[nltk_data]    |   Unzipping corpora/gazetteers.zip.
[nltk_data]    | Downloading package genesis to
[nltk_data]    |     /home/miketp333/nltk_data...
[nltk_data]    |   Unzipping corpora/genesis.zip.
[nltk_data]    | Downloading package gutenberg to
[nltk_data]    |     /home/miketp333/nltk_data...
[nltk_data]    |   Unzipping corpora/gutenberg.zip.
[nltk_data]    | Downloading package inaugural to
[nltk_data]    |     /home/miketp333/nltk_data...
[nltk_data]    |   Unzipping corpora/inaugural.zip.
[nltk_data]    | Downloading package movie_reviews to
[nltk_data]    |     /home/miketp333/nltk_data...
[nltk_data]    |   Unzipping corpora/movie_reviews.zip.
[nltk_data]    | Downlo

True

In [21]:
# Removing stopwords and punctuations

from string import punctuation
from nltk.corpus import stopwords
from nltk import sent_tokenize, word_tokenize

stopwords_en = set(stopwords.words('english'))

# Stopwords from stopwords-json
stopwords_json = {"en":["a","a's","able","about","above","according","accordingly","across","actually","after","afterwards","again","against","ain't","all","allow","allows","almost","alone","along","already","also","although","always","am","among","amongst","an","and","another","any","anybody","anyhow","anyone","anything","anyway","anyways","anywhere","apart","appear","appreciate","appropriate","are","aren't","around","as","aside","ask","asking","associated","at","available","away","awfully","b","be","became","because","become","becomes","becoming","been","before","beforehand","behind","being","believe","below","beside","besides","best","better","between","beyond","both","brief","but","by","c","c'mon","c's","came","can","can't","cannot","cant","cause","causes","certain","certainly","changes","clearly","co","com","come","comes","concerning","consequently","consider","considering","contain","containing","contains","corresponding","could","couldn't","course","currently","d","definitely","described","despite","did","didn't","different","do","does","doesn't","doing","don't","done","down","downwards","during","e","each","edu","eg","eight","either","else","elsewhere","enough","entirely","especially","et","etc","even","ever","every","everybody","everyone","everything","everywhere","ex","exactly","example","except","f","far","few","fifth","first","five","followed","following","follows","for","former","formerly","forth","four","from","further","furthermore","g","get","gets","getting","given","gives","go","goes","going","gone","got","gotten","greetings","h","had","hadn't","happens","hardly","has","hasn't","have","haven't","having","he","he's","hello","help","hence","her","here","here's","hereafter","hereby","herein","hereupon","hers","herself","hi","him","himself","his","hither","hopefully","how","howbeit","however","i","i'd","i'll","i'm","i've","ie","if","ignored","immediate","in","inasmuch","inc","indeed","indicate","indicated","indicates","inner","insofar","instead","into","inward","is","isn't","it","it'd","it'll","it's","its","itself","j","just","k","keep","keeps","kept","know","known","knows","l","last","lately","later","latter","latterly","least","less","lest","let","let's","like","liked","likely","little","look","looking","looks","ltd","m","mainly","many","may","maybe","me","mean","meanwhile","merely","might","more","moreover","most","mostly","much","must","my","myself","n","name","namely","nd","near","nearly","necessary","need","needs","neither","never","nevertheless","new","next","nine","no","nobody","non","none","noone","nor","normally","not","nothing","novel","now","nowhere","o","obviously","of","off","often","oh","ok","okay","old","on","once","one","ones","only","onto","or","other","others","otherwise","ought","our","ours","ourselves","out","outside","over","overall","own","p","particular","particularly","per","perhaps","placed","please","plus","possible","presumably","probably","provides","q","que","quite","qv","r","rather","rd","re","really","reasonably","regarding","regardless","regards","relatively","respectively","right","s","said","same","saw","say","saying","says","second","secondly","see","seeing","seem","seemed","seeming","seems","seen","self","selves","sensible","sent","serious","seriously","seven","several","shall","she","should","shouldn't","since","six","so","some","somebody","somehow","someone","something","sometime","sometimes","somewhat","somewhere","soon","sorry","specified","specify","specifying","still","sub","such","sup","sure","t","t's","take","taken","tell","tends","th","than","thank","thanks","thanx","that","that's","thats","the","their","theirs","them","themselves","then","thence","there","there's","thereafter","thereby","therefore","therein","theres","thereupon","these","they","they'd","they'll","they're","they've","think","third","this","thorough","thoroughly","those","though","three","through","throughout","thru","thus","to","together","too","took","toward","towards","tried","tries","truly","try","trying","twice","two","u","un","under","unfortunately","unless","unlikely","until","unto","up","upon","us","use","used","useful","uses","using","usually","uucp","v","value","various","very","via","viz","vs","w","want","wants","was","wasn't","way","we","we'd","we'll","we're","we've","welcome","well","went","were","weren't","what","what's","whatever","when","whence","whenever","where","where's","whereafter","whereas","whereby","wherein","whereupon","wherever","whether","which","while","whither","who","who's","whoever","whole","whom","whose","why","will","willing","wish","with","within","without","won't","wonder","would","wouldn't","x","y","yes","yet","you","you'd","you'll","you're","you've","your","yours","yourself","yourselves","z","zero"]}
stopwords_json_en = set(stopwords_json['en'])
stopwords_nltk_en = set(stopwords.words('english'))
stopwords_punct = set(punctuation)

# Combine the stopwords
stoplist_combined = set.union(stopwords_json_en, stopwords_nltk_en, stopwords_punct)

In [22]:
# Removing stemming and lemmatization

from nltk import pos_tag
from nltk.stem import WordNetLemmatizer

wnl = WordNetLemmatizer()

# tagging part of speech so that lemmatization can be done
def penn2morphy(penntag):
    """ Converts Penn Treebank tags to WordNet. """
    morphy_tag = {'NN':'n', 'JJ':'a',
                  'VB':'v', 'RB':'r'}
    try:
        return morphy_tag[penntag[:2]]
    except:
        return 'n' 
    
def lemmatize_sent(text): 
    # Text input is string, returns lowercased strings.
    return [wnl.lemmatize(word.lower(), pos=penn2morphy(tag)) 
            for word, tag in pos_tag(word_tokenize(text))]

In [23]:
# preprocessor

def preprocess_text(text):
    # Input: str, i.e. document/sentence
    # Output: list(str) , i.e. list of lemmas
    
    # Convert all text to lowercase
    text = text.lower()
    
    # Replace sequences of numbers with a single token
    text = re.sub(r'\d+','numseq',text)
    
    # Remove non letter characters
    text = re.sub(r'[^a-zA-Z]+', ' ', text)
    
    return [word for word in lemmatize_sent(text) 
            if word not in stoplist_combined
            and not word.isdigit()]


### Vectorize Data

In [24]:
vectorizer = CountVectorizer(analyzer=preprocess_text)
train_data = vectorizer.fit_transform(newsVectors)
test_data = vectorizer.transform(newsVectors_test)

print("Size of the vocabulary:", train_data.shape[1])

Size of the vocabulary: 5273


##### Multinomial Naive Bayes Model

In [29]:
# Multinomial modeling
alphas = {'alpha': [0.01, 0.05, 0.1, 0.5, 0.8, 1, 1.5, 5, 10]}
multi_gs = GridSearchCV(estimator=MultinomialNB(), param_grid=alphas)
multi_gs.fit(train_data, classVector)

multi_clf_best = MultinomialNB(alpha=multi_gs.best_params_['alpha'])
multi_clf_best.fit(train_data, classVector)
multi_clf_best_predicted = multi_clf_best.predict(test_data)
fpr, tpr, _ = metrics.roc_curve(classVector_test, multi_clf_best_predicted)

print('----- Multinomial Naive Bayes -----')
print (multi_gs.best_params_)
print("Multinomial accuracy: {:2.5f}%".format(accuracy_score(multi_clf_best_predicted, classVector_test) * 100))
print("Multinomial accuracy (AUC): {:2.5f}%".format(metrics.auc(fpr,tpr) * 100))

print('Number of predictions:', len(multi_clf_best_predicted))
print('Number predicted as Fake News:',sum(multi_clf_best_predicted))
print('-----------------------------------')
print('--- Confusion Matrix ---')
print(confusion_matrix(classVector_test, multi_clf_best_predicted))

----- Multinomial Naive Bayes -----
{'alpha': 10}
Multinomial accuracy: 23.95833%
Multinomial accuracy (AUC): 22.67857%
Number of predictions: 96
Number predicted as Fake News: 45
-----------------------------------
--- Confusion Matrix ---
[[17 39]
 [34  6]]


##### Logistic Regression Model

In [31]:
# Logisitc modeling

log_clf = LogisticRegression()
C = {"C": [0.001, 0.01, 0.02, 0.03, 0.04, 0.05, 0.1, 0.5, 0.8, 1, 1.5, 5, 10]}
log_gs = GridSearchCV(log_clf,C)
log_gs.fit(train_data, classVector)

log_clf_best = LogisticRegression(C=log_gs.best_params_['C'])
log_clf_best.fit(train_data, classVector)
log_clf_best_predicted = log_clf_best.predict(test_data)
fpr, tpr, _ = metrics.roc_curve(classVector_test, log_clf_best_predicted)

print('----- Logistic Regression -----')
print (log_gs.best_params_)
print ("Logistic accuracy: {:2.5f}%".format(accuracy_score(log_clf_best_predicted, classVector_test) * 100))
print ("Logistic accuracy (AUC): {:2.5f}%".format(metrics.auc(fpr,tpr) * 100))

# print confusion matrix to identify mistakes
print('--- Confusion Matrix ---')
print (confusion_matrix(classVector_test, log_clf_best_predicted))
print('-----------------------------------')

----- Logistic Regression -----
{'C': 0.001}
Logistic accuracy: 44.79167%
Logistic accuracy (AUC): 44.46429%
--- Confusion Matrix ---
[[26 30]
 [23 17]]
-----------------------------------
