In [136]:
import numpy as np
import pandas as pd

import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

import spacy
from spacy import displacy

import string
import re
import bs4 as BeautifulSoup
import fasttext

import re
import itertools
from collections import Counter

from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.base import BaseEstimator, TransformerMixin

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.model_selection import cross_validate
import warnings
warnings.filterwarnings("ignore")

In [114]:
df = pd.read_csv('data/fakenews_corpus_2018.csv')

In [115]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 130444 entries, 0 to 130443
Data columns (total 3 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   Unnamed: 0  130444 non-null  int64 
 1   title       130443 non-null  object
 2   label       130444 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 3.0+ MB


In [116]:
df.head()

Unnamed: 0.1,Unnamed: 0,title,label
0,0,Surprise: Socialist Hotbed Of Venezuela Has Lo...,0
1,1,Water Cooler 1/25/18 Open Thread; Fake News ? ...,0
2,2,Veteran Commentator Calls Out the Growing “Eth...,0
3,3,"Lost Words, Hidden Words, Otters, Banks and Books",0
4,4,Red Alert: Bond Yields Are SCREAMING “Inflatio...,0


In [117]:
df.drop('Unnamed: 0', axis=1, inplace=True)

In [118]:
df.dropna(inplace=True)

In [119]:
df.reset_index(drop=True, inplace=True)

In [120]:
df.label.value_counts()

0    123849
1      6594
Name: label, dtype: int64

In [121]:
# normalising distribution of labels

df = pd.concat([df[df['label'] == 0].head(6594), df[df['label'] == 1]])

In [122]:
df.label.value_counts()

1    6594
0    6594
Name: label, dtype: int64

<h2>Define pipeline settings</h2>

In [94]:
# For the sake of simplicity and just being neat a seperate library is imported, holding the custom transformers

from custom_transformers import CharCounter
from custom_transformers import CaseCounter
from custom_transformers import StopWordCounter
from custom_transformers import WordPronCounter
from custom_transformers import WordNounCounter
from custom_transformers import WordAdjCounter

In [95]:
stop_words = stopwords.words("english")

url_regex = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'

In [96]:
def tokenize(text):

    '''
    INPUT: String to tokenise, detect and replace URLs
    OUTPUT: List of tokenised string items
    '''

    # Remove punctuations and numbers
    text = re.sub('[^a-zA-Z]', ' ', text)

    # Single character removal
    text = re.sub(r"\s+[a-zA-Z]\s+", ' ', text)

    # Removing multiple spaces
    text = re.sub(r'\s+', ' ', text)

    text = [w for w in text.split() if not w in stop_words]

    # Join list to string
    text = " ".join(text)

    # Replace URLs if any
    detected_urls = re.findall(url_regex, text)
    for url in detected_urls:
        text = text.replace(url, "urlplaceholder")

    # Setup tokens and lemmatize
    tokens = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()

    # Create tokens and lemmatize
    clean_tokens = []
    for tok in tokens:
        clean_tok = lemmatizer.lemmatize(tok).lower().strip()
        clean_tokens.append(clean_tok)

    return clean_tokens

In [123]:
def model_pipeline():
    
    '''
    INPUT: None
    OUTPUT: pipeline object used to .fit X_train and y_train datasets
    '''
    
    pipeline = Pipeline([
        ('features', FeatureUnion([
            ('text_pipeline', Pipeline([
                ('vect', CountVectorizer(tokenizer=tokenize)),
                ('tfidf', TfidfTransformer())
            ])),
            ('char_counter', CharCounter()),
            ('case_counter', CaseCounter()),
            ('stop_counter', StopWordCounter()),
            ('pro_counter', WordPronCounter()),
            ('noun_counter', WordNounCounter()),
            ('adj_counter', WordAdjCounter())
        ])),
        ('clf', LogisticRegression())
    ])

    return pipeline

In [124]:
X = df['title']
Y = df['label']

In [125]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.25)

In [138]:
model = model_pipeline()
model.fit(X_train, y_train);

In [127]:
y_pred = model.predict(X_test)

<h2>Accuracy results</h2>

In [128]:
def display_results(y_test, y_pred):
    
    '''
    INPUT: y_test, y_pred dfs
    OUTPUT: print average accuracy score
    '''
    
    labels = np.unique(y_pred)
    confusion_mat = confusion_matrix(y_test, y_pred, labels=labels)
    accuracy = (y_pred == y_test).mean()

    print("Accuracy:", accuracy)

In [129]:
display_results(y_test, y_pred)

Accuracy: 0.8507734303912647


In [130]:
# For more granular performance evaluation, a classification report is initialised and displayed as a pd table

report = classification_report(y_true=y_test,
                               y_pred=y_pred,
                               target_names=['fake','true'],
                               output_dict=True)

In [131]:
pd.DataFrame(report).transpose().style\
.highlight_max(color='lightgreen', subset=['precision'])\
.highlight_min(color='lightgreen', subset=['precision'] )\
.highlight_max(color='lightgreen', subset=['recall'])\
.highlight_min(color='lightgreen', subset=['recall'] )\
.highlight_max(color='lightgreen', subset=['f1-score'])\
.highlight_min(color='lightgreen', subset=['f1-score'] )

Unnamed: 0,precision,recall,f1-score,support
fake,0.823428,0.884687,0.852959,1613.0
true,0.881074,0.81829,0.848522,1684.0
accuracy,0.850773,0.850773,0.850773,0.850773
macro avg,0.852251,0.851488,0.85074,3297.0
weighted avg,0.852872,0.850773,0.850693,3297.0


## Multiple classifier evaluation

In [137]:
clfs = []
clfs.append(RandomForestClassifier())
clfs.append(AdaBoostClassifier())
clfs.append(GradientBoostingClassifier())
clfs.append(SVC())
clfs.append(LogisticRegression())
clfs.append(KNeighborsClassifier(n_neighbors=3))
clfs.append(DecisionTreeClassifier())

classifier_name = []
mean_value = []
std_value = []

for classifier in clfs:
    model.set_params(clf = classifier)
    scores = cross_validate(model, X_train, y_train)
    print('---------------------------------')
    print(str(classifier))
    print('-----------------------------------')
    
    for key, values in scores.items():
        
        classifier_name.append(classifier)
        mean_value.append(values.mean())
        std_value.append(values.std())
        
        print(key,' mean ', values.mean())
        print(key,' std ', values.std())

---------------------------------
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators='warn',
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)
-----------------------------------
fit_time  mean  153.26657629013062
fit_time  std  2.336130197826069
score_time  mean  73.92878603935242
score_time  std  4.408559949784302
test_score  mean  0.8328773216297352
test_score  std  0.006350048555057398
---------------------------------
AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=1.0,
                   n_estimators=50, random_state=None)
-----------------------------------
fit_time  mean  1