In [55]:
import numpy as np
import pandas as pd

import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

import spacy
from spacy import displacy

import string
import re
import bs4 as BeautifulSoup
import fasttext

import re
import itertools
from collections import Counter

from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.base import BaseEstimator, TransformerMixin

from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

In [56]:
df = pd.read_csv('data/fakenews_corpus_2018.csv')

In [57]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 130444 entries, 0 to 130443
Data columns (total 3 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   Unnamed: 0  130444 non-null  int64 
 1   title       130443 non-null  object
 2   label       130444 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 3.0+ MB


In [58]:
df.head()

Unnamed: 0.1,Unnamed: 0,title,label
0,0,Surprise: Socialist Hotbed Of Venezuela Has Lo...,0
1,1,Water Cooler 1/25/18 Open Thread; Fake News ? ...,0
2,2,Veteran Commentator Calls Out the Growing “Eth...,0
3,3,"Lost Words, Hidden Words, Otters, Banks and Books",0
4,4,Red Alert: Bond Yields Are SCREAMING “Inflatio...,0


In [59]:
df.drop('Unnamed: 0', axis=1, inplace=True)

In [92]:
df.dropna(inplace=True)

In [93]:
df.reset_index(drop=True, inplace=True)

In [106]:
df.label.value_counts()

0    123849
1      6594
Name: label, dtype: int64

<h2>Define pipeline settings</h2>

In [94]:
# For the sake of simplicity and just being neat a seperate library is imported, holding the custom transformers

from custom_transformers import CharCounter
from custom_transformers import CaseCounter
from custom_transformers import StopWordCounter
from custom_transformers import WordPronCounter
from custom_transformers import WordNounCounter
from custom_transformers import WordAdjCounter

In [95]:
stop_words = stopwords.words("english")

url_regex = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'

In [96]:
def tokenize(text):

    '''
    INPUT: String to tokenise, detect and replace URLs
    OUTPUT: List of tokenised string items
    '''

    # Remove punctuations and numbers
    text = re.sub('[^a-zA-Z]', ' ', text)

    # Single character removal
    text = re.sub(r"\s+[a-zA-Z]\s+", ' ', text)

    # Removing multiple spaces
    text = re.sub(r'\s+', ' ', text)

    text = [w for w in text.split() if not w in stop_words]

    # Join list to string
    text = " ".join(text)

    # Replace URLs if any
    detected_urls = re.findall(url_regex, text)
    for url in detected_urls:
        text = text.replace(url, "urlplaceholder")

    # Setup tokens and lemmatize
    tokens = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()

    # Create tokens and lemmatize
    clean_tokens = []
    for tok in tokens:
        clean_tok = lemmatizer.lemmatize(tok).lower().strip()
        clean_tokens.append(clean_tok)

    return clean_tokens

In [97]:
def model_pipeline():
    
    '''
    INPUT: None
    OUTPUT: pipeline object used to .fit X_train and y_train datasets
    '''
    
    pipeline = Pipeline([
        ('features', FeatureUnion([
            ('text_pipeline', Pipeline([
                ('vect', CountVectorizer(tokenizer=tokenize)),
                ('tfidf', TfidfTransformer())
            ])),
            ('char_counter', CharCounter()),
            ('case_counter', CaseCounter()),
            ('stop_counter', StopWordCounter()),
            ('pro_counter', WordPronCounter()),
            ('noun_counter', WordNounCounter()),
            ('adj_counter', WordAdjCounter())
        ])),
        ('clf', RandomForestClassifier())
    ])

    return pipeline

In [98]:
X = df['title']
Y = df['label']

In [99]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.25)

In [100]:
model = model_pipeline()
model.fit(X_train, y_train);



In [101]:
y_pred = model.predict(X_test)

<h2>Accuracy results</h2>

In [102]:
def display_results(y_test, y_pred):
    
    '''
    INPUT: y_test, y_pred dfs
    OUTPUT: print average accuracy score
    '''
    
    labels = np.unique(y_pred)
    confusion_mat = confusion_matrix(y_test, y_pred, labels=labels)
    accuracy = (y_pred == y_test).mean()

    print("Accuracy:", accuracy)

In [103]:
display_results(y_test, y_pred)

Accuracy: 0.9741804912452854


In [104]:
# For more granular performance evaluation, a classification report is initialised and displayed as a pd table

report = classification_report(y_true=y_test,
                               y_pred=y_pred,
                               target_names=['fake','true'],
                               output_dict=True)

In [105]:
pd.DataFrame(report).transpose().style\
.highlight_max(color='lightgreen', subset=['precision'])\
.highlight_min(color='lightgreen', subset=['precision'] )\
.highlight_max(color='lightgreen', subset=['recall'])\
.highlight_min(color='lightgreen', subset=['recall'] )\
.highlight_max(color='lightgreen', subset=['f1-score'])\
.highlight_min(color='lightgreen', subset=['f1-score'] )

Unnamed: 0,precision,recall,f1-score,support
fake,0.97511,0.998323,0.98658,31002.0
true,0.940299,0.509012,0.660484,1609.0
accuracy,0.97418,0.97418,0.97418,0.97418
macro avg,0.957704,0.753667,0.823532,32611.0
weighted avg,0.973393,0.97418,0.970491,32611.0
