In [955]:
import numpy as np
import pandas as pd

import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

import spacy
from spacy import displacy

import string
import re
import bs4 as BeautifulSoup
import fasttext

import re
import itertools
from collections import Counter

from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.base import BaseEstimator, TransformerMixin

from sklearn.ensemble import RandomForestClassifier

import warnings
warnings.filterwarnings("ignore")

In [956]:
stop_words = stopwords.words("english")
nlp = spacy.load('en_core_web_sm')

In [957]:
url_regex = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'

## Setup custom transformers for pipeine

Features extracted via custom transformers are based on trends discovered during data analysis

In [958]:
# create class for custom transformer
class CharCounter(BaseEstimator, TransformerMixin):
  
    # takes in a 2d array X for the feature data and a 1d array y for the target labels
    def fit(self, X, y=None):
        return self
    # transform method also takes a 2d array X
    def transform(self, X):
        
        n_char = X.str.len()
        
        X_values = n_char
        
        return pd.DataFrame(np.array(X_values))

In [959]:
# create class for custom transformer
class CaseCounter(BaseEstimator, TransformerMixin):
  
    # takes in a 2d array X for the feature data and a 1d array y for the target labels
    def fit(self, X, y=None):
        return self
    # transform method also takes a 2d array X
    def transform(self, X):
        
        n_title_case = pd.Series(X).apply(lambda x: sum(1 for c in x if c.isupper())).values
        n_char = X.str.len()
        
        X_values = n_title_case / n_char
        
        return pd.DataFrame(np.array(X_values))

In [960]:
# create class for custom transformer
class StopWordCounter(BaseEstimator, TransformerMixin):
  
    # takes in a 2d array X for the feature data and a 1d array y for the target labels
    def fit(self, X, y=None):
        return self
    
    # transform method also takes a 2d array X
    def transform(self, X):
        
        '''
        INPUT: string
        OUPUT: number of stopwords found (int)
        '''

        X_values = []

        for text in X:
            stop_words_found = []

            for i in text.split():
                if i in stop_words:
                    stop_words_found.append(1)

            X_values.append(sum(stop_words_found))

        return pd.DataFrame(np.array(X_values))

In [961]:
# create class for custom transformer
class WordPronCounter(BaseEstimator, TransformerMixin):
  
    # takes in a 2d array X for the feature data and a 1d array y for the target labels
    def fit(self, X, y=None):
        return self
    
    # transform method also takes a 2d array X
    def transform(self, X):
        
        X_values = []

        for text in X:
            pronouns = []

            for token in nlp(text):
                if token.pos_ == 'PRON':
                    pronouns.append(token)

            X_values.append(len(pronouns))
                
        return pd.DataFrame(np.array(X_values))

In [962]:
# create class for custom transformer
class WordNounCounter(BaseEstimator, TransformerMixin):
  
    # takes in a 2d array X for the feature data and a 1d array y for the target labels
    def fit(self, X, y=None):
        return self
    
    # transform method also takes a 2d array X
    def transform(self, X):
        
        X_values = []

        for text in X:
            pronouns = []

            for token in nlp(text):
                if token.pos_ == 'NOUN':
                    pronouns.append(token)

            X_values.append(len(pronouns))
                
        return pd.DataFrame(np.array(X_values))

In [963]:
# create class for custom transformer
class WordAdjCounter(BaseEstimator, TransformerMixin):
  
    # takes in a 2d array X for the feature data and a 1d array y for the target labels
    def fit(self, X, y=None):
        return self
    
    # transform method also takes a 2d array X
    def transform(self, X):
        
        X_values = []

        for text in X:
            pronouns = []

            for token in nlp(text):
                if token.pos_ == 'ADJ':
                    pronouns.append(token)

            X_values.append(len(pronouns))
                
        return pd.DataFrame(np.array(X_values))

## Load data and pipeline

In [964]:
def tokenize(text):

    '''
    INPUT: String to tokenise, detect and replace URLs
    OUTPUT: List of tokenised string items
    '''

    # Remove punctuations and numbers
    text = re.sub('[^a-zA-Z]', ' ', text)

    # Single character removal
    text = re.sub(r"\s+[a-zA-Z]\s+", ' ', text)

    # Removing multiple spaces
    text = re.sub(r'\s+', ' ', text)

    text = [w for w in text.split() if not w in stop_words]

    # Join list to string
    text = " ".join(text)

    # Replace URLs if any
    detected_urls = re.findall(url_regex, text)
    for url in detected_urls:
        text = text.replace(url, "urlplaceholder")

    # Setup tokens and lemmatize
    tokens = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()

    # Create tokens and lemmatize
    clean_tokens = []
    for tok in tokens:
        clean_tok = lemmatizer.lemmatize(tok).lower().strip()
        clean_tokens.append(clean_tok)

    return clean_tokens

In [965]:
def model_pipeline():
    pipeline = Pipeline([
        ('features', FeatureUnion([
            ('text_pipeline', Pipeline([
                ('vect', CountVectorizer(tokenizer=tokenize)),
                ('tfidf', TfidfTransformer())
            ])),
            ('char_counter', CharCounter()),
            ('case_counter', CaseCounter()),
            ('stop_counter', StopWordCounter()),
            ('pro_counter', WordPronCounter()),
            ('noun_counter', WordNounCounter()),
            ('adj_counter', WordAdjCounter())
        ])),
        ('clf', RandomForestClassifier())
    ])

    return pipeline

In [966]:
df = pd.read_csv('https://raw.githubusercontent.com/susanli2016/NLP-with-Python/master/data/corona_fake.csv')

In [967]:
def load_data(df):

    '''
    INPUT: string directory of db
    OUTPUT: x message pd column, y categorical column labels, categorical names
    '''
    
    df = df.dropna(subset=['title'])
    df = df.dropna(subset=['text'])
    df = df.dropna(subset=['label'])
    
    df.drop(axis=1, labels='source', inplace=True)
    
    df['label'].replace('TRUE', 1, inplace=True)
    df['label'].replace('fake', 0, inplace=True)
    df['label'].replace('Fake', 0, inplace=True)
    
    df.reset_index(drop=True, inplace=True)

    # read in file
    X = df['title']
    Y = df['label']
    
    return X, Y

In [968]:
X, Y = load_data(df)

In [969]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.25)

## Fit model and check accuracy

In [970]:
model = model_pipeline()
model.fit(X_train, y_train);

In [971]:
y_pred = model.predict(X_test)

In [972]:
def display_results(y_test, y_pred):
    labels = np.unique(y_pred)
    confusion_mat = confusion_matrix(y_test, y_pred, labels=labels)
    accuracy = (y_pred == y_test).mean()

    print("Accuracy:", accuracy)

In [973]:
display_results(y_test, y_pred)

Accuracy: 0.8246268656716418


In [974]:
report = classification_report(y_true=y_test,
                               y_pred=y_pred,
                               target_names=['fake','true'],
                               output_dict=True)

In [975]:
pd.DataFrame(report).transpose().style\
.highlight_max(color='lightgreen', subset=['precision'])\
.highlight_min(color='lightgreen', subset=['precision'] )\
.highlight_max(color='lightgreen', subset=['recall'])\
.highlight_min(color='lightgreen', subset=['recall'] )\
.highlight_max(color='lightgreen', subset=['f1-score'])\
.highlight_min(color='lightgreen', subset=['f1-score'] )

Unnamed: 0,precision,recall,f1-score,support
fake,0.830508,0.784,0.806584,125.0
true,0.82,0.86014,0.83959,143.0
accuracy,0.824627,0.824627,0.824627,0.824627
macro avg,0.825254,0.82207,0.823087,268.0
weighted avg,0.824901,0.824627,0.824196,268.0


## Hyperparameter tuning

In [976]:
pipeline = model_pipeline()

In [977]:
parameters = {
    'features__text_pipeline__vect__ngram_range': ((1, 1), (1, 2)),
#     'features__text_pipeline__vect__max_df': (0.5, 1.0),
#     'clf__min_samples_split': [2, 3, 4],
    'features__transformer_weights': (
        {'text_pipeline': 1, 'char_counter': 0.5},
        {'text_pipeline': 0.5, 'char_counter': 1},

    )
}

cv = GridSearchCV(pipeline, param_grid=parameters)

In [978]:
cv.fit(X_train, y_train);

In [979]:
y_pred = model.predict(X_test)

In [980]:
def display_cv_results(cv, y_test, y_pred):
    labels = np.unique(y_pred)
    confusion_mat = confusion_matrix(y_test, y_pred, labels=labels)
    accuracy = (y_pred == y_test).mean()

    print("Labels:", labels)
    print("Confusion Matrix:\n", confusion_mat)
    print("Accuracy:", accuracy)
    print("\nBest Parameters:", cv.best_params_)

In [981]:
display_cv_results(cv, y_test, y_pred)

Labels: [0 1]
Confusion Matrix:
 [[ 98  27]
 [ 20 123]]
Accuracy: 0.8246268656716418

Best Parameters: {'features__text_pipeline__vect__ngram_range': (1, 1), 'features__transformer_weights': {'text_pipeline': 1, 'char_counter': 0.5}}


In [984]:
report = classification_report(y_true=y_test,
                               y_pred=y_pred,
                               target_names=['fake','true'],
                               output_dict=True)

In [985]:
pd.DataFrame(report).transpose().style\
.highlight_max(color='lightgreen', subset=['precision'])\
.highlight_min(color='lightgreen', subset=['precision'] )\
.highlight_max(color='lightgreen', subset=['recall'])\
.highlight_min(color='lightgreen', subset=['recall'] )\
.highlight_max(color='lightgreen', subset=['f1-score'])\
.highlight_min(color='lightgreen', subset=['f1-score'] )

Unnamed: 0,precision,recall,f1-score,support
fake,0.830508,0.784,0.806584,125.0
true,0.82,0.86014,0.83959,143.0
accuracy,0.824627,0.824627,0.824627,0.824627
macro avg,0.825254,0.82207,0.823087,268.0
weighted avg,0.824901,0.824627,0.824196,268.0


<h2>Multiple classifer test</h2>

In [982]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.model_selection import cross_validate

In [983]:
clfs = []
clfs.append(RandomForestClassifier())
clfs.append(AdaBoostClassifier())
clfs.append(GradientBoostingClassifier())
clfs.append(SVC())
clfs.append(LogisticRegression())
clfs.append(KNeighborsClassifier(n_neighbors=3))
clfs.append(DecisionTreeClassifier())

classifier_name = []
mean_value = []
std_value = []

for classifier in clfs:
    pipeline.set_params(clf = classifier)
    scores = cross_validate(pipeline, X_train, y_train)
    print('---------------------------------')
    print(str(classifier))
    print('-----------------------------------')
    
    for key, values in scores.items():
        
        classifier_name.append(classifier)
        mean_value.append(values.mean())
        std_value.append(values.std())
        
        print(key,' mean ', values.mean())
        print(key,' std ', values.std())

---------------------------------
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators='warn',
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)
-----------------------------------
fit_time  mean  15.30619223912557
fit_time  std  1.5044411739878019
score_time  mean  6.76354996363322
score_time  std  0.39624837654681416
test_score  mean  0.7952821990131009
test_score  std  0.02125308090319933
---------------------------------
AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=1.0,
                   n_estimators=50, random_state=None)
-----------------------------------
fit_time  mean  1