In [458]:
# https://github.com/feeblefruits/disaster_responseline_text_classifier/blob/master/models/train_classifier.py

In [459]:
import numpy as np
import pandas as pd

import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

import spacy
from spacy import displacy

import string
import re
import bs4 as BeautifulSoup
import fasttext

import re
import itertools
from collections import Counter

from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.base import BaseEstimator, TransformerMixin

In [460]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /Users/jacques/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jacques/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/jacques/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [461]:
stop_words = stopwords.words("english")
nlp = spacy.load('en_core_web_sm')

In [462]:
url_regex = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'

In [463]:
# create class for custom transformer
class CaseCounter(BaseEstimator, TransformerMixin):
  
    # takes in a 2d array X for the feature data and a 1d array y for the target labels
    def fit(self, X, y=None):
        return self
    # transform method also takes a 2d array X
    def transform(self, X):
        
        n_title_case = pd.Series(X).apply(lambda x: sum(1 for c in x if c.isupper())).values
        n_char = X.str.len()
        
        X_values = n_title_case / n_char
        
        return pd.DataFrame(np.array(X_values))

In [464]:
# create class for custom transformer
class StopWordCounter(BaseEstimator, TransformerMixin):
  
    # takes in a 2d array X for the feature data and a 1d array y for the target labels
    def fit(self, X, y=None):
        return self
    
    # transform method also takes a 2d array X
    def transform(self, X):
        
        '''
        INPUT: string
        OUPUT: number of stopwords found (int)
        '''

        X_values = []

        for text in X:
            stop_words_found = []

            for i in text.split():
                if i in stop_words:
                    stop_words_found.append(1)

            X_values.append(sum(stop_words_found))

        return pd.DataFrame(np.array(X_values))

In [465]:
# create class for custom transformer
class WordPronounCounter(BaseEstimator, TransformerMixin):
  
    # takes in a 2d array X for the feature data and a 1d array y for the target labels
    def fit(self, X, y=None):
        return self
    
    # transform method also takes a 2d array X
    def transform(self, X):
        
        X_values = []

        for text in X:
            pronouns = []

            for token in nlp(text):
                if token.pos_ == 'PRON':
                    pronouns.append(token)

                X_values.append(len(pronouns))
                
        return pd.DataFrame(np.array(X_values))

In [466]:
def tokenize(text):

    '''
    INPUT: String to tokenise, detect and replace URLs
    OUTPUT: List of tokenised string items
    '''

    # Remove punctuations and numbers
    text = re.sub('[^a-zA-Z]', ' ', text)

    # Single character removal
    text = re.sub(r"\s+[a-zA-Z]\s+", ' ', text)

    # Removing multiple spaces
    text = re.sub(r'\s+', ' ', text)

    text = [w for w in text.split() if not w in stop_words]

    # Join list to string
    text = " ".join(text)

    # Replace URLs if any
    detected_urls = re.findall(url_regex, text)
    for url in detected_urls:
        text = text.replace(url, "urlplaceholder")

    # Setup tokens and lemmatize
    tokens = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()

    # Create tokens and lemmatize
    clean_tokens = []
    for tok in tokens:
        clean_tok = lemmatizer.lemmatize(tok).lower().strip()
        clean_tokens.append(clean_tok)

    return clean_tokens

In [467]:
def model_pipeline():

    '''
    Defining pipeline which includes custom transformer
    '''
    
    # define pipeline    
    pipeline = Pipeline([
        
        # define featureunion inside pipeline
        ('features', FeatureUnion
         ([
             # define another pipeline inside featureunion
             ('nlp_pipe', Pipeline
              ([
                  ('vect', CountVectorizer(tokenizer=tokenize)),
                  ('tfidf', TfidfTransformer())
              ]))
            # define custom transformer
#             ('case_counter', CaseCounter()),
#             ('stop_counter', StopWordCounter()),
#             ('pronoun_counter', WordPronounCounter())
         ])),
        # round off with predict transformer
        ('clf', RandomForestClassifier())
    ])    

    return pipeline

In [468]:
df = pd.read_csv('https://raw.githubusercontent.com/susanli2016/NLP-with-Python/master/data/corona_fake.csv')

In [469]:
def load_data(df):

    '''
    INPUT: string directory of db
    OUTPUT: x message pd column, y categorical column labels, categorical names
    '''
    
    df = df.dropna(subset=['title'])
    df = df.dropna(subset=['text'])
    df = df.dropna(subset=['label'])
    
    df.drop(axis=1, labels='source', inplace=True)
    
    df['label'].replace('TRUE', 1, inplace=True)
    df['label'].replace('fake', 0, inplace=True)
    df['label'].replace('Fake', 0, inplace=True)
    
    df.reset_index(drop=True, inplace=True)

    # read in file
    X = df['title']
    Y = df['label']
    
    return X, Y

In [470]:
X, Y = load_data(df)

In [471]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

In [472]:
model = model_pipeline()
model.fit(X_train, y_train);



In [473]:
y_pred = model.predict(X_test)

In [474]:
def display_results(y_test, y_pred):
    labels = np.unique(y_pred)
    confusion_mat = confusion_matrix(y_test, y_pred, labels=labels)
    accuracy = (y_pred == y_test).mean()

    print("Labels:", labels)
    print("Confusion Matrix:\n", confusion_mat)
    print("Accuracy:", accuracy)

In [475]:
display_results(y_test, y_pred)

Labels: [0 1]
Confusion Matrix:
 [[ 93  44]
 [ 14 117]]
Accuracy: 0.7835820895522388
