# KPMG Data Kontest - Detection of fake news

## 1. Importing libraries

In [5]:
import numpy as np
import pandas as pd
import re, string, unicodedata
import nltk
import contractions
import inflect
from bs4 import BeautifulSoup
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
import argparse
from sklearn.pipeline import Pipeline
from scipy.sparse import csr_matrix

import six
from abc import ABCMeta
from scipy import sparse
from scipy.sparse import issparse
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils import check_X_y, check_array
from sklearn.utils.extmath import safe_sparse_dot
from sklearn.preprocessing import normalize, binarize, LabelBinarizer

import xgboost as xg
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import (RandomForestClassifier, ExtraTreesClassifier,AdaBoostClassifier)
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC

## 2. Importing Dataset

In [6]:
train = pd.read_csv("Train.csv", dtype=object)

In [7]:
train = train[train['content'].duplicated() == False]

In [8]:
train = train.drop(['Unnamed: 0'], axis=1)

In [13]:
train.head(20)

Unnamed: 0,content,title,fake
0,VIDEO: The World Reacts to the Paris Attacks\r...,VIDEO: The World Reacts to the Paris Attacks,1
1,"The Philippine finance secretary, Jose Camacho...",Philippines Denies It Is Boycotting UBS,0
2,"Now 18M Users Strong, Edmodo Makes Its First A...","Now 18M Users Strong, Edmodo Makes Its First A...",1
3,Moral Hazard: The Federal Reserve And Financia...,Moral Hazard: The Federal Reserve And Financia...,1
4,Two Jewish Teens Beaten And Robbed Leaving Syn...,Anti-Semitism – Liberty News,1
5,A prosecutor is suing the judge overseeing the...,National Briefing: South: Georgia: Prosecutor ...,0
6,An expert at Sotheby's said that Mr. Shearman'...,An Old Master Sold at Auction Raises Doubts; S...,0
7,"In Piermont, housing prices have risen 8 to 10...","If You're Thinking of Living In/Piermont, N.Y....",0
8,FRANKEL-Hon. Marvin E. Yeshiva University and ...,"Paid Notice: Deaths FRANKEL, HON. MARVIN E.",0
9,-San Antonio Police Try to Silence Street Prea...,-San Antonio Police Try to Silence Street Prea...,1


In [12]:
# eg of an article
train.content[3]



## 3. Preprocessing

In [5]:
def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

def remove_between_square_brackets(text):
    return re.sub('\[[^]]*\]', '', text)

def replace_contractions(text):
    """Replace contractions in string of text"""
    return contractions.fix(text)

def remove_punctuation(text):
    return re.sub(r'[^\w\s]', '', text)

def stem_words(text):
    return LancasterStemmer.stem(text)

def lemmatize_verbs(text):
    lemmatizer = WordNetLemmatizer()
    return lemmatizer.lemmatize(text, pos='v')

def denoise_text(text):
    text = strip_html(text)
    text = remove_between_square_brackets(text)
    text = contractions.fix(text)
    return text

In [6]:
# Applying denoise_text function on the content column
for idx, item in enumerate(train.content):
    train.iloc[idx,0] = denoise_text(item)
    if idx%5000 == 0:
        print('Here is the ',idx,'th item')

Here is the  0 th item
Here is the  5000 th item
Here is the  10000 th item
Here is the  15000 th item
Here is the  20000 th item
Here is the  25000 th item
Here is the  30000 th item
Here is the  35000 th item
Here is the  40000 th item
Here is the  45000 th item
Here is the  50000 th item
Here is the  55000 th item
Here is the  60000 th item
Here is the  65000 th item


In [9]:
nbc = len(train.index)

In [10]:
trainheadlines = []
for j in range(0,len(train.index)):
    trainheadlines.append(' '.join(str(x) for x in train.iloc[j,:nbc-1]))

In [11]:
# advancedtrain training examples : CountVectorizer performed better than TF-IDF in my usecase
# To try if time : Glove, Bert, Gensim
advancedvectorizer = CountVectorizer(ngram_range=(1,3), strip_accents ='ascii', decode_error='ignore', lowercase=True)
advancedtrain = advancedvectorizer.fit_transform(trainheadlines)
print(advancedtrain.shape)

(70000, 7513439)


In [48]:
advancedtrain.shape

(70000, 7513439)

In [12]:
# targets - output for my training examples
targets = train['fake'].values

In [14]:
X_train, X_test, y_train, y_test = train_test_split(advancedtrain, targets, random_state=0)

## 4. Building Models

I did not performed a cross validation in this notebook due to the short time left for submitting results

### A. Logistic regression :  logit with C=0.5 and lasso penality

In [38]:
advancedmodel = LogisticRegression(penalty='l1', C=0.5)
advancedmodel = advancedmodel.fit(X_train, y_train)

In [33]:
print('Accuracy of Logit classifier on training set: {:.6f}'.format(advancedmodel.score(X_train, y_train)))
print('Accuracy of Logit classifier on test set: {:.6f}'.format(advancedmodel.score(X_test, y_test)))

Accuracy of Logit classifier on training set: 0.996629
Accuracy of Logit classifier on test set: 0.968914


### B. NBSVM

In [49]:
class NBSVM(six.with_metaclass(ABCMeta, BaseEstimator, ClassifierMixin)):

    def __init__(self, alpha=1.0, C=1.0, max_iter=10000):
        self.alpha = alpha
        self.max_iter = max_iter
        self.C = C
        self.svm_ = [] # fuggly

    def fit(self, X, y):
        X, y = check_X_y(X, y, 'csr')
        _, n_features = X.shape

        labelbin = LabelBinarizer()
        Y = labelbin.fit_transform(y)
        self.classes_ = labelbin.classes_
        if Y.shape[1] == 1:
            Y = np.concatenate((1 - Y, Y), axis=1)

        # LabelBinarizer().fit_transform() returns arrays with dtype=np.int64.
        # so we don't have to cast X to floating point
        Y = Y.astype(np.float64)

        # Count raw events from data
        n_effective_classes = Y.shape[1]
        self.class_count_ = np.zeros(n_effective_classes, dtype=np.float64)
        self.ratios_ = np.full((n_effective_classes, n_features), self.alpha,
                                 dtype=np.float64)
        self._compute_ratios(X, Y)

        # flugglyness
        for i in range(n_effective_classes):
            X_i = X.multiply(self.ratios_[i])
            svm = LinearSVC(C=self.C, max_iter=self.max_iter)
            Y_i = Y[:,i]
            svm.fit(X_i, Y_i)
            self.svm_.append(svm) 

        return self
    
    def predict(self, X):
        n_effective_classes = self.class_count_.shape[0]
        n_examples = X.shape[0]

        D = np.zeros((n_effective_classes, n_examples))

        for i in range(n_effective_classes):
            X_i = X.multiply(self.ratios_[i])
            D[i] = self.svm_[i].decision_function(X_i)
        
        return self.classes_[np.argmax(D, axis=0)]
        
    def _compute_ratios(self, X, Y):
        """Count feature occurrences and compute ratios."""
        if np.any((X.data if issparse(X) else X) < 0):
            raise ValueError("Input X must be non-negative")

        self.ratios_ += safe_sparse_dot(Y.T, X)  # ratio + feature_occurrance_c
        normalize(self.ratios_, norm='l1', axis=1, copy=False)
        row_calc = lambda r: np.log(np.divide(r, (1 - r)))
        self.ratios_ = np.apply_along_axis(row_calc, axis=1, arr=self.ratios_)
        check_array(self.ratios_)
        self.ratios_ = sparse.csr_matrix(self.ratios_)

In [29]:
advancedmodel3 = NBSVM()
advancedmodel3 = advancedmodel3.fit(X_train, y_train)

# countvectorizer avec balanced # LOGIT avec tout le pre-processing
print('Accuracy of Logit classifier on training set: {:.6f}'.format(advancedmodel3.score(X_train, y_train)))
print('Accuracy of Logit classifier on test set: {:.6f}'.format(advancedmodel3.score(X_test, y_test)))

Accuracy of Logit classifier on training set: 1.000000
Accuracy of Logit classifier on test set: 0.964686


### C. Linear SVC with C = 0.05, squared hinge loss, lasso penality and primal problem

In [57]:
advancedmodel = LinearSVC(C=0.05, loss="squared_hinge", penalty='l1', dual=False)
advancedmodel = advancedmodel.fit(X_train, y_train)

print('Accuracy of Logit classifier on training set: {:.6f}'.format(advancedmodel.score(X_train, y_train)))
print('Accuracy of Logit classifier on test set: {:.6f}'.format(advancedmodel.score(X_test, y_test)))

Accuracy of Logit classifier on training set: 0.985867
Accuracy of Logit classifier on test set: 0.967371


### D. LightGBM

In [65]:
# Changing format
X_train2 = X_train.astype('float32')
X_test2 = X_test.astype('float32')   
y_train2 = pd.to_numeric(y_train, errors='coerce')
y_test2 =  pd.to_numeric(y_test, errors='coerce')

In [67]:
advancedmodel = LGBMClassifier(application = 'binary',
                               objective = 'binary',
                               n_jobs = -1, 
                               verbose = 1,
                               max_depth = -1,
                               n_estimators = 500,
                               num_leaves = 200)
advancedmodel = advancedmodel.fit(X_train2, y_train2)

In [68]:
# countvectorizer avec balanced
print('Accuracy of LB classifier on training set: {:.6f}'.format(advancedmodel.score(X_train2, y_train2)))
print('Accuracy of LB classifier on test set: {:.6f}'.format(advancedmodel.score(X_test2, y_test2)))

Accuracy of LB classifier on training set: 1.000000
Accuracy of LB classifier on test set: 0.979314


### to do : embedding + LSTM, Neural networks