In [102]:
import re
import time
import string
import json
import math
from collections import defaultdict
from collections import Counter
import itertools

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('dark_background')
import seaborn as sns
from scipy.sparse import lil_matrix, csr_matrix
from scipy.io import mmwrite
from scipy.io import mmread

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
nltk.download('wordnet')
word_net_lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package wordnet to
[nltk_data]     /home/guillaume/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
# Load Data
train_data_path = "data/train.csv"
test_data_path = "data/test.csv"

train_set = pd.read_csv(train_data_path)

In [3]:
train_set.head(10)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0
5,00025465d4725e87,"""\n\nCongratulations from me as well, use the ...",0,0,0,0,0,0
6,0002bcb3da6cb337,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,1,1,1,0,1,0
7,00031b1e95af7921,Your vandalism to the Matt Shirvington article...,0,0,0,0,0,0
8,00037261f536c51d,Sorry if the word 'nonsense' was offensive to ...,0,0,0,0,0,0
9,00040093b2687caa,alignment on this subject and which are contra...,0,0,0,0,0,0


## TfIdf vectors creation
1. Create inversed index when tokenize / lemmatize ,etc. documents
2. Vectorize documents in sparse matrix

In [91]:
# Create translator to remove punctuation and digits
remove_punctuation = str.maketrans(string.punctuation, ' '*len(string.punctuation))
remove_digits = str.maketrans('', '', string.digits)

def tokenize(text):
    clean_text = text.replace('\n', ' ')
    # Remove punctuation
    clean_text = clean_text.translate(remove_punctuation)
    # Remove digits
    clean_text = clean_text.translate(remove_digits)
    # To lowercase
    clean_text = clean_text.lower()

    split_text = clean_text.split()

    treated_text = []
    for word in split_text:
        lemmatized_word = word_net_lemmatizer.lemmatize(word)
        if not lemmatized_word in stop_words and len(lemmatized_word) >= 3:
            treated_text.append(lemmatized_word)
    return treated_text
    

In [92]:
tfidf_vectorizer = TfidfVectorizer(max_features=None, strip_accents='unicode',\
               analyzer='word', ngram_range=(1,1), use_idf=1,\
               smooth_idf=1, sublinear_tf=1, stop_words='english', tokenizer=tokenize)

In [93]:
train_comments = train_set.iloc[:,1]
tfidf_vectorizer.fit(train_comments)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=1,
        stop_words='english', strip_accents='unicode', sublinear_tf=1,
        token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=<function tokenize at 0x7fb487439d08>, use_idf=1,
        vocabulary=None)

In [95]:
X = tfidf_vectorizer.transform(train_comments)

In [96]:
Y = train_set[train_set.columns[2:]]
Y.shape

(159571, 6)

In [97]:
X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size=0.2)
X_train.shape, X_val.shape, len(Y_train), len(Y_val)

((127656, 165352), (31915, 165352), 127656, 31915)

In [98]:
random_forest = RandomForestClassifier(n_estimators=5)
random_forest.fit(X_train, Y_train)
print(random_forest.score(X_train, Y_train))
print(random_forest.score(X_val, Y_val))

0.985625430845
0.906814977283


In [106]:
class ModelEvaluator:
    """ This model implements some functions to 
    evaluate classifier model
    """

    def __init__(self, model_to_evaluate, X, Y):
        self.model_to_evaluate = model_to_evaluate
        self.X = X
        self.Y = Y
    
    def evaluate_against_dummy(self, nb_iterations, dummy_strategy):
        """ Implement a dummy classifier to check if 
        current classifier model is working
        """
        dummy_accuracies = []
        model_accuracies = []
        for i in range(0, nb_iterations):
            dummy = DummyClassifier(strategy=dummy_strategy)
            X_train, X_val, Y_train, Y_val = train_test_split(self.X, self.Y, test_size=0.2)
            dummy.fit(X_train, Y_train)
            dummy_accuracies.append(dummy.score(X_val, Y_val))
            model_accuracies.append(random_forest.score(X_val, Y_val))
        return sum(dummy_accuracies)/len(dummy_accuracies), sum(model_accuracies)/len(model_accuracies)
    
    def create_confusion_matrix(self):
        """ Build confusion matrix to 
        evaluate classifier perfomance on each 
        class
        """
        Y_pred = self.model_to_evaluate.predict(self.X)
        return confusion_matrix(self.Y, Y_pred)
    
    def plot_confusion_matrix(self, cm, classes,
                             title="Confusion matrix"):
        """
        This function prints and plots the confusion matrix.
        Seen on scikit-learn website
        """
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

        print(cm)

        plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
        plt.title(title)
        plt.colorbar()
        tick_marks = np.arange(len(classes))
        plt.xticks(tick_marks, classes, rotation=45)
        plt.yticks(tick_marks, classes)

        fmt = '.2f'
        thresh = cm.max() / 2.
        for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
            plt.text(j, i, format(cm[i, j], fmt),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")

        #plt.tight_layout()
        plt.ylabel('True label')
        plt.xlabel('Predicted label')
    
    def build_classification_report(self, classes):
        """ Build a classification report to evaluate model 
        on each class"""
        Y_pred = self.model_to_evaluate.predict(self.X)
        return classification_report(self.Y, Y_pred, target_names=classes)

In [107]:
rf_model_evaluator = ModelEvaluator(random_forest, X_val, Y_val)
rf_model_evaluator.evaluate_against_dummy(10, 'most_frequent')

(0.89935766880777057, 0.90784897383675389)

In [109]:

sgd_cr = rf_model_evaluator.build_classification_report(['toxic', 'severe_toxic', 'obscene', 'threat'
                                                'insult', 'identity_hate'])
print(sgd_cr)

               precision    recall  f1-score   support

        toxic       0.81      0.61      0.69      3077
 severe_toxic       0.38      0.14      0.20       325
      obscene       0.81      0.60      0.69      1702
 threatinsult       0.35      0.07      0.12       108
identity_hate       0.69      0.49      0.57      1582

  avg / total       0.75      0.53      0.62      7070



  .format(len(labels), len(target_names))


In [115]:
X_val[0,:].todense().shape

(1, 165352)