In [1]:
%reset

Once deleted, variables cannot be recovered. Proceed (y/[n])? y


In [2]:
import re
import time
import string
import json
import math
from collections import defaultdict
from collections import Counter
import itertools

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('dark_background')
import seaborn as sns
from scipy.sparse import lil_matrix, csr_matrix
from scipy.io import mmwrite
from scipy.io import mmread

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score

from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
nltk.download('wordnet')
word_net_lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package wordnet to
[nltk_data]     /home/guillaume/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
# Load Data
train_data_path = "data/train.csv"
test_data_path = "data/test.csv"

train_set = pd.read_csv(train_data_path)

In [4]:
train_set.head(10)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0
5,00025465d4725e87,"""\n\nCongratulations from me as well, use the ...",0,0,0,0,0,0
6,0002bcb3da6cb337,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,1,1,1,0,1,0
7,00031b1e95af7921,Your vandalism to the Matt Shirvington article...,0,0,0,0,0,0
8,00037261f536c51d,Sorry if the word 'nonsense' was offensive to ...,0,0,0,0,0,0
9,00040093b2687caa,alignment on this subject and which are contra...,0,0,0,0,0,0


In [5]:
train_set.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159571 entries, 0 to 159570
Data columns (total 8 columns):
id               159571 non-null object
comment_text     159571 non-null object
toxic            159571 non-null int64
severe_toxic     159571 non-null int64
obscene          159571 non-null int64
threat           159571 non-null int64
insult           159571 non-null int64
identity_hate    159571 non-null int64
dtypes: int64(6), object(2)
memory usage: 9.7+ MB


## TfIdf vectors creation
1. Create inversed index when tokenize / lemmatize ,etc. documents
2. Vectorize documents in sparse matrix

In [6]:
# Create translator to remove punctuation and digits
remove_punctuation = str.maketrans(string.punctuation, ' '*len(string.punctuation))
remove_digits = str.maketrans('', '', string.digits)

def custom_tokenize(text):
    clean_text = text.replace('\n', ' ')
    # Remove punctuation
    clean_text = clean_text.translate(remove_punctuation)
    # Remove digits
    clean_text = clean_text.translate(remove_digits)
    # To lowercase
    clean_text = clean_text.lower()

    split_text = clean_text.split()

    treated_text = []
    for word in split_text:
        lemmatized_word = word_net_lemmatizer.lemmatize(word)
        if not lemmatized_word in stop_words and len(lemmatized_word) >= 3:
            treated_text.append(lemmatized_word)
    return split_text
    

In [7]:
tfidf_vectorizer = TfidfVectorizer(
    min_df=0.0001, max_df=0.99,
    max_features=None, strip_accents='unicode',\
    analyzer='word', ngram_range=(1,3), use_idf=1,\
    smooth_idf=1, sublinear_tf=1, tokenizer=custom_tokenize,
    stop_words='english')

In [8]:
train_comments = train_set.iloc[:,1]
tfidf_vectorizer.fit(train_comments)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.99, max_features=None, min_df=0.0001,
        ngram_range=(1, 3), norm='l2', preprocessor=None, smooth_idf=1,
        stop_words='english', strip_accents='unicode', sublinear_tf=1,
        token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=<function custom_tokenize at 0x7f470f377d90>, use_idf=1,
        vocabulary=None)

In [9]:
print("Longueur du vocabulaire :,", len(tfidf_vectorizer.vocabulary_))
print("Longueur des stop words :,", len(tfidf_vectorizer.stop_words_))

Longueur du vocabulaire :, 41298
Longueur des stop words :, 6663809


In [10]:
X = tfidf_vectorizer.transform(train_comments)

In [11]:
Y = train_set[train_set.columns[2:]]
Y.shape

(159571, 6)

In [12]:
X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size=0.2)
X_train.shape, X_val.shape, len(Y_train), len(Y_val)

((127656, 41298), (31915, 41298), 127656, 31915)

In [18]:
toxic_Y_train = Y_train['toxic']
toxic_Y_val = Y_val['toxic']

severe_toxic_Y_train = Y_train['severe_toxic']
severe_toxic_Y_val = Y_val['severe_toxic']

obscene_Y_train = Y_train['obscene']
obscene_Y_val = Y_val['obscene']

threat_Y_train = Y_train['threat']
threat_Y_val = Y_val['threat']

insult_Y_train = Y_train['insult']
insult_Y_val = Y_val['insult']

identity_hate_Y_train = Y_train['identity_hate']
identity_hate_Y_val = Y_val['identity_hate']

In [31]:
toxic_svm = LogisticRegression()
toxic_svm.fit(X_train, toxic_Y_train)
print("toxic done")
severe_toxic_svm = LogisticRegression()
severe_toxic_svm.fit(X_train, severe_toxic_Y_train)
print("severe_toxic done")
obscene_svm = LogisticRegression()
obscene_svm.fit(X_train, obscene_Y_train)
print("obscene done")
threat_svm = LogisticRegression()
threat_svm.fit(X_train, threat_Y_train)
print("threat done")
insult_svm = LogisticRegression()
insult_svm.fit(X_train, insult_Y_train)
print("insult done")
identity_hate_svm = LogisticRegression()
identity_hate_svm.fit(X_train, identity_hate_Y_train)
print("identity hate done")

toxic done
severe_toxic done
obscene done
threat done
insult done
identity hate done


In [32]:
print(roc_auc_score(toxic_svm.predict(X_val), toxic_Y_val))
print(roc_auc_score(severe_toxic_svm.predict(X_val), severe_toxic_Y_val))
print(roc_auc_score(obscene_svm.predict(X_val), obscene_Y_val))
print(roc_auc_score(threat_svm.predict(X_val), threat_Y_val))
print(roc_auc_score(insult_svm.predict(X_val), insult_Y_val))
print(roc_auc_score(identity_hate_svm.predict(X_val), identity_hate_Y_val))

0.945316471635
0.770466078348
0.943637659146
0.790412866084
0.894758479105
0.876450513153


In [34]:
toxic_svm.predict(X_val)

array([0, 0, 0, ..., 0, 0, 0])

In [35]:
toxic_svm.predict_proba(X_val)[:, 1]

array([ 0.01356621,  0.14758645,  0.0401466 , ...,  0.01612235,
        0.01023697,  0.01620785])

In [33]:
print(roc_auc_score(toxic_svm.predict_proba(X_val)[:, 1], toxic_Y_val))
print(roc_auc_score(severe_toxic_svm.predict_proba(X_val), severe_toxic_Y_val))
print(roc_auc_score(obscene_svm.predict_proba(X_val), obscene_Y_val))
print(roc_auc_score(threat_svm.predict_proba(X_val), threat_Y_val))
print(roc_auc_score(insult_svm.predict_proba(X_val), insult_Y_val))
print(roc_auc_score(identity_hate_svm.predict_proba(X_val), identity_hate_Y_val))

ValueError: continuous format is not supported

In [82]:
random_forest = RandomForestClassifier(n_estimators=5)
random_forest.fit(X_train, Y_train)
print(roc_auc_score(random_forest.predict(X_train), Y_train))
print(roc_auc_score(random_forest.predict(X_val), Y_val))

0.937962971622
0.634883825785


In [28]:
class ModelEvaluator:
    """ This model implements some functions to 
    evaluate classifier model
    """

    def __init__(self, model_to_evaluate, X, Y):
        self.model_to_evaluate = model_to_evaluate
        self.X = X
        self.Y = Y
    
    def evaluate_against_dummy(self, nb_iterations, dummy_strategy):
        """ Implement a dummy classifier to check if 
        current classifier model is working
        """
        dummy_roc_auc = []
        model_roc_auc = []
        for i in range(0, nb_iterations):
            dummy = DummyClassifier(strategy=dummy_strategy)
            dummy.fit(X, Y)
            dummy_roc_auc.append(roc_auc_score(dummy.predict(X), Y))
            model_roc_auc.append(self.model_to_evaluate(dummy.predict(X), Y))
        return sum(dummy_roc_auc)/len(dummy_roc_auc), sum(model_roc_auc)/len(model_roc_auc)
    
    def create_confusion_matrix(self):
        """ Build confusion matrix to 
        evaluate classifier perfomance on each 
        class
        """
        Y_pred = self.model_to_evaluate.predict(self.X)
        return confusion_matrix(self.Y, Y_pred)
    
    def plot_confusion_matrix(self, cm, classes,
                             title="Confusion matrix"):
        """
        This function prints and plots the confusion matrix.
        Seen on scikit-learn website
        """
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

        print(cm)

        plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
        plt.title(title)
        plt.colorbar()
        tick_marks = np.arange(len(classes))
        plt.xticks(tick_marks, classes, rotation=45)
        plt.yticks(tick_marks, classes)

        fmt = '.2f'
        thresh = cm.max() / 2.
        for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
            plt.text(j, i, format(cm[i, j], fmt),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")

        #plt.tight_layout()
        plt.ylabel('True label')
        plt.xlabel('Predicted label')
    
    def build_classification_report(self, classes):
        """ Build a classification report to evaluate model 
        on each class"""
        Y_pred = self.model_to_evaluate.predict(self.X)
        return classification_report(self.Y, Y_pred, target_names=classes)

### Basic model

In [26]:
severe_toxic_svm_model_evaluator = ModelEvaluator(severe_toxic_svm, X_val, severe_toxic_Y_val)
severe_toxic_svm_model_evaluator.evaluate_against_dummy(10, 'most_frequent')

severe_toxic_svm_cr = severe_toxic_svm_model_evaluator.build_classification_report(['severe_toxic'])
print(severe_toxic_svm_cr)

              precision    recall  f1-score   support

severe_toxic       0.99      1.00      1.00     31610

 avg / total       0.99      0.99      0.99     31915



  .format(len(labels), len(target_names))


In [28]:
final_df = pd.DataFrame()

In [29]:
final_df['toxic'] = toxic_svm.predict(X_val)

In [33]:
final_df.head(50)

Unnamed: 0,toxic
0,0
1,0
2,0
3,0
4,0
5,0
6,0
7,0
8,0
9,0
