In [1]:
%reset

Once deleted, variables cannot be recovered. Proceed (y/[n])? y


In [13]:
import re
import time
import string
import json
import math
from collections import defaultdict
from collections import Counter
import itertools

import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
nltk.download('wordnet')
word_net_lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package wordnet to
[nltk_data]     /home/guillaume/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
# Load Data
train_data_path = "data/train.csv"
test_data_path = "data/test.csv"

train_set = pd.read_csv(train_data_path)
test_set = pd.read_csv(test_data_path)

In [4]:
# Create translator to remove punctuation and digits
remove_punctuation = str.maketrans(string.punctuation, ' '*len(string.punctuation))
remove_digits = str.maketrans('', '', string.digits)

def custom_tokenize(text):
    clean_text = text.replace('\n', ' ')
    # Remove punctuation
    clean_text = clean_text.translate(remove_punctuation)
    # Remove digits
    clean_text = clean_text.translate(remove_digits)
    # To lowercase
    clean_text = clean_text.lower()

    split_text = clean_text.split()

    treated_text = []
    for word in split_text:
        lemmatized_word = word_net_lemmatizer.lemmatize(word)
        if not lemmatized_word in stop_words and len(lemmatized_word) >= 3:
            treated_text.append(lemmatized_word)
    return split_text

In [5]:
tfidf_vectorizer = TfidfVectorizer(
    min_df=0.001, max_df=0.95,
    max_features=None, strip_accents='unicode',\
    analyzer='word', ngram_range=(1,2), use_idf=1,\
    smooth_idf=1, sublinear_tf=1, tokenizer=custom_tokenize,
    stop_words='english')

In [6]:
train_comments = train_set.iloc[:,1]
tfidf_vectorizer.fit(train_comments)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.95, max_features=None, min_df=0.001,
        ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=1,
        stop_words='english', strip_accents='unicode', sublinear_tf=1,
        token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=<function custom_tokenize at 0x7f25ac022730>, use_idf=1,
        vocabulary=None)

In [7]:
print("Longueur du vocabulaire :,", len(tfidf_vectorizer.vocabulary_))
print("Longueur des stop words :,", len(tfidf_vectorizer.stop_words_))

Longueur du vocabulaire :, 4462
Longueur des stop words :, 2753614


In [8]:
X_train = tfidf_vectorizer.transform(train_comments)
Y_train = train_set[train_set.columns[2:]]
Y_train.shape

(159571, 6)

In [9]:
test_comments = test_set.iloc[:,1]
X_test = tfidf_vectorizer.transform(test_comments)

In [10]:
toxic_Y_train = Y_train['toxic']

severe_toxic_Y_train = Y_train['severe_toxic']

obscene_Y_train = Y_train['obscene']

threat_Y_train = Y_train['threat']

insult_Y_train = Y_train['insult']

identity_hate_Y_train = Y_train['identity_hate']

In [14]:
toxic_svm = LogisticRegression()
toxic_svm.fit(X_train, toxic_Y_train)
print("toxic done")
severe_toxic_svm = LogisticRegression()
severe_toxic_svm.fit(X_train, severe_toxic_Y_train)
print("severe_toxic done")
obscene_svm = LogisticRegression()
obscene_svm.fit(X_train, obscene_Y_train)
print("obscene done")
threat_svm = LogisticRegression()
threat_svm.fit(X_train, threat_Y_train)
print("threat done")
insult_svm = LogisticRegression()
insult_svm.fit(X_train, insult_Y_train)
print("insult done")
identity_hate_svm = LogisticRegression()
identity_hate_svm.fit(X_train, identity_hate_Y_train)
print("identity hate done")

toxic done
severe_toxic done
obscene done
threat done
insult done
identity hate done


In [18]:
toxic_svm.predict(X_test)

array([1, 0, 0, ..., 0, 0, 1])

In [23]:
a = toxic_svm.predict_proba(X_test)
print(a)

[[ 0.00130187  0.99869813]
 [ 0.98889442  0.01110558]
 [ 0.9947901   0.0052099 ]
 ..., 
 [ 0.99345869  0.00654131]
 [ 0.98603307  0.01396693]
 [ 0.01098133  0.98901867]]


In [22]:
a[:, 1]

array([ 0.99869813,  0.01110558,  0.0052099 , ...,  0.00654131,
        0.01396693,  0.98901867])

In [24]:
final_df = pd.DataFrame()
final_df['toxic'] = toxic_svm.predict_proba(X_test)[:, 1]
final_df['severe_toxic'] = severe_toxic_svm.predict_proba(X_test)[:, 1]
final_df['obscene'] = obscene_svm.predict_proba(X_test)[:, 1]
final_df['threat'] = threat_svm.predict_proba(X_test)[:, 1]
final_df['insult'] = insult_svm.predict_proba(X_test)[:, 1]
final_df['identity_hate'] = identity_hate_svm.predict_proba(X_test)[:, 1]

In [25]:
final_df.head(10)

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0.998698,0.341085,0.998705,0.056465,0.965762,0.497038
1,0.011106,0.002353,0.004587,0.001529,0.00788,0.003003
2,0.00521,0.000942,0.002676,0.000605,0.004825,0.00135
3,0.005389,0.002309,0.003868,0.001074,0.004389,0.00099
4,0.062305,0.002992,0.013439,0.002181,0.026345,0.002888
5,0.012037,0.001407,0.005055,0.000758,0.009451,0.001551
6,0.007365,0.001333,0.007294,0.000667,0.008109,0.001432
7,0.567738,0.003073,0.049525,0.003338,0.122666,0.005065
8,0.022942,0.002102,0.011456,0.002546,0.011198,0.004318
9,0.010744,0.00112,0.005672,0.001143,0.007471,0.001499


In [26]:
submissions = pd.read_csv("data/sample_submission.csv")
final_df.insert(0, 'id', submissions['id'])

In [27]:
final_df.head(10)

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.998698,0.341085,0.998705,0.056465,0.965762,0.497038
1,0000247867823ef7,0.011106,0.002353,0.004587,0.001529,0.00788,0.003003
2,00013b17ad220c46,0.00521,0.000942,0.002676,0.000605,0.004825,0.00135
3,00017563c3f7919a,0.005389,0.002309,0.003868,0.001074,0.004389,0.00099
4,00017695ad8997eb,0.062305,0.002992,0.013439,0.002181,0.026345,0.002888
5,0001ea8717f6de06,0.012037,0.001407,0.005055,0.000758,0.009451,0.001551
6,00024115d4cbde0f,0.007365,0.001333,0.007294,0.000667,0.008109,0.001432
7,000247e83dcc1211,0.567738,0.003073,0.049525,0.003338,0.122666,0.005065
8,00025358d4737918,0.022942,0.002102,0.011456,0.002546,0.011198,0.004318
9,00026d1092fe71cc,0.010744,0.00112,0.005672,0.001143,0.007471,0.001499


In [28]:
final_df.to_csv('out.csv', index = False)

In [60]:
final_df.shape

(153164, 7)