In [43]:
%reset

Once deleted, variables cannot be recovered. Proceed (y/[n])? y


In [44]:
import re
import time
import string
import json
import math
from collections import defaultdict
from collections import Counter
import itertools

import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
nltk.download('wordnet')
word_net_lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package wordnet to
[nltk_data]     /home/guillaume/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [26]:
# Load Data
train_data_path = "data/train.csv"
test_data_path = "data/test.csv"

train_set = pd.read_csv(train_data_path)
test_set = pd.read_csv(test_data_path)

In [27]:
# Create translator to remove punctuation and digits
remove_punctuation = str.maketrans(string.punctuation, ' '*len(string.punctuation))
remove_digits = str.maketrans('', '', string.digits)

def custom_tokenize(text):
    clean_text = text.replace('\n', ' ')
    # Remove punctuation
    clean_text = clean_text.translate(remove_punctuation)
    # Remove digits
    clean_text = clean_text.translate(remove_digits)
    # To lowercase
    clean_text = clean_text.lower()

    split_text = clean_text.split()

    treated_text = []
    for word in split_text:
        lemmatized_word = word_net_lemmatizer.lemmatize(word)
        if not lemmatized_word in stop_words and len(lemmatized_word) >= 3:
            treated_text.append(lemmatized_word)
    return split_text

In [28]:
tfidf_vectorizer = TfidfVectorizer(
    min_df=0.001, max_df=0.95,
    max_features=None, strip_accents='unicode',\
    analyzer='word', ngram_range=(1,1), use_idf=1,\
    smooth_idf=1, sublinear_tf=1, tokenizer=custom_tokenize,
    stop_words='english')

In [29]:
train_comments = train_set.iloc[:,1]
tfidf_vectorizer.fit(train_comments)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.95, max_features=None, min_df=0.001,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=1,
        stop_words='english', strip_accents='unicode', sublinear_tf=1,
        token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=<function custom_tokenize at 0x7fc092c36d08>, use_idf=1,
        vocabulary=None)

In [30]:
print("Longueur du vocabulaire :,", len(tfidf_vectorizer.vocabulary_))
print("Longueur des stop words :,", len(tfidf_vectorizer.stop_words_))

Longueur du vocabulaire :, 3496
Longueur des stop words :, 173396


In [31]:
X_train = tfidf_vectorizer.transform(train_comments)
Y_train = train_set[train_set.columns[2:]]
Y_train.shape

(159571, 6)

In [32]:
test_comments = test_set.iloc[:,1]
X_test = tfidf_vectorizer.transform(test_comments)

In [34]:
toxic_Y_train = Y_train['toxic']

severe_toxic_Y_train = Y_train['severe_toxic']

obscene_Y_train = Y_train['obscene']

threat_Y_train = Y_train['threat']

insult_Y_train = Y_train['insult']

identity_hate_Y_train = Y_train['identity_hate']

In [35]:
toxic_svm = LinearSVC()
toxic_svm.fit(X_train, toxic_Y_train)
print("toxic done")
severe_toxic_svm = LinearSVC()
severe_toxic_svm.fit(X_train, severe_toxic_Y_train)
print("severe_toxic done")
obscene_svm = LinearSVC()
obscene_svm.fit(X_train, obscene_Y_train)
print("obscene done")
threat_svm = LinearSVC()
threat_svm.fit(X_train, threat_Y_train)
print("threat done")
insult_svm = LinearSVC()
insult_svm.fit(X_train, insult_Y_train)
print("insult done")
identity_hate_svm = LinearSVC()
identity_hate_svm.fit(X_train, identity_hate_Y_train)
print("identity hate done")

toxic done
severe_toxic done
obscene done
threat done
insult done
identity hate done


In [36]:
final_df = pd.DataFrame()
final_df['toxic'] = toxic_svm.predict(X_test)
final_df['severe_toxic'] = severe_toxic_svm.predict(X_test)
final_df['obscene'] = obscene_svm.predict(X_test)
final_df['threat'] = threat_svm.predict(X_test)
final_df['insult'] = insult_svm.predict(X_test)
final_df['identity_hate'] = identity_hate_svm.predict(X_test)

In [38]:
final_df.head(10)

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,1,0,1,0,1,0
1,0,0,0,0,0,0
2,0,0,0,0,0,0
3,0,0,0,0,0,0
4,0,0,0,0,0,0
5,0,0,0,0,0,0
6,0,0,0,0,0,0
7,1,0,0,0,0,0
8,0,0,0,0,0,0
9,0,0,0,0,0,0


In [39]:
submissions = pd.read_csv("data/sample_submission.csv")
final_df.insert(0, 'id', submissions['id'])

In [40]:
final_df.head(10)

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,1,0,1,0,1,0
1,0000247867823ef7,0,0,0,0,0,0
2,00013b17ad220c46,0,0,0,0,0,0
3,00017563c3f7919a,0,0,0,0,0,0
4,00017695ad8997eb,0,0,0,0,0,0
5,0001ea8717f6de06,0,0,0,0,0,0
6,00024115d4cbde0f,0,0,0,0,0,0
7,000247e83dcc1211,1,0,0,0,0,0
8,00025358d4737918,0,0,0,0,0,0
9,00026d1092fe71cc,0,0,0,0,0,0


In [41]:
final_df.to_csv('out.csv', index = False)

In [42]:
final_df.shape

(153164, 7)