In [1]:
import pandas as pd
import numpy as np
import sys
import nltk
from sklearn.feature_extraction.text import CountVectorizer 
from nltk.stem.lancaster import LancasterStemmer
from collections import Counter
from textblob import TextBlob
from sklearn.preprocessing import StandardScaler
from scipy import sparse
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.decomposition import TruncatedSVD
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression
from tqdm import tqdm
import matplotlib.pyplot as plt

import re

%matplotlib inline

In [2]:
data = pd.read_csv('train.csv')
count_vec = CountVectorizer(stop_words='english')
tokenizer_drop_stop_words = count_vec.build_analyzer()
stemmer = LancasterStemmer()

def spell_corr(x):
    spell_corr = TextBlob(x)
    return str(spell_corr.correct())

def preprocesser(x, tokenizer, drop_numbers=False, stemmer=stemmer, spell_check=False):
    if spell_check:
        x = spell_corr(x)
    tokens = [word for word in tokenizer(x)]
    if drop_numbers:
        tokens = [word for word in tokens if not word.isnumeric()]
    if stemmer:
        tokens = [stemmer.stem(token) for token in tokens]
    return tokens

count_vec = CountVectorizer(tokenizer=lambda x: preprocesser(x, tokenizer_drop_stop_words, drop_numbers=True))
count_feature = count_vec.fit_transform(data['comment_text'])

In [4]:
target = data[['toxic','severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']]
X_train, X_test, y_train, y_test = train_test_split(count_feature, target,test_size = 0.2, random_state = 42)
svd = TruncatedSVD(n_components=150, random_state=42)
X_train = svd.fit_transform(X_train)  
X_test = svd.transform(X_test)
print('explained variance for 150 components: {}'.format(svd.explained_variance_ratio_.sum()))

explained variance for 150 components: 0.8416323145801105


In [11]:
sc = StandardScaler()
X_train_tr = sc.fit_transform(X_train)
X_test_tr = sc.transform(X_test)

In [13]:
parameters_NB = {'alpha': [1e-5, 1e-3, 0.1, 1.0, 5.0, 10., 30., 100., 500, 1000]}

for i, column in enumerate(y_train.columns): 
    model = BernoulliNB()
    grid_search = GridSearchCV(model, parameters_NB, cv = 3, scoring='roc_auc')
    grid_search.fit(X_train_tr, y_train[column])
    
    best_estimator = grid_search.best_estimator_
    best_estimator.fit(X_train_tr, y_train[column])
    y_pred = best_estimator.predict(X_test_tr)
    roc_auc = roc_auc_score(y_test[column], y_pred)
    
    print('baseline {} test score: {}'.format(column, roc_auc))

baseline toxic test score: 0.697216207516633
baseline severe_toxic test score: 0.8017077851250198
baseline obscene test score: 0.759839408800417
baseline threat test score: 0.6005078442972982
baseline insult test score: 0.7304157538406375
baseline identity_hate test score: 0.6891917925893991


In [15]:
parameters_LR = {'C': [1e-2, 0.1, 1.0, 5.0]}
for i, column in tqdm(enumerate(y_train.columns)): 
    model = LogisticRegression(class_weight='balanced')
    grid_search = GridSearchCV(model, parameters_LR, cv = 3, scoring='roc_auc')
    grid_search.fit(X_train_tr, y_train[column])
    
    best_estimator = grid_search.best_estimator_
    best_estimator.fit(X_train_tr, y_train[column])
    y_pred = best_estimator.predict(X_test_tr)
    roc_auc = roc_auc_score(y_test[column], y_pred)
    print('baseline {} test score: {}'.format(column, roc_auc))

0it [00:00, ?it/s]

baseline toxic test score: 0.8151042002104836


1it [05:44, 344.91s/it]

baseline severe_toxic test score: 0.9171302982130958


2it [13:27, 403.91s/it]

baseline obscene test score: 0.8879017434788484


3it [17:55, 358.47s/it]

baseline threat test score: 0.8608088161023058


4it [23:01, 345.30s/it]

baseline insult test score: 0.8225037722508821


5it [28:42, 344.54s/it]

baseline identity_hate test score: 0.8538341651451383


6it [34:56, 349.35s/it]


<matplotlib.figure.Figure at 0x2738ebd0518>