# Toxic Comments Detection

In [1]:
import sys, os, re, csv, codecs, numpy as np, pandas as pd

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers

from nltk.tokenize import TweetTokenizer
import nltk
from nltk import Text
from nltk.tokenize import regexp_tokenize
from nltk.tokenize import word_tokenize  
from nltk.tokenize import sent_tokenize 
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag

from collections import Counter

from sklearn.feature_extraction.text import CountVectorizer

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
trainPath = "data/train.csv"
testPath = "data/test.csv"
df_train = pd.read_csv(trainPath)
df_test = pd.read_csv(testPath)
df_train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


### Data Cleanning
#### Fill NA
#### Remove special character
#### Romove stop words
#### Stemming

In [3]:
tknzr = TweetTokenizer()

df_train['comment_text']= df_train['comment_text'].apply(lambda comment: regexp_tokenize(comment, pattern='[a-zA-Z]+'))
stopWords = set(stopwords.words('english'))
df_train['comment_text'] = df_train['comment_text'].apply(lambda comment:[token.lower() for token in comment])
df_train['comment_text'] = df_train['comment_text'].apply(lambda comment:[token for token in comment if token not in stopWords])
df_train.head(5)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,"[explanation, edits, made, username, hardcore,...",0,0,0,0,0,0
1,000103f0d9cfb60f,"[aww, matches, background, colour, seemingly, ...",0,0,0,0,0,0
2,000113f07ec002fd,"[hey, man, really, trying, edit, war, guy, con...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"[make, real, suggestions, improvement, wondere...",0,0,0,0,0,0
4,0001d958c54c6e35,"[sir, hero, chance, remember, page]",0,0,0,0,0,0


In [4]:
# from nltk.stem import PorterStemmer
# porter = PorterStemmer()
from nltk.stem.lancaster import LancasterStemmer
lst = LancasterStemmer() # initiating LancasterStemmer
df_train['comment_text'] = df_train['comment_text'].apply(lambda comment:[lst.stem(token) for token in comment])
df_train.head(5)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,"[expl, edit, mad, usernam, hardc, metallic, fa...",0,0,0,0,0,0
1,000103f0d9cfb60f,"[aww, match, background, colo, seem, stuck, th...",0,0,0,0,0,0
2,000113f07ec002fd,"[hey, man, real, try, edit, war, guy, const, r...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"[mak, real, suggest, improv, wond, sect, stat,...",0,0,0,0,0,0
4,0001d958c54c6e35,"[sir, hero, chant, rememb, pag]",0,0,0,0,0,0


In [5]:
df_train['comment_text'] = df_train['comment_text'].apply(lambda comment: ' '.join(comment))
df_train.head(5)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,expl edit mad usernam hardc metallic fan rever...,0,0,0,0,0,0
1,000103f0d9cfb60f,aww match background colo seem stuck thank tal...,0,0,0,0,0,0
2,000113f07ec002fd,hey man real try edit war guy const remov rele...,0,0,0,0,0,0
3,0001b41b1c6bb37e,mak real suggest improv wond sect stat lat sub...,0,0,0,0,0,0
4,0001d958c54c6e35,sir hero chant rememb pag,0,0,0,0,0,0


In [6]:
df_test['comment_text']= df_test['comment_text'].apply(lambda comment: regexp_tokenize(comment, pattern='[a-zA-Z]+'))
df_test['comment_text'] = df_test['comment_text'].apply(lambda comment:[token.lower() for token in comment])
stopWords = set(stopwords.words('english'))
df_test['comment_text'] = df_test['comment_text'].apply(lambda comment:[token for token in comment if token not in stopWords])

In [7]:

df_test.head(5)

Unnamed: 0,id,comment_text
0,00001cee341fdb12,"[yo, bitch, ja, rule, succesful, ever, whats, ..."
1,0000247867823ef7,"[rfc, title, fine, imo]"
2,00013b17ad220c46,"[sources, zawe, ashton, lapland]"
3,00017563c3f7919a,"[look, back, source, information, updated, cor..."
4,00017695ad8997eb,"[anonymously, edit, articles]"


In [8]:
df_test['comment_text'] = df_test['comment_text'].apply(lambda comment:[lst.stem(token) for token in comment])
df_test.head(5)

Unnamed: 0,id,comment_text
0,00001cee341fdb12,"[yo, bitch, ja, rul, succes, ev, what, hat, sa..."
1,0000247867823ef7,"[rfc, titl, fin, imo]"
2,00013b17ad220c46,"[sourc, zaw, ashton, lapland]"
3,00017563c3f7919a,"[look, back, sourc, inform, upd, correct, form..."
4,00017695ad8997eb,"[anonym, edit, artic]"


In [9]:
df_test['comment_text'] = df_test['comment_text'].apply(lambda comment: ' '.join(comment))
df_test.head(5)

Unnamed: 0,id,comment_text
0,00001cee341fdb12,yo bitch ja rul succes ev what hat sad mofucka...
1,0000247867823ef7,rfc titl fin imo
2,00013b17ad220c46,sourc zaw ashton lapland
3,00017563c3f7919a,look back sourc inform upd correct form guess ...
4,00017695ad8997eb,anonym edit artic


In [10]:
list_sentences_train = df_train["comment_text"].fillna("_na_").values
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y = df_train[list_classes].values
list_sentences_test = df_test["comment_text"].fillna("_na_").values

In [11]:
# from sklearn.model_selection import train_test_split

X_train = df_train.comment_text
y_train = df_train[list_classes]

# X_train, X_validate, y_train, y_validate = train_test_split(
#     df_data, df_label, test_size=0.2, random_state=42)

X_test = df_test["comment_text"].fillna("_na_")

### Tokenizer

In [12]:
max_features = 20000
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(list_sentences_train))
list_tokenized_train = tokenizer.texts_to_sequences(X_train.values)
# list_tokenized_validate = tokenizer.texts_to_sequences(X_validate.values)
list_tokenized_test = tokenizer.texts_to_sequences(X_test.values)

In [13]:
maxlen = 200
feature_tokenizer_train = pad_sequences(list_tokenized_train, maxlen=maxlen)
# feature_tokenizer_validate = pad_sequences(list_tokenized_validate, maxlen=maxlen)
feature_tokenizer_test = pad_sequences(list_tokenized_test, maxlen=maxlen)
print feature_tokenizer_train.shape

(159571, 200)


### Bag of words

In [14]:
vectorizer = CountVectorizer(max_features = 3000)
features_CountVectorizer_train = vectorizer.fit_transform(X_train)
# features_CountVectorizer_validate = vectorizer.transform(X_validate)
features_CountVectorizer_test = vectorizer.transform(X_test)
feature_names_CountVectorizer = vectorizer.get_feature_names()
print features_CountVectorizer_train.shape

(159571, 3000)


In [15]:
from xgboost import XGBRegressor
from xgboost import XGBClassifier
from xgboost import plot_importance
from matplotlib import pyplot
import warnings
warnings.filterwarnings('ignore')

xgb = XGBRegressor()
xgb = xgb.fit(features_CountVectorizer_train, y_train.toxic)
imp = pd.DataFrame(xgb.feature_importances_,columns = ['Importance'],index = feature_names_CountVectorizer)
imp = imp.sort_values(['Importance'], ascending = False)

print(imp)

          Importance
fuck        0.138571
artic       0.047143
us          0.034286
would       0.031429
on          0.022857
suck        0.021429
idiot       0.021429
stupid      0.018571
shit        0.017143
gay         0.017143
bullshit    0.015714
crap        0.014286
asshol      0.014286
cunt        0.012857
ass         0.012857
bitch       0.012857
also        0.012857
faggot      0.012857
jerk        0.011429
dick        0.011429
pathet      0.011429
hel         0.011429
fag         0.010000
liar        0.010000
act         0.010000
hat         0.010000
moron       0.010000
see         0.008571
retard      0.008571
damn        0.008571
...              ...
fggt        0.000000
fict        0.000000
field       0.000000
fif         0.000000
fig         0.000000
fight       0.000000
fil         0.000000
fed         0.000000
febru       0.000000
feb         0.000000
fasc        0.000000
famili      0.000000
famy        0.000000
fan         0.000000
fantasy     0.000000
faq         0

In [16]:
from sklearn.feature_selection import SelectFromModel
model = SelectFromModel(xgb, prefit=True,threshold = 0.0000001)
features_CountVectorizer_train_new = model.transform(features_CountVectorizer_train)
features_CountVectorizer_test_new = model.transform(features_CountVectorizer_test)
# features_CountVectorizer_validate_new = model.transform(features_CountVectorizer_validate)
print features_CountVectorizer_train_new.shape
print features_CountVectorizer_test_new.shape
# print features_CountVectorizer_validate_new.shape
print "The number of selected features is: %d"%(features_CountVectorizer_train_new.shape[1])

(159571, 171)
(153164, 171)
The number of selected features is: 171


### Bag-of-words features with the tf-idf algorithm

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_features = 3000)
features_TfidfVectorizer_train = vectorizer.fit_transform(X_train)
# features_TfidfVectorizer_validate = vectorizer.transform(X_validate)
features_TfidfVectorizer_test = vectorizer.transform(X_test)
feature_names_TfidfVectorizer = vectorizer.get_feature_names()

print features_TfidfVectorizer_train.shape

(159571, 3000)


In [18]:
xgb = XGBRegressor()
xgb = xgb.fit(features_TfidfVectorizer_train, y_train.toxic)
imp = pd.DataFrame(xgb.feature_importances_,columns = ['Importance'],index = feature_names_TfidfVectorizer)
imp = imp.sort_values(['Importance'], ascending = False)

print(imp)

          Importance
fuck        0.193410
suck        0.031519
crap        0.028653
artic       0.027221
stupid      0.027221
idiot       0.025788
ass         0.022923
gay         0.021490
hel         0.020057
shit        0.018625
asshol      0.017192
bullshit    0.017192
hat         0.015759
bitch       0.015759
moron       0.015759
cunt        0.014327
faggot      0.014327
pathet      0.014327
los         0.012894
liar        0.011461
die         0.011461
retard      0.011461
dick        0.011461
pen         0.011461
jerk        0.011461
piss        0.010029
us          0.010029
thank       0.010029
shut        0.010029
bastard     0.008596
...              ...
field       0.000000
fif         0.000000
fig         0.000000
fight       0.000000
fil         0.000000
film        0.000000
filt        0.000000
feedback    0.000000
fee         0.000000
fed         0.000000
fath        0.000000
fantasy     0.000000
faq         0.000000
far         0.000000
farm        0.000000
fart        0

In [19]:
from sklearn.feature_selection import SelectFromModel
model = SelectFromModel(xgb, prefit=True,threshold = 0.0000001)
features_TfidfVectorizer_train_selected = model.transform(features_TfidfVectorizer_train)
# features_TfidfVectorizer_validate_selected = model.transform(features_TfidfVectorizer_validate)
features_TfidfVectorizer_test_selected = model.transform(features_TfidfVectorizer_test)
print features_TfidfVectorizer_train_selected.shape
# print features_TfidfVectorizer_validate_selected.shape
print features_TfidfVectorizer_test_selected.shape
print "The number of selected features is: %d"%(features_TfidfVectorizer_train_selected.shape[1])

(159571, 148)
(153164, 148)
The number of selected features is: 148


### word2vec

In [20]:
from gensim.models.word2vec import Word2Vec

def tokenize(docs):
    pattern = re.compile('[\W_]+', re.UNICODE)
    sentences = []
    for d in docs:
        sentence = d.lower().split(" ")
        sentence = [pattern.sub('', w) for w in sentence]
        sentences.append( [w for w in sentence if w not in stopWords] )
    return sentences

def featurize_w2v(model, sentences):
    f = np.zeros((len(sentences), model.vector_size))
    for i,s in enumerate(sentences):
        for w in s:
            try:
                vec = model[w]
            except KeyError:
                continue
            f[i,:] = f[i,:] + vec
        f[i,:] = f[i,:] / len(s)
    return f

def delete_nans(features):
    rows_to_delete = []
    for i in range(len(features)):
        if np.isnan(features[i].sum()):
            rows_to_delete.append(i)
    return rows_to_delete

train_sentences = tokenize(X_train)
model = Word2Vec(train_sentences, size=500, window=5, min_count=6, sample=1e-3, workers=2)
model.init_sims(replace=True)

features_w2v_train = featurize_w2v(model, train_sentences)
rows_to_delete_train = delete_nans(features_w2v_train)
features_w2v_train = np.delete(features_w2v_train, rows_to_delete_train, 0)

# validate_sentences = tokenize(X_validate)
# features_w2v_validate = featurize_w2v(model, validate_sentences)
# rows_to_delete_validate = delete_nans(features_w2v_validate)
# features_w2v_validate = np.delete(features_w2v_validate, rows_to_delete_validate, 0)

test_sentences = tokenize(X_test)
features_w2v_test = featurize_w2v(model, test_sentences)
rows_to_delete_test = delete_nans(features_w2v_test)
features_w2v_test = np.delete(features_w2v_test, rows_to_delete_test, 0)

print features_w2v_train.shape
print rows_to_delete_train

(159557, 500)
[12003, 26302, 38016, 44715, 45864, 85789, 89363, 91131, 99486, 115981, 120508, 135342, 148563, 155735]


### Cross Validation

In [21]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
n_folds = 3

def cv(model, X_train, y_train):
    kf = KFold(n_folds, shuffle=True, random_state=42).get_n_splits(X_train)
    score = cross_val_score(model, X_train, y_train, scoring= 'roc_auc', cv = kf)
    return(score)

### LSTM

In [22]:
embed_size = 128
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.layers import Embedding
from keras.layers import LSTM
lstm_model = Sequential()
lstm_model.add(Embedding(max_features, output_dim=256))
lstm_model.add(LSTM(60))
lstm_model.add(Dropout(0.5))
lstm_model.add(Dense(6, activation='sigmoid'))

lstm_model.compile(loss='binary_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])

Instructions for updating:
keep_dims is deprecated, use keepdims instead
Instructions for updating:
keep_dims is deprecated, use keepdims instead


In [28]:
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve
from sklearn.model_selection import train_test_split

X_train_lstm, X_validate_lstm, y_train_lstm, y_validate_lstm = train_test_split(
    feature_tokenizer_train, y_train, test_size=0.33, random_state=42)
lstm_model.fit(X_train_lstm,y_train_lstm, batch_size=256, epochs=2, validation_split=0.33)

Train on 71631 samples, validate on 35281 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x1329d7950>

In [29]:
prediction_lstm = lstm_model.predict(X_validate_lstm)
print roc_auc_score(y_validate_lstm.toxic, prediction_lstm[:,0])

0.9753588745307796


### SVM

In [30]:
# from sklearn import svm
# def build_model_svm(Tfid = False, svm_c = 0.1, svm_g = 0.005):
#     svm_clf = svm.SVC(C = svm_c, gamma = svm_g,probability=True)
#     if Tfid == 'Tfid':
#         svm_clf.fit(features_TfidfVectorizer_train, y_train.toxic)
#         pred = svm_clf.predict_proba(features_TfidfVectorizer_validate)
#     elif Tfid == 'Counter':
#         svm_clf.fit(features_CountVectorizer_train, y_train.toxic)
#         pred = svm_clf.predict_proba(features_CountVectorizer_validate) 
#     else:
#         svm_clf.fit(features_w2v_train, y_train[label].drop(y_train.index[rows_to_delete_train]))
#         pred = svm_clf.predict_proba(features_w2v_validate)
        
#     return {
#         "Tfid": Tfid,
#         "svm_c": svm_c,
#         "svm_g": svm_g,     
#         "auc": roc_auc_score(y_validate.toxic, pred[:,1])
#     }


In [31]:
# from itertools import product
# param_values = {
#   "svm_c": [1, 2, 5, 10, 20],
#     "svm_g": [0.05, 0.2, 0.5, 5, 50],
#     "Tfid": [True, False]
# }

# results = []
# max_auc = 0

# for p in product(*param_values.values()):
#     res = build_model_svm(**dict(zip(param_values.keys(), p)))
#     results.append(res)
#     if res.get('auc')>max_auc:
#         max_auc = res.get('auc')
#         Tfid_opt = res.get('Tfid')
#         svm_c_opt = res.get('svm_c')
#         svm_g_opt = res.get('svm_g')
#     print(res)

### Random Forest

In [32]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve

In [33]:
def build_model_rf(Tfid = 'Tfid',n_trees= 100):
    rf_clf = RandomForestClassifier(n_estimators = n_trees)
    if Tfid == 'Tfid':
        score = cv(rf_clf, features_TfidfVectorizer_train, y_train.toxic)
        return {
            "Tfid": Tfid,
            "n_trees": n_trees,
            "auc":  np.mean(score)
        }
    elif Tfid == 'Counter':
        score = cv(rf_clf, features_CountVectorizer_train, y_train.toxic)
        return {
            "Tfid": Tfid,
            "n_trees": n_trees,
            "auc":  np.mean(score)
        }
    else:
        score = cv(rf_clf, features_w2v_train, y_train.toxic.drop(y_train.index[rows_to_delete_train]))
        return {
            "Tfid": Tfid,
            "n_trees": n_trees,
            "auc":  np.mean(score)
        }

In [34]:
from itertools import product
param_values = {
  "Tfid": ['Word2Vec','Tfid','Counter'],
  "n_trees": [10, 50, 100, 200]
}

results = []
max_auc = 0

for p in product(*param_values.values()):
    res = build_model_rf(**dict(zip(param_values.keys(), p)))
    results.append(res)
    if res.get('auc')>max_auc:
        max_auc = res.get('auc')
        Tfid_opt = res.get('Tfid')
        n_trees_opt = res.get('n_trees')
    print(res)

{'auc': 0.900472221463993, 'Tfid': 'Word2Vec', 'n_trees': 10}
{'auc': 0.932474776007839, 'Tfid': 'Word2Vec', 'n_trees': 50}
{'auc': 0.9368080706899624, 'Tfid': 'Word2Vec', 'n_trees': 100}
{'auc': 0.9390857666163148, 'Tfid': 'Word2Vec', 'n_trees': 200}
{'auc': 0.9224602572278231, 'Tfid': 'Tfid', 'n_trees': 10}
{'auc': 0.9479672713494175, 'Tfid': 'Tfid', 'n_trees': 50}
{'auc': 0.9525974483053274, 'Tfid': 'Tfid', 'n_trees': 100}
{'auc': 0.9539436649494664, 'Tfid': 'Tfid', 'n_trees': 200}
{'auc': 0.9248396733731972, 'Tfid': 'Counter', 'n_trees': 10}
{'auc': 0.9435143250272224, 'Tfid': 'Counter', 'n_trees': 50}
{'auc': 0.9462083713513394, 'Tfid': 'Counter', 'n_trees': 100}
{'auc': 0.9474485041968497, 'Tfid': 'Counter', 'n_trees': 200}


### NB

In [35]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve
def build_model(Tfid = 'Tfid',nb_alpha=1.0):
    nb_clf = MultinomialNB(alpha=nb_alpha)
    if Tfid == 'Tfid':
        score = cv(nb_clf, features_TfidfVectorizer_train, y_train.toxic)
    elif Tfid == 'Counter':
        score = cv(nb_clf, features_CountVectorizer_train, y_train.toxic)
    return {
        "Tfid": Tfid,
        "nb_alpha": nb_alpha,
        "auc": np.mean(score)
    }

In [36]:
from itertools import product
param_values = {
  "Tfid": ['Tfid','Counter'],
  "nb_alpha": [0.01, 0.1, 1.0, 2,10]
}

results = []
max_auc = 0

for p in product(*param_values.values()):
    res = build_model(**dict(zip(param_values.keys(), p)))
    results.append(res)
    if res.get('auc')>max_auc:
        max_auc = res.get('auc')
        Tfid_opt = res.get('Tfid')
        nb_alpha_opt = res.get('nb_alpha')
    print(res)

{'nb_alpha': 0.01, 'auc': 0.9506579353461703, 'Tfid': 'Tfid'}
{'nb_alpha': 0.01, 'auc': 0.9209745849187193, 'Tfid': 'Counter'}
{'nb_alpha': 0.1, 'auc': 0.9518184484871535, 'Tfid': 'Tfid'}
{'nb_alpha': 0.1, 'auc': 0.9212816486662977, 'Tfid': 'Counter'}
{'nb_alpha': 1.0, 'auc': 0.9544086196190392, 'Tfid': 'Tfid'}
{'nb_alpha': 1.0, 'auc': 0.9221170163378615, 'Tfid': 'Counter'}
{'nb_alpha': 2, 'auc': 0.9541552674449356, 'Tfid': 'Tfid'}
{'nb_alpha': 2, 'auc': 0.9227563371912918, 'Tfid': 'Counter'}
{'nb_alpha': 10, 'auc': 0.9377407998272237, 'Tfid': 'Tfid'}
{'nb_alpha': 10, 'auc': 0.9234760607912956, 'Tfid': 'Counter'}


### Logistic Regression

In [37]:
def build_model_lr(Tfid = 'Tfid',lr_c= 1):
    lr_clf = LogisticRegression(C=lr_c, dual=False, n_jobs=1)
    if Tfid == 'Tfid':
        score = cv(lr_clf, features_TfidfVectorizer_train, y_train.toxic)
        return {
            "Tfid": Tfid,
            "lr_c": lr_c,
            "auc":  np.mean(score)
        }
    elif Tfid == 'Counter':
        score = cv(lr_clf, features_CountVectorizer_train, y_train.toxic)
        return {
            "Tfid": Tfid,
            "lr_c": lr_c,
            "auc":  np.mean(score)
        }
    else:
        score = cv(lr_clf, features_w2v_train, y_train.toxic.drop(y_train.index[rows_to_delete_train]))
        return {
            "Tfid": Tfid,
            "lr_c": lr_c,
            "auc":  np.mean(score)
        }


In [38]:
from sklearn.linear_model import LogisticRegression
from itertools import product
param_values = {
  "Tfid": ['Tfid','Counter','Word2Vec'],
  "lr_c": [0.01, 0.1, 1.0, 2.0 ,10.0]
}

results = []
max_auc = 0

for p in product(*param_values.values()):
    res = build_model_lr(**dict(zip(param_values.keys(), p)))
    results.append(res)
    if res.get('auc')>max_auc:
        max_auc = res.get('auc')
        Tfid_opt = res.get('Tfid')
        lr_c_opt = res.get('lr_c')
    print(res)

{'lr_c': 0.01, 'auc': 0.9402733356948132, 'Tfid': 'Tfid'}
{'lr_c': 0.01, 'auc': 0.9422624459267811, 'Tfid': 'Counter'}
{'lr_c': 0.01, 'auc': 0.9367029317189411, 'Tfid': 'Word2Vec'}
{'lr_c': 0.1, 'auc': 0.9585442438457297, 'Tfid': 'Tfid'}
{'lr_c': 0.1, 'auc': 0.9472235817634922, 'Tfid': 'Counter'}
{'lr_c': 0.1, 'auc': 0.9498878721034272, 'Tfid': 'Word2Vec'}
{'lr_c': 1.0, 'auc': 0.9653069476755065, 'Tfid': 'Tfid'}
{'lr_c': 1.0, 'auc': 0.9472481236806066, 'Tfid': 'Counter'}
{'lr_c': 1.0, 'auc': 0.9545608855556155, 'Tfid': 'Word2Vec'}
{'lr_c': 2.0, 'auc': 0.9649358449275244, 'Tfid': 'Tfid'}
{'lr_c': 2.0, 'auc': 0.9473891931486982, 'Tfid': 'Counter'}
{'lr_c': 2.0, 'auc': 0.9549138455818168, 'Tfid': 'Word2Vec'}
{'lr_c': 10.0, 'auc': 0.9618044230682744, 'Tfid': 'Tfid'}
{'lr_c': 10.0, 'auc': 0.94727544114222, 'Tfid': 'Counter'}
{'lr_c': 10.0, 'auc': 0.9553763416223734, 'Tfid': 'Word2Vec'}


### NB-LR

In [39]:
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils.validation import check_X_y, check_is_fitted
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from scipy import sparse

class NbLRClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, C=1.0, dual=False, n_jobs=1):
        self.C = C
        self.dual = dual
        self.n_jobs = n_jobs

    def predict(self, x):
        # Verify that model has been fit
        check_is_fitted(self, ['_r', '_clf'])
        return self._clf.predict(x.multiply(self._r))

    def predict_proba(self, x):
        # Verify that model has been fit
        check_is_fitted(self, ['_r', '_clf'])
        return self._clf.predict_proba(x.multiply(self._r))

    def fit(self, x, y):
        # Check that X and y have correct shape
        y = y.values
        x, y = check_X_y(x, y, accept_sparse=True)

        def pr(x, y_i, y):
            p = x[y==y_i].sum(0)
            return (p+1.1) / ((y==y_i).sum()+1)
        self._r = sparse.csr_matrix(np.log(pr(x,1,y) / pr(x,0,y)))
        x_nb = x.multiply(self._r)
        self._clf = LogisticRegression(C=self.C, dual=self.dual, n_jobs=self.n_jobs).fit(x_nb, y)
        return self
    
    def fit_w2v(self, x, y):
        # Check that X and y have correct shape
        y = y.values
        x, y = check_X_y(x, y, accept_sparse=True)

        def pr(x, y_i, y):
            p = x[y==y_i].sum(0)
            return (p+1.3) / ((y==y_i).sum()+1.3)
        self._r = sparse.csr_matrix(np.log(pr(x,1,y) / pr(x,0,y)))
        
        x_nb = sparse.csr_matrix(x).multiply(self._r)
        self._clf = LogisticRegression(C=self.C, dual=self.dual, n_jobs=self.n_jobs).fit(x_nb, y)
        return self

In [40]:
def build_model_nblr(Tfid = 'Tfid',lr_c= 1):
    model = NbLRClassifier(C=lr_c, dual=False, n_jobs=-1)
    if Tfid == 'Tfid':
        score = cv(model, features_TfidfVectorizer_train, y_train.toxic)
        return {
            "Tfid": Tfid,
            "lr_c": lr_c,
            "auc":  np.mean(score)
        }
    elif Tfid == 'Counter':
        score = cv(model, features_CountVectorizer_train, y_train.toxic)
        return {
            "Tfid": Tfid,
            "lr_c": lr_c,
            "auc":  np.mean(score)
        }
    else:
        score = cv(model, features_w2v_train, y_train.toxic.drop(y_train.index[rows_to_delete_train]))
        return {
            "Tfid": Tfid,
            "lr_c": lr_c,
            "auc":  np.mean(score)
        }

In [41]:
param_values = {
  "Tfid": ['Tfid','Counter'],
  "lr_c": [0.01, 0.1, 1.0, 2.0 ,10.0]
}

results = []
max_auc = 0

for p in product(*param_values.values()):
    res = build_model_nblr(**dict(zip(param_values.keys(), p)))
    results.append(res)
    if res.get('auc')>max_auc:
        max_auc = res.get('auc')
        Tfid_opt = res.get('Tfid')
        lr_c_opt = res.get('lr_c')
    print(res)

{'lr_c': 0.01, 'auc': 0.9573069086982664, 'Tfid': 'Tfid'}
{'lr_c': 0.01, 'auc': 0.9474459473514306, 'Tfid': 'Counter'}
{'lr_c': 0.1, 'auc': 0.9632403776358842, 'Tfid': 'Tfid'}
{'lr_c': 0.1, 'auc': 0.9480304962406572, 'Tfid': 'Counter'}
{'lr_c': 1.0, 'auc': 0.9661523550097125, 'Tfid': 'Tfid'}
{'lr_c': 1.0, 'auc': 0.947905421249103, 'Tfid': 'Counter'}
{'lr_c': 2.0, 'auc': 0.9659902996819992, 'Tfid': 'Tfid'}
{'lr_c': 2.0, 'auc': 0.9477916077995613, 'Tfid': 'Counter'}
{'lr_c': 10.0, 'auc': 0.9641658774712377, 'Tfid': 'Tfid'}
{'lr_c': 10.0, 'auc': 0.9477580550851467, 'Tfid': 'Counter'}


### Model Build and Predict in Testing Data Set

In [47]:
preds_nblr = np.zeros((features_TfidfVectorizer_test.shape[0], len(list_classes)))
nblr_model = NbLRClassifier(C=1, dual=False, n_jobs=-1)
for i,label in enumerate(list_classes):
    nblr_model.fit(features_TfidfVectorizer_train, y_train[label])
    preds_nblr[:,i] = nblr_model.predict_proba(features_TfidfVectorizer_test)[:,1]
print preds_nblr.shape

(153164, 6)


In [49]:
subm = pd.read_csv('data/sample_submission.csv')
submid = pd.DataFrame({'id': subm["id"]})
submission = pd.concat([submid, pd.DataFrame(preds_nblr, columns = list_classes)], axis=1)
submission.to_csv('submission_nblr.csv', index=False)
submission.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.999986,0.297667,0.999899,0.05062,0.993505,0.578811
1,0000247867823ef7,0.005749,0.002184,0.002309,0.000537,0.006962,0.00187
2,00013b17ad220c46,0.008466,0.000514,0.003402,2.8e-05,0.003644,0.000102
3,00017563c3f7919a,0.001476,0.001068,0.001384,0.000511,0.002428,0.000252
4,00017695ad8997eb,0.019654,0.001104,0.004853,0.000228,0.007441,0.000442


##### NBLR testing score is 0.9727

In [50]:
preds_lr = np.zeros((features_TfidfVectorizer_test.shape[0], len(list_classes)))
lr_model = lr_clf = LogisticRegression(C=1, dual=False, n_jobs=1)
for i,label in enumerate(list_classes):
    lr_model.fit(features_TfidfVectorizer_train, y_train[label])
    preds_lr[:,i] = lr_model.predict_proba(features_TfidfVectorizer_test)[:,1]
print preds_lr.shape

(153164, 6)


In [51]:
submid = pd.DataFrame({'id': subm["id"]})
submission_lr = pd.concat([submid, pd.DataFrame(preds_lr, columns = list_classes)], axis=1)
submission_lr.to_csv('submission_lr.csv', index=False)
submission_lr.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.999649,0.239714,0.998739,0.054813,0.984641,0.512128
1,0000247867823ef7,0.009113,0.002605,0.004496,0.001958,0.0077,0.002291
2,00013b17ad220c46,0.009903,0.001353,0.004375,0.000438,0.004377,0.000684
3,00017563c3f7919a,0.003652,0.002097,0.003019,0.000897,0.003299,0.000759
4,00017695ad8997eb,0.015524,0.001183,0.004439,0.000719,0.005861,0.001251


##### LR testing score is 0.9713

In [53]:
lstm_model.fit(feature_tokenizer_train,y_train, batch_size=256, epochs=2, validation_split=0.2,verbose=0)
preds_lstm = lstm_model.predict(feature_tokenizer_test)
print preds_lr.shape

(153164, 6)


In [55]:
submid = pd.DataFrame({'id': subm["id"]})
submission_lstm = pd.concat([submid, pd.DataFrame(preds_lstm, columns = list_classes)], axis=1)
submission_lstm.to_csv('submission_lstm.csv', index=False)
submission_lstm.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.993704,0.2807285,0.973476,0.04038447,0.88922,0.1894136
1,0000247867823ef7,0.000544,3.071439e-06,0.000101,7.989715e-06,5.7e-05,1.016809e-05
2,00013b17ad220c46,0.051714,0.0002639241,0.005475,0.0003494619,0.008154,0.0004944871
3,00017563c3f7919a,0.000105,5.036533e-08,1.8e-05,1.315698e-07,4e-06,2.272676e-07
4,00017695ad8997eb,0.003982,1.278094e-05,0.000589,3.120828e-05,0.000408,4.616031e-05


#### LSTM testing socre is 0.9764

### Ensemble

In [56]:
pred_avg = preds_lr*0.3 + preds_nblr*0.2 + preds_lstm*0.5

In [57]:
submid = pd.DataFrame({'id': subm["id"]})
submission_lstm = pd.concat([submid, pd.DataFrame(pred_avg, columns = list_classes)], axis=1)
submission_lstm.to_csv('submission_ensemble.csv', index=False)
submission_lstm.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.996744,0.271812,0.98634,0.04676,0.938703,0.364107
1,0000247867823ef7,0.004155,0.00122,0.001861,0.000699,0.003731,0.001066
2,00013b17ad220c46,0.030521,0.000641,0.004731,0.000312,0.006119,0.000473
3,00017563c3f7919a,0.001443,0.000843,0.001192,0.000371,0.001478,0.000278
4,00017695ad8997eb,0.010579,0.000582,0.002597,0.000277,0.003451,0.000487


### Score is 0.9796 after model ensemble