**In this notebook, three models (NB-SVM, LSTM, LR) are trained. The final submission are the weighted average of the results of three models.**

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

from subprocess import check_output
print(check_output(["ls", "../input"]).decode("utf8"))

# Any results you write to the current directory are saved as output.

Now let's have a look at the three data files to get a sense what they look like.


In [None]:
sample_submission = pd.read_csv("../input/jigsaw-toxic-comment-classification-challenge/sample_submission.csv")
test = pd.read_csv("../input/jigsaw-toxic-comment-classification-challenge/test.csv")
train = pd.read_csv("../input/jigsaw-toxic-comment-classification-challenge/train.csv")
train.head(10)

Have a look at the lengths of the comments. I replace empty comments with "unknown" to avoid errors.

In [None]:
lens = train.comment_text.str.len()
lens.hist()
sorted(lens.tolist())[:10]
COMMENT = 'comment_text'
train[COMMENT].fillna("unknown", inplace=True)
test[COMMENT].fillna("unknown", inplace=True)

In [None]:
import re, string
re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')
def tokenize(s): return re_tok.sub(r' \1 ', s).split()

In [None]:
n = train.shape[0]
vec = TfidfVectorizer(ngram_range=(1,2), tokenizer=tokenize,
               min_df=3, max_df=0.9, strip_accents='unicode', use_idf=1,
               smooth_idf=1, sublinear_tf=1 )
trn_term_doc = vec.fit_transform(train["comment_text"])
test_term_doc = vec.transform(test["comment_text"])

In [None]:
trn_term_doc, test_term_doc

In [None]:
def pr(y_i, y):
    p = x[y==y_i].sum(0)
    return (p+1) / ((y==y_i).sum()+1)

In [None]:
x = trn_term_doc
test_x = test_term_doc

In [None]:
def get_mdl(y):
    y = y.values
    r = np.log(pr(1,y) / pr(0,y))
    m = LogisticRegression(C=4, dual=True)
    x_nb = x.multiply(r)
    return m.fit(x_nb, y), r

In [None]:
label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

preds = np.zeros((len(test), len(label_cols)))

for i, j in enumerate(label_cols):
    print('fit', j)
    m,r = get_mdl(train[j])
    preds[:,i] = m.predict_proba(test_x.multiply(r))[:,1]

In [None]:
subm = pd.read_csv('../input/jigsaw-toxic-comment-classification-challenge/sample_submission.csv')
submid = pd.DataFrame({'id': subm["id"]})
submission = pd.concat([submid, pd.DataFrame(preds, columns = label_cols)], axis=1)
submission.to_csv('submission_NBSVM.csv', index=False)

Now try LSTM

In [None]:
import sys, os, re, csv, codecs, numpy as np, pandas as pd

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers

In [None]:
path = '../input/'
comp = 'jigsaw-toxic-comment-classification-challenge/'
EMBEDDING_FILE=f'{path}glove6b50d/glove.6B.50d.txt'
TRAIN_DATA_FILE=f'{path}{comp}train.csv'
TEST_DATA_FILE=f'{path}{comp}test.csv'

In [None]:
embed_size = 50 # how big is each word vector
max_features = 20000 # how many unique words to use (i.e num rows in embedding vector)
maxlen = 100 # max number of words in a comment to use

In [None]:
train = pd.read_csv(TRAIN_DATA_FILE)
test = pd.read_csv(TEST_DATA_FILE)

list_sentences_train = train["comment_text"].fillna("_na_").values
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y = train[list_classes].values
list_sentences_test = test["comment_text"].fillna("_na_").values

In [None]:
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(list_sentences_train))
list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)
X_t = pad_sequences(list_tokenized_train, maxlen=maxlen)
X_te = pad_sequences(list_tokenized_test, maxlen=maxlen)

In [None]:
def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.strip().split()) for o in open(EMBEDDING_FILE))
all_embs = np.stack(embeddings_index.values())
emb_mean,emb_std = all_embs.mean(), all_embs.std()
emb_mean,emb_std

In [None]:
word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector

In [None]:
inp = Input(shape=(maxlen,))
x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
x = Bidirectional(LSTM(50, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(x)
x = GlobalMaxPool1D()(x)
x = Dense(50, activation="relu")(x)
x = Dropout(0.1)(x)
x = Dense(6, activation="sigmoid")(x)
model = Model(inputs=inp, outputs=x)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
model.fit(X_t, y, batch_size=32, epochs=2) # validation_split=0.1);

In [None]:
y_test = model.predict([X_te], batch_size=1024, verbose=1)
submission2 = pd.read_csv(f'{path}{comp}sample_submission.csv')
submission2[list_classes] = y_test
submission2.to_csv('submission_LSTM.csv', index=False)

Now try linear regression model. The data used are from https://www.kaggle.com/eoveson/convai-datasets-baseline-models

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from scipy import sparse
from subprocess import check_output
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [None]:
train = pd.read_csv('../input/convai-datasets-baseline-models/train_with_convai.csv')
test = pd.read_csv('../input/convai-datasets-baseline-models/test_with_convai.csv')

In [None]:
label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
p_res = submission2.copy()
p_res[label_cols] = (submission[label_cols] + submission2[label_cols]) / 2
p_res.to_csv('submission.csv', index=False)

In [None]:
feats_to_concat = ['comment_text', 'toxic_level', 'attack', 'aggression']
# combining test and train
alldata = pd.concat([train[feats_to_concat], test[feats_to_concat]], axis=0)
alldata.comment_text.fillna('unknown', inplace=True)

In [None]:
vect_words = TfidfVectorizer(max_features=50000, analyzer='word', ngram_range=(1, 1))
vect_chars = TfidfVectorizer(max_features=20000, analyzer='char', ngram_range=(1, 3))
all_words = vect_words.fit_transform(alldata.comment_text)
all_chars = vect_chars.fit_transform(alldata.comment_text)

In [None]:
train_new = train
test_new = test

train_words = all_words[:len(train_new)]
test_words = all_words[len(train_new):]

train_chars = all_chars[:len(train_new)]
test_chars = all_chars[len(train_new):]

In [None]:
feats = ['toxic_level', 'attack']
# make sparse matrix with needed data for train and test
train_feats = sparse.hstack([train_words, train_chars, alldata[feats][:len(train_new)]])
test_feats = sparse.hstack([test_words, test_chars, alldata[feats][len(train_new):]])

In [None]:
col = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

only_col = ['toxic']

preds = np.zeros((test_new.shape[0], len(col)))

for i, j in enumerate(col):
    print('===Fit '+j)
    
    model = LogisticRegression(C=4.0, solver='sag')
    print('Fitting model')
    model.fit(train_feats, train_new[j])
      
    print('Predicting on test')
    preds[:,i] = model.predict_proba(test_feats)[:,1]

In [None]:
subm = pd.read_csv('../input/jigsaw-toxic-comment-classification-challenge/sample_submission.csv')

submid = pd.DataFrame({'id': subm["id"]})
submission3 = pd.concat([submid, pd.DataFrame(preds, columns = col)], axis=1)
submission3.to_csv('submission_LR.csv', index=False)

In [None]:
label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
p_res = submission4.copy()
p_res[label_cols] = (2* submission[label_cols] + 3 * submission2[label_cols] + 4 * submission3[label_cols]) / 9
p_res.to_csv('submission.csv', index=False)