In [5]:
import os
import re
import sys
import numpy as np
import pandas as pd

#import matplotlib.pyplot as plt
#%matplotlib inline

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import RNN, GRU, LSTM, Dense, Input, Embedding, Dropout, Activation, concatenate
from keras.layers import Bidirectional, GlobalAveragePooling1D, GlobalMaxPooling1D
from keras.models import Model
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras import initializers, regularizers, constraints, optimizers, layers

In [6]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')
embedding_file = 'glove.6B.300d.txt'

In [None]:
train_data.describe()

In [None]:
test_data.describe()

In [None]:
train_data.head()

In [None]:
test_data.head()

In [None]:
train_data.isnull().any()

In [None]:
test_data.isnull().any()

In [26]:
train_data.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [None]:
# Furhter actions on any columns is not required, because no columns has any missing data.

In [7]:
classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y = train_data[classes].values

In [8]:
train_sentences = train_data["comment_text"].fillna("fillna").str.lower()
test_sentences = test_data["comment_text"].fillna("fillna").str.lower()

In [17]:
max_features = 50
max_len = 150
embed_size = 300

In [18]:
tokenizer = Tokenizer(max_features)
tokenizer.fit_on_texts(list(train_sentences))

In [19]:
tokenized_train_sentences = tokenizer.texts_to_sequences(train_sentences)
tokenized_test_sentences = tokenizer.texts_to_sequences(test_sentences)

In [20]:
train_sentences[1]

"d'aww! he matches this background colour i'm seemingly stuck with. thanks.  (talk) 21:51, january 11, 2016 (utc)"

In [24]:
tokenizer.word_index

{'the': 1,
 'to': 2,
 'of': 3,
 'and': 4,
 'a': 5,
 'you': 6,
 'i': 7,
 'is': 8,
 'that': 9,
 'in': 10,
 'it': 11,
 'for': 12,
 'this': 13,
 'not': 14,
 'on': 15,
 'be': 16,
 'as': 17,
 'have': 18,
 'are': 19,
 'your': 20,
 'with': 21,
 'if': 22,
 'article': 23,
 'was': 24,
 'or': 25,
 'but': 26,
 'page': 27,
 'wikipedia': 28,
 'my': 29,
 'an': 30,
 'from': 31,
 'by': 32,
 'do': 33,
 'at': 34,
 'me': 35,
 'about': 36,
 'so': 37,
 'talk': 38,
 'what': 39,
 'can': 40,
 'there': 41,
 'all': 42,
 'has': 43,
 'will': 44,
 'please': 45,
 'no': 46,
 'would': 47,
 'one': 48,
 'like': 49,
 'just': 50,
 'they': 51,
 'he': 52,
 'which': 53,
 'any': 54,
 'been': 55,
 'should': 56,
 'more': 57,
 'we': 58,
 "don't": 59,
 'some': 60,
 'other': 61,
 'who': 62,
 'here': 63,
 'see': 64,
 'also': 65,
 'his': 66,
 'think': 67,
 'because': 68,
 'know': 69,
 'how': 70,
 'edit': 71,
 'am': 72,
 "i'm": 73,
 'people': 74,
 'why': 75,
 'up': 76,
 'only': 77,
 "it's": 78,
 'out': 79,
 'articles': 80,
 'use': 81,

In [22]:
tokenized_train_sentences[1]

[13, 21, 38]

In [14]:
train_sentences[5]

'"\n\ncongratulations from me as well, use the tools well. \xa0· talk "'

In [15]:
tokenized_train_sentences[5]

[31, 35, 17, 81, 1, 38]

In [16]:
tokenized_train_sentences

[[75, 1, 29, 86, 51, 50, 15, 60, 7, 34, 4, 45, 59, 1, 31, 1, 38, 27, 73, 89],
 [52, 13, 73, 21, 94, 38],
 [73,
  14,
  2,
  71,
  78,
  50,
  9,
  13,
  8,
  4,
  2,
  35,
  3,
  29,
  38,
  27,
  52,
  2,
  57,
  36,
  1,
  93,
  1],
 [57,
  7,
  97,
  54,
  15,
  7,
  22,
  1,
  56,
  16,
  15,
  25,
  5,
  3,
  3,
  7,
  67,
  1,
  85,
  37,
  9,
  51,
  19,
  42,
  10,
  1,
  7,
  40,
  33,
  9,
  15,
  22,
  46,
  48,
  22,
  6,
  18,
  54,
  12,
  15,
  25,
  2,
  33,
  11,
  45,
  35,
  69,
  41,
  2,
  16,
  5,
  15,
  80,
  12,
  37,
  7,
  41,
  85,
  16,
  5,
  5,
  76,
  78,
  10,
  1,
  28,
  98,
  23],
 [6, 19, 29, 54, 6, 39, 27, 15],
 [31, 35, 17, 81, 1, 38],
 [6, 15, 29],
 [20, 2, 1, 23, 43, 55, 45, 59, 33, 11, 25, 6, 44, 16],
 [22,
  1,
  24,
  2,
  6,
  73,
  14,
  2,
  10,
  1,
  23,
  51,
  47,
  15,
  35,
  12,
  73,
  9,
  11,
  16,
  57,
  37,
  48,
  40,
  81,
  11,
  12,
  17,
  5,
  7,
  18,
  55,
  2,
  1,
  27,
  26,
  78,
  5,
  11,
  2,
  53,
  8,
  5,
  2

In [None]:
train_padding = pad_sequences(tokenized_train_sentences, max_len)
test_padding = pad_sequences(tokenized_test_sentences, max_len)

In [None]:
def get_coefs(word, *arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.rstrip().rsplit(' ')) for o in open(embedding_file))

word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.zeros((nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [None]:
image_input = Input(shape=(max_len, ))
X = Embedding(max_features, embed_size, weights=[embedding_matrix])(image_input)
X = Bidirectional(GRU(64, return_sequences=True, dropout=0.2, recurrent_dropout=0.2))(X)
# Dropout and R-Dropout sequence, inspired by Deep Learning with Python - Francois Chollet
avg_pl = GlobalAveragePooling1D()(X)
max_pl = GlobalMaxPooling1D()(X)
conc = concatenate([avg_pl, max_pl])
X = Dense(6, activation="sigmoid")(conc)
model = Model(inputs=image_input, outputs=X)

In [None]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
saved_model = "weights_base.best.hdf5"
checkpoint = ModelCheckpoint(saved_model, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
early = EarlyStopping(monitor="val_acc", mode="max", patience=5)
callbacks_list = [checkpoint, early]

In [None]:
batch_sz = 32
epoch = 2
model.fit(train_padding, y, batch_size=batch_sz, epochs=epoch, validation_split=0.1, callbacks=callbacks_list)

In [None]:
test_values = model.predict([test_padding], batch_size=1024, verbose=1)

In [None]:
sample_submission = pd.read_csv('sample_submission.csv')
sample_submission[classes] = test_values
sample_submission.to_csv('submission.csv', index=False)