# Kaggle Toxic Comments Challenge

In [None]:
import pandas as pd
import numpy as np
import gensim
import keras

## Data Loading

In [None]:
df = pd.read_csv('../input/train.csv',index_col='id')
df_test = pd.read_csv('../input/test.csv', index_col = 'id')

In [None]:
df_test.isnull().sum()

In [None]:
df_test.fillna('', inplace = True)

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.info()

## Preprocessing

In [None]:
simple_tokens = df.comment_text.apply(gensim.utils.simple_preprocess)

In [None]:
simple_tokens

In [None]:
df[df.index==999898414104]['comment_text'] # this is the actual conversion : [it, staying, let, move, on, corbett]

In [None]:
phrases = gensim.models.phrases.Phrases(simple_tokens)
tokenizer = gensim.models.phrases.Phraser(phrases)

In [None]:
tokenized_text = list(tokenizer[simple_tokens]) # a 2D list of all the keywords from comment_text

In [None]:
tokenized_text[0]

In [None]:
corpus_dict = gensim.corpora.dictionary.Dictionary(tokenized_text)

In [None]:
TARGET_CLASSES = df.columns[1:]

In [None]:
TARGET_CLASSES

In [None]:
targets = df[TARGET_CLASSES].values

## Analysis using seaborn

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
sns.distplot([len(doc) for doc in tokenized_text], bins=100, kde=False, label='Number of tokens per comment.')
plt.xlabel("Tokens in a comment")
plt.ylabel("Frequency")
plt.xlim((0, 400))

## Training word2vec on comment data

In [None]:
word2vec = gensim.models.word2vec.Word2Vec(tokenized_text, window=5, size=100, min_count=2, workers=6)

In [None]:
word2vec.wv.most_similar('popularity')

In [None]:
word2vec.wv.most_similar('idiot')

## word2vec-based based model.

In [None]:
features = np.zeros((len(tokenized_text), word2vec.vector_size))
for i, tokens in enumerate(tokenized_text):
    tokens = [t for t in tokens if t in word2vec.wv.vocab]
    if tokens:
        features[i, :] = np.mean([word2vec.wv[t] / word2vec.wv.vocab[t].count for t in tokens], axis=0)

In [None]:
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam

model = Sequential()
model.add(Dense(256, activation='relu', input_shape=(word2vec.vector_size,)))
model.add(Dense(128, activation='relu'))
model.add(Dense(len(TARGET_CLASSES), activation='sigmoid'))
model.compile(optimizer=Adam(lr=0.001), loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
model.fit(features, targets, epochs=10, validation_split=0.1)

In [None]:
# serialize model to YAML
model_yaml = model.to_yaml()
with open("model-baseline.yaml", "w") as yaml_file:
    yaml_file.write(model_yaml)
    
# serialize model to JSON
model_json = model.to_json()
with open("model-baseline.json", "w") as json_file:
    json_file.write(model_json)

# serialize weights to HDF5
model.save_weights("model-baseline.h5")
print("Saved model to disk")

## Sequential models

In [None]:
# Note: shifting indices by 1 as index 0 will be used for padding.
docs = [[idx + 1 for idx in corpus_dict.doc2idx(doc)]  for doc in tokenized_text]

In [None]:
MAX_SEQ_LEN = 50
padded_docs = keras.preprocessing.sequence.pad_sequences(docs, maxlen=MAX_SEQ_LEN, truncating='post', value=0)

In [None]:
max_idx = max(c for d in docs for c in d)
max_idx

In [None]:
embeddings = np.array([np.random.normal(size=word2vec.vector_size)]+ # for the '0' padding word
                      [word2vec.wv[corpus_dict[idx]]
                      if corpus_dict[idx] in word2vec.wv.vocab
                      else np.random.normal(size=word2vec.vector_size)
                      for idx in range(max_idx)])

### CNN

In [None]:
from keras.models import Sequential
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import SimpleRNN
from keras.layers.core import Dense, Dropout
from keras.layers.wrappers import TimeDistributed
from keras.layers import Convolution1D, MaxPool1D, Flatten, BatchNormalization

model = Sequential()
model.add(Embedding(max_idx + 1, word2vec.vector_size, input_length=MAX_SEQ_LEN))
model.add(Dropout(0.5))
model.add(BatchNormalization())
model.add(Convolution1D(52, 5, padding='same',
                        kernel_regularizer=keras.regularizers.l2(0.01)))
model.add(MaxPool1D())
model.add(Dropout(0.5))
model.add(BatchNormalization())
model.add(Convolution1D(128, 3, padding='same',
                        kernel_regularizer=keras.regularizers.l2(0.01)))
model.add(MaxPool1D())
model.add(Flatten())
model.add(Dense(len(TARGET_CLASSES), activation='sigmoid',
                kernel_regularizer=keras.regularizers.l2(0.02)))
model.compile(Adam(0.001), 'binary_crossentropy')

In [None]:
model.fit(padded_docs, targets, batch_size=512, epochs=20, validation_split=0.1)

In [None]:
# serialize model to YAML
model_yaml = model.to_yaml()
with open("model-cnn.yaml", "w") as yaml_file:
    yaml_file.write(model_yaml)
    
# serialize model to JSON
model_json = model.to_json()
with open("model-cnn.json", "w") as json_file:
    json_file.write(model_json)

# serialize weights to HDF5
model.save_weights("model-cnn.h5")
print("Saved model to disk")

## Submission

In [None]:
def comment_to_sequential_input(comment):
    tokens = tokenizer[gensim.utils.simple_preprocess(comment)]
    t_ids = [corpus_dict.token2id[t] + 1 for t in tokens if t in word2vec.wv.vocab and t in corpus_dict.token2id]
    return keras.preprocessing.sequence.pad_sequences([t_ids], maxlen=MAX_SEQ_LEN)[0]

In [None]:
test_input = [comment_to_sequential_input("You are a jerk you freakin indian.").reshape(1, -1)]
for target_class, score in zip(TARGET_CLASSES, model.predict(test_input)[0]):
    print("{}: {:.2f}%".format(target_class, score * 100))

In [None]:
test_inputs = np.array([comment_to_sequential_input(doc) for doc in df_test.comment_text])

In [None]:
test_outputs = model.predict(test_inputs)

In [None]:
test_outputs[0]

In [None]:
output_df = df_test.reset_index()[['id']].copy()

In [None]:
for i, target_class in enumerate(TARGET_CLASSES):
    output_df[target_class] = test_outputs[:, i]

In [None]:
output_df[output_df.toxic > 0.5].sample(10, random_state=0).merge(df_test.reset_index(), on='id')

In [None]:
output_df.to_csv('cnn-pred.csv', index=False)