# Import Libraries

In [1]:
import pandas as pd
import re
import numpy as np
from keras.preprocessing import sequence
from keras.regularizers import l2
from keras.models import Model
from keras.layers.merge import concatenate
from keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard
from keras.layers import SpatialDropout1D, Dense, Flatten, GlobalMaxPooling1D, Activation, Dropout, GaussianNoise
from keras.layers import Embedding, Input, BatchNormalization, SpatialDropout1D, Conv1D
from keras.optimizers import Adam
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from IPython.display import display
import itertools
from nltk.corpus import words
%matplotlib inline
import matplotlib.pyplot as plt

  return f(*args, **kwds)
  return f(*args, **kwds)
Using TensorFlow backend.


# Load Cleaned data

In [3]:
# Load data
train = pd.read_csv('./data/cleaned_train.csv')
test = pd.read_csv('./data/cleaned_test.csv')

list_sentences_train = train["comment_text"].fillna("_NaN_").values
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y = train[list_classes].values
list_sentences_test = test["comment_text"].fillna("_NaN_").values

In [None]:
# Set parameters
embed_size   = 300    # how big is each word vector
max_features = 100000 # how many unique words to use (i.e num rows in embedding vector)
maxlen       = 400   # max number of words in a comment to use 

In [4]:
# Pad sentences and convert to integers
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(list_sentences_train))
list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)

X_train = pad_sequences(list_tokenized_train, maxlen=maxlen, padding='post')
X_test = pad_sequences(list_tokenized_test, maxlen=maxlen, padding='post')

In [5]:
f = open('./data/glove.840B.300d.txt')
embeddings_index = {}
for line in f:
    values = line.split()
    word = ''.join(values[:-300])
    coefs = np.asarray(values[-300:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

In [6]:
# Create embeddings matrix
all_embs = np.stack(embeddings_index.values())
emb_mean,emb_std = all_embs.mean(), all_embs.std()

# Create embedding matrix using our vocabulary
word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))

# Initialize embedding matrix
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))

# Loop through each word and get its embedding vector
for word, i in word_index.items():
    if i >= max_features: 
        continue # Skip words appearing less than the minimum allowed
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: 
        embedding_matrix[i] = embedding_vector

# Training

In [7]:
from keras.engine import InputSpec, Layer
import tensorflow as tf
def auc_roc(y_true, y_pred):
    # any tensorflow metric
    value, update_op = tf.contrib.metrics.streaming_auc(y_pred, y_true)#Please switch to tf.metrics.auc. Note that the order of the labels and predictions arguments has been switched.

    # find all variables created for this metric
    metric_vars = [i for i in tf.local_variables() if 'auc_roc' in i.name.split('/')[1]]

    # Add metric variables to GLOBAL_VARIABLES collection.
    # They will be initialized for new session.
    for v in metric_vars:
        tf.add_to_collection(tf.GraphKeys.GLOBAL_VARIABLES, v)

    # force to update metric values
    with tf.control_dependencies([update_op]):
        value = tf.identity(value)
        return value

In [8]:
# Initialize parameters
conv_filters = 180 # No. filters to use for each convolution

In [9]:
from keras.constraints import max_norm
inp = Input(shape=(X_train.shape[1],), dtype='int32')
emb = Embedding(max_features, embed_size, weights=[embedding_matrix], input_length = maxlen, trainable = False)(inp)
emb = SpatialDropout1D(0.2)(emb)
# Specify each convolution layer and their kernel siz i.e. n-grams 
conv1_1 = Conv1D(filters=conv_filters, kernel_size=1, kernel_initializer = 'normal')(emb)
actv1_1 = Activation('relu')(conv1_1)
glmp1_1 = GlobalMaxPooling1D()(actv1_1)

conv1_2 = Conv1D(filters=conv_filters, kernel_size=2, kernel_initializer = 'normal')(emb)
actv1_2 = Activation('relu')(conv1_2)
glmp1_2 = GlobalMaxPooling1D()(actv1_2)

conv1_3 = Conv1D(filters=conv_filters, kernel_size=3, kernel_initializer = 'normal')(emb)
actv1_3 = Activation('relu')(conv1_3)
glmp1_3 = GlobalMaxPooling1D()(actv1_3)

conv1_4 = Conv1D(filters=conv_filters, kernel_size=4, kernel_initializer = 'normal')(emb)
actv1_4 = Activation('relu')(conv1_4)
glmp1_4 = GlobalMaxPooling1D()(actv1_4)
# Gather all convolution layers
cnct = concatenate([glmp1_1, glmp1_2, glmp1_3, glmp1_4], axis=1)
drp1 = Dropout(0.6)(cnct)
dns1  = Dense(144, activation='relu')(drp1)
out = Dense(y.shape[1],kernel_constraint=max_norm(3.), activation='sigmoid')(dns1)

In [10]:
# Compile
model = Model(inputs=inp, outputs=out)
adam = Adam(lr=1e-3, decay=1e-7)
model.compile(optimizer=adam, loss='binary_crossentropy', metrics=['accuracy'])

In [11]:
# Set callback functions to early stop training and save the best model so far
callbacks = [EarlyStopping(monitor='val_loss', patience=10),
             ModelCheckpoint(filepath='./models/best_CNN_model.h5', monitor='val_loss', save_best_only=True)]

In [12]:
# Train neural network
history = model.fit(X_train, # Features
                      y, # Target vector
                      epochs=50, # Number of epochs
                      callbacks=callbacks, # Early stopping
                      verbose=1, # Print description after each epoch
                      batch_size=256, # Number of observations per batch
                      validation_split=0.1,
                      shuffle=True) # Data for evaluation

Train on 143613 samples, validate on 15958 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50


In [14]:
from keras.models import load_model
model = load_model('./models/best_CNN_model.h5')

In [15]:
# Predict
preds = model.predict(X_test)

# Submit

In [16]:
# Create submission
submid = pd.DataFrame({'id': test["id"]})
submission = pd.concat([submid, pd.DataFrame(preds, columns = list_classes)], axis=1)
submission.to_csv('conv_glove_simple_sub.csv', index=False)