Trying Convolutional Neural Networks for text classification using Keras since the API is straight forward.
Word embeddings were all using GloVe (http://nlp.stanford.edu/data/glove.840B.300d.zip)

First copied an example from Keras: https://github.com/keras-team/keras/blob/master/examples/imdb_cnn.py and got a Public Test score of 0.049.
Then tried to emulate CNN-static from Yoon Kim (http://aclweb.org/anthology/D14-1181) and got a Public Test score of .

In [27]:
from keras import backend as K
from keras.layers import Activation, Conv1D, Embedding, Dense, Dropout, Flatten, GlobalMaxPooling1D, Input, MaxPooling1D
from keras.layers.merge import Concatenate
from keras.models import Model
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras import regularizers
import numpy as np
import pandas as pd

In [3]:
df = pd.read_csv('train.csv')
raw_comments = df.comment_text.values
classes = df.drop(columns=['id', 'comment_text']).columns
y = df[classes].values

In [4]:
MAX_WORDS = 20000
LEN_SENTENCE = 100

In [8]:
tokenizer = Tokenizer(num_words=MAX_WORDS)
tokenizer.fit_on_texts(list(raw_comments))
tokens = tokenizer.texts_to_sequences(raw_comments)
X = sequence.pad_sequences(tokens, maxlen=LEN_SENTENCE)

In [9]:
def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.strip().split()) for o in open('glove.840B.300d.txt', 'r'))
EMBED_SIZE = 300

In [20]:
all_embs = np.stack(embeddings_index.values())
emb_mean = all_embs.mean()
emb_std = all_embs.std()

In [24]:
word_index = tokenizer.word_index
nb_words = min(MAX_WORDS, len(word_index))
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, EMBED_SIZE))
for word, i in word_index.items():
    if i >= nb_words:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

### Keras example model

This was based on the example from Keras. Surprisingly ended up performing better because other models overfitted more. The version of this that was submitted only used 1 epoch for training. As you can see from the output below, the log loss on the validation set increased on the second epoch despite lower training log loss, which indicates overfitting.

In [134]:
inp = Input(shape=(LEN_SENTENCE,))
x = Embedding(MAX_WORDS, EMBED_SIZE, weights=[embedding_matrix], trainable=True)(inp)
x = Dropout(0.2)(x)
x = Conv1D(250, 3, padding='valid', activation='relu', strides=1)(x)
x = GlobalMaxPooling1D()(x)
x = Dense(250)(x)
x = Dropout(0.2)(x)
x = Activation('relu')(x)
x = Dense(len(classes), activation='sigmoid')(x)

model_keras = Model(inputs=inp, outputs=x)
model_keras.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [129]:
model_keras.fit(X, y, validation_split=0.05, batch_size=32, epochs=2, verbose=1)

Train on 91058 samples, validate on 4793 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f4ae1b65e10>

### Convolutional Neural Networks for Sentence Classification

The CNN-static, CNN-nonstatic, and CNN-multichannel versions from the paper were implemented. These all performed significantly worse than the model above as you can see from the training and test metrics. A custom regularization method was implemented below based on the description in the paper.

In [110]:
NUM_FILTERS = 100
conv_strides = 1

In [115]:
# Regularization on weights of penultimate layer: L2 norms cannot be higher than 3
def custom_reg(weight_matrix):
    return K.clip(K.l2_normalize(weight_matrix), float("-inf"), 3)

#### Static

In [130]:
inp = Input(shape=(LEN_SENTENCE,))
x = Embedding(MAX_WORDS, EMBED_SIZE, weights=[embedding_matrix])(inp)

convolutions = []
for filter_window in [3, 4, 5]:
    conv = Conv1D(NUM_FILTERS, filter_window, padding='valid', activation='relu', strides=conv_strides)(x)
    pool_size = (LEN_SENTENCE - filter_window + 1) / conv_strides
    conv = MaxPooling1D(pool_size=pool_size, strides=None)(conv)
    conv = Flatten()(conv)
    convolutions.append(conv)
x = Concatenate()(convolutions)

x = Dense(len(classes), activation='sigmoid', kernel_regularizer=custom_reg)(x)
x = Dropout(0.5)(x)

cnn_static = Model(inputs=inp, outputs=x)
cnn_static.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [117]:
cnn_static.fit(X, y, validation_split=0.05, batch_size=50, epochs=2, verbose=1)

Train on 91058 samples, validate on 4793 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f4b287ee510>

#### Non-static

In [122]:
inp = Input(shape=(LEN_SENTENCE,))
x = Embedding(MAX_WORDS, EMBED_SIZE, weights=[embedding_matrix], trainable=True)(inp)

convolutions = []
for filter_window in [3, 4, 5]:
    conv = Conv1D(NUM_FILTERS, filter_window, padding='valid', activation='relu', strides=conv_strides)(x)
    pool_size = (LEN_SENTENCE - filter_window + 1) / conv_strides
    conv = MaxPooling1D(pool_size=pool_size, strides=None)(conv)
    conv = Flatten()(conv)
    convolutions.append(conv)
x = Concatenate()(convolutions)

x = Dense(len(classes), activation='sigmoid', kernel_regularizer=custom_reg)(x)
x = Dropout(0.5)(x)

cnn_non_static = Model(inputs=inp, outputs=x)
cnn_non_static.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [123]:
cnn_non_static.fit(X, y, validation_split=0.05, batch_size=50, epochs=2, verbose=1)

Train on 91058 samples, validate on 4793 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f4ae2383150>

#### Multi-Channel

In [124]:
inp = Input(shape=(LEN_SENTENCE,))

channel_dynamic = Embedding(MAX_WORDS, EMBED_SIZE, weights=[embedding_matrix], trainable=True)(inp)
channel_static = Embedding(MAX_WORDS, EMBED_SIZE, weights=[embedding_matrix])(inp)
x = Concatenate()([channel_dynamic, channel_static])

convolutions = []
for filter_window in [3, 4, 5]:
    conv = Conv1D(NUM_FILTERS, filter_window, padding='valid', activation='relu', strides=conv_strides)(x)
    pool_size = (LEN_SENTENCE - filter_window + 1) / conv_strides
    conv = MaxPooling1D(pool_size=pool_size, strides=None)(conv)
    conv = Flatten()(conv)
    convolutions.append(conv)
x = Concatenate()(convolutions)

x = Dense(len(classes), activation='sigmoid', kernel_regularizer=custom_reg)(x)
x = Dropout(0.5)(x)

cnn_multichannel = Model(inputs=inp, outputs=x)
cnn_multichannel.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [125]:
cnn_multichannel.fit(X, y, validation_split=0.05, batch_size=50, epochs=2, verbose=1)

Train on 91058 samples, validate on 4793 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f4ae23efe10>

# Fit on all training data, predict on test data, write to csv

In [135]:
model_keras.fit(X, y, batch_size=32, epochs=2, verbose=1)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f4af18ee890>

In [136]:
sub = pd.read_csv('test.csv')
sub_tokens = tokenizer.texts_to_sequences(sub.comment_text.fillna("_na_").values)
X_sub = sequence.pad_sequences(sub_tokens, maxlen=LEN_SENTENCE)
pred_sub = model.predict([X_sub], batch_size=1024, verbose=2)

In [137]:
df_sub = pd.concat([sub.drop(columns=['comment_text']), pd.DataFrame(pred_sub, columns=classes)], axis=1)
df_sub.to_csv('conv_submission_keras_trainable_embedding_2epoch.csv', index=False)