Trying Convolutional Neural Networks for text classification using Keras since the API is straight forward.
Word embeddings were all using GloVe (http://nlp.stanford.edu/data/glove.840B.300d.zip)

First copied an example from Keras: https://github.com/keras-team/keras/blob/master/examples/imdb_cnn.py and got a Public Test score of 0.049.
Then tried to emulate CNN-static from Yoon Kim (http://aclweb.org/anthology/D14-1181) and got a Public Test score of .

In [1]:
from keras.layers import Bidirectional, CuDNNGRU, Embedding, Dense, Dropout, Flatten, Input
from keras.models import Model
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras import regularizers
import numpy as np
import pandas as pd
import string

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
df = pd.read_csv('train.csv')
raw_comments = df.comment_text.values
classes = df.drop(columns=['id', 'comment_text']).columns
y = df[classes].values

In [3]:
MAX_WORDS = 20000
LEN_SENTENCE = 150

In [4]:
tokenizer = Tokenizer(num_words=MAX_WORDS)
tokenizer.fit_on_texts(list(raw_comments))
tokens = tokenizer.texts_to_sequences(raw_comments)
X = sequence.pad_sequences(tokens, maxlen=LEN_SENTENCE)

In [5]:
def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.strip().split()) for o in open('glove.840B.300d.txt', 'r'))
EMBED_SIZE = 300

In [6]:
all_embs = np.stack(embeddings_index.values())
emb_mean = all_embs.mean()
emb_std = all_embs.std()

In [7]:
word_index = tokenizer.word_index
nb_words = min(MAX_WORDS, len(word_index))
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, EMBED_SIZE))
for word, i in word_index.items():
    if i >= nb_words:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

### Keras example model

This was based on the example from Keras (https://github.com/keras-team/keras/blob/master/examples/imdb_bidirectional_lstm.py) with a modification that it runs on Cuda GRU.

In [8]:
inp = Input(shape=(LEN_SENTENCE,))
x = Embedding(MAX_WORDS, EMBED_SIZE, weights=[embedding_matrix])(inp)
x = Bidirectional((CuDNNGRU(64)))(x)
x = Dropout(0.2)(x)
x = Dense(len(classes), activation='sigmoid')(x)

model_keras = Model(inputs=inp, outputs=x)
model_keras.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [9]:
model_keras.fit(X, y, batch_size=32, epochs=2, verbose=1)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f096cf1bfd0>

In [10]:
X_pred = model_keras.predict([X], batch_size=1024)
df_pred = pd.concat([df[['id']], pd.DataFrame(X_pred, columns=classes)], axis=1)
df_pred.to_csv('train_RNN_GRU.csv', index=False)

### RCNN

In [13]:
from keras.layers import Conv1D, GlobalMaxPooling1D, MaxPooling1D

In [20]:
inp = Input(shape=(LEN_SENTENCE,))
x = Embedding(MAX_WORDS, EMBED_SIZE, weights=[embedding_matrix])(inp)
x = Dropout(0.2)(x)
x = Conv1D(256, 3, activation='relu')(x)
x = MaxPooling1D(3)(x)
x = Bidirectional((CuDNNGRU(64)))(x)
x = Dropout(0.2)(x)
x = Dense(len(classes), activation='sigmoid')(x)

model_keras = Model(inputs=inp, outputs=x)
model_keras.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [22]:
model_keras.fit(X, y, validation_split=0.05, batch_size=32, epochs=2, verbose=1)

Train on 91058 samples, validate on 4793 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7fd54c128f10>

### Character based

In [3]:
characters = list(string.ascii_lowercase + string.digits + string.punctuation + '\n')
NUM_CHARS = len(characters)
char_vocab = {c: ind for ind, c in enumerate(characters)}
char_set = set(characters)

In [4]:
MAX_CHAR_LEN = 256   # Max num chars to consider for each frame, using Small Frame b/c OOM

In [5]:
# Custom Character Embedding
X_char = np.zeros((len(raw_comments), MAX_CHAR_LEN, NUM_CHARS), dtype=np.int8)
for i in range(len(raw_comments)):
    for j in range(len(raw_comments[i])):
        if j >= MAX_CHAR_LEN:
            break

        c = raw_comments[i][j].lower()
        if c in char_set:
            X_char[i, j, char_vocab[c]] = 1

In [10]:
inp = Input(shape=(MAX_CHAR_LEN, NUM_CHARS))

x = Bidirectional((CuDNNGRU(64)))(inp)
x = Dropout(0.2)(x)
x = Dense(200, activation='relu')(x)
x = Dropout(0.2)(x)
x = Dense(len(classes), activation='sigmoid')(x)

model = Model(inputs=inp, outputs=x)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [12]:
model.fit(X_char, y, validation_split=0.05, batch_size=32, epochs=10, verbose=1)

Train on 91058 samples, validate on 4793 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
 8096/91058 [=>............................] - ETA: 1:18 - loss: 0.0402 - acc: 0.9853

KeyboardInterrupt: 

# Fit on all training data, predict on test data, write to csv

In [None]:
model_keras.fit(X, y, batch_size=32, epochs=2, verbose=1)

In [None]:
sub = pd.read_csv('test.csv')

In [None]:
# Custom Character Embedding
X_test = np.zeros((len(sub), MAX_CHAR_LEN, NUM_CHARS))
for i, comment in enumerate(sub.comment_text.fillna("_na_").values):
    comment_vec = np.zeros((MAX_CHAR_LEN, NUM_CHARS))
    for j, c in enumerate(comment.lower()):
        if j >= MAX_CHAR_LEN:
            break
        
        char_vec = np.zeros(NUM_CHARS, dtype=np.int)
        if c in char_set:
            char_vec[char_vocab[c]] = 1
        
        comment_vec[j, :] = char_vec
    X_test[i, :, :] = comment_vec

In [None]:
# Split up scoring because OOM
X_sub_parts = np.array_split(X_sub, 10)
pred_sub_parts = map(lambda x: cnn_char.predict([x], batch_size=512, verbose=2), X_sub_parts)
pred_sub = np.concatenate(pred_sub_parts)

In [None]:
df_sub = pd.concat([sub.drop(columns=['comment_text']), pd.DataFrame(pred_sub, columns=classes)], axis=1)
df_sub.to_csv('rnn_keras_gru.csv', index=False)