In [6]:
import sys, os, re, csv, codecs, numpy as np, pandas as pd
import gensim
import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras import initializers, regularizers, constraints, optimizers, layers
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers import Bidirectional, GlobalMaxPool1D, Conv1D
from keras.models import Model, Sequential
from keras.layers.wrappers import TimeDistributed
from keras.layers import Convolution1D, MaxPool1D, Flatten, BatchNormalization


embedding_file = '../inputs/glove.6B.100d.txt'
train_file = '../inputs/train.csv'
test_file = '../inputs/test.csv'

In [2]:
df = pd.read_csv(train_file, index_col='id')
df_test = pd.read_csv(test_file, index_col='id')
# One test input is missing data, so we will just replace it by an empty string.
df_test['comment_text'].fillna('', inplace=True)

simple_tokens = df.comment_text.apply(gensim.utils.simple_preprocess)
phrases = gensim.models.phrases.Phrases(simple_tokens)
tokenizer = gensim.models.phrases.Phraser(phrases)
tokenized_text = list(tokenizer[simple_tokens])
corpus_dict = gensim.corpora.dictionary.Dictionary(tokenized_text)
TARGET_CLASSES = df.columns[1:]
targets = df[TARGET_CLASSES].values
from sklearn.feature_extraction import stop_words
stoplist = [x for x in stop_words.ENGLISH_STOP_WORDS]
customlist = ['ll', 'd', 'm', 're']
stoplist.extend(customlist)

for sentence in tokenized_text:
    for ind, word in enumerate(sentence):
        if '_' in word:
            seplist = word.split('_')
            sentence.pop(ind)
            pos = ind
            for sep in seplist:
                sentence.insert(pos, sep)
                pos += 1

clear_tokenized_text = []
for sentence in tokenized_text:
    clear = [x for x in sentence if not x in stoplist]
    clear_tokenized_text.append(clear)
    
reversed_tokenized_text = []
for sentence in tokenized_text:
    reversed_tokenized_text.append(list(reversed(sentence)))
word2vec = gensim.models.word2vec.Word2Vec(reversed_tokenized_text, window=5, size=300, min_count=2, workers=6)
features = np.zeros((len(reversed_tokenized_text), word2vec.vector_size))
for i, tokens in enumerate(reversed_tokenized_text):
    tokens = [t for t in tokens if t in word2vec.wv.vocab]
    if tokens:
        features[i, :] = np.mean([word2vec.wv[t] / word2vec.wv.vocab[t].count for t in tokens], axis=0)

MAX_SEQ_LEN = 150
# Note: shifting indices by 1 as index 0 will be used for padding.
docs = [[idx + 1 for idx in corpus_dict.doc2idx(doc)]  for doc in clear_tokenized_text]
padded_docs = keras.preprocessing.sequence.pad_sequences(docs, maxlen=MAX_SEQ_LEN, truncating='post', value=0)
max_idx = max(c for d in docs for c in d)
embeddings = np.array([np.random.normal(size=word2vec.vector_size)]+ # for the '0' padding word
                      [word2vec.wv[corpus_dict[idx]]
                      if corpus_dict[idx] in word2vec.wv.vocab
                      else np.random.normal(size=word2vec.vector_size)
                      for idx in range(max_idx)])

In [3]:
nn = Sequential()
nn.add(Dense(1024, activation='elu', input_shape=(word2vec.vector_size,)))
nn.add(Dense(512, activation='elu'))
nn.add(Dense(128, activation='elu'))
nn.add(Dense(len(TARGET_CLASSES), activation='sigmoid'))
nn.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
nn.fit(features, targets, epochs=10, validation_split=0.1)

Train on 86265 samples, validate on 9586 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7feab12c1fd0>

In [9]:
from keras.layers import Embedding, Conv1D, MaxPooling1D, GlobalMaxPooling1D 

num_filters = 64
weight_decay = 1e-4

cnn = Sequential()
cnn.add(Embedding(max_idx + 1, word2vec.vector_size, input_length=MAX_SEQ_LEN))
cnn.add(Conv1D(num_filters, 7, activation='relu', padding='same'))
cnn.add(MaxPooling1D(2))
cnn.add(Conv1D(num_filters, 7, activation='relu', padding='same'))
cnn.add(GlobalMaxPooling1D())
cnn.add(Dropout(0.5))
cnn.add(Dense(32, activation='relu', kernel_regularizer=regularizers.l2(weight_decay)))
cnn.add(Dense(len(TARGET_CLASSES), activation='sigmoid'))  #multi-label (k-hot encoding)

cnn.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
cnn.fit(padded_docs, targets, batch_size=128, epochs=5, validation_split=0.1)

Train on 86265 samples, validate on 9586 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7fea8033ce48>

In [10]:
rnn = Sequential()
rnn.add(Embedding(max_idx + 1, word2vec.vector_size, input_length=MAX_SEQ_LEN))
rnn.add(Bidirectional(LSTM(256, return_sequences=True, dropout=0.1, recurrent_dropout=0.1)))
rnn.add(Bidirectional(LSTM(256, return_sequences=False, dropout=0.1, recurrent_dropout=0.1)))
rnn.add(Dense(256, activation='elu'))
rnn.add(Dropout(0.25))
rnn.add(Dense(128, activation='elu'))
rnn.add(Dropout(0.25))
rnn.add(Dense(len(TARGET_CLASSES), activation='sigmoid'))

rnn.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
rnn.fit(padded_docs, targets, batch_size=128, epochs=3, validation_split=0.05)

Train on 91058 samples, validate on 4793 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7fea50756f60>

In [11]:
df = pd.read_csv(test_file, index_col='id')
# One test input is missing data, so we will just replace it by an empty string.
df['comment_text'].fillna('', inplace=True)

simple_tokens = df.comment_text.apply(gensim.utils.simple_preprocess)
phrases = gensim.models.phrases.Phrases(simple_tokens)
tokenizer = gensim.models.phrases.Phraser(phrases)
tokenized_text = list(tokenizer[simple_tokens])
corpus_dict = gensim.corpora.dictionary.Dictionary(tokenized_text)
TARGET_CLASSES = df.columns[1:]
targets = df[TARGET_CLASSES].values
from sklearn.feature_extraction import stop_words
stoplist = [x for x in stop_words.ENGLISH_STOP_WORDS]
customlist = ['ll', 'd', 'm', 're']
stoplist.extend(customlist)

for sentence in tokenized_text:
    for ind, word in enumerate(sentence):
        if '_' in word:
            seplist = word.split('_')
            sentence.pop(ind)
            pos = ind
            for sep in seplist:
                sentence.insert(pos, sep)
                pos += 1

clear_tokenized_text = []
for sentence in tokenized_text:
    clear = [x for x in sentence if not x in stoplist]
    clear_tokenized_text.append(clear)
    
reversed_tokenized_text = []
for sentence in tokenized_text:
    reversed_tokenized_text.append(list(reversed(sentence)))
word2vec = gensim.models.word2vec.Word2Vec(reversed_tokenized_text, window=5, size=300, min_count=2, workers=6)
features = np.zeros((len(reversed_tokenized_text), word2vec.vector_size))
for i, tokens in enumerate(reversed_tokenized_text):
    tokens = [t for t in tokens if t in word2vec.wv.vocab]
    if tokens:
        features[i, :] = np.mean([word2vec.wv[t] / word2vec.wv.vocab[t].count for t in tokens], axis=0)

MAX_SEQ_LEN = 150
# Note: shifting indices by 1 as index 0 will be used for padding.
docs = [[idx + 1 for idx in corpus_dict.doc2idx(doc)]  for doc in clear_tokenized_text]
padded_docs = keras.preprocessing.sequence.pad_sequences(docs, maxlen=MAX_SEQ_LEN, truncating='post', value=0)
max_idx = max(c for d in docs for c in d)
embeddings = np.array([np.random.normal(size=word2vec.vector_size)]+ # for the '0' padding word
                      [word2vec.wv[corpus_dict[idx]]
                      if corpus_dict[idx] in word2vec.wv.vocab
                      else np.random.normal(size=word2vec.vector_size)
                      for idx in range(max_idx)])

In [12]:
y_nn = nn.predict(features, batch_size=1024, verbose=1)
y_cnn = cnn.predict(padded_docs, batch_size=1024, verbose=1)
y_rnn = rnn.predict(padded_docs, batch_size=1024, verbose=1)




NameError: name 'list_classes' is not defined

In [18]:
y_hat = (4 * y_cnn + 6 * y_rnn) / 10

df = pd.read_csv(train_file, index_col='id')
TARGET_CLASSES = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

sample_submission = pd.read_csv('../inputs/sample_submission.csv')
sample_submission[TARGET_CLASSES] = y_hat
sample_submission.to_csv('../outputs/submission.csv', index=False)