In [1]:
import sys
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [65]:
from keras.preprocessing import text, sequence
from keras.layers import (Input, Embedding, GlobalAveragePooling1D, GlobalMaxPooling1D, Dense, 
                          SpatialDropout1D, Bidirectional, LSTM, concatenate)
from keras.models import Model

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

In [4]:
sys.path.insert(1, '..')

# Functions and constants

In [5]:
TARGETS = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [95]:
def multi_roc_auc(y_true, y_score, verbose=False):
    ''' Compute roc auc for each target and then average them
    y_true - dataframe of true targets
    y_score - dataframe of predicted target
    '''
    roc_scores = dict()
    for target in TARGETS:
        roc_score = roc_auc_score(y_true=y_true[target], y_score=y_score[target])
        roc_scores[target] = roc_score

    mean_roc_score = np.mean(list(roc_scores.values()))
    
    if verbose: 
        print('Mean ROC AUC overall all targets: {}'.format(mean_roc_score))
    
    return mean_roc_score, roc_scores

# Load data

In [6]:
data_dir = '../data/external/'

In [7]:
train_df = pd.read_csv(data_dir + 'train.csv')
test_df = pd.read_csv(data_dir + 'test.csv')

In [8]:
X_train = train_df.comment_text
y_train = train_df.loc[:, TARGETS]

X_test = test_df.comment_text

# Word embeddings

In [9]:
vocab_size = 100000
max_sequence_len = 200
embedding_dim = 100
glove_file = '../glove.6B/glove.6B.100d.txt'

In [11]:
tk = text.Tokenizer(num_words=vocab_size, lower=True)
tk.fit_on_texts(list(X_train))

In [12]:
# Rectify issue with num_words in Keras
# See issue: https://github.com/keras-team/keras/issues/8583#issuecomment-346981336
tk.word_index = {e:i for e,i in tk.word_index.items() if i <= vocab_size} # <= because tokenizer is 1 indexed
tk.word_index[tk.oov_token] = vocab_size + 1

In [14]:
X_train_processed = tk.texts_to_sequences(X_train)
X_test_processed = tk.texts_to_sequences(X_test)

X_train_processed = sequence.pad_sequences(X_train_processed, maxlen=max_sequence_len)
X_test_processed = sequence.pad_sequences(X_test_processed, maxlen=max_sequence_len)

In [15]:
# Load embeddings into dictionary
embeddings_dict = {}
with open(glove_file) as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], dtype='float32')
        embeddings_dict[word] = vector

In [18]:
word_index = tk.word_index

embedding_matrix = np.zeros((vocab_size+1, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_dict.get(word)
    if embedding_vector is not None:
        embedding_matrix[i, :] = embedding_vector

# Build and train bidirectional LSTM

In [84]:
input_seq = Input(shape=(max_sequence_len,), dtype='int32')
x = Embedding(input_dim=vocab_size+1,
              output_dim=embedding_dim,
              weights=[embedding_matrix],
              input_length=max_sequence_len,
              trainable=False)(input_seq)

x = SpatialDropout1D(0.3)(x)
x = Bidirectional(LSTM(128, return_sequences=True, dropout=0.1))(x)

max_pool = GlobalMaxPooling1D()(x)
avg_pool = GlobalAveragePooling1D()(x)
x = concatenate([max_pool, avg_pool])

output = Dense(6, activation='sigmoid')(x)

In [85]:
model = Model(input_seq, output)
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_15 (InputLayer)           (None, 200)          0                                            
__________________________________________________________________________________________________
embedding_18 (Embedding)        (None, 200, 100)     10000100    input_15[0][0]                   
__________________________________________________________________________________________________
spatial_dropout1d_7 (SpatialDro (None, 200, 100)     0           embedding_18[0][0]               
__________________________________________________________________________________________________
bidirectional_5 (Bidirectional) (None, 200, 256)     234496      spatial_dropout1d_7[0][0]        
__________________________________________________________________________________________________
global_max

In [90]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X_train_processed[:100], y_train.values[:100], batch_size=32, epochs=1, verbose=1)

Epoch 1/1


<keras.callbacks.History at 0x1a6c1929b0>

# Score on train and test set

In [93]:
preds_train = model.predict(X_train_processed[:10])
preds_test = model.predict(X_test_processed[:10])

In [None]:
mean_roc_score, roc_scores = multi_roc_auc(y_score=preds_train, y_true=y_train.values[:10])

# Output predictions for submission

In [80]:
output_df = pd.concat([test_df['id'], preds_test_df], axis=1)
output_df.head()

In [81]:
output_df.to_csv('../data/processed/lstm_glove_submission.csv')