In [1]:
import logging
import pickle
import pandas as pd
from sklearn import metrics
from sklearn.metrics import f1_score
import keras
from keras.models import Model, Sequential
from keras.layers import *
from keras.preprocessing.text import one_hot
from keras.optimizers import Adam
from tqdm import tnrange, tqdm_notebook
from keras.layers.advanced_activations import LeakyReLU
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from sklearn.utils import class_weight
from keras.layers import Concatenate, Dense, LSTM, Input, Activation, concatenate

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
import tensorflow as tf
from keras.backend.tensorflow_backend import set_session
config = tf.ConfigProto()
config.gpu_options.per_process_gpu_memory_fraction = 0.3
set_session(tf.Session(config=config))

In [3]:
MODEL_PATH = "../models/"
DATA_PATH = "../data/"

In [4]:
train = pd.read_csv(DATA_PATH + "preprocessed/train_ling.csv")

In [5]:
targets = train[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].values

In [21]:
meta_features = ['count_sent', 'count_word', 'count_unique_word', 'count_letters',
       'count_punctuations', 'count_words_upper', 'count_words_title',
       'count_stopwords', 'mean_word_len', 'word_unique_percent',
       'punct_percent', 'count_swear_words']

In [7]:
train.head()

Unnamed: 0.1,Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,count_sent,...,count_unique_word,count_letters,count_punctuations,count_words_upper,count_words_title,count_stopwords,mean_word_len,word_unique_percent,punct_percent,count_swear_words
0,0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0,2,...,41,264,10,2,11,16,5.162791,95.348837,23.255814,0
1,1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0,1,...,17,112,12,1,3,2,5.588235,100.0,70.588235,0
2,2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0,1,...,39,233,6,0,2,19,4.571429,92.857143,14.285714,0
3,3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0,5,...,82,622,21,5,7,55,4.486726,72.566372,18.584071,0
4,4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0,1,...,13,67,5,0,2,5,4.230769,100.0,38.461538,0


In [8]:
def build_input_data(sentences, labels, vocabulary):
    x = np.array([[embeddings_index[vocabulary_inv[vocabulary['word']]] if word in vocabulary.keys() else len(vocabulary) - 1 for word in sentence] for sentence in sentences])
    y = np.array(labels)
    return [x, y]

In [10]:
train_text = train['comment_text']

In [11]:
tokenizer = Tokenizer(num_words=18400)
tokenizer.fit_on_texts(train_text)
sequences = tokenizer.texts_to_sequences(train_text)

In [12]:
vocabulary = tokenizer.word_index
vocabulary_inv = {v:k for k, v in vocabulary.items()}
embeddings_index = {}
EMBEDDING_DIM = 100
f = open("../../../embeddings/glove.6B." + str(EMBEDDING_DIM) + "d.txt")
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

In [13]:
print('Found %s word vectors.' % len(embeddings_index))
embedding_matrix = np.zeros((len(vocabulary) + 1, EMBEDDING_DIM))
embedding_matrix[-1] = np.random.rand(EMBEDDING_DIM) # oov-vector
for word, i in vocabulary.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i - 1] = embedding_vector
embedding_layer = Embedding(embedding_matrix.shape[0],
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=200, trainable = False)

Found 400000 word vectors.


In [14]:
from sklearn.model_selection import train_test_split

In [19]:
x_train, x_dev, y_train, y_dev = train_test_split(train, targets, test_size=0.3, random_state=42)

In [20]:
max_length = 200

x_train_texts = tokenizer.texts_to_sequences(x_train['comment_text'])
x_train_texts = pad_sequences(x_train_texts, maxlen=max_length, padding='post')

x_dev_texts = tokenizer.texts_to_sequences(x_dev['comment_text'])
x_dev_texts = pad_sequences(x_dev_texts, maxlen=max_length, padding='post')

In [46]:
sequence_input = Input(shape=(max_length,), dtype='float32')
metadata_input = Input(shape=(len(meta_features),), dtype='float32')
embedded_sequences = embedding_layer(sequence_input)
lstm = keras.layers.CuDNNLSTM(10, return_sequences=False)(embedded_sequences)
concatenated_data = Concatenate(axis=1)([lstm, metadata_input])
dense_1 = Dense(50, activation='relu')(concatenated_data)
# dense_2 = Dense(1, activation='relu')(dense_1)
output_1 = Dense(units=1, activation='sigmoid', name = 'output_1')(dense_1)
output_2 = Dense(units=1, activation='sigmoid', name = 'output_2')(dense_1)
output_3 = Dense(units=1, activation='sigmoid', name = 'output_3')(dense_1)
output_4 = Dense(units=1, activation='sigmoid', name = 'output_4')(dense_1)
output_5 = Dense(units=1, activation='sigmoid', name = 'output_5')(dense_1)
output_6 = Dense(units=1, activation='sigmoid', name = 'output_6')(dense_1)
model = Model(inputs=[sequence_input,metadata_input], outputs=[output_1, output_2, output_3, output_4, output_5, output_6])
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['acc'])

model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_8 (InputLayer)            (None, 200)          0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 200, 100)     21033800    input_8[0][0]                    
__________________________________________________________________________________________________
cu_dnnlstm_3 (CuDNNLSTM)        (None, 10)           4480        embedding_1[3][0]                
__________________________________________________________________________________________________
input_9 (InputLayer)            (None, 12)           0                                            
__________________________________________________________________________________________________
concatenat

In [44]:
separate_targets_train = [y_train[:, i] for i in range(0, y_train.shape[1])]

In [83]:
separate_targets_dev = [y_dev[:, i] for i in range(0, y_dev.shape[1])]

In [52]:
separate_class_weights = [class_weight.compute_class_weight('balanced', np.unique(separate_targets_train[i]),separate_targets_train[i]) for i in range(0, len(separate_targets_train))]

In [58]:
separate_class_weights_dict = [{0:x[0], 1: x[1]} for x in separate_class_weights]

In [59]:
separate_class_weights_dict

[{0: 0.5530365294542862, 1: 5.213732262882749},
 {0: 0.5050140157337915, 1: 50.360234445446345},
 {0: 0.527848137156683, 1: 9.477261157305277},
 {0: 0.5015356017134083, 1: 163.30263157894737},
 {0: 0.5258353654517893, 1: 10.176658163265307},
 {0: 0.5043937286635478, 1: 57.39928057553957}]

In [53]:
separate_class_weights

[array([0.55303653, 5.21373226]),
 array([ 0.50501402, 50.36023445]),
 array([0.52784814, 9.47726116]),
 array([  0.5015356 , 163.30263158]),
 array([ 0.52583537, 10.17665816]),
 array([ 0.50439373, 57.39928058])]

In [62]:
multiple_class_weights = {'output_' + str(i + 1): x for i, x in enumerate(separate_class_weights_dict)}

In [72]:
separate_targets_train_ = [y_train[i, :] for i in range(0, y_train.shape[0])]
separate_targets_dev_ = [y_dev[i, :] for i in range(0, y_dev.shape[0])]

In [67]:
len(x_train_texts)

111699

In [84]:
model.fit([x_train_texts, x_train[meta_features]], separate_targets_train, validation_data=([x_dev_texts, x_dev[meta_features]], separate_targets_dev),
          epochs=20, batch_size=20, class_weight = multiple_class_weights)

Train on 111699 samples, validate on 47872 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20


Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20


Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7fd4f7de8f98>

In [85]:
pred_dev = model.predict([x_dev_texts, x_dev[meta_features]])

In [94]:
def mean_roc_auc(y_true, y_pred):
    roc_auc_scores = []
    for i in range(0, y_true.shape[1]):
        roc_auc_scores.append(metrics.roc_auc_score(y_true[:, i], y_pred[:, i]))
    print(roc_auc_scores)
    return np.mean(roc_auc_scores)

In [95]:
mean_roc_auc(y_dev, np.hstack(pred_dev))

[0.9276342117896024, 0.9751219474279966, 0.9553500442588447, 0.9423514378099153, 0.9494162589756695, 0.9451369859003185]


0.9491684810270579

In [96]:
model_json = model.to_json()
with open(MODEL_PATH + "keras_contextual_lstm_classification_model.json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
model.save_weights(MODEL_PATH + "keras_contextual_lstm_classification_model.h5")
print("Saved model to disk")

Saved model to disk


In [97]:
model_name = 'keras_contextual_lstm_classification_model'

In [101]:
test = pd.read_csv(DATA_PATH + "preprocessed/test_ling.csv")

In [102]:
x_dev_texts = tokenizer.texts_to_sequences(test['comment_text'])
x_dev_texts = pad_sequences(x_dev_texts, maxlen=max_length, padding='post')

In [103]:
pred = model.predict([x_dev_texts, test[meta_features]])

In [107]:
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
sample_submission = pd.read_csv('../submissions/sample_submission.csv')
sample_submission[list_classes] = np.hstack(pred)
sample_submission.to_csv("../submissions/" + model_name + ".csv", index=False)