In [1]:
import numpy as np
import pandas as pd
import pickle
from keras.models import Model
from keras.layers import Dense, Embedding, Input, PReLU, Flatten
from keras.layers import LSTM, Bidirectional, GlobalMaxPool1D, Dropout
from keras.preprocessing import text, sequence
from keras.callbacks import EarlyStopping, ModelCheckpoint
import tensorflow as tf

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
max_features = 20000
maxlen = 100

In [3]:
DATA_PATH = "../data/"
train = pd.read_csv(DATA_PATH + "train.csv")
test = pd.read_csv(DATA_PATH + "test.csv")
train = train.sample(frac=1)
list_sentences_train = train["comment_text"].fillna("CVxTz").values
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y = train[list_classes].values
list_sentences_test = test["comment_text"].fillna("CVxTz").values

In [4]:
# tokenizer = text.Tokenizer(num_words=max_features)
# tokenizer.fit_on_texts(list(list_sentences_train))
# pickle.dump(tokenizer, open('../ling_src/tokenizer.p', 'wb'))

In [5]:
tokenizer = pickle.load(open('../ling_src/tokenizer.p', 'rb'))

In [6]:
list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)
X_train = sequence.pad_sequences(list_tokenized_train, maxlen=maxlen)
X_test = sequence.pad_sequences(list_tokenized_test, maxlen=maxlen)

In [7]:
X_train.shape

(159571, 100)

In [8]:
y.shape

(159571, 6)

In [9]:
def f2_score(y_true, y_pred):
    y_true = tf.cast(y_true, "int32")
    y_pred = tf.cast(tf.round(y_pred), "int32") # implicit 0.5 threshold via tf.round
    y_correct = y_true * y_pred
    sum_true = tf.reduce_sum(y_true, axis=1)
    sum_pred = tf.reduce_sum(y_pred, axis=1)
    sum_correct = tf.reduce_sum(y_correct, axis=1)
    precision = sum_correct / sum_pred
    recall = sum_correct / sum_true
    f_score = 5 * precision * recall / (4 * precision + recall)
    f_score = tf.where(tf.is_nan(f_score), tf.zeros_like(f_score), f_score)
    return tf.reduce_mean(f_score)

In [10]:
import keras.backend as K

#-----------------------------------------------------------------------------------------------------------------------------------------------------
# AUC for a binary classifier
def auc(y_true, y_pred):   
    ptas = tf.stack([binary_PTA(y_true,y_pred,k) for k in np.linspace(0, 1, 1000)],axis=0)
    pfas = tf.stack([binary_PFA(y_true,y_pred,k) for k in np.linspace(0, 1, 1000)],axis=0)
    pfas = tf.concat([tf.ones((1,)) ,pfas],axis=0)
    binSizes = -(pfas[1:]-pfas[:-1])
    s = ptas*binSizes
    return K.sum(s, axis=0)

#-----------------------------------------------------------------------------------------------------------------------------------------------------
# PFA, prob false alert for binary classifier
def binary_PFA(y_true, y_pred, threshold=K.variable(value=0.5)):
    y_pred = K.cast(y_pred >= threshold, 'float32')
    # N = total number of negative labels
    N = K.sum(1 - y_true)
    # FP = total number of false alerts, alerts from the negative class labels
    FP = K.sum(y_pred - y_pred * y_true)    
    return FP/N
#-----------------------------------------------------------------------------------------------------------------------------------------------------
# P_TA prob true alerts for binary classifier
def binary_PTA(y_true, y_pred, threshold=K.variable(value=0.5)):
    y_pred = K.cast(y_pred >= threshold, 'float32')
    # P = total number of positive labels
    P = K.sum(y_true)
    # TP = total number of correct alerts, alerts from the positive class labels
    TP = K.sum(y_pred * y_true)    
    return TP/P

In [11]:
def mean_roc_auc(y_true, y_pred):
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
#         y_true = y_true, "int32"
        y_pred = tf.round(y_pred) # implicit 0.5 threshold via tf.round
        roc_aucs = []
        shape = y_pred.shape.as_list()
        for i in range(0, shape[1]):
            auc_score = auc(y_pred[:, i], y_true[:, i])
            roc_aucs.append(auc_score)
        return tf.reduce_mean(roc_aucs)

In [12]:
def get_model():
    embed_size = 100
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size)(inp)
    x = Flatten()(x)
    x = Dense(50, activation = "relu")(x)
    x = Dropout(0.5)(x)
    x = Dense(50, activation = "relu")(x)
    x = Dropout(0.5)(x)
    x = Dense(6, activation="sigmoid")(x)
    model = Model(inputs=inp, outputs=x)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=[mean_roc_auc])

    return model

In [16]:
model = get_model()
batch_size = 10
epochs = 5

In [18]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 100)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 100, 100)          2000000   
_________________________________________________________________
flatten_1 (Flatten)          (None, 10000)             0         
_________________________________________________________________
dense_1 (Dense)              (None, 50)                500050    
_________________________________________________________________
dropout_1 (Dropout)          (None, 50)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 50)                2550      
_________________________________________________________________
dropout_2 (Dropout)          (None, 50)                0         
__________

In [19]:
MODEL_PATH = "../models/"

In [20]:
model_name = "deep_dense_model_mean_roc_auc.hdf5"

In [21]:
file_path = MODEL_PATH + model_name
checkpoint = ModelCheckpoint(file_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
early = EarlyStopping(monitor="val_loss", mode="min", patience=3)
callbacks_list = [checkpoint, early] #early

In [28]:
batch_size = int(np.round(0.2*X_train.shape[0]))

In [None]:
model.fit(X_train, y, batch_size=batch_size, epochs=epochs, validation_split=0.2, callbacks=callbacks_list)
# model.load_weights(file_path)

Train on 127656 samples, validate on 31915 samples
Epoch 1/5


In [None]:
model.evaluate(X_train, y)

In [None]:
y_test = model.predict(X_test)

In [None]:
y_test_rounded = np.round(y_test)

In [None]:
sample_submission = pd.read_csv("../submissions/sample_submission.csv")
sample_submission[list_classes] = y_test_rounded
sample_submission.to_csv("../submissions/" + model_name + ".csv", index=False)

In [None]:
y_train = model.predict(X_train)

In [None]:
pickle.dump(y_train, open("../submissions/TRAIN_" + model_name + ".p", "wb"))