In [1]:
import numpy as np
import pandas as pd 

import os

from tensorflow.keras.layers import Dense,Input,LSTM,Bidirectional,Activation,Conv1D,GRU
from tensorflow.keras.callbacks import Callback
from tensorflow.keras.layers import Dropout,Embedding,GlobalMaxPooling1D, MaxPooling1D, Add, Flatten
from tensorflow.keras.preprocessing import text, sequence
from tensorflow.keras.layers import GlobalAveragePooling1D, GlobalMaxPooling1D, concatenate, SpatialDropout1D
from tensorflow.keras import initializers, regularizers, constraints, optimizers, layers, callbacks
from tensorflow.keras.callbacks import EarlyStopping,ModelCheckpoint
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

In [2]:
train = pd.read_csv('./dataset/train.csv')
test = pd.read_csv('./dataset/test.csv')

In [3]:
train["comment_text"].fillna("fillna")
test["comment_text"].fillna("fillna")
X_train = train["comment_text"].str.lower()
y_train = train[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]].values

X_test = test["comment_text"].str.lower()

In [4]:
max_features=100000
maxlen=150
embed_size=300

In [5]:
class RocAucEvaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()

        self.interval = interval
        self.X_val, self.y_val = validation_data

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.X_val, verbose=0)
            score = roc_auc_score(self.y_val, y_pred)
            print("\n ROC-AUC - epoch: {:d} - score: {:.6f}".format(epoch+1, score))

In [6]:
tok=text.Tokenizer(num_words=max_features,lower=True)
tok.fit_on_texts(list(X_train)+list(X_test))
X_train=tok.texts_to_sequences(X_train)
X_test=tok.texts_to_sequences(X_test)
x_train=sequence.pad_sequences(X_train,maxlen=maxlen)
x_test=sequence.pad_sequences(X_test,maxlen=maxlen)

In [7]:
sequence_input = Input(shape=(maxlen, ))
x = Embedding(max_features, embed_size)(sequence_input)
x = SpatialDropout1D(0.2)(x)
x = Bidirectional(GRU(128, return_sequences=True,dropout=0.1,recurrent_dropout=0.1))(x)
x = Conv1D(64, kernel_size = 3, padding = "valid", kernel_initializer = "glorot_uniform")(x)
avg_pool = GlobalAveragePooling1D()(x)
max_pool = GlobalMaxPooling1D()(x)
x = concatenate([avg_pool, max_pool]) 
# x = Dense(128, activation='relu')(x)
# x = Dropout(0.1)(x)
preds = Dense(6, activation="sigmoid")(x)
model = Model(sequence_input, preds)
model.compile(loss='binary_crossentropy',optimizer=Adam(lr=1e-3),metrics=['accuracy'])

In [8]:
batch_size = 512
epochs = 10
X_tra, X_val, y_tra, y_val = train_test_split(x_train, y_train, train_size=0.9, random_state=233)

In [9]:
filepath="weights_base.best.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_accuracy', verbose=1, save_best_only=True, mode='max')
early = EarlyStopping(monitor="val_acc", mode="max", patience=5)
ra_val = RocAucEvaluation(validation_data=(X_val, y_val), interval = 1)
callbacks_list = [ra_val,checkpoint, early]

In [10]:
model.fit(X_tra, y_tra, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val),callbacks = callbacks_list,verbose=1)

Train on 143613 samples, validate on 15958 samples
Epoch 1/10
 ROC-AUC - epoch: 1 - score: 0.969294
Epoch 2/10
 ROC-AUC - epoch: 2 - score: 0.974636
Epoch 3/10
 ROC-AUC - epoch: 3 - score: 0.977190
Epoch 4/10
 ROC-AUC - epoch: 4 - score: 0.977951
Epoch 5/10
 ROC-AUC - epoch: 5 - score: 0.979389
Epoch 6/10
 ROC-AUC - epoch: 6 - score: 0.980099
Epoch 7/10
 ROC-AUC - epoch: 7 - score: 0.980414
Epoch 8/10
 ROC-AUC - epoch: 8 - score: 0.978704
Epoch 9/10
 ROC-AUC - epoch: 9 - score: 0.978335
Epoch 10/10
 ROC-AUC - epoch: 10 - score: 0.976920


<tensorflow.python.keras.callbacks.History at 0x1f89489bd68>

In [12]:
# model.load_weights(filepath)
print('Predicting....')
y_pred = model.predict(x_test,batch_size=1024,verbose=1)

Predicting....


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [13]:
y_pred

array([[9.9874687e-01, 4.6806693e-02, 9.7717893e-01, 1.6734600e-03,
        5.0345832e-01, 1.5222132e-02],
       [4.7385693e-06, 4.0531158e-06, 2.1755695e-06, 1.9073486e-06,
        1.4901161e-06, 2.9802322e-07],
       [2.5877357e-04, 4.3898821e-05, 3.8564205e-05, 2.1368265e-05,
        7.4177980e-05, 9.6261501e-06],
       ...,
       [5.4588914e-04, 3.6597252e-05, 6.2760711e-04, 1.8417835e-05,
        2.3573637e-05, 2.5868416e-05],
       [2.8312206e-04, 1.0460615e-05, 8.9108944e-06, 3.3676624e-05,
        2.0116568e-05, 2.7751923e-04],
       [9.9993539e-01, 3.4895778e-02, 9.4702518e-01, 3.0955374e-03,
        8.1735265e-01, 6.9559216e-03]], dtype=float32)

[name: "/device:CPU:0"
 device_type: "CPU"
 memory_limit: 268435456
 locality {
 }
 incarnation: 15793186596492628765, name: "/device:GPU:0"
 device_type: "GPU"
 memory_limit: 3141979340
 locality {
   bus_id: 1
   links {
   }
 }
 incarnation: 11135713829224998369
 physical_device_desc: "device: 0, name: GeForce GTX 1050 Ti, pci bus id: 0000:01:00.0, compute capability: 6.1"]