In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import warnings
warnings.filterwarnings("ignore")

In [3]:
from keras.models import model_from_json

In [4]:
from keras.models import Model
from keras.layers import Input,Dense,Embedding,SpatialDropout1D, concatenate
from keras.layers import GRU, Bidirectional, GlobalAveragePooling1D, GlobalMaxPooling1D
from keras.preprocessing import text, sequence
from keras.callbacks import Callback

In [5]:
import os
os.environ['OMP_NUM_THREADS']='4'

In [6]:
Embedding_file = './embeddings/glove.6B.50d.txt'

In [7]:
train = pd.read_csv('./input/train.csv')
test = pd.read_csv('./input/test.csv')
submission = pd.read_csv('./input/sample_submission.csv')

In [8]:
# len(train["comment_text"][1].split())
threat = train[train.threat>=1]
# len("hi".split())
# train["comment_text"][2]
# train.head()["comment_text"]

In [9]:
threat.loc[1017]

id                                                02c6e41e4b317ac3
comment_text     WOULDN'T BE THE FIRST TIME BITCH. FUCK YOU I'L...
toxic                                                            1
severe_toxic                                                     1
obscene                                                          1
threat                                                           1
insult                                                           1
identity_hate                                                    1
Name: 1017, dtype: object

In [10]:
# train["comment_text"]
x_train = train["comment_text"].fillna("fillna").values
# X_train is an array
y_train = train[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]].values
# y_train
x_test = test["comment_text"].fillna("fillna").values

In [12]:
type(x_train)

numpy.ndarray

# Embeddings and features extraction

In [14]:
max_features = 30000
maxlen = 100
embed_size = 50

In [15]:
tokenizer = text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(x_train) + list(x_test))
X_train_1 = tokenizer.texts_to_sequences(x_train)
X_test_1 = tokenizer.texts_to_sequences(x_test)

In [16]:
x_train_1 = sequence.pad_sequences(X_train_1, maxlen=maxlen)
x_test_1 = sequence.pad_sequences(X_test_1, maxlen=maxlen)

In [17]:
# len(X_train[0])
len(X_train_1[1])

17

In [18]:
X_train_1[1]

[52,
 2911,
 13,
 450,
 3782,
 72,
 4871,
 2676,
 21,
 95,
 46,
 912,
 3225,
 1024,
 616,
 9983,
 216]

In [19]:
def get_coefs(word, *arr):
    return word, np.asarray(arr, dtype='float32')

In [20]:
get_coefs(1,[1,2,3])

(1, array([[1., 2., 3.]], dtype=float32))

In [21]:
embeddings_index = dict(get_coefs(*o.rstrip().rsplit(' ')) for o in open(Embedding_file))

In [22]:
word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.zeros((nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector

In [23]:
type(embedding_matrix)

numpy.ndarray

In [39]:
class RocAucEvaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()

        self.interval = interval
        self.X_val, self.y_val = validation_data

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.X_val, verbose=0)
            score = roc_auc_score(self.y_val, y_pred)
            print("\n ROC-AUC - epoch: %d - score: %.6f \n" % (epoch+1, score))

In [40]:
def get_model():
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
    x = SpatialDropout1D(0.2)(x)
    x = Bidirectional(GRU(80, return_sequences=True))(x)
    avg_pool = GlobalAveragePooling1D()(x)
    max_pool = GlobalMaxPooling1D()(x)
    conc = concatenate([avg_pool, max_pool])
    outp = Dense(6, activation="sigmoid")(conc)
    
    model = Model(inputs=inp, outputs=outp)
    model.compile(loss='categorical_crossentropy',
                  optimizer='rmsprop',
                  metrics=['accuracy'])

    return model

In [41]:
model = get_model()


In [None]:
batch_size = 32
epochs = 2

X_tra, X_val, y_tra, y_val = train_test_split(x_train, y_train, train_size=0.95, random_state=233)
RocAuc = RocAucEvaluation(validation_data=(X_val, y_val), interval=1)

hist = model.fit(X_tra, y_tra, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val),
                 callbacks=[RocAuc], verbose=2)


In [None]:
# serialize model to JSON
model_json = model.to_json()
with open("model_GRU.json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
model.save_weights("model_GRU.h5")
print("Saved model to disk")

In [None]:
!ls

In [None]:
# load json and create model
json_file = open('model_GRU.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)
# load weights into new model
loaded_model.load_weights("model_GRU.h5")
print("Loaded model from disk")

In [None]:
y_pred = loaded_model.predict(x_test, batch_size=1024)
submission[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]] = y_pred
submission.to_csv('submission_GRU_glove.csv', index=False)