In [1]:
import os, re
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt

from sklearn.model_selection import GridSearchCV
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Bidirectional
from keras.layers import GlobalMaxPool1D, Conv1D, MaxPooling1D, Flatten, GRU
from keras.models import Model, Sequential
from keras import initializers, regularizers, constraints, optimizers, layers
from keras.callbacks.callbacks import EarlyStopping, ModelCheckpoint
from keras.metrics import AUC
from keras.models import load_model

Using TensorFlow backend.


# Open and save data and weights

In [2]:
x_tr_1 = np.genfromtxt('data_preproc/x_tr_1.csv')

In [3]:
x_tr_1.shape

(159571, 200)

In [4]:
x_ts_1 = np.genfromtxt('data_preproc/x_ts_1.csv')

In [5]:
x_ts_1.shape

(153164, 200)

In [6]:
x_tr_2 = np.genfromtxt('data_preproc/x_tr_2.csv')

In [7]:
x_tr_2.shape

(159571, 200)

In [8]:
x_ts_2 = np.genfromtxt('data_preproc/x_ts_2.csv')

In [9]:
x_ts_2.shape

(153164, 200)

In [10]:
x_tr_3 = np.genfromtxt('data_preproc/x_tr_3.csv')

In [11]:
x_tr_3.shape

(159571, 200)

In [12]:
x_ts_3 = np.genfromtxt('data_preproc/x_ts_3.csv')

In [13]:
x_ts_3.shape

(153164, 200)

In [14]:
x_tr_4 = np.genfromtxt('data_preproc/x_tr_4.csv')

In [15]:
x_tr_4.shape

(159571, 200)

In [16]:
x_ts_4 = np.genfromtxt('data_preproc/x_ts_4.csv')

In [17]:
x_ts_4.shape

(153164, 200)

In [18]:
emb_matrix_glove = np.genfromtxt('data_preproc/emb_matr_glove.csv')

In [19]:
emb_matrix_glove.shape

(30000, 200)

In [20]:
emb_matrix_ft = np.genfromtxt('data_preproc/emb_matr_ft.csv')

In [21]:
emb_matrix_ft.shape

(30000, 200)

In [22]:
train = pd.read_csv('data/train.csv/train.csv')
col = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
y_tr = train[col].values

# Build the model

In [23]:
max_len = 200
max_feature = 30000

In [24]:
def build_model(emb_matrix):
    """
    Questa funzione crea il modello e lo compila
    return: a model
    
    emb_matrix: matrice che inizializza i pesi dell'Embedding layer
    
    """
    
    inp = Input(shape=(None,))
    emb = Embedding(max_feature, 200, input_length=max_len, weights=[emb_matrix])(inp)
    lstm = LSTM(units=60, dropout=0.3, recurrent_dropout=0.3, return_sequences=True)(emb)
    fl = Flatten()(lstm)
    d1 = Dense(100, activation='relu')(fl)
    out = Dense(6, activation='sigmoid')(d1)
    model = Model(inputs=inp, outputs=out)
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=[AUC(name='acc')])
    
    return model 

In [25]:
cp = ModelCheckpoint('model_/best_lstm.h5', monitor='val_acc', verbose=1, save_best_only=True)
es = EarlyStopping(verbose=1)
callbacks = [cp, es]

In [26]:
# training with x_1: without punctuation and digits

model1 = build_model(emb_matrix_glove)

history1_1 = model1.fit(x_tr_1, y_tr, batch_size=128, epochs=3, validation_split=0.1, callbacks=callbacks)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 143613 samples, validate on 15958 samples
Epoch 1/3

Epoch 00001: val_acc improved from -inf to 0.97971, saving model to model_/best_lstm.h5
Epoch 2/3

Epoch 00002: val_acc improved from 0.97971 to 0.98144, saving model to model_/best_lstm.h5
Epoch 3/3

Epoch 00003: val_acc did not improve from 0.98144
Epoch 00003: early stopping


In [27]:
# training with x_2: without URLs, abbreviations and tags

model2 = build_model(emb_matrix_glove)

history1_2 = model2.fit(x_tr_2, y_tr, batch_size=128, epochs=3, validation_split=0.1, callbacks=callbacks)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 143613 samples, validate on 15958 samples
Epoch 1/3

Epoch 00001: val_acc did not improve from 0.98144
Epoch 2/3

Epoch 00002: val_acc did not improve from 0.98144
Epoch 3/3

Epoch 00003: val_acc did not improve from 0.98144
Epoch 00003: early stopping


In [28]:
# training with x_3: with lemmatization

model3 = build_model(emb_matrix_glove)

history1_3 = model3.fit(x_tr_3, y_tr, batch_size=128, epochs=3, validation_split=0.1, callbacks=callbacks)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 143613 samples, validate on 15958 samples
Epoch 1/3

Epoch 00001: val_acc did not improve from 0.98144
Epoch 2/3

Epoch 00002: val_acc did not improve from 0.98144
Epoch 3/3

Epoch 00003: val_acc did not improve from 0.98144


In [29]:
# training with x_4: with stemming

model4 = build_model(emb_matrix_glove)

history1_4 = model4.fit(x_tr_4, y_tr, batch_size=128, epochs=3, validation_split=0.1, callbacks=callbacks)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 143613 samples, validate on 15958 samples
Epoch 1/3

Epoch 00001: val_acc did not improve from 0.98144
Epoch 2/3

Epoch 00002: val_acc did not improve from 0.98144
Epoch 3/3

Epoch 00003: val_acc did not improve from 0.98144
Epoch 00003: early stopping


In [30]:
# training with x_1: without punctuations and digits

model1_2 = build_model(emb_matrix_ft)

history_1 = model1_2.fit(x_tr_1, y_tr, batch_size=128, epochs=3, validation_split=0.1, callbacks=callbacks)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 143613 samples, validate on 15958 samples
Epoch 1/3

Epoch 00001: val_acc improved from 0.98144 to 0.98328, saving model to model_/best_lstm.h5
Epoch 2/3

Epoch 00002: val_acc did not improve from 0.98328
Epoch 3/3

Epoch 00003: val_acc did not improve from 0.98328
Epoch 00003: early stopping


In [31]:
# training with x_2: without URLs, abbreviations and tags

model2_2 = build_model(emb_matrix_ft)

history_2 = model2_2.fit(x_tr_2, y_tr, batch_size=128, epochs=3, validation_split=0.1, callbacks=callbacks)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 143613 samples, validate on 15958 samples
Epoch 1/3

Epoch 00001: val_acc did not improve from 0.98328
Epoch 2/3

Epoch 00002: val_acc did not improve from 0.98328
Epoch 3/3

Epoch 00003: val_acc did not improve from 0.98328


In [32]:
# training with x_3: with lemmatization

model3_2 = build_model(emb_matrix_ft)

history_3 = model3_2.fit(x_tr_3, y_tr, batch_size=128, epochs=3, validation_split=0.1, callbacks=callbacks)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 143613 samples, validate on 15958 samples
Epoch 1/3

Epoch 00001: val_acc did not improve from 0.98328
Epoch 2/3

Epoch 00002: val_acc did not improve from 0.98328
Epoch 3/3

Epoch 00003: val_acc did not improve from 0.98328
Epoch 00003: early stopping


In [33]:
# training with x_4: with stemming

model4_2 = build_model(emb_matrix_ft)

history_4 = model4_2.fit(x_tr_4, y_tr, batch_size=128, epochs=3, validation_split=0.1, callbacks=callbacks)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 143613 samples, validate on 15958 samples
Epoch 1/3

Epoch 00001: val_acc did not improve from 0.98328
Epoch 2/3

Epoch 00002: val_acc did not improve from 0.98328
Epoch 3/3

Epoch 00003: val_acc did not improve from 0.98328


# Model without embedding matrix

In [34]:
def build_model_():
    
    inp = Input(shape=(None,))
    emb = Embedding(max_feature, 200, input_length=max_len)(inp)
    lstm = LSTM(units=60, dropout=0.3, recurrent_dropout=0.3, return_sequences=True)(emb)
    fl = Flatten()(lstm)
    d1 = Dense(100, activation='relu')(fl)
    out = Dense(6, activation='sigmoid')(d1)
    
    model = Model(inputs=inp, outputs=out)
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=[AUC(name='acc')])
    
    return model

In [35]:
model1_3 = build_model_()

history3_1 = model1_3.fit(x_tr_1, y_tr, batch_size=128, epochs=3, validation_split=0.1, callbacks=callbacks)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 143613 samples, validate on 15958 samples
Epoch 1/3

Epoch 00001: val_acc did not improve from 0.98328
Epoch 2/3

Epoch 00002: val_acc did not improve from 0.98328
Epoch 3/3

Epoch 00003: val_acc did not improve from 0.98328
Epoch 00003: early stopping


In [36]:
model2_3 = build_model_()

history3_2 = model2_3.fit(x_tr_2, y_tr, batch_size=128, epochs=3, validation_split=0.1, callbacks=callbacks)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 143613 samples, validate on 15958 samples
Epoch 1/3

Epoch 00001: val_acc did not improve from 0.98328
Epoch 2/3

Epoch 00002: val_acc did not improve from 0.98328
Epoch 3/3

Epoch 00003: val_acc did not improve from 0.98328
Epoch 00003: early stopping


In [37]:
model3_3 = build_model_()

history3_3 = model3_3.fit(x_tr_3, y_tr, batch_size=128, epochs=3, validation_split=0.1, callbacks=callbacks)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 143613 samples, validate on 15958 samples
Epoch 1/3

Epoch 00001: val_acc did not improve from 0.98328
Epoch 2/3

Epoch 00002: val_acc did not improve from 0.98328
Epoch 3/3

Epoch 00003: val_acc did not improve from 0.98328
Epoch 00003: early stopping


In [38]:
model4_3 = build_model_()

history3_4 = model4_3.fit(x_tr_4, y_tr, batch_size=32, epochs=3, validation_split=0.1, callbacks=callbacks)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 143613 samples, validate on 15958 samples
Epoch 1/3

Epoch 00001: val_acc did not improve from 0.98328
Epoch 2/3

Epoch 00002: val_acc did not improve from 0.98328
Epoch 00002: early stopping
