In [1]:
import os, re
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt

from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Bidirectional
from keras.layers import GlobalMaxPool1D, Conv1D, MaxPooling1D, Flatten, AveragePooling1D
from keras.models import Model, Sequential
from keras import initializers, regularizers, constraints, optimizers, layers
from keras.callbacks.callbacks import EarlyStopping, ModelCheckpoint
from keras.metrics import AUC
from keras.models import load_model

Using TensorFlow backend.


In [2]:
physical_devices = tf.config.experimental.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(physical_devices[0], True)

# Open and save data and weights

In [3]:
x_tr_1 = np.genfromtxt('data_preproc/x_tr_1.csv')

In [4]:
x_tr_1.shape

(159571, 200)

In [5]:
x_ts_1 = np.genfromtxt('data_preproc/x_ts_1.csv')

In [6]:
x_ts_1.shape

(153164, 200)

In [7]:
x_tr_2 = np.genfromtxt('data_preproc/x_tr_2.csv')

In [8]:
x_tr_2.shape

(159571, 200)

In [9]:
x_ts_2 = np.genfromtxt('data_preproc/x_ts_2.csv')

In [10]:
x_ts_2.shape

(153164, 200)

In [11]:
x_tr_3 = np.genfromtxt('data_preproc/x_tr_3.csv')

In [12]:
x_tr_3.shape

(159571, 200)

In [13]:
x_ts_3 = np.genfromtxt('data_preproc/x_ts_3.csv')

In [14]:
x_ts_3.shape

(153164, 200)

In [15]:
x_tr_4 = np.genfromtxt('data_preproc/x_tr_4.csv')

In [16]:
x_tr_4.shape

(159571, 200)

In [17]:
x_ts_4 = np.genfromtxt('data_preproc/x_ts_4.csv')

In [18]:
x_ts_4.shape

(153164, 200)

In [19]:
emb_matrix_glove = np.genfromtxt('data_preproc/emb_matr_glove.csv')

In [20]:
emb_matrix_glove.shape

(30000, 200)

In [21]:
emb_matrix_ft = np.genfromtxt('data_preproc/emb_matr_ft.csv')

In [22]:
emb_matrix_ft.shape

(30000, 200)

In [23]:
train = pd.read_csv('data/train.csv/train.csv')
col = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
y_tr = train[col].values

# Convolutional Neural Network

In [24]:
max_len = 200
max_feature = 30000

In [31]:
cp = ModelCheckpoint('model_/best_cnn.h5', monitor='val_acc', verbose=1, save_best_only=True)
es = EarlyStopping(verbose=1)
callbacks = [cp, es]

In [32]:
def build_model(emb_matrix):
    """
    Questa funzione crea la struttura del modello e lo compila
    return: il modello
    
    emb_matrix: matrice che inizializza i pesi dell'Embedding layer
   
    """
    
    inp = Input(shape=(None,))
    emb = Embedding(max_feature, 200, input_length=max_len, weights=[emb_matrix])(inp)
    conv = Conv1D(filters=128, kernel_size=3, activation='relu', padding='same')(emb)
    glbm = GlobalMaxPool1D()(conv)
    drp1 = Dropout(rate=0.3)(glbm)
    d1 = Dense(units=100, activation='relu')(drp1)
    drp2 = Dropout(rate=0.1)(d1)
    out = Dense(6, activation='sigmoid')(drp2)
    model = Model(inputs=inp, outputs=out)
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=[AUC(name='acc')])
    
    return model

In [33]:
# fit with x_tr_1: training data without digits and punctuation

model1 = build_model(emb_matrix_glove)

history1_1 = model1.fit(x_tr_1, y_tr, batch_size=128, epochs=3, validation_split=0.1, callbacks=callbacks)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 143613 samples, validate on 15958 samples
Epoch 1/3

Epoch 00001: val_acc improved from -inf to 0.98194, saving model to model_/best_cnn.h5
Epoch 2/3

Epoch 00002: val_acc improved from 0.98194 to 0.98199, saving model to model_/best_cnn.h5
Epoch 3/3

Epoch 00003: val_acc did not improve from 0.98199
Epoch 00003: early stopping


In [34]:
# fit with x_tr_2: without URLs, stopwords and abbreviations

model2 = build_model(emb_matrix_glove)

history1_2 = model2.fit(x_tr_2, y_tr, batch_size=128, epochs=3, validation_split=0.1, callbacks=callbacks)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 143613 samples, validate on 15958 samples
Epoch 1/3

Epoch 00001: val_acc improved from 0.98199 to 0.98216, saving model to model_/best_cnn.h5
Epoch 2/3

Epoch 00002: val_acc did not improve from 0.98216
Epoch 3/3

Epoch 00003: val_acc did not improve from 0.98216
Epoch 00003: early stopping


In [35]:
# fit with x_tr_3: with lemmatization

model3 = build_model(emb_matrix_glove)

history1_3 = model3.fit(x_tr_3, y_tr, batch_size=128, epochs=3, validation_split=0.1, callbacks=callbacks)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 143613 samples, validate on 15958 samples
Epoch 1/3

Epoch 00001: val_acc did not improve from 0.98216
Epoch 2/3

Epoch 00002: val_acc did not improve from 0.98216
Epoch 3/3

Epoch 00003: val_acc did not improve from 0.98216


In [36]:
# fit with x_tr_4: with stemming

model4 = build_model(emb_matrix_glove)

history1_4 = model4.fit(x_tr_4, y_tr, batch_size=128, epochs=3, validation_split=0.1, callbacks=callbacks)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 143613 samples, validate on 15958 samples
Epoch 1/3

Epoch 00001: val_acc did not improve from 0.98216
Epoch 2/3

Epoch 00002: val_acc did not improve from 0.98216
Epoch 3/3

Epoch 00003: val_acc did not improve from 0.98216


In [37]:
# training with x_tr_1: without digits and punctuation

model1_2 = build_model(emb_matrix_ft)

history2_1 = model1_2.fit(x_tr_1, y_tr, batch_size=128, epochs=3, validation_split=0.1, callbacks=callbacks)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 143613 samples, validate on 15958 samples
Epoch 1/3

Epoch 00001: val_acc did not improve from 0.98216
Epoch 2/3

Epoch 00002: val_acc improved from 0.98216 to 0.98421, saving model to model_/best_cnn.h5
Epoch 3/3

Epoch 00003: val_acc did not improve from 0.98421
Epoch 00003: early stopping


In [38]:
# training with x_tr_2: without URLs, stopswords and abbreviation

model2_2 = build_model(emb_matrix_ft)

history2_2 = model2_2.fit(x_tr_2, y_tr, batch_size=128, epochs=3, validation_split=0.1, callbacks=callbacks)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 143613 samples, validate on 15958 samples
Epoch 1/3

Epoch 00001: val_acc did not improve from 0.98421
Epoch 2/3

Epoch 00002: val_acc did not improve from 0.98421
Epoch 3/3

Epoch 00003: val_acc did not improve from 0.98421
Epoch 00003: early stopping


In [39]:
# training with x_tr_3: with lemamtization

model3_2 = build_model(emb_matrix_ft)

history2_3 = model3_2.fit(x_tr_3, y_tr, batch_size=128, epochs=3, validation_split=0.1, callbacks=callbacks)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 143613 samples, validate on 15958 samples
Epoch 1/3

Epoch 00001: val_acc did not improve from 0.98421
Epoch 2/3

Epoch 00002: val_acc did not improve from 0.98421
Epoch 00002: early stopping


In [40]:
# fit with x_tr_4: with stemming

model4_2 = build_model(emb_matrix_ft)

history2_4 = model4_2.fit(x_tr_4, y_tr, batch_size=128, epochs=3, validation_split=0.1, callbacks=callbacks)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 143613 samples, validate on 15958 samples
Epoch 1/3

Epoch 00001: val_acc did not improve from 0.98421
Epoch 2/3

Epoch 00002: val_acc did not improve from 0.98421
Epoch 3/3

Epoch 00003: val_acc did not improve from 0.98421


# Model without embedding weights initialization

In [41]:
def build_model_():
    
    inp = Input(shape=(None,))
    emb = Embedding(max_feature, 200, input_length=max_len)(inp)
    conv = Conv1D(filters=128, kernel_size=3, activation='relu', padding='same')(emb)
    glbm = GlobalMaxPool1D()(conv)
    drp1 = Dropout(rate=0.3)(glbm)
    d1 = Dense(units=100, activation='relu')(drp1)
    drp2 = Dropout(rate=0.1)(d1)
    out = Dense(6, activation='sigmoid')(drp2)
    model = Model(inputs=inp, outputs=out)
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=[AUC(name='acc')])
    
    return model

In [42]:
# training with x_tr_1: without digits and punctuation

model1_3 = build_model_()

history3_1 = model1_3.fit(x_tr_1, y_tr, batch_size=128, epochs=3, validation_split=0.1, callbacks=callbacks)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 143613 samples, validate on 15958 samples
Epoch 1/3

Epoch 00001: val_acc did not improve from 0.98421
Epoch 2/3

Epoch 00002: val_acc did not improve from 0.98421
Epoch 3/3

Epoch 00003: val_acc did not improve from 0.98421
Epoch 00003: early stopping


In [43]:
# training with x_tr_1: without digits and punctuation

model2_3 = build_model_()

history3_2 = model2_3.fit(x_tr_2, y_tr, batch_size=128, epochs=3, validation_split=0.1, callbacks=callbacks)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 143613 samples, validate on 15958 samples
Epoch 1/3

Epoch 00001: val_acc did not improve from 0.98421
Epoch 2/3

Epoch 00002: val_acc did not improve from 0.98421
Epoch 3/3

Epoch 00003: val_acc did not improve from 0.98421
Epoch 00003: early stopping


In [44]:
# training with x_tr_1: without digits and punctuation

model3_3 = build_model_()

history3_3 = model3_3.fit(x_tr_3, y_tr, batch_size=128, epochs=3, validation_split=0.1, callbacks=callbacks)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 143613 samples, validate on 15958 samples
Epoch 1/3

Epoch 00001: val_acc did not improve from 0.98421
Epoch 2/3

Epoch 00002: val_acc did not improve from 0.98421
Epoch 3/3

Epoch 00003: val_acc did not improve from 0.98421
Epoch 00003: early stopping


In [45]:
# training with x_tr_1: without digits and punctuation

model4_3 = build_model_()

history3_4 = model4_3.fit(x_tr_4, y_tr, batch_size=128, epochs=3, validation_split=0.1, callbacks=callbacks)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 143613 samples, validate on 15958 samples
Epoch 1/3

Epoch 00001: val_acc did not improve from 0.98421
Epoch 2/3

Epoch 00002: val_acc did not improve from 0.98421
Epoch 3/3

Epoch 00003: val_acc did not improve from 0.98421
