In [2]:
import os, re
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt

from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Bidirectional
from keras.layers import GlobalMaxPool1D, Conv1D, MaxPooling1D, Flatten, AveragePooling1D
from keras.models import Model, Sequential
from keras import initializers, regularizers, constraints, optimizers, layers
from keras.callbacks.callbacks import EarlyStopping, ModelCheckpoint
from keras.wrappers.scikit_learn import KerasClassifier
from keras.metrics import AUC
from keras.models import load_model

Using TensorFlow backend.


In [3]:
physical_devices = tf.config.experimental.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(physical_devices[0], True)

# Open and save data and weights

In [4]:
x_tr_1 = np.genfromtxt('data_preproc/x_tr_1_it.csv')

In [5]:
x_tr_1.shape

(7410, 200)

In [6]:
x_ts_1 = np.genfromtxt('data_preproc/x_ts_1_it.csv')

In [7]:
x_ts_1.shape

(1998, 200)

In [8]:
x_tr_2 = np.genfromtxt('data_preproc/x_tr_2_it.csv')

In [9]:
x_tr_2.shape

(7410, 200)

In [10]:
x_ts_2 = np.genfromtxt('data_preproc/x_ts_2_it.csv')

In [11]:
x_ts_2.shape

(1998, 200)

In [12]:
x_tr_3 = np.genfromtxt('data_preproc/x_tr_3_it.csv')

In [13]:
x_tr_3.shape

(7410, 200)

In [14]:
x_ts_3 = np.genfromtxt('data_preproc/x_ts_3_it.csv')

In [15]:
x_ts_3.shape

(1998, 200)

In [16]:
x_tr_4 = np.genfromtxt('data_preproc/x_tr_4_it.csv')

In [17]:
x_tr_4.shape

(7410, 200)

In [18]:
x_ts_4 = np.genfromtxt('data_preproc/x_ts_4_it.csv')

In [19]:
x_ts_4.shape

(1998, 200)

In [20]:
emb_matrix_ft = np.genfromtxt('data_preproc/emb_matr_ft_it.csv')

In [21]:
emb_matrix_ft.shape

(30000, 200)

In [22]:
train = pd.read_csv('data_it/training_set_sentipolc16.csv/training_set_sentipolc16.csv')
col = ['opos', 'oneg']
y_tr = train[col].values

# Convolutional Neural Network

In [23]:
max_len = 200
max_feature = 30000

In [68]:
from keras import backend as K

def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall))


In [74]:
cp = ModelCheckpoint('models_it/best_cnn_IT.h5', monitor='val_accuracy', verbose=1, save_best_only=True, mode='max')
es = EarlyStopping(verbose=1, patience=10)
callbacks = [cp, es]

In [75]:
def build_model(emb_matrix):
    """
    Questa funzione crea la struttura del modello e lo compila
    return: il modello
    
    emb_matrix: matrice che inizializza i pesi dell'Embedding layer
    """
    
    model_1 = Sequential()
    model_1.add(Embedding(max_feature, 200, input_length=max_len, weights=[emb_matrix]))
    model_1.add(Conv1D(filters=128, kernel_size=3, activation='relu', padding='same'))
    model_1.add(GlobalMaxPool1D())
    model_1.add(Dropout(rate=0.3))
    model_1.add(Dense(units=100, activation='relu'))
    model_1.add(Dropout(rate=0.1))
    model_1.add(Dense(2, activation='sigmoid'))
    model_1.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    
    return model_1

In [76]:
# fit with x_tr_1: training data without digits and punctuation

model1 = build_model(emb_matrix_ft)

history1_1 = model1.fit(x_tr_1, y_tr, batch_size=128, epochs=50, validation_split=0.1, callbacks=callbacks)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 6669 samples, validate on 741 samples
Epoch 1/50

Epoch 00001: val_accuracy improved from -inf to 0.72470, saving model to models_it/best_cnn_IT.h5
Epoch 2/50

Epoch 00002: val_accuracy improved from 0.72470 to 0.74224, saving model to models_it/best_cnn_IT.h5
Epoch 3/50

Epoch 00003: val_accuracy did not improve from 0.74224
Epoch 4/50

Epoch 00004: val_accuracy improved from 0.74224 to 0.76991, saving model to models_it/best_cnn_IT.h5
Epoch 5/50

Epoch 00005: val_accuracy did not improve from 0.76991
Epoch 6/50

Epoch 00006: val_accuracy did not improve from 0.76991
Epoch 7/50

Epoch 00007: val_accuracy did not improve from 0.76991
Epoch 8/50

Epoch 00008: val_accuracy did not improve from 0.76991
Epoch 9/50

Epoch 00009: val_accuracy did not improve from 0.76991
Epoch 10/50

Epoch 00010: val_accuracy did not improve from 0.76991
Epoch 11/50

Epoch 00011: val_accuracy did not improve from 0.76991
Epoch 12/50

Epoch 00012: val_accuracy did not improve from 0.76991
Epoch 13/50

In [77]:
# fit with x_tr_2: without URLs, stopwords and abbreviations

model2 = build_model(emb_matrix_ft)

history1_2 = model2.fit(x_tr_2, y_tr, batch_size=128, epochs=50, validation_split=0.1, callbacks=callbacks)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 6669 samples, validate on 741 samples
Epoch 1/50

Epoch 00001: val_accuracy improved from 0.76991 to 0.81781, saving model to models_it/best_cnn_IT.h5
Epoch 2/50

Epoch 00002: val_accuracy did not improve from 0.81781
Epoch 3/50

Epoch 00003: val_accuracy did not improve from 0.81781
Epoch 4/50

Epoch 00004: val_accuracy did not improve from 0.81781
Epoch 5/50

Epoch 00005: val_accuracy did not improve from 0.81781
Epoch 6/50

Epoch 00006: val_accuracy did not improve from 0.81781
Epoch 7/50

Epoch 00007: val_accuracy did not improve from 0.81781
Epoch 8/50

Epoch 00008: val_accuracy did not improve from 0.81781
Epoch 9/50

Epoch 00009: val_accuracy did not improve from 0.81781
Epoch 10/50

Epoch 00010: val_accuracy did not improve from 0.81781
Epoch 11/50

Epoch 00011: val_accuracy did not improve from 0.81781
Epoch 12/50

Epoch 00012: val_accuracy did not improve from 0.81781
Epoch 00012: early stopping


In [78]:
# fit with x_tr_3: with lemmatization

model3 = build_model(emb_matrix_ft)

history1_3 = model3.fit(x_tr_3, y_tr, batch_size=128, epochs=50, validation_split=0.1, callbacks=callbacks)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 6669 samples, validate on 741 samples
Epoch 1/50

Epoch 00001: val_accuracy improved from 0.81781 to 0.82119, saving model to models_it/best_cnn_IT.h5
Epoch 2/50

Epoch 00002: val_accuracy did not improve from 0.82119
Epoch 3/50

Epoch 00003: val_accuracy did not improve from 0.82119
Epoch 4/50

Epoch 00004: val_accuracy did not improve from 0.82119
Epoch 5/50

Epoch 00005: val_accuracy did not improve from 0.82119
Epoch 6/50

Epoch 00006: val_accuracy did not improve from 0.82119
Epoch 7/50

Epoch 00007: val_accuracy did not improve from 0.82119
Epoch 8/50

Epoch 00008: val_accuracy did not improve from 0.82119
Epoch 9/50

Epoch 00009: val_accuracy did not improve from 0.82119
Epoch 10/50

Epoch 00010: val_accuracy did not improve from 0.82119
Epoch 11/50

Epoch 00011: val_accuracy did not improve from 0.82119
Epoch 12/50

Epoch 00012: val_accuracy did not improve from 0.82119
Epoch 13/50

Epoch 00013: val_accuracy did not improve from 0.82119
Epoch 00013: early stopping


In [79]:
# fit with x_tr_4: with stemming

model4 = build_model(emb_matrix_ft)

history1_4 = model4.fit(x_tr_4, y_tr, batch_size=128, epochs=50, validation_split=0.1, callbacks=callbacks)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 6669 samples, validate on 741 samples
Epoch 1/50

Epoch 00001: val_accuracy did not improve from 0.82119
Epoch 2/50

Epoch 00002: val_accuracy did not improve from 0.82119
Epoch 3/50

Epoch 00003: val_accuracy did not improve from 0.82119
Epoch 4/50

Epoch 00004: val_accuracy did not improve from 0.82119
Epoch 5/50

Epoch 00005: val_accuracy did not improve from 0.82119
Epoch 6/50

Epoch 00006: val_accuracy did not improve from 0.82119
Epoch 7/50

Epoch 00007: val_accuracy did not improve from 0.82119
Epoch 8/50

Epoch 00008: val_accuracy did not improve from 0.82119
Epoch 9/50

Epoch 00009: val_accuracy did not improve from 0.82119
Epoch 10/50

Epoch 00010: val_accuracy did not improve from 0.82119
Epoch 11/50

Epoch 00011: val_accuracy did not improve from 0.82119
Epoch 12/50

Epoch 00012: val_accuracy did not improve from 0.82119
Epoch 00012: early stopping
