In [1]:
from pathlib import Path
import sys
sys.path.append("../../")

from config import Config
import pandas as pd
import numpy as np
np.random.seed(42)

Loading the dataset, pre-processing it and tokenizing it.

In [2]:
from preprocessing.pipeline import ItalianTweetsPreprocessingPipeline
from preprocessing.tokenization import ItalianTweetsTokenizer

In [3]:
df = pd.read_csv(Config.TRAINING_DATASET_PATH)

In [4]:
#automatically pre-processes the dataset
itt = ItalianTweetsTokenizer()
dft = itt.tokenize(df, 'text')

In [5]:
dft[0].shape

(50,)

In [6]:
tok = itt.tokenizer

In [7]:
num_words = len(tok.word_index) + 1

In [8]:
import tensorflow as tk
from tensorflow import keras

In [9]:
early_stopping = keras.callbacks.EarlyStopping(monitor='val_loss', patience=2)

In [67]:
model = keras.Sequential()
model.add(keras.layers.Embedding(num_words, 32, input_length=Config.SEQUENCE_LENGTH, mask_zero=True))
model.add(keras.layers.GRU(32, return_sequences=True, recurrent_dropout=0.4))
model.add(keras.layers.GRU(32, recurrent_dropout=0.3, return_sequences=True))
model.add(keras.layers.GRU(16, recurrent_dropout=0.3))
model.add(keras.layers.Dense(1, activation='sigmoid'))

In [68]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, 50, 32)            466784    
_________________________________________________________________
gru_11 (GRU)                 (None, 50, 32)            6336      
_________________________________________________________________
gru_12 (GRU)                 (None, 50, 32)            6336      
_________________________________________________________________
gru_13 (GRU)                 (None, 16)                2400      
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 17        
Total params: 481,873
Trainable params: 481,873
Non-trainable params: 0
_________________________________________________________________
None


In [69]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(dft, df['irony'].to_numpy(), test_size=0.2)

In [70]:
x_train[0].reshape(1,50).shape

(1, 50)

In [71]:
x_train.shape

(3181, 50)

In [72]:
x_train_reshaped = np.array([x.reshape(1,50) for x in x_train])
x_train_reshaped

array([[[   1,  939,  112, ...,    0,    0,    0]],

       [[ 635,   42,    1, ...,    0,    0,    0]],

       [[   1,   81,    1, ...,    0,    0,    0]],

       ...,

       [[   1,  429, 1834, ...,    0,    0,    0]],

       [[   1,    1,    1, ...,    0,    0,    0]],

       [[1028,    1, 1230, ...,    0,    0,    0]]], dtype=int32)

In [73]:
model.fit(x_train, y_train, batch_size=10, epochs=10, validation_split=0.1, callbacks=[early_stopping])

Epoch 1/10
Epoch 2/10
Epoch 3/10


<tensorflow.python.keras.callbacks.History at 0x1481c9e50>

In [74]:
model.evaluate(x_test, y_test)



[0.9949962496757507, 0.5804020166397095]

In [76]:
predicted = model.predict_classes(x_test).ravel()
predicted == y_test

array([False,  True, False, False,  True,  True,  True,  True, False,
        True,  True, False, False,  True,  True, False, False,  True,
        True, False, False, False,  True,  True, False, False, False,
        True,  True, False,  True,  True,  True, False,  True, False,
       False, False,  True, False, False, False,  True,  True,  True,
       False, False,  True,  True, False,  True,  True,  True, False,
        True,  True, False, False,  True,  True, False, False, False,
        True,  True,  True, False, False,  True, False,  True,  True,
       False, False, False,  True,  True, False,  True,  True, False,
        True,  True,  True,  True,  True, False,  True, False,  True,
        True,  True,  True, False,  True, False, False,  True, False,
        True, False,  True,  True,  True,  True, False, False,  True,
       False,  True,  True, False,  True,  True, False,  True, False,
        True, False,  True,  True,  True,  True, False, False,  True,
        True,  True,

In [58]:
(model.predict(x_test[0].reshape(1,50)) > 0.5).astype("int32")

array([[1]], dtype=int32)