# Grid search 

In [1]:
import sys
sys.path.append("../")

import itertools
import random
import pandas as pd
import numpy as np
from config import Config
import keras


VALIDATION_SPLIT = 0.1
EMBEDDING_SIZE = 128
INPUT_LENGHT = 194

In [2]:
from preprocessing.tokenizationBagWords import tokenize_frame
from keras.layers import Conv1D, Dropout, Flatten, Dense

In [3]:
hidden_layers = [[(128, True, 0.5), (64, True, 0.2), (16, False, 0.0)],
 [(128, True, 0.5), (32, False, 0.2)],
 [(128, True, 0.5), (64, False, 0.2)],  
 [(128, True, 0.0), (32, False, 0.0)],
 [(128, False, 0.0)],
 [(128, False, 0.2)]
 ]
loss = ['binary_crossentropy']
epochs = [10]


combinations = list(itertools.product(*[hidden_layers, loss, epochs]))
combinations

validation_performance = []

training_data = pd.read_csv(Config.TRAINING_DATASET_PATH)

from preprocessing.pipeline import ItalianTweetsPreprocessingPipeline
pp = ItalianTweetsPreprocessingPipeline()

preprocessed_training_data = pp.apply(training_data)

x = preprocessed_training_data['text']
y_1 = list(preprocessed_training_data['irony'])
y_2 = list(preprocessed_training_data['sarcasm'])
y = list(zip(y_1, y_2))

x, _ , num_words = tokenize_frame(x)


tmp = list(zip(x,y))
random.shuffle(tmp)
x, y = zip(*tmp)


validate_size = int(len(x)*VALIDATION_SPLIT)
x_train, x_validate = x[:-validate_size], x[-validate_size:]
x_train = np.asarray(x_train)
x_validate = np.asarray(x_validate)
y_train, y_validate = y[:-validate_size], y[-validate_size:]
y_train = np.asarray(y_train)
y_validate = np.asarray(y_validate)




In [5]:
results = []
index = 0
for combination in combinations:
    print("index: " + str(index) + " Configurazione: " + str(combination))
    model = keras.Sequential()
    model.add(keras.layers.Embedding(num_words, EMBEDDING_SIZE, input_length=INPUT_LENGHT, mask_zero=True))
    for e in combination[0]:
        model.add(keras.layers.GRU(e[0], return_sequences=e[1]))
        if e[2] != 0.0:
            model.add(keras.layers.Dropout(e[2]))
    model.add(keras.layers.Dense(2, activation='sigmoid'))
    model.compile(loss=combination[1], optimizer='adam', metrics=['accuracy'])
    
    model.fit(x_train, y_train, epochs=combination[2], verbose=0)
    result = model.evaluate(x_validate, y_validate)
    results.append([combination, dict(zip(model.metrics_names, result))])
    index += 1

index: 0 Configurazione: ([(128, True, 0.5), (64, True, 0.2), (16, False, 0.0)], 'binary_crossentropy', 10)
index: 1 Configurazione: ([(128, True, 0.5), (32, False, 0.2)], 'binary_crossentropy', 10)
index: 2 Configurazione: ([(128, True, 0.5), (64, False, 0.2)], 'binary_crossentropy', 10)
index: 3 Configurazione: ([(128, True, 0.0), (32, False, 0.0)], 'binary_crossentropy', 10)
index: 4 Configurazione: ([(128, False, 0.0)], 'binary_crossentropy', 10)
index: 5 Configurazione: ([(128, False, 0.2)], 'binary_crossentropy', 10)


In [6]:
metric = 'accuracy'
values = []


for i in range(len(results)):
    values.append((results[i][0],results[i][1][metric]))
                  
values.sort(key=lambda tup: tup[1], reverse = True)

print('Best Hyperparameter')
for v in values:
    print(v[0], v[1])

Best Hyperparameter
([(128, True, 0.5), (64, True, 0.2), (16, False, 0.0)], 'binary_crossentropy', 10) 1.0
([(128, True, 0.5), (32, False, 0.2)], 'binary_crossentropy', 10) 1.0
([(128, True, 0.5), (64, False, 0.2)], 'binary_crossentropy', 10) 1.0
([(128, True, 0.0), (32, False, 0.0)], 'binary_crossentropy', 10) 1.0
([(128, False, 0.0)], 'binary_crossentropy', 10) 1.0
([(128, False, 0.2)], 'binary_crossentropy', 10) 1.0


# Best Modelfrom pathlib import Path

In [None]:
from pathlib import Path
import sys
sys.path.append("../")


from config import Config
import pandas as pd

In [None]:
training_data = pd.read_csv(Config.TRAINING_DATASET_PATH)
test_data = pd.read_csv(Config.TEST_DATASET_PATH)
del test_data['id']
del test_data['topic']

In [None]:
from preprocessing.pipeline import ItalianTweetsPreprocessingPipeline
pp = ItalianTweetsPreprocessingPipeline()

In [None]:
preprocessed_training_data = pp.apply(training_data)
preprocessed_test_data = pp.apply(test_data)


In [None]:
x_train = preprocessed_training_data['text']
x_test = preprocessed_test_data['text']
y_train = preprocessed_training_data[['irony','sarcasm']]
y_test = preprocessed_test_data[['irony','sarcasm']]

In [None]:
import tensorflow as tk
from tensorflow import keras
from config import Config

INPUT_LENGHT = 194
EMBEDDING_SIZE = 128
VALIDATION_SIZE = 0.1

from preprocessing.tokenizationBagWords import tokenize_frame
import numpy as np


In [None]:
# Seed value
seed_value= 450

# 1. Set the `PYTHONHASHSEED` environment variable at a fixed value
import os
os.environ['PYTHONHASHSEED']=str(seed_value)

# 2. Set the `python` built-in pseudo-random generator at a fixed value
import random
random.seed(seed_value)

# 3. Set the `numpy` pseudo-random generator at a fixed value
import numpy as np
np.random.seed(seed_value)

# 4. Set the `tensorflow` pseudo-random generator at a fixed value
import tensorflow as tf
tf.random.set_seed(seed_value)
# for later versions: 
# tf.compat.v1.set_random_seed(seed_value)

# 5. Configure a new global `tensorflow` session
from keras import backend as K
session_conf = tf.compat.v1.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
sess = tf.compat.v1.Session(graph=tf.compat.v1.get_default_graph(), config=session_conf)
K.set_session(sess)

In [None]:
def model_gru(x, y, num_words: int, combination, embedding_size=EMBEDDING_SIZE):
    
    model = keras.Sequential()
    model.add(keras.layers.Embedding(num_words, EMBEDDING_SIZE, input_length=INPUT_LENGHT, mask_zero=True))
    for e in combination[0]:
        model.add(keras.layers.GRU(e[0], return_sequences=e[1]))
        if e[2] != 0.0:
            model.add(keras.layers.Dropout(e[2]))
    model.add(keras.layers.Dense(2, activation='sigmoid'))
    model.compile(loss=combination[1], optimizer='adam', metrics=['accuracy'])

    history = model.fit(x, y, epochs=combination[2], validation_split = VALIDATION_SIZE)
    
    
    return model


In [17]:
x_train_gru, x_test_gru, num_words = tokenize_frame(x_train.tolist(), x_test.tolist())

x_train_gru = np.asarray(x_train_gru)
x_test_gru = np.asarray(x_test_gru)

hyperparameters = [[(128, True, 0.5), (64, True, 0.2), (16, False, 0.0)], 'binary_crossentropy', 10]
model = model_gru(x_train_gru, y_train, num_words, hyperparameters)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [19]:
from custom_metrics import computePerformanceTaskB_2output

print("F1 Average Task A-B")
computePerformanceTaskB_2output(model, x_test_gru, y_test, y_test['irony'])

F1 Average Task A-B


[0.5480795538251079, 0.3123566819848675]