In [None]:
!git clone https://github.com/gerzin/IronySarcasmDetectorIT.git
!cd /content/IronySarcasmDetectorIT
!git pull
!cd ..

Cloning into 'IronySarcasmDetectorIT'...
remote: Enumerating objects: 301, done.[K
remote: Counting objects: 100% (301/301), done.[K
remote: Compressing objects: 100% (215/215), done.[K
remote: Total 301 (delta 159), reused 169 (delta 74), pack-reused 0[K
Receiving objects: 100% (301/301), 1.14 MiB | 9.87 MiB/s, done.
Resolving deltas: 100% (159/159), done.
fatal: not a git repository (or any of the parent directories): .git


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import sys
sys.path.append('/content/IronySarcasmDetectorIT')

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
tf.get_logger().setLevel('ERROR')
device_name = tf.test.gpu_device_name()
print(device_name)

/device:GPU:0


<b>DEFINE MODELS</b>

In [None]:
%%capture
!pip install emoji
!pip install transformers

In [None]:
from preprocessing.pipeline import ItalianTweetsPreprocessingPipeline
pre_pipeline = ItalianTweetsPreprocessingPipeline(to_lowercase=False)
df = pd.read_csv("/content/IronySarcasmDetectorIT/datasets/training_ironita2018.csv")
#pre-processing the data
df = pre_pipeline.apply(df)

In [None]:
from pathlib import Path
class ModelsConfig:
    SEQUENCE_LENGTH = 50
    BERT_ITA_XXL_CASED = "dbmdz/bert-base-italian-xxl-cased"
    BERT_TOKENIZER_LENGTH = 80
    BERT_MODEL_NAME = "bertlstm.h5"

In [None]:
from transformers import TFBertModel, AutoTokenizer
import numpy as np
import tensorflow as tf


def get_bert_tokenizer(model_url=ModelsConfig.BERT_ITA_XXL_CASED, tok_len=ModelsConfig.BERT_TOKENIZER_LENGTH):
    tokenizer = AutoTokenizer.from_pretrained(model_url, add_special_tokens=True, max_length=tok_len,
                                              pad_to_max_length=True)

    return tokenizer

def tokenize(sentences, tokenizer):
    input_ids, input_masks, input_segments = [], [], []
    for sentence in sentences:
        inputs = tokenizer.encode_plus(sentence, add_special_tokens=True, max_length=128, pad_to_max_length=True,
                                       return_attention_mask=True, return_token_type_ids=True, truncation=True)
        input_ids.append(inputs['input_ids'])
        input_masks.append(inputs['attention_mask'])
        input_segments.append(inputs['token_type_ids'])

    return np.asarray(input_ids, dtype='int32'), np.asarray(input_masks, dtype='int32'), np.asarray(input_segments,
                                                                                                    dtype='int32')


def get_bert_gru_classifier(hidden_layers, model_url=ModelsConfig.BERT_ITA_XXL_CASED):
    with tf.device(device_name):
      bert = TFBertModel.from_pretrained(model_url)

      input_ids_in = tf.keras.layers.Input(shape=(128,), name='input_token', dtype='int32')
      input_masks_in = tf.keras.layers.Input(shape=(128,), name='masked_token', dtype='int32')

      embedding_layer = bert(input_ids_in, attention_mask=input_masks_in)[0]
      
      first = True
      for layer in hidden_layers:
        if first:
           X =  tf.keras.layers.GRU(layer[0], return_sequences=layer[1])(embedding_layer)
           first = False
        else: 
          X =  tf.keras.layers.GRU(layer[0], return_sequences=layer[1])(X)
        if layer[2] != 0.0:
           X = tf.keras.layers.Dropout(layer[2])(X)
      
      X = tf.keras.layers.Dense(2, activation = 'sigmoid')(X)

      model = tf.keras.Model(inputs=[input_ids_in, input_masks_in], outputs=X)

      for layer in model.layers[:3]:
          layer.trainable = False

      return model

In [None]:
tokenizer = get_bert_tokenizer()

Downloading:   0%|          | 0.00/59.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/433 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/230k [00:00<?, ?B/s]

<b>GRID SEARCH</b>

In [None]:
import itertools
from sklearn.utils import shuffle

VALIDATION_SPLIT = 0.1
df_gridSearch = shuffle(df)
validation_size = int(len(df)*VALIDATION_SPLIT)
x_train_gs = df_gridSearch['text'][validation_size:]
y_train_gs = df_gridSearch[['irony','sarcasm']][validation_size:]
x_vali_gs = df_gridSearch['text'][:validation_size]
y_vali_gs = df_gridSearch[['irony','sarcasm']][:validation_size]

x_train_gs = tokenize(x_train_gs, tokenizer)[:-1]
x_vali_gs = tokenize(x_vali_gs, tokenizer)[:-1]

hidden_layers = [
                 [(128, True, 0.5), (64, True, 0.2), (16, False, 0.0)],
                 [(128, True, 0.5), (32, False, 0.2)],
                 [(128, True, 0.5), (16, False, 0.2)],
                 [(128, True, 0.5), (32, False, 0.0)],
                 [(128, False, 0.0)],
                 [(128, False, 0.2)],
                 [(64, False, 0.0)],
                 [(32, False, 0.0)]]
]

loss = ['binary_crossentropy','categorical_crossentropy']
epochs = [10]

combinations = list(itertools.product(*[hidden_layers, loss, epochs]))








In [None]:
early_stopping_cb = tf.keras.callbacks.EarlyStopping(patience=3, restore_best_weights=True, monitor='val_loss')

validation_performance = []
index = 0
for combination in combinations:
  print(index)
  model = get_bert_gru_classifier(hidden_layers = combination[0])
  model.compile(loss=combination[1], optimizer='adam', metrics =['acc'])

  model.fit(x = (x_train_gs[0], x_train_gs[1]), y = np.asarray(y_train_gs), epochs=combination[2], verbose = 0, callbacks=[early_stopping_cb])
  result = model.evaluate(x = (x_vali_gs[0], x_vali_gs[1]), y = y_vali_gs)
  performance = [combination, dict(zip(model.metrics_names, result))] 
  print(performance)
  validation_performance.append(performance)
  index +=1 

In [None]:
#RISULTATI GREED SEARCH
validation_performance = [
                   [([(128, True, 0.5), (64, True, 0.2), (16, False, 0.0)], 'binary_crossentropy', 10), {'loss': 0.4383414936065674, 'acc': 1.0}],
                   [([(128, True, 0.5), (64, True, 0.2), (16, False, 0.0)], 'categorical_crossentropy', 10), {'loss': 0.43652093410491943, 'acc': 1.0}],
                   [([(128, True, 0.5), (32, False, 0.2)], 'binary_crossentropy', 10), {'loss': 0.5214934945106506, 'acc': 0.9471032619476318}],
                   [([(128, True, 0.5), (32, False, 0.2)], 'categorical_crossentropy', 10), {'loss': 0.4481615126132965, 'acc': 0.8287153840065002}],
                   [([(128, True, 0.5), (16, False, 0.2)], 'binary_crossentropy', 10), {'loss': 0.5187110304832458, 'acc': 0.9748110771179199}],
                   [([(128, True, 0.5), (16, False, 0.2)], 'categorical_crossentropy', 10), {'loss': 0.4440704584121704, 'acc': 0.997481107711792}],
                   [([(128, True, 0.5), (32, False, 0.0)], 'binary_crossentropy', 10), {'loss': 0.5266370177268982, 'acc': 0.9622166156768799}],
                   [([(128, True, 0.5), (32, False, 0.0)], 'categorical_crossentropy', 10), {'loss': 0.4356771409511566, 'acc': 1.0}],
                   [([(128, False, 0.0)], 'binary_crossentropy', 10), {'loss': 0.6273298859596252, 'acc': 0.992443323135376}],
                   [([(128, False, 0.0)], 'categorical_crossentropy', 10), {'loss': 0.41610702872276306, 'acc': 0.982367753982544}],
                   [([(128, False, 0.2)], 'binary_crossentropy', 10), {'loss': 0.6418942809104919, 'acc': 0.8942065238952637}],
                   [([(128, False, 0.2)], 'categorical_crossentropy', 10), {'loss': 0.4152664542198181, 'acc': 0.997481107711792}],
                   [([(64, False, 0.0)], 'binary_crossentropy', 10), {'loss': 0.5600593090057373, 'acc': 0.9773299694061279}],
                   [([(64, False, 0.0)], 'categorical_crossentropy', 10), {'loss': 0.42633721232414246, 'acc': 0.992443323135376}],
                   [([(32, False, 0.0)], 'binary_crossentropy', 10), {'loss': 0.5144029259681702, 'acc': 0.9219143390655518}],
                   [([(32, False, 0.0)], 'categorical_crossentropy', 10), {'loss': 0.44485291838645935, 'acc': 0.9596977233886719}]

]

In [None]:
metric = 'acc'
values = []

for i in range(len(validation_performance)):
  values.append((validation_performance[i][0], validation_performance[i][1][metric]))

values.sort(key=lambda tup: tup[1], reverse = True)


print('Best Hyperparameters')
for v in values:
  print(v[0], v[1])

Best Hyperparameters
([(128, True, 0.5), (64, True, 0.2), (16, False, 0.0)], 'binary_crossentropy', 10) 1.0
([(128, True, 0.5), (64, True, 0.2), (16, False, 0.0)], 'categorical_crossentropy', 10) 1.0
([(128, True, 0.5), (32, False, 0.0)], 'categorical_crossentropy', 10) 1.0
([(128, True, 0.5), (16, False, 0.2)], 'categorical_crossentropy', 10) 0.997481107711792
([(128, False, 0.2)], 'categorical_crossentropy', 10) 0.997481107711792
([(128, False, 0.0)], 'binary_crossentropy', 10) 0.992443323135376
([(64, False, 0.0)], 'categorical_crossentropy', 10) 0.992443323135376
([(128, False, 0.0)], 'categorical_crossentropy', 10) 0.982367753982544
([(64, False, 0.0)], 'binary_crossentropy', 10) 0.9773299694061279
([(128, True, 0.5), (16, False, 0.2)], 'binary_crossentropy', 10) 0.9748110771179199
([(128, True, 0.5), (32, False, 0.0)], 'binary_crossentropy', 10) 0.9622166156768799
([(32, False, 0.0)], 'categorical_crossentropy', 10) 0.9596977233886719
([(128, True, 0.5), (32, False, 0.2)], 'binar

<b>FITTING THE BEST MODEL</b>

In [None]:
# Seed value
seed_value= 450

# 1. Set the `PYTHONHASHSEED` environment variable at a fixed value
import os
os.environ['PYTHONHASHSEED']=str(seed_value)

# 2. Set the `python` built-in pseudo-random generator at a fixed value
import random
random.seed(seed_value)

# 3. Set the `numpy` pseudo-random generator at a fixed value
import numpy as np
np.random.seed(seed_value)

# 4. Set the `tensorflow` pseudo-random generator at a fixed value
import tensorflow as tf
tf.random.set_seed(seed_value)
# for later versions: 
# tf.compat.v1.set_random_seed(seed_value)

# 5. Configure a new global `tensorflow` session
from keras import backend as K
session_conf = tf.compat.v1.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
sess = tf.compat.v1.Session(graph=tf.compat.v1.get_default_graph(), config=session_conf)
K.set_session(sess)

In [None]:
from custom_metrics import computePerformanceTaskB_2output
df_train = pd.read_csv("/content/IronySarcasmDetectorIT/datasets/training_ironita2018.csv")
#pre-processing the data
df_train = pre_pipeline.apply(df_train)

df_test = pd.read_csv("/content/IronySarcasmDetectorIT/datasets/test_gold_ironita2018.csv")
df_test = pre_pipeline.apply(df_test)

x_train = df_train['text']
y_train = df_train[['irony','sarcasm']]

x_test = df_test['text']
y_test = df_test[['irony','sarcasm']]
x_test = tokenize(x_test, tokenizer)[:-1]
x_train = tokenize(x_train, tokenizer)[:-1]

best_combination = ([(128, True, 0.5), (64, True, 0.2), (16, False, 0.0)], 'binary_crossentropy', 10)
model = get_bert_gru_classifier(hidden_layers = best_combination[0])
model.compile(loss=best_combination[1], optimizer='adam', metrics =['acc'])
early_stopping_cb = tf.keras.callbacks.EarlyStopping(patience=3, restore_best_weights=True, monitor='val_loss')

model.fit(x = (x_train[0], x_train[1]), y = np.asarray(y_train), epochs=best_combination[2], verbose = 0, callbacks=[early_stopping_cb])


print(computePerformanceTaskB_2output(model, x_test, y_test, y_test['irony']))





Downloading:   0%|          | 0.00/520M [00:00<?, ?B/s]

Some layers from the model checkpoint at dbmdz/bert-base-italian-xxl-cased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at dbmdz/bert-base-italian-xxl-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


[0.7121154496281271, 0.5357410280490237]


<b>SAVE MODEL</b>

In [None]:
model.save("/content/GRU/model", save_format='tf')

In [None]:
!zip -r "/content/model.zip" "/content/GRU/model"

In [None]:
files.download("/content/model.zip")