In [None]:
!git clone https://github.com/gerzin/IronySarcasmDetectorIT.git
!cd /content/IronySarcasmDetectorIT
!git pull
!cd ..

In [2]:
import sys
sys.path.append('/content/IronySarcasmDetectorIT')

In [3]:
import numpy as np
import pandas as pd
import tensorflow as tf
tf.get_logger().setLevel('ERROR')
device_name = tf.test.gpu_device_name()
print(device_name)

/device:GPU:0


In [None]:
import tensorflow.keras as keras

LOADING AND PREPROCESSING THE DATA WITH THE PREPROCESSING PIPELINE WE MADE

In [4]:
%%capture
!pip install emoji
!pip install transformers

In [5]:
from preprocessing.pipeline import ItalianTweetsPreprocessingPipeline
pre_pipeline = ItalianTweetsPreprocessingPipeline(to_lowercase=False)
df = pd.read_csv("/content/IronySarcasmDetectorIT/datasets/training_ironita2018.csv")
#pre-processing the data
df = pre_pipeline.apply(df)

SPLITTING THE DATASET INTO TEST, TRAIN AND VALIDATION SETS

In [7]:
from models.utils.model_selection import train_test_validation_split
train_set, test_set, validation_set = train_test_validation_split(df)
(X_train, y_train), (X_test, y_test), (X_val, y_val) = train_set, test_set, validation_set

In [8]:
from pathlib import Path
class ModelsConfig:
    SEQUENCE_LENGTH = 50
    BERT_ITA_XXL_CASED = "dbmdz/bert-base-italian-xxl-cased"
    BERT_TOKENIZER_LENGTH = 20
    BERT_MODEL_NAME = "bertlstm.h5"

In [9]:
from transformers import TFBertModel, AutoTokenizer, BertConfig
import numpy as np
import tensorflow as tf

def save_and_download_model(model, name):
  """utility function to download the weights from colab."""
  model.save(f"/content/{name}", save_format='tf')
  !zip -r f"/content/{name}.zip" f"/content/{name}"
  files.download(f"/content/{name}.zip")

def get_bert_tokenizer(model_url=ModelsConfig.BERT_ITA_XXL_CASED, tok_len=ModelsConfig.BERT_TOKENIZER_LENGTH):
    tokenizer = AutoTokenizer.from_pretrained(model_url, add_special_tokens=True, max_length=tok_len,
                                              pad_to_max_length=True)

    return tokenizer

def tokenize(sentences, tokenizer):
    input_ids, input_masks, input_segments = [], [], []
    for sentence in sentences:
        inputs = tokenizer.encode_plus(sentence, add_special_tokens=True, max_length=128, pad_to_max_length=True,
                                       return_attention_mask=True, return_token_type_ids=True, truncation=True)
        input_ids.append(inputs['input_ids'])
        input_masks.append(inputs['attention_mask'])
        input_segments.append(inputs['token_type_ids'])

    return np.asarray(input_ids, dtype='int32'), np.asarray(input_masks, dtype='int32'), np.asarray(input_segments,
                                                                                                    dtype='int32')


TOKENIZING THE SEQUENCES

In [10]:
tokenizer = get_bert_tokenizer()
# we only need the token ids and the attention mask
X_train_tok = tokenize(X_train, tokenizer)[:-1]
X_val_tok = tokenize(X_val, tokenizer)[:-1]
X_test_tok = tokenize(X_test, tokenizer)[:-1]

Downloading:   0%|          | 0.00/59.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/433 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/230k [00:00<?, ?B/s]



MODEL DEFINITION AND GRID SEARCH

In [None]:
import itertools

lstm_units = [8,16,32,64]
lstm_dense = [8,16,32,64]
lstm_dropouts = [0.25,0.45]
epochs = [5]

combinations = list(itertools.product(*[lstm_units, lstm_dense, lstm_dropouts, epochs]))
print(f"tot. combinations = {len(combinations)}")

def get_bert_lstm_classifier(params = [16, 32, 0.45, 5]):
    """
    Return the BERT LSTM model.
    params:
      params - list containing [#lstm outputspace dimensionality, #dense layer units, dropout, # epochs]
    """
    model_url=ModelsConfig.BERT_ITA_XXL_CASED
    bert_config = BertConfig.from_pretrained(ModelsConfig.BERT_ITA_XXL_CASED, output_hidden_states=True)
    with tf.device(device_name):
      bert = TFBertModel.from_pretrained(model_url, config=bert_config)

      input_ids_in = tf.keras.layers.Input(shape=(128,), name='input_token', dtype='int32')
      input_masks_in = tf.keras.layers.Input(shape=(128,), name='masked_token', dtype='int32')

      embedding_layer = bert(input_ids_in, attention_mask=input_masks_in)[0]

      X = tf.keras.layers.Bidirectional(
          tf.keras.layers.LSTM(params[0], return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(embedding_layer)
      X = tf.keras.layers.Concatenate(axis=-1)([X, embedding_layer])

      X = tf.keras.layers.MaxPooling1D(20)(X)
      X = tf.keras.layers.SpatialDropout1D(0.4)(X)
      X = tf.keras.layers.Flatten()(X)
      X = tf.keras.layers.Dense(params[1], activation="relu")(X)

      X = tf.keras.layers.Dropout(params[2])(X)
      X = tf.keras.layers.Dense(2, activation='softmax')(X)

      model = tf.keras.Model(inputs=[input_ids_in, input_masks_in], outputs=X)

      for layer in model.layers[:3]:
          layer.trainable = False
      opt = tf.keras.optimizers.Adam(learning_rate= 2e-5, epsilon= 1e-3)
      model.compile(optimizer= opt, loss='categorical_crossentropy', metrics=['acc'])
      return model

validation_performance = []
index = 0
for combination in combinations[index:]:
  print(index)
  model = get_bert_lstm_classifier(params = combination)

  model.fit(x =X_train_tok, y = y_train, epochs=combination[3])
  result = model.evaluate(x = X_val_tok, y = y_val)
  performance = [combination, dict(zip(model.metrics_names, result))] 
  print(performance)
  validation_performance.append(performance)
  index +=1

RESULTS OF THE GRID SEARCH

In [None]:
metric = 'acc'
values = []
validation_performance = [
  [(8, 8, 0.25, 5), {'loss': 0.5023014545440674, 'acc': 0.9949748516082764}],
  [(8, 8, 0.45, 5), {'loss': 0.6422484517097473, 'acc': 1.0}],
  [(8, 16, 0.25, 5), {'loss': 0.5010782480239868, 'acc': 0.9698492288589478}],
  [(8, 16, 0.45, 5), {'loss': 0.5436665415763855, 'acc': 1.0}],
  [(8, 32, 0.25, 5), {'loss': 0.5828566551208496, 'acc': 1.0}],
  [(8, 32, 0.45, 5), {'loss': 0.6800474524497986, 'acc': 1.0}],
  [(8, 64, 0.25, 5), {'loss': 0.5809416770935059, 'acc': 1.0}],
  [(8, 64, 0.45, 5), {'loss': 0.8537607789039612, 'acc': 1.0}],
  [(16, 8, 0.25, 5), {'loss': 0.5134936571121216, 'acc': 0.69597989320755}],
  [(16, 8, 0.45, 5), {'loss': 0.5282988548278809, 'acc': 1.0}],
  [(16, 16, 0.25, 5), {'loss': 0.595130205154419, 'acc': 1.0}],
  [(16, 16, 0.45, 5), {'loss': 0.5020710825920105, 'acc': 0.9321607947349548}],
  [(16, 32, 0.25, 5), {'loss': 0.5691903233528137, 'acc': 1.0}],
  [(16, 32, 0.45, 5), {'loss': 0.4975298345088959, 'acc': 0.9748743772506714}],
  [(16, 64, 0.25, 5), {'loss': 0.5888412594795227, 'acc': 1.0}],
  [(16, 64, 0.45, 5), {'loss': 0.675396740436554, 'acc': 1.0}],
  [(32, 8, 0.25, 5), {'loss': 0.5186547040939331, 'acc': 0.8919597864151001}],
  [(32, 8, 0.45, 5), {'loss': 0.5218737125396729, 'acc': 1.0}],
  [(32, 16, 0.25, 5), {'loss': 0.5134365558624268, 'acc': 0.8366833925247192}],
  [(32, 16, 0.45, 5), {'loss': 0.5705238580703735, 'acc': 1.0}],
  [(32, 32, 0.25, 5), {'loss': 0.6709557175636292, 'acc': 1.0}],
  [(32, 32, 0.45, 5), {'loss': 0.5036795139312744, 'acc': 0.9447236061096191}],
  [(32, 64, 0.25, 5), {'loss': 0.929313600063324, 'acc': 1.0}],
  [(32, 64, 0.45, 5), {'loss': 0.8172656297683716, 'acc': 1.0}],
  [(64, 8, 0.25, 5), {'loss': 0.6308498978614807, 'acc': 1.0}],
  [(64, 8, 0.45, 5), {'loss': 0.7422505021095276, 'acc': 1.0}],
  [(64, 16, 0.25, 5), {'loss': 0.5153309106826782, 'acc': 1.0}],
  [(64, 16, 0.45, 5), {'loss': 0.5317749381065369, 'acc': 1.0}]                
]


for i in range(len(validation_performance)):
  values.append((validation_performance[i][0], validation_performance[i][1][metric]))

values.sort(key=lambda tup: tup[1], reverse = True)


print('Best Hyperparameters')
for v in values:
  print(v[0], v[1])

TEST CON GOLDEN SET

In [33]:
from custom_metrics import computePerformanceTaskB_2output

In [34]:
gold_dataset = pd.read_csv("/content/IronySarcasmDetectorIT/datasets/test_gold_ironita2018.csv")
gold_dataset = pre_pipeline.apply(gold_dataset)

In [35]:
y_test = gold_dataset[['irony', 'sarcasm']]

In [None]:
x_test = tokenize(gold_dataset['text'], tokenizer)[:-1]

In [None]:
# F1 di entrambi i task
computePerformanceTaskB_2output(model, x_test, y_test, y_test['irony'])