### Import necessary packages

In [53]:
!pip install transformers seqeval[gpu]
!pip install datasets
!pip install sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [54]:
import torch
import numpy as np
import pandas as pd
import datasets
from collections import Counter
from datasets import load_dataset
from transformers import XLNetTokenizer, XLNetForSequenceClassification
from transformers import TrainingArguments, Trainer
from transformers import EarlyStoppingCallback
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from transformers import AutoTokenizer, AutoModelForSequenceClassification

### Load data

In [55]:
torch.cuda.is_available()
device = torch.device("cuda")
from google.colab import drive
drive.mount('/content/drive')
!ls '/content/drive/MyDrive/eth_CS4NLP/project/qadata'
train_path = '/content/drive/MyDrive/eth_CS4NLP/project/qadata/final_train.csv'
dev_path = '/content/drive/MyDrive/eth_CS4NLP/project/qadata/final_dev.csv'
test_path = '/content/drive/MyDrive/eth_CS4NLP/project/qadata/vardial_test.txt'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
final_dev.csv	 final_train.gsheet  vardial_dev.txt   vardial_train_1.csv
final_train.csv  pred_bert_output    vardial_test.txt  vardial_train_2.csv


In [56]:
dial_label = {
    0 : 'EML',
    1 : 'NAP',
    2 : 'PMS',
    3 : 'FUR',
    4 : 'LLD',
    5 : 'LIJ',
    6 : 'LMO',
    7 : 'ROA_TARA',
    8 : 'SCN', 
    9 : 'VEC',
    10 : 'SC'
}

fold_label = {
    'EML' : 0,
    'NAP' : 1,
    'PMS' : 2,
    'FUR' : 3,
    'LLD' : 4,
    'LIJ' : 5,
    'LMO' : 6,
    'ROA_TARA' : 7,
    'SCN' : 8,
    'VEC' : 9,
    'SC' : 10
}

def explain_label(label : int) -> str:
    """ 
    Given an integer label, convert it to the corresponding string label
    :param int label: integer label to be converted
    :return: string corresponding to the given label
    """
    return dial_label[label]

def encode_label(label : str) -> int:
    """ 
    Given a string label, encode it to the corresponding index
    :param string label: string label to be converted
    :return: int corresponding to the given label
    """
    return fold_label[label]

In [79]:
df_dev = pd.read_csv(dev_path)#, sep = "\t", names=["text", "label"])
#df_dev['label'] = df_dev['label'].apply(encode_label)
#df_dev = df_dev.fillna('')
#-
df_train = pd.read_csv(train_path)#, sep = "\t", names=["text", "label"])
#df_train_=df_train[['text','label']]
#df_train_.drop_duplicates(subset ='text',keep = False, inplace = True, ignore_index=True) 
#SUBSET
#f_train_ = df_train_.iloc[0:1000,:] 
df_test = pd.read_csv(test_path, sep = "\t", names=["text"])
df_test['label']=0 #add label column to the testset and assign 0 to all row
df_test.shape

(11087, 2)

In [69]:
df_train.head(1)

Unnamed: 0,text,label
0,El record de parteçipasion el xe del Bresa .,9


In [59]:
# Create dictionaries to transform from labels to id and vice-versa.
id2label = {0 : 'EML',
    1 : 'NAP',
    2 : 'PMS',
    3 : 'FUR',
    4 : 'LLD',
    5 : 'LIJ',
    6 : 'LMO',
    7 : 'ROA_TARA',
    8 : 'SCN', 
    9 : 'VEC',
    10 : 'SC'}
label2id = {v:k for k,v in id2label.items()}
num_labels = len(id2label)
print(id2label)

{0: 'EML', 1: 'NAP', 2: 'PMS', 3: 'FUR', 4: 'LLD', 5: 'LIJ', 6: 'LMO', 7: 'ROA_TARA', 8: 'SCN', 9: 'VEC', 10: 'SC'}


In [82]:
from datasets import Dataset
train_ds = Dataset.from_pandas(df_train)
dev_ds = Dataset.from_pandas(df_dev)
test_ds = Dataset.from_pandas(df_test)
# Example of instance of the dataset
train_ds[18]

{'label': 2,
 'text': "Damentre ch'a lo fasìa, a l'ha sentù 'n sìfol daré 'd chiel:"}

## Tokenizing data - Model name

**model_name**
- [x] m-polignano-uniba/bert_uncased_L-12_H-768_A-12_italian_alb3rt0
- [x] 'dbmdz/bert-base-italian-cased'
- [x] 'dbmdz/bert-base-italian-xxl-cased'
- [x] 'dbmdz/bert-base-italian-uncased'
- [x] mrm8488/bert-italian-finedtuned-squadv1-it-alfa

In [61]:

# Load BERT tokenizer.
model_name = 'dbmdz/bert-base-italian-xxl-cased'


In [62]:
#tokenizer = XLNetTokenizer.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

loading configuration file https://huggingface.co/dbmdz/bert-base-italian-xxl-cased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/a6f63287bf77965c2a075e2eb7b021b66b863c245cb1ac8cd51d73c9b9711f11.d4216e94150242f24c22ebc3ff97fb4079388b3e36c7e029e29e0f833f5db329
Model config BertConfig {
  "_name_or_path": "dbmdz/bert-base-italian-xxl-cased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.20.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 32102
}

loading file https://huggingface.co/dbmdz/bert-base-italian-xxl-cas

In [63]:

def encode_dataset(dataset: datasets.arrow_dataset.Dataset) -> list:
  '''
  Transforming each instance of the dataset with the Tokenizer
  '''
  encoded_dataset = []
  for item in dataset:
    # Tokenize the sentence.
    sentence_encoded = tokenizer(item['text'],
                                return_tensors="pt", 
                                padding='max_length', 
                                truncation=True, 
                                max_length=50)
    
    sentence_encoded['labels'] = torch.LongTensor(np.array([item['label']]))
    encoded_dataset.append(sentence_encoded)

  # Reduce dimensionality of tensors.
  for item in encoded_dataset:
    for key in item:
      item[key] = torch.squeeze(item[key])
  return encoded_dataset

In [83]:
# Tokenizing datasets
encoded_dataset_train = encode_dataset(train_ds)
encoded_dataset_dev = encode_dataset(dev_ds)
encoded_dataset_test = encode_dataset(test_ds)


In [67]:
# Example of dataset
for token, token_label in zip(tokenizer.convert_ids_to_tokens(encoded_dataset_train[50]["input_ids"]), encoded_dataset_train[50]["input_ids"]):
  print('{0:10}  {1}'.format(token, token_label))

[CLS]       102
In          369
tu          241
1999        5285
u           349
l           181
'           1553
è           198
s           109
##tà        232
##u         30887
istitu      1639
##ì         30946
##u         30887
u           349
Nu          9531
##cle       2867
##o         30879
Opera       9434
##tivo      705
per         156
l           181
'           1553
Archeo      26104
##logia     1783
Sub         13804
##acque     28740
##a         30878
pru         7693
##g         30891
##è         30915
##ttu       794
da          203
So          756
##pri       787
##nte       1696
##nd        21433
##enza      351
per         156
i           134
Beni        12194
Archeo      26104
##logici    6412
della       213
Liguria     21764
,           1307
che         158
a           111
l           181
[SEP]       103


# Fine tunning of the model

In [71]:
# Common training arguments
training_args = TrainingArguments(
    num_train_epochs=2,
    per_device_train_batch_size=128,
    per_device_eval_batch_size=128,
    logging_dir='logs',
    no_cuda=False,  
    output_dir = 'drive/MyDrive/eth_CS4NLP/project',
)

# Dictionary to save the results
models_performance ={}

# FINE TUNING PROCESS

# create model
#model = XLNetForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

'''model.transformer.mask_emb.requires_grad = False
model.transformer.word_embedding.weight.requires_grad = False
for name, param in model.transformer.layer.named_parameters():
  try:
    layer = int(name[:2])
  except ValueError:
    try:
      layer = int(name[:1])
    except ValueError:
      layer = 0
  if layer <= 20:
    param.requires_grad = False'''


# Create trainer
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=encoded_dataset_train,)

# Fine tunning
trainer.train()


PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
loading configuration file https://huggingface.co/dbmdz/bert-base-italian-xxl-cased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/a6f63287bf77965c2a075e2eb7b021b66b863c245cb1ac8cd51d73c9b9711f11.d4216e94150242f24c22ebc3ff97fb4079388b3e36c7e029e29e0f833f5db329
Model config BertConfig {
  "_name_or_path": "dbmdz/bert-base-italian-xxl-cased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
  

Step,Training Loss
500,0.2975
1000,0.1204
1500,0.1013
2000,0.0856
2500,0.0757
3000,0.0499
3500,0.049
4000,0.0458
4500,0.0427


Saving model checkpoint to drive/MyDrive/eth_CS4NLP/project/checkpoint-500
Configuration saved in drive/MyDrive/eth_CS4NLP/project/checkpoint-500/config.json
Model weights saved in drive/MyDrive/eth_CS4NLP/project/checkpoint-500/pytorch_model.bin
tokenizer config file saved in drive/MyDrive/eth_CS4NLP/project/checkpoint-500/tokenizer_config.json
Special tokens file saved in drive/MyDrive/eth_CS4NLP/project/checkpoint-500/special_tokens_map.json
Saving model checkpoint to drive/MyDrive/eth_CS4NLP/project/checkpoint-1000
Configuration saved in drive/MyDrive/eth_CS4NLP/project/checkpoint-1000/config.json
Model weights saved in drive/MyDrive/eth_CS4NLP/project/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in drive/MyDrive/eth_CS4NLP/project/checkpoint-1000/tokenizer_config.json
Special tokens file saved in drive/MyDrive/eth_CS4NLP/project/checkpoint-1000/special_tokens_map.json
Saving model checkpoint to drive/MyDrive/eth_CS4NLP/project/checkpoint-1500
Configuration saved i

TrainOutput(global_step=4942, training_loss=0.09112088509485916, metrics={'train_runtime': 4903.0665, 'train_samples_per_second': 129.013, 'train_steps_per_second': 1.008, 'total_flos': 1.62545363251374e+16, 'train_loss': 0.09112088509485916, 'epoch': 2.0})

In [72]:
# Get the predicted labels
preds = trainer.predict(encoded_dataset_dev)
predictions = preds.predictions.argmax(-1)

# Create array with predicted labels and expected.
true_values = np.array(preds.label_ids).flatten()
predicted_values = np.array(preds.predictions.argmax(-1)).flatten()

# Filter the labels. We only produce a label for each word. We filter labels
# of subwords and special tokens, such as PAD
proc_predicted_values = [prediction for prediction, label in zip(predicted_values, true_values) if label != -100]
proc_true_values = [label for prediction, label in zip(predicted_values, true_values) if label != -100]

# Evaluate models
model_performance = {}
model_performance['accuracy'] = accuracy_score(proc_true_values, proc_predicted_values)
model_performance['precision_micro'] = precision_score(proc_true_values, proc_predicted_values, average='micro')
model_performance['precision_macro'] = precision_score(proc_true_values, proc_predicted_values, average='macro')
model_performance['recall_micro'] = recall_score(proc_true_values, proc_predicted_values, average='micro')
model_performance['recall_macro'] = recall_score(proc_true_values, proc_predicted_values, average='macro')
model_performance['f1_micro'] = f1_score(proc_true_values, proc_predicted_values, average='micro')
model_performance['f1_macro'] = f1_score(proc_true_values, proc_predicted_values, average='macro')

model_performance['confusion_matrix'] = confusion_matrix(proc_true_values, proc_predicted_values)
model_performance['confusion_matrix_normalized'] = confusion_matrix(proc_true_values, proc_predicted_values, normalize='true')

***** Running Prediction *****
  Num examples = 79070
  Batch size = 128


In [73]:
model_performance

{'accuracy': 0.983407107626154,
 'confusion_matrix': array([[ 1397,     2,     4,     4,     5,     6,    35,     1,     1,
             7,     2],
        [    0,  4293,    10,     1,     2,     5,    18,    25,    46,
            20,     7],
        [    9,     4, 11865,     0,     5,     9,    71,     1,    16,
            25,    11],
        [    7,     2,     4,  2838,     5,     5,    29,     2,     7,
             4,     3],
        [    4,     2,    12,     1,  4635,     0,    35,     0,     0,
             9,     6],
        [    3,     9,     9,     3,     0,  3785,    33,     6,    17,
            42,     5],
        [   36,     6,    37,     6,    16,    32, 17001,     3,    29,
            78,    22],
        [    0,    10,     2,     4,     1,     1,     3,  2520,     9,
            14,     2],
        [    1,    11,    12,     1,     2,     3,    31,    16, 11860,
            22,    16],
        [    1,     3,    13,     7,     6,    14,   103,    11,    20,
         108

# Models evaluation

F1-micro and F1-macro for each model

In [74]:
print('------------Model performance------------')
print(f'  accuracy: {model_performance["accuracy"]}')
print(f'  f1-micro: {model_performance["f1_micro"]}')
print(f'  f1-macro: {model_performance["f1_macro"]}')
print(f'  precision_macro: {model_performance["precision_macro"]}')
print(f'  precision_micro: {model_performance["precision_micro"]}')
print(f'  recall_macro: {model_performance["recall_macro"]}')
print(f'  recall_micro: {model_performance["recall_micro"]}')
print()

------------Model performance------------
  accuracy: 0.983407107626154
  f1-micro: 0.983407107626154
  f1-macro: 0.9805004385512308
  precision_macro: 0.982040962941178
  precision_micro: 0.983407107626154
  recall_macro: 0.9790006433082545
  recall_micro: 0.983407107626154



In [75]:
pred_df = pd.DataFrame({'pred_label':predicted_values})
pred_df

Unnamed: 0,pred_label
0,5
1,9
2,6
3,6
4,9
...,...
79065,10
79066,6
79067,3
79068,1


In [76]:
from google.colab import files
pred_df.to_csv('new_pred_dbmdz_bert-base-italian-xxl-cased.csv',index=False)
files.download('new_pred_dbmdz_bert-base-italian-xxl-cased.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## Model Prediction

In [92]:
# Get the predicted labels
preds = trainer.predict(encoded_dataset_test)
predictions = preds.predictions.argmax(-1)

# Create array with predicted labels and expected.
true_values = np.array(preds.label_ids).flatten()
predicted_values = np.array(preds.predictions.argmax(-1)).flatten()

***** Running Prediction *****
  Num examples = 11087
  Batch size = 128


In [94]:
pred_test = pd.DataFrame({'pred_label':predicted_values})
from google.colab import files
pred_test.to_csv('new_test_pred_dbmdz_bert-base-italian-xxl-cased.csv',index=False)
files.download('new_test_pred_dbmdz_bert-base-italian-xxl-cased.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [95]:
pred_test.shape

(11087, 1)