### Import necessary packages

In [1]:
!pip install transformers seqeval[gpu]
!pip install datasets
!pip install sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.20.1-py3-none-any.whl (4.4 MB)
[K     |████████████████████████████████| 4.4 MB 4.3 MB/s 
[?25hCollecting seqeval[gpu]
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[K     |████████████████████████████████| 43 kB 2.4 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 54.1 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 101 kB 11.1 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 63.3 MB/s 
Building wheels for collected packages: seq

In [2]:
import torch
import numpy as np
import pandas as pd
import datasets
from collections import Counter
from datasets import load_dataset
from transformers import XLNetTokenizer, XLNetForSequenceClassification
from transformers import TrainingArguments, Trainer
from transformers import EarlyStoppingCallback
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [3]:
from torch import nn
from transformers import Trainer

class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        # forward pass
        outputs = model(**inputs)
        logits = outputs.get('logits')
        # compute custom loss
        loss_fct = nn.CrossEntropyLoss(weight=torch.tensor([1, 1, 1, 1, 1, 1, 0.38, 1, 0.66, 1, 1]))
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

### Load data

In [4]:
torch.cuda.is_available()
device = torch.device("cuda")
from google.colab import drive
drive.mount('/content/drive')
!ls '/content/drive/MyDrive/eth_CS4NLP/project/qadata'
train_path = '/content/drive/MyDrive/eth_CS4NLP/project/qadata/df_traindev.csv'
dev_path = '/content/drive/MyDrive/eth_CS4NLP/project/qadata/final_dev.csv'
test_path = '/content/drive/MyDrive/eth_CS4NLP/project/qadata/vardial_test.txt'

Mounted at /content/drive
df_traindev.csv      pred_bert_devset	    vardial_train_1.csv
final_dev.csv	     Record_outputs.gsheet  vardial_train_2.csv
final_train.csv      vardial_dev.txt
output_pred_testset  vardial_test.txt


In [5]:
dial_label = {
    0 : 'EML',
    1 : 'NAP',
    2 : 'PMS',
    3 : 'FUR',
    4 : 'LLD',
    5 : 'LIJ',
    6 : 'LMO',
    7 : 'ROA_TARA',
    8 : 'SCN', 
    9 : 'VEC',
    10 : 'SC'
}

fold_label = {
    'EML' : 0,
    'NAP' : 1,
    'PMS' : 2,
    'FUR' : 3,
    'LLD' : 4,
    'LIJ' : 5,
    'LMO' : 6,
    'ROA_TARA' : 7,
    'SCN' : 8,
    'VEC' : 9,
    'SC' : 10
}

def explain_label(label : int) -> str:
    """ 
    Given an integer label, convert it to the corresponding string label
    :param int label: integer label to be converted
    :return: string corresponding to the given label
    """
    return dial_label[label]

def encode_label(label : str) -> int:
    """ 
    Given a string label, encode it to the corresponding index
    :param string label: string label to be converted
    :return: int corresponding to the given label
    """
    return fold_label[label]

In [6]:
df_dev = pd.read_csv(dev_path)
df_train = pd.read_csv(train_path)
# testset
df_test = pd.read_csv(test_path, sep = "\t", names=["text"])
df_test['label']=0 #add label column to the testset and assign 0 to all row
df_test.shape

(11087, 2)

In [7]:
# Create dictionaries to transform from labels to id and vice-versa.
id2label = {0 : 'EML',
    1 : 'NAP',
    2 : 'PMS',
    3 : 'FUR',
    4 : 'LLD',
    5 : 'LIJ',
    6 : 'LMO',
    7 : 'ROA_TARA',
    8 : 'SCN', 
    9 : 'VEC',
    10 : 'SC'}
label2id = {v:k for k,v in id2label.items()}
num_labels = len(id2label)
print(id2label)

{0: 'EML', 1: 'NAP', 2: 'PMS', 3: 'FUR', 4: 'LLD', 5: 'LIJ', 6: 'LMO', 7: 'ROA_TARA', 8: 'SCN', 9: 'VEC', 10: 'SC'}


In [8]:
from datasets import Dataset
train_ds = Dataset.from_pandas(df_train)
dev_ds = Dataset.from_pandas(df_dev)
test_ds = Dataset.from_pandas(df_test)
# Example of instance of the dataset
train_ds[18]

{'id': 32711.0,
 'label': 0,
 'text': " 'l è 'l nòm 'd un domìni genèric. Al funsiòuna da 'l dicèmber dal 2016.",
 'title': '.كاثوليك',
 'url': 'https://eml.wikipedia.org/wiki?curid=32711'}

## Tokenizing data - Model name

**model_name**
- [] m-polignano-uniba/bert_uncased_L-12_H-768_A-12_italian_alb3rt0
- [] 'dbmdz/bert-base-italian-cased'
- [x] 'dbmdz/bert-base-italian-xxl-cased'
- [] 'dbmdz/bert-base-italian-uncased'
- [] mrm8488/bert-italian-finedtuned-squadv1-it-alfa

In [9]:

# Load BERT tokenizer.
model_name = 'dbmdz/bert-base-italian-xxl-cased'


In [10]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

Downloading:   0%|          | 0.00/59.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/433 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/230k [00:00<?, ?B/s]

In [11]:

def encode_dataset(dataset: datasets.arrow_dataset.Dataset) -> list:
  '''
  Transforming each instance of the dataset with the Tokenizer
  '''
  encoded_dataset = []
  for item in dataset:
    # Tokenize the sentence.
    sentence_encoded = tokenizer(item['text'],
                                return_tensors="pt", 
                                padding='max_length', 
                                truncation=True, 
                                max_length=70)
    
    sentence_encoded['labels'] = torch.LongTensor(np.array([item['label']]))
    encoded_dataset.append(sentence_encoded)

  # Reduce dimensionality of tensors.
  for item in encoded_dataset:
    for key in item:
      item[key] = torch.squeeze(item[key])
  return encoded_dataset

In [12]:
# Tokenizing datasets
encoded_dataset_train = encode_dataset(train_ds)
encoded_dataset_dev = encode_dataset(dev_ds)
encoded_dataset_test = encode_dataset(test_ds)


In [13]:
# Example of dataset
for token, token_label in zip(tokenizer.convert_ids_to_tokens(encoded_dataset_train[50]["input_ids"]), encoded_dataset_train[50]["input_ids"]):
  print('{0:10}  {1}'.format(token, token_label))

[CLS]       102
I           184
an          221
1970        12365
,           1307
cia         2718
##m         30889
##ê         31028
an          221
Est         11965
##à         30914
##n         30880
##ta        115
,           1307
i           134
[UNK]       101
chi         524
[UNK]       101
an          221
ch          471
'           1553
i           134
partì       22841
##sen       6775
da          203
'           1553
l           181
1970        12365
inf         1238
##ì         30946
##n         30880
a           111
'           1553
l           181
1979        10542
cia         2718
##p         30888
##ê         31028
dèi         28237
##nte       1696
##r         30882
.           697
[SEP]       103
[PAD]       0
[PAD]       0
[PAD]       0
[PAD]       0
[PAD]       0
[PAD]       0
[PAD]       0
[PAD]       0
[PAD]       0
[PAD]       0
[PAD]       0
[PAD]       0
[PAD]       0
[PAD]       0
[PAD]       0
[PAD]       0
[PAD]       0
[PAD]       0
[PAD]       0
[PAD] 

# Fine tunning of the model

In [14]:
# Common training arguments
training_args = TrainingArguments(
    num_train_epochs=2,
    per_device_train_batch_size=128,
    per_device_eval_batch_size=128,
    logging_dir='logs',
    no_cuda=False,  
    output_dir = 'drive/MyDrive/eth_CS4NLP/project',
)

# Dictionary to save the results
models_performance ={}

# FINE TUNING PROCESS

# create model
#model = XLNetForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
model = AutoModelForSequenceClassification.from_pretrained(model_name,  num_labels=num_labels)#,class_weights=[1, 1, 1, 1, 1, 1, 0.38, 1, 0.66, 1, 1] )
#0:1, 1:1, 2:1, 3:1, 4:1, 5:1, 6:0.38, 7:1, 8:0.66, 9:1, 10:1

'''model.transformer.mask_emb.requires_grad = False
model.transformer.word_embedding.weight.requires_grad = False
for name, param in model.transformer.layer.named_parameters():
  try:
    layer = int(name[:2])
  except ValueError:
    try:
      layer = int(name[:1])
    except ValueError:
      layer = 0
  if layer <= 20:
    param.requires_grad = False'''


# Create trainer
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=encoded_dataset_train,)

# Fine tunning
trainer.train()


Downloading:   0%|          | 0.00/425M [00:00<?, ?B/s]

Some weights of the model checkpoint at dbmdz/bert-base-italian-xxl-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification w

Step,Training Loss
500,0.1316
1000,0.0269
1500,0.0209
2000,0.0166
2500,0.0084
3000,0.008
3500,0.0066


Saving model checkpoint to drive/MyDrive/eth_CS4NLP/project/checkpoint-500
Configuration saved in drive/MyDrive/eth_CS4NLP/project/checkpoint-500/config.json
Model weights saved in drive/MyDrive/eth_CS4NLP/project/checkpoint-500/pytorch_model.bin
tokenizer config file saved in drive/MyDrive/eth_CS4NLP/project/checkpoint-500/tokenizer_config.json
Special tokens file saved in drive/MyDrive/eth_CS4NLP/project/checkpoint-500/special_tokens_map.json
Saving model checkpoint to drive/MyDrive/eth_CS4NLP/project/checkpoint-1000
Configuration saved in drive/MyDrive/eth_CS4NLP/project/checkpoint-1000/config.json
Model weights saved in drive/MyDrive/eth_CS4NLP/project/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in drive/MyDrive/eth_CS4NLP/project/checkpoint-1000/tokenizer_config.json
Special tokens file saved in drive/MyDrive/eth_CS4NLP/project/checkpoint-1000/special_tokens_map.json
Saving model checkpoint to drive/MyDrive/eth_CS4NLP/project/checkpoint-1500
Configuration saved i

TrainOutput(global_step=3934, training_loss=0.028500288952166622, metrics={'train_runtime': 5561.6076, 'train_samples_per_second': 90.535, 'train_steps_per_second': 0.707, 'total_flos': 1.811426508764124e+16, 'train_loss': 0.028500288952166622, 'epoch': 2.0})

# Models evaluation

In [15]:
# Get the predicted labels
preds = trainer.predict(encoded_dataset_dev)
predictions = preds.predictions.argmax(-1)

# Create array with predicted labels and expected.
true_values = np.array(preds.label_ids).flatten()
predicted_values = np.array(preds.predictions.argmax(-1)).flatten()

# Filter the labels. We only produce a label for each word. We filter labels
# of subwords and special tokens, such as PAD
proc_predicted_values = [prediction for prediction, label in zip(predicted_values, true_values) if label != -100]
proc_true_values = [label for prediction, label in zip(predicted_values, true_values) if label != -100]

# Evaluate models
model_performance = {}
model_performance['accuracy'] = accuracy_score(proc_true_values, proc_predicted_values)
model_performance['precision_micro'] = precision_score(proc_true_values, proc_predicted_values, average='micro')
model_performance['precision_macro'] = precision_score(proc_true_values, proc_predicted_values, average='macro')
model_performance['recall_micro'] = recall_score(proc_true_values, proc_predicted_values, average='micro')
model_performance['recall_macro'] = recall_score(proc_true_values, proc_predicted_values, average='macro')
model_performance['f1_micro'] = f1_score(proc_true_values, proc_predicted_values, average='micro')
model_performance['f1_macro'] = f1_score(proc_true_values, proc_predicted_values, average='macro')

model_performance['confusion_matrix'] = confusion_matrix(proc_true_values, proc_predicted_values)
model_performance['confusion_matrix_normalized'] = confusion_matrix(proc_true_values, proc_predicted_values, normalize='true')

***** Running Prediction *****
  Num examples = 79070
  Batch size = 128


In [16]:
model_performance

{'accuracy': 0.9516377893006197,
 'confusion_matrix': array([[ 1252,     1,    87,    16,     2,     9,    74,     0,     8,
             5,    10],
        [    4,  4123,    22,    13,     4,    33,    38,    10,    84,
            86,    10],
        [   10,    15, 11132,    40,     8,    52,   459,     2,    38,
            57,   203],
        [    4,     1,    11,  2803,     4,     6,    47,     5,     4,
            12,     9],
        [    9,     3,    23,    12,  4507,    11,    72,     0,     3,
            50,    14],
        [    6,     9,    44,    13,     0,  3589,    75,     2,    26,
            99,    49],
        [   82,     6,   123,    45,    17,    69, 16619,     1,    49,
           200,    55],
        [    1,    44,     7,    11,     3,    13,    18,  2400,    19,
            44,     6],
        [    1,    18,    11,    25,     2,    24,    69,     2, 11724,
            23,    76],
        [    3,     3,    15,    33,     6,    58,   378,     9,    59,
         10

F1-micro and F1-macro for each model

In [17]:
print('------------Model performance------------')
print(f'  accuracy: {model_performance["accuracy"]}')
print(f'  f1-micro: {model_performance["f1_micro"]}')
print(f'  f1-macro: {model_performance["f1_macro"]}')
print(f'  precision_macro: {model_performance["precision_macro"]}')
print(f'  precision_micro: {model_performance["precision_micro"]}')
print(f'  recall_macro: {model_performance["recall_macro"]}')
print(f'  recall_micro: {model_performance["recall_micro"]}')
print()

------------Model performance------------
  accuracy: 0.9516377893006197
  f1-micro: 0.9516377893006197
  f1-macro: 0.9462851287278432
  precision_macro: 0.9518242179651747
  precision_micro: 0.9516377893006197
  recall_macro: 0.9415209777791005
  recall_micro: 0.9516377893006197



In [18]:
pred_df = pd.DataFrame({'pred_label':predicted_values})
pred_df

Unnamed: 0,pred_label
0,5
1,9
2,6
3,6
4,9
...,...
79065,10
79066,6
79067,3
79068,1


In [None]:
from google.colab import files
pred_df.to_csv('final_dev_pred_dbmdz_bert-base-italian-xxl-cased.csv',index=False)
files.download('final_dev_pred_dbmdz_bert-base-italian-xxl-cased.csv')

# Model Prediction

In [19]:
# Get the predicted labels
preds = trainer.predict(encoded_dataset_test)
predictions = preds.predictions.argmax(-1)

# Create array with predicted labels and expected.
true_values = np.array(preds.label_ids).flatten()
predicted_values = np.array(preds.predictions.argmax(-1)).flatten()

***** Running Prediction *****
  Num examples = 11087
  Batch size = 128


In [20]:
pred_test = pd.DataFrame({'pred_label':predicted_values})
pred_test['pred_label']=pred_test['pred_label'].apply(explain_label)
pred_test['text']=df_test['text']
pred_test.head()

Unnamed: 0,pred_label,text
0,VEC,"E lì è montagnie, là o' se trova asai falchoni..."
1,VEC,"Ma alora l é proprio un zoo, l à 'sontà l Pirata."
2,FUR,Al è ancje un aspiet faunistic di segnalâ.
3,SCN,Ca sempe chesto é stato!
4,FUR,Ancje achì al à lis sôs sodisfazions:


In [23]:
encoded_dataset_test.shape

AttributeError: ignored

In [21]:
from google.colab import files
pred_test.to_csv('finalbert_len70_test_pred.csv',index=False)
files.download('finalbert_len70_test_pred.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
from numpy import savetxt
logits =preds[0]
# save to csv file
savetxt('logit_len70_test.csv', logits, delimiter=',')
files.download('logit_len70_test.csv')

In [None]:
#softmax on the logits
qa = preds[0]
for i in range (3):
  input_array = qa[i]
  input = torch.from_numpy(input_array)
  m = nn.Softmax()
  qa[i] = m(input)
  from numpy import savetxt
savetxt('logit_softmax.csv', qa, delimiter=',')
files.download('logit_softmax.csv')

In [None]:
arrayA = preds[0]
arrayB = predictions
df_preds = pd.DataFrame(zip(arrayA, arrayB), columns=['logits','predicted_label'])
df_preds.head(3)
'''df_preds.to_csv('finalbert_probability.csv',index=False)
files.download('finalbert_probability.csv')'''