In [1]:
try:
    from google.colab import drive
    COLAB = True
    !pip install transformers
    !pip install datasets
    !pip install sentence_transformers
except:
    COLAB = False

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (

In [2]:
from pathlib import Path
import csv

## Preparation of the dataset

### Load the dataset

In [3]:
data_dir = Path("../data")
training_set_dir = "haspeede2_dev"
training_file = "haspeede2_dev_taskAB.tsv"

if COLAB:
    drive.mount("/content/drive", force_remount=True)
    training_set_dir = "/content/drive/My Drive"

train_path = data_dir / training_set_dir / training_file

train_set = []

with open(train_path, 'r') as file:
    reader = csv.DictReader(file, delimiter='\t')
    for row in reader:
        train_set.append(row)

train_docs = [doc['text '] for doc in train_set]

Mounted at /content/drive


### Tokenize

In [4]:
import torch
from transformers import AutoTokenizer, AutoModel

In [None]:
model_name = "dbmdz/bert-base-italian-cased"

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

In [None]:
pooler = model.pooler

In [None]:
train_sample = train_docs[0]

tokens = tokenizer.tokenize(train_sample, add_special_tokens=True)
input_ids = tokenizer.encode(train_sample, add_special_tokens=True)

print("Tokens:", tokens)
print("Token ids:", input_ids)

Tokens: ['[CLS]', 'È', 'terrorismo', 'anche', 'questo', ',', 'per', 'mettere', 'in', 'uno', 'stato', 'di', 'sogg', '##ez', '##ione', 'le', 'persone', 'e', 'rende', '##rle', 'inno', '##cue', ',', 'mentre', 'qualcuno', '.', '.', '.', 'UR', '##L', '[SEP]']
Token ids: [102, 696, 11601, 409, 395, 1307, 156, 3234, 139, 610, 482, 120, 10590, 30394, 256, 199, 1022, 126, 4101, 6546, 6870, 11356, 1307, 1105, 1776, 697, 697, 697, 17943, 30909, 103]


As you can see the tokenizer adds the special token `[CLS]`, which is normally used by the classification head of the transformer.
We want to extract the embedding for the `[CLS]` token of each document as a sentence embedding.

## Run the model

### Extract embedding of [CLS] token in last hidden state for one document

In [None]:
input_ids = torch.tensor([input_ids])

with torch.no_grad():
    outputs = model(input_ids)

In [None]:
doc_embeddings = outputs.last_hidden_state[0]
word_embeddings = {token: emb
                   for token, emb
                   in zip(tokens, doc_embeddings)}
cls_embedding = word_embeddings['[CLS]']
print(cls_embedding.shape)
print(cls_embedding)

torch.Size([768])
tensor([ 3.3091e-01, -2.5560e-01,  9.9565e-02, -1.6235e-01,  2.8167e-02,
         1.1574e-01,  6.7760e-03, -1.2880e-01,  1.5049e-01,  1.2580e-01,
        -3.6794e-01, -1.3459e-01, -1.5786e-02,  6.4181e-01, -2.0602e-01,
        -7.0953e-03,  4.4382e-02,  6.7526e-01, -3.3225e-01, -1.3388e-02,
        -4.0139e-02,  4.1250e-01, -2.0346e-01,  7.2367e-02, -3.0406e-01,
        -4.3439e-02, -9.8926e-02,  1.2743e-01, -1.2440e-01,  4.2027e-01,
         3.3305e-01, -1.8008e-01, -4.2361e-01, -6.3529e-01,  3.4179e-01,
         3.3845e-01, -2.4885e-02, -3.7214e-01, -3.2806e-01, -1.1229e-01,
         6.0399e-01, -1.6509e-01, -3.6538e-01,  4.1586e-01, -5.6831e-01,
        -4.3380e-01,  3.4057e-01,  2.9778e-01,  1.8465e-01,  1.7047e-01,
         1.3323e-01, -1.1138e-01,  1.2110e-01, -4.5228e-02, -5.6285e-01,
        -1.0581e-01, -9.3740e-02,  1.5095e-01,  2.1296e-01,  6.9111e-02,
         3.5689e-01,  1.1326e-01, -3.1078e-01, -2.1829e-01,  5.4454e-01,
        -3.6558e-01, -4.3426e-01,

### Run the model on training documents

We use a SentenceTransformer which will tokenize and encode the documents, return pooled sentence embeddings and tokens embeddings.

In [11]:
from sentence_transformers import SentenceTransformer

In [None]:
model = SentenceTransformer(model_name)
outputs = model.encode(train_docs, output_value=None, convert_to_numpy=True ,batch_size=16, show_progress_bar=True)

No sentence-transformers model found with name dbmdz/bert-base-italian-cased. Creating a new one with mean pooling.
Batches: 100%|██████████| 428/428 [05:24<00:00,  1.32it/s]


In [None]:
outputs[0].keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'token_embeddings', 'sentence_embedding'])

In [None]:
cls_embeddings = [output['token_embeddings'][0] for output in outputs]
sent_embeddings = [output['sentence_embedding'] for output in outputs]

In [18]:
import pandas as pd

In [None]:
cls_embeddings_df = pd.DataFrame(torch.stack(cls_embeddings), index=[doc['id'] for doc in train_set])
cls_embeddings_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,758,759,760,761,762,763,764,765,766,767
2066,0.330913,-0.255601,0.099565,-0.162353,0.028167,0.11574,0.006777,-0.128799,0.150491,0.1258,...,-0.167303,0.202621,-0.450907,0.971462,0.071537,0.061021,0.101291,-0.336357,-0.018048,0.419867
2045,-0.176426,0.43014,-0.020807,0.300881,-0.077966,-0.20765,-0.422625,-0.218152,-0.23188,-0.323087,...,-0.362934,0.090953,-0.012831,0.149438,0.072081,-0.019213,-0.187425,0.001177,0.002967,-0.141162
61,0.094847,-0.043276,0.030565,0.295105,-0.137413,-0.192275,-0.487759,0.237893,-0.025203,-0.364422,...,-0.157764,0.1966,-0.21017,-0.216122,-0.208825,0.084819,-0.156896,5.9e-05,-0.066262,-0.047716
1259,-0.04331,0.053228,-0.073848,0.045386,-0.065473,-0.121833,-0.112266,-0.03686,0.073647,-0.110107,...,-0.06518,0.03857,-0.116088,-0.050137,-0.215983,0.059105,-0.093424,-0.018517,0.073619,-0.088922
949,-0.008078,-0.031456,-0.093861,-0.106218,0.025641,-0.139674,-0.073613,-0.047472,0.142491,-0.187894,...,-0.066059,0.021286,-0.143739,0.113479,-0.063397,0.001536,-0.103806,-0.02371,0.287298,-0.046536


In [None]:
sent_embeddings_df = pd.DataFrame(torch.stack(sent_embeddings), index=[doc['id'] for doc in train_set])
sent_embeddings_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,758,759,760,761,762,763,764,765,766,767
2066,-0.15818,-0.095946,0.093634,0.379927,0.079886,0.367576,-0.116891,-0.129829,-0.059884,-0.211322,...,0.135857,0.111227,-0.172573,-0.249292,-0.004123,0.140916,0.235578,-0.268505,0.058081,-0.10975
2045,-0.171309,0.073012,0.085175,-0.00499,0.017174,0.32158,0.016343,-0.209061,-0.003585,-0.026075,...,-0.176989,0.156726,0.005133,0.099889,0.25676,-0.079786,-0.344093,-0.093583,-0.134889,-0.043373
61,-0.246598,0.161771,0.133039,0.402667,0.133085,0.591527,-0.285158,-0.072103,-0.00971,-0.057412,...,0.428057,0.72611,0.082723,0.085876,0.479539,-0.006699,-0.274346,-0.274852,0.033653,0.056907
1259,0.093394,0.146723,0.193343,0.154804,-0.043327,0.205783,-0.028684,-0.420744,0.006844,0.04095,...,0.006352,-0.053703,0.148035,-0.131488,0.198386,0.173133,-0.248725,0.083605,-0.231437,-0.054648
949,0.057813,0.234703,0.269424,0.047158,0.111977,0.288683,-0.123735,-0.315585,0.041978,0.15596,...,0.185213,0.439666,-0.106471,0.012941,0.306468,-0.099841,-0.299336,-0.047937,0.14251,-0.115172


In [None]:
results_dir = Path("../results")
cls_embs_csv = "sent_embs_cls_train.csv"
sent_embs_csv = "sent_embs_pooled_train.csv"

cls_embeddings_df.to_csv(results_dir / cls_embs_csv)
sent_embeddings_df.to_csv(results_dir / sent_embs_csv)

## Alternative model: AlBERTo
Since the sentence embeddings extracted from the BERT base model yield bad performances, we'll now try using AlBERTo, the first italian BERT model for Twitter languange understanding.
HF: [m-polignano-uniba/bert_uncased_L-12_H-768_A-12_italian_alb3rt0](https://huggingface.co/m-polignano-uniba/bert_uncased_L-12_H-768_A-12_italian_alb3rt0)

!!Add reference
  

```
  @InProceedings{PolignanoEtAlCLIC2019,
    author    = {Marco Polignano and Pierpaolo Basile and Marco de Gemmis and Giovanni Semeraro and Valerio Basile},
    title     = {{AlBERTo: Italian BERT Language Understanding Model for NLP Challenging Tasks Based on Tweets}},
    booktitle = {Proceedings of the Sixth Italian Conference on Computational Linguistics (CLiC-it 2019)},
    year      = {2019},
    publisher = {CEUR},
    journal={CEUR Workshop Proceedings},
    volume={2481},
    url={https://www.scopus.com/inward/record.uri?eid=2-s2.0-85074851349&partnerID=40&md5=7abed946e06f76b3825ae5e294ffac14},
    document_type={Conference Paper},
    source={Scopus}
```  

### Tokenize

In [9]:
model_name = "m-polignano-uniba/bert_uncased_L-12_H-768_A-12_italian_alb3rt0"

In [10]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/1.11M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/740M [00:00<?, ?B/s]

In [12]:
pooler = model.pooler

In [13]:
train_sample = train_docs[0]

tokens = tokenizer.tokenize(train_sample, add_special_tokens=True)
input_ids = tokenizer.encode(train_sample, add_special_tokens=True)

print("Tokens:", tokens)
print("Token ids:", input_ids)

Tokens: ['[CLS]', 'e', 'terrorismo', 'anche', 'questo', '[UNK]', 'per', 'mettere', 'in', 'uno', 'stato', 'di', 'soggezione', 'le', 'persone', 'e', 'render', '##le', 'innocue', '[UNK]', 'mentre', 'qualcuno', '[UNK]', '[UNK]', '[UNK]', 'ur', '##l', '[SEP]']
Token ids: [2, 13, 4923, 23, 79, 1, 22, 605, 24, 153, 184, 12, 49535, 40, 234, 13, 20897, 1041, 90954, 1, 408, 271, 1, 1, 1, 14013, 902, 3]


### Run the model on training documents

In [14]:
model = SentenceTransformer(model_name)
outputs = model.encode(train_docs, output_value=None, convert_to_numpy=True ,batch_size=16, show_progress_bar=True)



Batches:   0%|          | 0/428 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/740M [00:00<?, ?B/s]

In [15]:
outputs[0].keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'token_embeddings', 'sentence_embedding'])

In [16]:
cls_embeddings = [output['token_embeddings'][0] for output in outputs]
sent_embeddings = [output['sentence_embedding'] for output in outputs]

In [19]:
cls_embeddings_df = pd.DataFrame(torch.stack(cls_embeddings), index=[doc['id'] for doc in train_set])
cls_embeddings_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,758,759,760,761,762,763,764,765,766,767
2066,0.76047,0.355727,0.137676,0.726672,-2.574655,0.532849,-0.55717,0.991323,1.060009,1.304483,...,1.570961,0.55143,1.155125,-0.538862,-0.37452,-1.204794,-0.279697,0.47014,-0.23385,0.586996
2045,0.518653,-0.293679,0.897923,0.939204,-1.704334,-0.979642,-0.562784,0.861344,1.091535,1.580141,...,0.838025,0.897236,0.382713,0.015039,-0.517843,-0.360566,-0.452484,0.644569,-0.130329,0.309873
61,0.115341,-0.349727,0.932148,0.28088,-1.950502,-0.653146,-0.628392,1.047905,0.908965,0.88058,...,1.181039,1.18234,0.5042,-0.307846,-0.903916,-0.378418,-0.335408,1.087677,-0.558886,1.066654
1259,0.193663,-0.478392,-0.421455,0.399824,-2.44101,-2.016848,-1.222893,1.228242,1.13783,0.676532,...,0.282832,0.679007,-0.052451,0.247086,-0.596628,0.294246,-0.16122,0.476959,-0.525229,0.145264
949,-0.066913,0.163852,0.749034,0.494453,-1.948138,0.647739,-0.576177,0.213923,1.084984,1.494357,...,1.011368,0.536175,0.886733,-0.752798,-0.571639,-1.107355,0.005425,0.552605,-0.146093,0.227068


In [20]:
sent_embeddings_df = pd.DataFrame(torch.stack(sent_embeddings), index=[doc['id'] for doc in train_set])
sent_embeddings_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,758,759,760,761,762,763,764,765,766,767
2066,0.64436,-0.322945,-0.067169,0.433594,-0.808968,-0.152856,-0.647717,0.184886,0.296216,0.320702,...,1.217888,-0.477605,0.415001,-0.565366,0.155336,-0.465147,0.415667,-0.456462,0.38162,-0.039221
2045,0.450706,-0.424301,0.411552,0.328261,0.030625,-0.186563,-0.68652,0.170887,-0.283004,0.599489,...,1.025982,-0.918605,0.689864,-0.410859,-0.162232,-0.233352,0.712453,0.026411,0.085908,0.151146
61,0.201871,-0.709691,0.472239,0.021727,-0.761589,-0.614881,-0.017946,-0.493089,0.020694,0.596931,...,0.950691,-0.165642,0.930925,-0.179106,-0.87867,0.208187,0.963974,0.759507,-0.602236,0.605712
1259,0.591678,-0.093739,0.016063,-0.190001,-1.040889,-0.33911,-0.732731,0.51493,0.003258,0.302663,...,1.096214,-0.606475,-0.055335,0.177271,-0.339264,0.044101,0.427433,0.326121,0.159303,0.334155
949,-0.378547,-0.582111,-0.293675,0.505529,-1.042519,1.041653,-1.042608,-0.270629,1.146067,0.159086,...,1.316795,-0.54402,0.471582,-0.037146,-0.36886,-0.030393,0.960981,-0.530281,-0.155071,-0.379835


In [22]:
results_dir = Path("../results")
alberto_cls_embs_csv = "alberto_sent_embs_cls_train.csv"
alberto_sent_embs_csv = "alberto_sent_embs_pooled_train.csv"

cls_embeddings_df.to_csv(results_dir / alberto_cls_embs_csv)
sent_embeddings_df.to_csv(results_dir / alberto_sent_embs_csv)

OSError: Cannot save file into a non-existent directory: '../results'