In [1]:
try:
    from google.colab import drive
    COLAB = True
    !pip install transformers
    !pip install datasets
    !pip install sentence_transformers
except:
    COLAB = False

In [None]:
from pathlib import Path
import csv

## Preparation of the dataset

### Load the dataset

In [None]:
data_dir = Path("../data")
training_set_dir = "haspeede2_dev"
training_file = "haspeede2_dev_taskAB.tsv"

if COLAB:
    drive.mount("/content/drive", force_remount=True)
    training_set_dir = "/content/drive/My Drive"

train_path = data_dir / training_set_dir / training_file

train_set = []

with open(train_path, 'r') as file:
    reader = csv.DictReader(file, delimiter='\t')
    for row in reader:
        train_set.append(row)

train_docs = [doc['text '] for doc in train_set]

### Tokenize

In [4]:
import torch
from transformers import AutoTokenizer, AutoModel

model_name = "dbmdz/bert-base-italian-cased"

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

In [6]:
pooler = model.pooler

In [7]:
train_sample = train_docs[0]

tokens = tokenizer.tokenize(train_sample, add_special_tokens=True)
input_ids = tokenizer.encode(train_sample, add_special_tokens=True)

print("Tokens:", tokens)
print("Token ids:", input_ids)

Tokens: ['[CLS]', 'È', 'terrorismo', 'anche', 'questo', ',', 'per', 'mettere', 'in', 'uno', 'stato', 'di', 'sogg', '##ez', '##ione', 'le', 'persone', 'e', 'rende', '##rle', 'inno', '##cue', ',', 'mentre', 'qualcuno', '.', '.', '.', 'UR', '##L', '[SEP]']
Token ids: [102, 696, 11601, 409, 395, 1307, 156, 3234, 139, 610, 482, 120, 10590, 30394, 256, 199, 1022, 126, 4101, 6546, 6870, 11356, 1307, 1105, 1776, 697, 697, 697, 17943, 30909, 103]


As you can see the tokenizer adds the special token `[CLS]`, which is normally used by the classification head of the transformer.
We want to extract the embedding for the `[CLS]` token of each document as a sentence embedding. 

## Run the model

### Extract embedding of [CLS] token in last hidden state for one document

In [8]:
input_ids = torch.tensor([input_ids])

with torch.no_grad():
    outputs = model(input_ids)

In [9]:
doc_embeddings = outputs.last_hidden_state[0]
word_embeddings = {token: emb
                   for token, emb
                   in zip(tokens, doc_embeddings)}
cls_embedding = word_embeddings['[CLS]']
print(cls_embedding.shape)
print(cls_embedding)

torch.Size([768])
tensor([ 3.3091e-01, -2.5560e-01,  9.9565e-02, -1.6235e-01,  2.8167e-02,
         1.1574e-01,  6.7760e-03, -1.2880e-01,  1.5049e-01,  1.2580e-01,
        -3.6794e-01, -1.3459e-01, -1.5786e-02,  6.4181e-01, -2.0602e-01,
        -7.0953e-03,  4.4382e-02,  6.7526e-01, -3.3225e-01, -1.3388e-02,
        -4.0139e-02,  4.1250e-01, -2.0346e-01,  7.2367e-02, -3.0406e-01,
        -4.3439e-02, -9.8926e-02,  1.2743e-01, -1.2440e-01,  4.2027e-01,
         3.3305e-01, -1.8008e-01, -4.2361e-01, -6.3529e-01,  3.4179e-01,
         3.3845e-01, -2.4885e-02, -3.7214e-01, -3.2806e-01, -1.1229e-01,
         6.0399e-01, -1.6509e-01, -3.6538e-01,  4.1586e-01, -5.6831e-01,
        -4.3380e-01,  3.4057e-01,  2.9778e-01,  1.8465e-01,  1.7047e-01,
         1.3323e-01, -1.1138e-01,  1.2110e-01, -4.5228e-02, -5.6285e-01,
        -1.0581e-01, -9.3740e-02,  1.5095e-01,  2.1296e-01,  6.9111e-02,
         3.5689e-01,  1.1326e-01, -3.1078e-01, -2.1829e-01,  5.4454e-01,
        -3.6558e-01, -4.3426e-01,

### Run the model on training documents

We use a SentenceTransformer which will tokenize and encode the documents, return pooled sentence embeddings and tokens embeddings. 

In [10]:
from sentence_transformers import SentenceTransformer

In [11]:
model = SentenceTransformer(model_name)
outputs = model.encode(train_docs, output_value=None, convert_to_numpy=True ,batch_size=16, show_progress_bar=True)

No sentence-transformers model found with name dbmdz/bert-base-italian-cased. Creating a new one with mean pooling.
Batches: 100%|██████████| 428/428 [05:24<00:00,  1.32it/s]


In [12]:
outputs[0].keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'token_embeddings', 'sentence_embedding'])

In [13]:
cls_embeddings = [output['token_embeddings'][0] for output in outputs]
sent_embeddings = [output['sentence_embedding'] for output in outputs]

In [14]:
import pandas as pd

In [15]:
cls_embeddings_df = pd.DataFrame(torch.stack(cls_embeddings), index=[doc['id'] for doc in train_set])
cls_embeddings_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,758,759,760,761,762,763,764,765,766,767
2066,0.330913,-0.255601,0.099565,-0.162353,0.028167,0.11574,0.006777,-0.128799,0.150491,0.1258,...,-0.167303,0.202621,-0.450907,0.971462,0.071537,0.061021,0.101291,-0.336357,-0.018048,0.419867
2045,-0.176426,0.43014,-0.020807,0.300881,-0.077966,-0.20765,-0.422625,-0.218152,-0.23188,-0.323087,...,-0.362934,0.090953,-0.012831,0.149438,0.072081,-0.019213,-0.187425,0.001177,0.002967,-0.141162
61,0.094847,-0.043276,0.030565,0.295105,-0.137413,-0.192275,-0.487759,0.237893,-0.025203,-0.364422,...,-0.157764,0.1966,-0.21017,-0.216122,-0.208825,0.084819,-0.156896,5.9e-05,-0.066262,-0.047716
1259,-0.04331,0.053228,-0.073848,0.045386,-0.065473,-0.121833,-0.112266,-0.03686,0.073647,-0.110107,...,-0.06518,0.03857,-0.116088,-0.050137,-0.215983,0.059105,-0.093424,-0.018517,0.073619,-0.088922
949,-0.008078,-0.031456,-0.093861,-0.106218,0.025641,-0.139674,-0.073613,-0.047472,0.142491,-0.187894,...,-0.066059,0.021286,-0.143739,0.113479,-0.063397,0.001536,-0.103806,-0.02371,0.287298,-0.046536


In [16]:
sent_embeddings_df = pd.DataFrame(torch.stack(sent_embeddings), index=[doc['id'] for doc in train_set])
sent_embeddings_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,758,759,760,761,762,763,764,765,766,767
2066,-0.15818,-0.095946,0.093634,0.379927,0.079886,0.367576,-0.116891,-0.129829,-0.059884,-0.211322,...,0.135857,0.111227,-0.172573,-0.249292,-0.004123,0.140916,0.235578,-0.268505,0.058081,-0.10975
2045,-0.171309,0.073012,0.085175,-0.00499,0.017174,0.32158,0.016343,-0.209061,-0.003585,-0.026075,...,-0.176989,0.156726,0.005133,0.099889,0.25676,-0.079786,-0.344093,-0.093583,-0.134889,-0.043373
61,-0.246598,0.161771,0.133039,0.402667,0.133085,0.591527,-0.285158,-0.072103,-0.00971,-0.057412,...,0.428057,0.72611,0.082723,0.085876,0.479539,-0.006699,-0.274346,-0.274852,0.033653,0.056907
1259,0.093394,0.146723,0.193343,0.154804,-0.043327,0.205783,-0.028684,-0.420744,0.006844,0.04095,...,0.006352,-0.053703,0.148035,-0.131488,0.198386,0.173133,-0.248725,0.083605,-0.231437,-0.054648
949,0.057813,0.234703,0.269424,0.047158,0.111977,0.288683,-0.123735,-0.315585,0.041978,0.15596,...,0.185213,0.439666,-0.106471,0.012941,0.306468,-0.099841,-0.299336,-0.047937,0.14251,-0.115172


In [None]:
results_dir = Path("../results")
cls_embs_csv = "sent_embs_cls_train.csv"
sent_embs_csv = "sent_embs_pooled_train.csv"

cls_embeddings_df.to_csv(results_dir / cls_embs_csv)
sent_embeddings_df.to_csv(results_dir / sent_embs_csv)