In [23]:
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification

### Tokenizer

In [3]:
"""
FutureWarning: `clean_up_tokenization_spaces` was not set. It will be set to `True` by default. 
This behavior will be deprecated in transformers v4.45, and will be then set to `False` by default.
"""

checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint, clean_up_tokenization_spaces=True)

In [4]:
tokenizer

BertTokenizerFast(name_or_path='bert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [5]:
tokenizer("Hello World")

{'input_ids': [101, 7592, 2088, 102], 'token_type_ids': [0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1]}

In [7]:
tokens = tokenizer.tokenize("Hello World")
tokens

['hello', 'world']

In [8]:
ids = tokenizer.convert_tokens_to_ids(tokens)
ids

[7592, 2088]

In [10]:
words = tokenizer.convert_ids_to_tokens(ids)
words

['hello', 'world']

In [12]:
# Crea stringa dagli ID. Come convert ids to tokens, ma un solo output
tokenizer.decode(ids)

'hello world'

In [16]:
# Encode aggiunge special tokens. Questi sono CLS (101) e SEP (102)
ids = tokenizer.encode("Hello World")
ids

[101, 7592, 2088, 102]

In [18]:
# Qui possiamo vedere gli special tokens
tokenizer.convert_ids_to_tokens(ids)

['[CLS]', 'hello', 'world', '[SEP]']

In [19]:
tokenizer.decode(ids)

'[CLS] hello world [SEP]'

In [21]:
model_input = tokenizer("Hello World")
model_input

{'input_ids': [101, 7592, 2088, 102], 'token_type_ids': [0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1]}

In [22]:
data = [
    "I like cats.",
    "Do you like cats?"
]
tokenizer(data)

{'input_ids': [[101, 1045, 2066, 8870, 1012, 102], [101, 2079, 2017, 2066, 8870, 1029, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1]]}

### Model 

In [24]:
# Tokenizers e Models sono legati tra loro. Usiamo il modello associato allo stesso tokenizer
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [27]:
# Vediamo come otteniamo errore se passiamo come input le liste ottenute dal tokenizer
# outputs = model(**model_input)

In [28]:
model_input = tokenizer("Hello World", return_tensors="pt")

In [31]:
output = model(**model_input)
output

SequenceClassifierOutput(loss=None, logits=tensor([[ 0.3001, -0.6187]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [38]:
# E' uguale a output[0]
output.logits

tensor([[ 0.3001, -0.6187]], grad_fn=<AddmmBackward0>)

In [41]:
# Se volessimo usare questi output per calcolare alcuni score come f1 o AUC in sklearn, abbiamo bisogno di numpy arrays
# In ordine: detach() -> rimuove i gradienti dal tensore
# cpu() -> porta dalla GPU alla CPU i dati
# numpy() -> converte in numpy()
output.logits.detach().cpu().numpy()

array([[ 0.30013636, -0.61868644]], dtype=float32)

In [45]:
# Se volessimo creare un batch, sarà necessario avere la stessa lunghezza
# Per questo motivo dovremmo aggiungere truncate=True e padding=True per farlo funzionare o otterremo errori
"""
Unable to create tensor, you should probably activate truncation and/or padding 
with 'padding=True' 'truncation=True' to have batched tensors with the same length.
"""
# In realtà padding si può anche settare con una padding strategy tra ['longest', 'max_length', 'do_not_pad']
data = [
    "I like cats.",
    "Do you like cats?"
]
model_inputs = tokenizer(data, return_tensors="pt", padding=True, truncation=True)

In [46]:
model_inputs

{'input_ids': tensor([[ 101, 1045, 2066, 8870, 1012,  102,    0],
        [ 101, 2079, 2017, 2066, 8870, 1029,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 0],
        [1, 1, 1, 1, 1, 1, 1]])}

In [47]:
ouputs = model(**model_inputs)
ouputs

SequenceClassifierOutput(loss=None, logits=tensor([[ 0.1823, -0.5322],
        [ 0.1686, -0.5445]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)