In [35]:
import datasets
from datasets import load_dataset
# Nel corso era in datasets load_dataset. Dalla versione di Settembre, HF ora ha spostato le metrics in evaluate
from evaluate import load as load_metric
import numpy as np

# ATTENZIONE: per poter usare i TrainingArguments, conviene installare transformers[torch]
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
# Questo lo separiamo perchè è inferenza
from transformers import pipeline
from pprint import pprint
from torchinfo import summary

In [2]:
raw_datasets = load_dataset("glue", "sst2")

In [3]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 872
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1821
    })
})

In [4]:
raw_datasets["train"]

Dataset({
    features: ['sentence', 'label', 'idx'],
    num_rows: 67349
})

In [5]:
# dir ci serve a capire tutti gli attributi e i metodi di un oggetto. 
# Essendo di tipo Dataset, vediamo cosa possiamo fare
dir(raw_datasets["train"])

['_TF_DATASET_REFS',
 '__class__',
 '__del__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__enter__',
 '__eq__',
 '__exit__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__getitems__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_build_local_temp_path',
 '_check_index_is_initialized',
 '_data',
 '_estimate_nbytes',
 '_fingerprint',
 '_format_columns',
 '_format_kwargs',
 '_format_type',
 '_generate_tables_from_cache_file',
 '_generate_tables_from_shards',
 '_get_cache_file_path',
 '_get_output_signature',
 '_getitem',
 '_indexes',
 '_indices',
 '_info',
 '_map_single',
 '_new_dataset_with_indices',
 '_output_all_columns',
 '_push_parquet_shards_to_hub',
 '_save_to_disk_single',
 '_select_contiguo

In [6]:
# Arrow dataset è un dataset di pyarrow. Migliora le performances per operazioni sulle colonne
type(raw_datasets["train"])

datasets.arrow_dataset.Dataset

In [7]:
# Ci mostra qualche esempio per ogni riga, spiegando il tipo delle colonne
raw_datasets["train"].data

MemoryMappedTable
sentence: string
label: int64
idx: int32
----
sentence: [["hide new secretions from the parental units ","contains no wit , only labored gags ","that loves its characters and communicates something rather beautiful about human nature ","remains utterly satisfied to remain the same throughout ","on the worst revenge-of-the-nerds clichés the filmmakers could dredge up ",...,"you wish you were at home watching that movie instead of in the theater watching this one ","'s no point in extracting the bare bones of byatt 's plot for purposes of bland hollywood romance ","underdeveloped ","the jokes are flat ","a heartening tale of small victories "],["suspense , intriguing characters and bizarre bank robberies , ","a gritty police thriller with all the dysfunctional family dynamics one could wish for ","with a wonderful ensemble cast of characters that bring the routine day to day struggles of the working class to life ","nonetheless appreciates the art and reveals a music sc

In [8]:
# Nota bene: pyarrow crea dizionario di liste, invece che liste di dizionari
raw_datasets["train"][0:5]

{'sentence': ['hide new secretions from the parental units ',
  'contains no wit , only labored gags ',
  'that loves its characters and communicates something rather beautiful about human nature ',
  'remains utterly satisfied to remain the same throughout ',
  'on the worst revenge-of-the-nerds clichés the filmmakers could dredge up '],
 'label': [0, 0, 1, 0, 0],
 'idx': [0, 1, 2, 3, 4]}

In [9]:
# Ci da informazioni importanti sulle colonne. In particolare risulta utile per le label a classi, per risalire al nome da idx
raw_datasets["train"].features

{'sentence': Value(dtype='string', id=None),
 'label': ClassLabel(names=['negative', 'positive'], id=None),
 'idx': Value(dtype='int32', id=None)}

In [10]:
# distilbert si addestra prima. Volendo, possiamo usare Bert.
checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)



In [11]:
# Test tokenizer, vediamo cosa tira fuori
tokenized_sentence = tokenizer(raw_datasets["train"][0:3]["sentence"])
pprint(tokenized_sentence)

{'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
                    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
                    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]],
 'input_ids': [[101, 5342, 2047, 3595, 8496, 2013, 1996, 18643, 3197, 102],
               [101,
                3397,
                2053,
                15966,
                1010,
                2069,
                4450,
                2098,
                18201,
                2015,
                102],
               [101,
                2008,
                7459,
                2049,
                3494,
                1998,
                10639,
                2015,
                2242,
                2738,
                3376,
                2055,
                2529,
                3267,
                102]]}


In [12]:
# Funzione custom per processare i batch dati
def tokenize_fn(batch):
    return tokenizer(batch["sentence"], truncation=True)

In [13]:
tokenized_dataset = raw_datasets.map(tokenize_fn, batched=True)

Map: 100%|█████████████████████████████████████████████████████████████████| 872/872 [00:00<00:00, 21925.61 examples/s]


In [14]:
training_args = TrainingArguments(
    "my_trainer",
    eval_strategy="epoch", # Uso del dataset di evaluation per calcolare l'andamento. Default no
    save_strategy="epoch", # Salva il modello al termine di ogni epoch. Il default è per ogni step.
    num_train_epochs=1
)

In [15]:
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
type(model)

transformers.models.distilbert.modeling_distilbert.DistilBertForSequenceClassification

In [17]:
# Vediamo la struttura
model

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [19]:
# Summary non funziona un gran che con gli LLM di huggingface
summary(model)

Layer (type:depth-idx)                                  Param #
DistilBertForSequenceClassification                     --
├─DistilBertModel: 1-1                                  --
│    └─Embeddings: 2-1                                  --
│    │    └─Embedding: 3-1                              23,440,896
│    │    └─Embedding: 3-2                              393,216
│    │    └─LayerNorm: 3-3                              1,536
│    │    └─Dropout: 3-4                                --
│    └─Transformer: 2-2                                 --
│    │    └─ModuleList: 3-5                             42,527,232
├─Linear: 1-2                                           590,592
├─Linear: 1-3                                           1,538
├─Dropout: 1-4                                          --
Total params: 66,955,010
Trainable params: 66,955,010
Non-trainable params: 0

In [27]:
# Siccome nel fine tuning noi vogliamo cambiare solo alcuni dei pesi, storiamo i parametri del modello prima 
# per fare il confronto con quelli dopo il fine tuning.
params_before = []
for name, p in model.named_parameters():
    params_before.append(p.detach().cpu().numpy())

In [28]:
metric = load_metric('glue', 'sst2')

In [30]:
# Testiamo la metrica. Ci restituisce un dizionario
metric.compute(predictions=[1, 0, 1], references=[1, 0, 0])

{'accuracy': 0.6666666666666666}

In [31]:
def compute_metrics(logit_and_labels):
    logit, labels = logit_and_labels
    predictions = np.argmax(logit, axis=-1) # axis = -1 -> Ultimo asse
    return metric.compute(predictions=predictions, references=labels)

In [32]:
trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [33]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.1968,0.335805,0.904817


TrainOutput(global_step=8419, training_loss=0.2635177383939392, metrics={'train_runtime': 580.638, 'train_samples_per_second': 115.991, 'train_steps_per_second': 14.5, 'total_flos': 517212489917652.0, 'train_loss': 0.2635177383939392, 'epoch': 1.0})

In [34]:
trainer.save_model("models/my_sentiment_analysis")

In [36]:
cls = pipeline("text-classification", model="models/my_sentiment_analysis", device=0)

In [38]:
cls("This moovie is great!")

[{'label': 'LABEL_1', 'score': 0.998356282711029}]

In [39]:
cls("This moovie sucks!")

[{'label': 'LABEL_0', 'score': 0.9986245632171631}]

In [41]:
# Correggiamo queste label generiche!
import json
cfg_path = "models/my_sentiment_analysis/config.json"
with open(cfg_path) as f:
    obj = json.load(f)

obj["id2label"] = {0: "negative", 1: "positive"}

with open(cfg_path, "w") as f:
    json.dump(obj, f, indent=2)

In [42]:
new_cls = pipeline("text-classification", model="models/my_sentiment_analysis", device=0)

In [44]:
new_cls("This moovie is great! suck!")

[{'label': 'positive', 'score': 0.9665664434432983}]

In [45]:
# Ora controlliamo i parametri
param_after = []
for name, p in model.named_parameters():
    param_after.append(p.detach().cpu().numpy())

In [47]:
# Qui scopriamo che abbiamo cambiato tutti i parametri con questo fine tuning
for bef, aft in zip(params_before, param_after):
    print(np.sum(np.abs(bef-aft)))

13525.799
85.55518
1.7041587
1.1174519
1310.0093
1.6430478
1291.5242
0.0034204703
1191.9945
1.0146875
1122.0751
0.85111535
1.6632833
0.84583247
4896.5825
5.7475758
4498.3555
0.7450188
1.5758862
0.7957795
1300.3873
1.4776052
1296.3428
0.0033661863
1105.2034
0.82875156
1056.7418
0.73670316
1.490978
0.71903646
4872.818
5.371088
4462.4985
0.68522507
1.4758179
0.7006481
1270.0234
1.632243
1277.5608
0.0026035346
1112.9858
0.7386063
1093.1621
0.70980954
1.452709
0.754694
4920.3975
5.6502533
4360.021
0.7104047
1.4351361
0.6181278
1289.69
1.4192636
1300.1702
0.003086572
1137.1788
0.69133025
1080.1112
0.7577317
1.4079797
0.7500416
4814.8315
5.3746986
4123.7964
0.72084594
1.3138576
0.66775125
1206.4757
1.3946418
1201.5515
0.002044647
1006.33295
0.78442913
1011.0608
0.955362
1.4613957
1.0006258
4416.8013
5.3369427
3413.3987
0.79335725
1.3267266
0.71263015
1103.8936
1.2300984
1112.4004
0.0011897419
938.73816
0.8849616
934.6699
0.96692544
1.361017
1.1861262
3736.7664
4.7800736
3169.5474
0.90498114
1

In [50]:
# Vediamo come si comporta con il test set!
test_data = raw_datasets["test"]

In [65]:
predictions = new_cls(test_data["sentence"])

In [66]:
numeric_predictions = [int(pred["label"]=="positive") for pred in predictions]

In [70]:
metric.compute(predictions=numeric_predictions, references=test_data["label"])

{'accuracy': 0.0}

In [71]:
predictions

[{'label': 'negative', 'score': 0.9986230134963989},
 {'label': 'negative', 'score': 0.9989619255065918},
 {'label': 'positive', 'score': 0.9781332015991211},
 {'label': 'positive', 'score': 0.9984196424484253},
 {'label': 'positive', 'score': 0.9993983507156372},
 {'label': 'positive', 'score': 0.9996474981307983},
 {'label': 'negative', 'score': 0.9933964014053345},
 {'label': 'positive', 'score': 0.9995680451393127},
 {'label': 'negative', 'score': 0.9607862234115601},
 {'label': 'negative', 'score': 0.9991262555122375},
 {'label': 'negative', 'score': 0.9940369129180908},
 {'label': 'positive', 'score': 0.5918056964874268},
 {'label': 'positive', 'score': 0.9902966618537903},
 {'label': 'positive', 'score': 0.9985684156417847},
 {'label': 'positive', 'score': 0.9992306232452393},
 {'label': 'positive', 'score': 0.9981826543807983},
 {'label': 'positive', 'score': 0.9976049661636353},
 {'label': 'positive', 'score': 0.9996306896209717},
 {'label': 'positive', 'score': 0.999234676361

In [78]:
test_data["sentence"][3]

'director rob marshall went out gunning to make a great one .'