In [1]:
import datasets
from datasets import load_dataset
# Nel corso era in datasets load_dataset. Dalla versione di Settembre, HF ora ha spostato le metrics in evaluate
from evaluate import load as load_metric
import numpy as np

# ATTENZIONE: per poter usare i TrainingArguments, conviene installare transformers[torch]
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, AutoConfig
# Questo lo separiamo perchè è inferenza
from transformers import pipeline
from pprint import pprint
from torchinfo import summary

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
raw_datasets = load_dataset("glue", "rte")

In [3]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 2490
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 277
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3000
    })
})

In [4]:
# Qui Entailment =0, not entailment = 1
raw_datasets["train"].features

{'sentence1': Value(dtype='string', id=None),
 'sentence2': Value(dtype='string', id=None),
 'label': ClassLabel(names=['entailment', 'not_entailment'], id=None),
 'idx': Value(dtype='int32', id=None)}

In [5]:
raw_datasets["train"][1]

{'sentence1': 'A place of sorrow, after Pope John Paul II died, became a place of celebration, as Roman Catholic faithful gathered in downtown Chicago to mark the installation of new Pope Benedict XVI.',
 'sentence2': 'Pope Benedict XVI is the new leader of the Roman Catholic Church.',
 'label': 0,
 'idx': 1}

In [6]:
set(raw_datasets["train"]["label"])

{0, 1}

In [7]:
raw_datasets["train"][0]

{'sentence1': 'No Weapons of Mass Destruction Found in Iraq Yet.',
 'sentence2': 'Weapons of Mass Destruction Found in Iraq.',
 'label': 1,
 'idx': 0}

In [8]:
checkpoint = "distilbert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)



In [9]:
def tokenize_fn(batch):
    return tokenizer(batch["sentence1"], batch["sentence2"], truncation=True)

In [10]:
tokenized_dataset = raw_datasets.map(tokenize_fn, batched=True)

Map: 100%|██████████████████████████████████████████████████████████████████| 277/277 [00:00<00:00, 7602.06 examples/s]


In [11]:
training_args = TrainingArguments(
    output_dir="text_entailment_training",
    eval_strategy="epoch", # Uso del dataset di evaluation per calcolare l'andamento. Default no
    save_strategy="epoch", # Salva il modello al termine di ogni epoch. Il default è per ogni step.
    num_train_epochs=5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    logging_steps=150 # evita il No Log nella prima epoch del training
)

In [12]:
target_map = {"entailed": 0, "not_entailed": 1}

In [13]:
config = AutoConfig.from_pretrained(checkpoint)

In [14]:
config.id2label = {v:k for k, v in target_map.items()}

In [15]:
config.label2id = target_map

In [16]:
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, config=config)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
summary(model)

Layer (type:depth-idx)                                  Param #
DistilBertForSequenceClassification                     --
├─DistilBertModel: 1-1                                  --
│    └─Embeddings: 2-1                                  --
│    │    └─Embedding: 3-1                              22,268,928
│    │    └─Embedding: 3-2                              393,216
│    │    └─LayerNorm: 3-3                              1,536
│    │    └─Dropout: 3-4                                --
│    └─Transformer: 2-2                                 --
│    │    └─ModuleList: 3-5                             42,527,232
├─Linear: 1-2                                           590,592
├─Linear: 1-3                                           1,538
├─Dropout: 1-4                                          --
Total params: 65,783,042
Trainable params: 65,783,042
Non-trainable params: 0

In [18]:
metric = load_metric('glue', 'rte')

In [19]:
def compute_metrics(logit_and_labels):
    logit, labels = logit_and_labels
    predictions = np.argmax(logit, axis=-1) # axis = -1 -> Ultimo asse
    return metric.compute(predictions=predictions, references=labels)

In [20]:
trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [21]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.6943,0.687087,0.548736
2,0.6503,0.723308,0.545126
3,0.4367,0.839845,0.577617
4,0.2163,1.520054,0.599278
5,0.1134,1.831276,0.581227


TrainOutput(global_step=780, training_loss=0.40945693896366997, metrics={'train_runtime': 350.5, 'train_samples_per_second': 35.521, 'train_steps_per_second': 2.225, 'total_flos': 542121276647352.0, 'train_loss': 0.40945693896366997, 'epoch': 5.0})

In [36]:
new_cls = pipeline("text-classification", model="text_entailment_training/checkpoint-156", device=0)

In [37]:
test_data = raw_datasets["test"]

In [38]:
test_data[0]

{'sentence1': "Mangla was summoned after Madhumita's sister Nidhi Shukla, who was the first witness in the case.",
 'sentence2': 'Shukla is related to Mangla.',
 'label': -1,
 'idx': 0}

In [39]:
test_input = [{"text": d["sentence1"] , "text_pair": d["sentence2"]} for d in test_data]

In [40]:
predictions = new_cls(test_input)

In [41]:
predictions[:10]

[{'label': 'entailed', 'score': 0.516261875629425},
 {'label': 'entailed', 'score': 0.5346696376800537},
 {'label': 'entailed', 'score': 0.5143323540687561},
 {'label': 'entailed', 'score': 0.5035443305969238},
 {'label': 'entailed', 'score': 0.5439093112945557},
 {'label': 'not_entailed', 'score': 0.5467738509178162},
 {'label': 'entailed', 'score': 0.5089209675788879},
 {'label': 'not_entailed', 'score': 0.5052562952041626},
 {'label': 'entailed', 'score': 0.5177659392356873},
 {'label': 'entailed', 'score': 0.557256281375885}]

In [42]:
predictions_idx = [target_map[d["label"]] for d in predictions]

In [43]:
new_cls({"text": "I just bought a car" , "text_pair": "I have a dog"})

{'label': 'entailed', 'score': 0.5041528344154358}

In [44]:
test_data[2]

{'sentence1': 'A mercenary group faithful to the warmongering policy of former Somozist colonel Enrique Bermudez attacked an IFA truck belonging to the interior ministry at 0900 on 26 March in El Jicote, wounded and killed an interior ministry worker and wounded five others.',
 'sentence2': 'An interior ministry worker was killed by a mercenary group.',
 'label': -1,
 'idx': 2}