In [1]:
# scarica i dati dall' HUB e li salva in cache come oggetto DatasetDict
from datasets import load_dataset, DatasetDict
cnn_dataset = load_dataset("abisee/cnn_dailymail", "3.0.0")
small_cnn_dataset = DatasetDict({
    "train": cnn_dataset["train"].select(range(5000)),
    "validation": cnn_dataset["validation"].select(range(1000)),
    "test": cnn_dataset["test"].select(range(1000)),
})

In [2]:
import pandas as pd
# Prendo lo split train (è un Dataset)
train_split = small_cnn_dataset["train"]

# Conversione a pandas DataFrame
df = train_split.to_pandas()

print(df.head())

                                             article  \
0  LONDON, England (Reuters) -- Harry Potter star...   
1  Editor's note: In our Behind the Scenes series...   
2  MINNEAPOLIS, Minnesota (CNN) -- Drivers who we...   
3  WASHINGTON (CNN) -- Doctors removed five small...   
4  (CNN)  -- The National Football League has ind...   

                                          highlights  \
0  Harry Potter star Daniel Radcliffe gets £20M f...   
1  Mentally ill inmates in Miami are housed on th...   
2  NEW: "I thought I was going to die," driver sa...   
3  Five small polyps found during procedure; "non...   
4  NEW: NFL chief, Atlanta Falcons owner critical...   

                                         id  
0  42c027e4ff9730fbb3de84c1af0d2c506e41c3e4  
1  ee8871b15c50d0db17b0179a6d2beab35065f1e9  
2  06352019a19ae31e527f37f7571c6dd7f0c5da37  
3  24521a2abb2e1f5e34e6824e0f9e56904a2b0e88  
4  7fe70cc8b12fab2d0a258fababf7d9c6b5e1262a  


In [3]:
def preprocess(batch, tokenizer, prompt: str = "",  max_input_len=512, max_target_len=128):
    # Per ogni riga del batch si organizza l'input in base alla presenza o meno del prompt (essenziale per LLM GPT)
    inputs = [f"{prompt}{doc}" for doc in batch["article"]]

    # Tokenizza l'input
    model_inputs = tokenizer(
        inputs,
        max_length=max_input_len,
        truncation=True
    )

    # Tokenizza il target (riassunto)
    labels = tokenizer(
        batch["highlights"],
        max_length=max_target_len,
        truncation=True
    )
    # se nell' etichetta abbiamo padding questo valore viene ignorato nel calcolo della LOSS
    labels_ids = labels["input_ids"]
    labels_ids = [
        [(id if id != tokenizer.pad_token_id else -100) for id in seq]
        for seq in labels_ids
    ]

    # Aggiunge labels al dizionario
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


In [4]:
import evaluate 


rouge_score = evaluate.load("rouge")

In [5]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, AutoTokenizer, DataCollatorForSeq2Seq, Seq2SeqTrainer
from nltk.tokenize import sent_tokenize
# configurazione del modello
model_checkpoint = "google-t5/t5-small"
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

tokenized_dataset = small_cnn_dataset.map(
    lambda batch: preprocess(
        batch,
        tokenizer=tokenizer
        # ,prompt="Riassumi: ", 
    ),
    batched=True
)
#configurazione degli iperparametri
batch_size = 2
num_train_epochs = 8
logging_steps = len(tokenized_dataset["train"])
model_name = model_checkpoint.split("/")[-1]

args = Seq2SeqTrainingArguments(
    output_dir = f"{model_name}",
    eval_strategy = "epoch",
    learning_rate = 5.6e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=num_train_epochs,
    predict_with_generate=True,
    logging_steps=logging_steps,
    push_to_hub = False,
    report_to="none" 
)

# data collator

data_collator = DataCollatorForSeq2Seq(tokenizer,    model = model)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [10]:

import numpy as np
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    # Decode generated summaries into text
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    # Decode reference summaries into text
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    # ROUGE expects a newline after each sentence
    decoded_preds = ["\n".join(sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(sent_tokenize(label.strip())) for label in decoded_labels]
    # Compute ROUGE scores
    result = rouge_score.compute(
        predictions=decoded_preds, references=decoded_labels, use_stemmer=True
    )
    # Check if the score object has the 'mid' attribute
    # This is the key change to handle the different return types
    if 'mid' in result['rouge1']:
        # This branch handles the older/specific version of evaluate
        result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
    else:
        # This branch handles the newer version of evaluate, which returns floats directly
        result = {key: value * 100 for key, value in result.items()}
        
    return {k: round(v, 4) for k, v in result.items()}

In [11]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()

trainer.evaluate()

  trainer = Seq2SeqTrainer(


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [None]:
import torch
print(torch.__version__)
print(torch.cuda.is_available())


2.8.0+cu128
False
