In [1]:
from data.preprocess import preprocess_cnn_dailymail
from utils.helpers import create_tf_dataloader

# Pre-elabora i dati
print("Caricamento e pre-elaborazione del dataset...")
tokenized_dataset, tokenizer = preprocess_cnn_dailymail(fraction=0.25)

# Crea il DataLoader
print("Creazione del DataLoader...")
train_loader = create_tf_dataloader(tokenized_dataset['train'])

# Verifica il DataLoader
for batch in train_loader.take(1):
    print("Batch di input:", batch[0]["input_ids"].shape)
    print("Batch di output:", batch[1].shape)


Caricamento e pre-elaborazione del dataset...
Creazione del DataLoader...
Batch di input: (32, 512)
Batch di output: (32, 150)


# Train Model

In [2]:
from data.preprocess import preprocess_cnn_dailymail
from utils.helpers import create_tf_dataloader
from model.encoder_decoder import train_model
from config import Config

# Configura i parametri
config = Config()

# Pre-elabora i dati
print("Caricamento e pre-elaborazione del dataset...")
tokenized_dataset, tokenizer = preprocess_cnn_dailymail(fraction=0.25)
train_loader = create_tf_dataloader(tokenized_dataset['train'], batch_size=config.BATCH_SIZE)

# Addestra il modello
print("Addestramento del modello...")
train_model(
    train_loader,
    vocab_size=len(tokenizer.vocab),
    embedding_dim=config.EMBEDDING_DIM,
    hidden_dim=config.HIDDEN_DIM,
    epochs=config.EPOCHS,
    learning_rate=config.LEARNING_RATE
)


Caricamento e pre-elaborazione del dataset...
Addestramento del modello...
Epoch 1/10
Batch 0: Loss = 10.377692222595215
Batch 10: Loss = 8.179736137390137
Batch 20: Loss = 4.484278678894043


KeyboardInterrupt: 

# Valutazione modello

In [None]:
test_dataset = dataset["test"]
test_loader = create_tf_dataloader(test_dataset, batch_size=config.BATCH_SIZE)

from transformers import AutoTokenizer

# Carica il tokenizer usato durante il training
tokenizer = AutoTokenizer.from_pretrained("t5-small")

# Funzione per generare riassunti
def generate_summary(model, input_ids, max_length=150):
    # Passa gli input_ids al modello per ottenere l'output del decoder
    output_ids = model.predict(input_ids)
    
    # Decodifica i token generati
    summary = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return summary

for sample in test_dataset:
    input_ids = tf.convert_to_tensor([sample["input_ids"]])
    generated_summary = generate_summary(model, input_ids)
    print(f"Article: {tokenizer.decode(sample['input_ids'], skip_special_tokens=True)}")
    print(f"Generated Summary: {generated_summary}")
    print(f"Reference Summary: {sample['highlights']}")
    print("-" * 80)


In [None]:
from rouge_score import rouge_scorer

# Inizializza lo scorer ROUGE
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

# Calcola i punteggi
def evaluate_model(model, test_dataset, tokenizer, max_length=150):
    rouge_scores = {"rouge1": [], "rouge2": [], "rougeL": []}
    
    for sample in test_dataset:
        input_ids = tf.convert_to_tensor([sample["input_ids"]])
        generated_summary = generate_summary(model, input_ids, max_length)
        reference_summary = sample["highlights"]
        
        # Calcola i punteggi ROUGE per ogni esempio
        scores = scorer.score(reference_summary, generated_summary)
        for key in scores:
            rouge_scores[key].append(scores[key].fmeasure)
    
    # Calcola le medie
    avg_scores = {key: sum(scores) / len(scores) for key, scores in rouge_scores.items()}
    return avg_scores

# Esegui la valutazione
scores = evaluate_model(model, test_dataset, tokenizer)
print(f"ROUGE scores: {scores}")


In [None]:
article = "Your article text here."
inputs = tokenizer(article, max_length=512, truncation=True, return_tensors="tf")

# Genera il riassunto
generated_summary = generate_summary(model, inputs["input_ids"])
print(f"Generated Summary: {generated_summary}")
