# Libraries

In [4]:
# 📦 Library installation

#may give you truble depending on the permission on your os
# %pip install numpy transformers datasets accelerate peft evaluate --user

# ⚙️ Library import

import os
import numpy as np
from tqdm.auto import tqdm

# Datasets management
from datasets import load_dataset, Dataset, DatasetDict, load_from_disk

# HuggingFace Transformers
from transformers import (
    T5Tokenizer,
    AutoTokenizer, 
    AutoModelForSeq2SeqLM, 
    Seq2SeqTrainer, 
    Seq2SeqTrainingArguments
)

# PEFT (Parameter Efficient Fine Tuning - for LoRA)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

from functools import partial

from evaluate import load

# Paths

In [None]:
data_path = ""
result_path = "Results"

#Scores
train_score_path = "Scores/xenc_scores_train-stsb-distilroberta-base.npy"
test_score_path = "Scores/xenc_scores_test-stsb-distilroberta-base.npy"

#Tokenized preprocessed data
dataset_path_K3 = "Processed Data/tokenized_data_K=3"
dataset_path_K2 = "Processed Data/tokenized_data_K=2"

#Models
model_path_t5 = "Models/t5_small"
model_path_flant5 = "Models/flan_t5_small"

#Results
t5 = "t5_small"
flant5 = "flan_t5_small"

full_ft= "full_fine_tuning"
LoRA_ft = "LoRA_fine_tuning"



k_2 = "K=2"
k_3 = "K=3"

# Pretraining configurations

## Model Selection

*Run only one cell depending on the model you want to train*

## Load Model

In [None]:
tokenizer = AutoTokenizer.from_pretrained(data_path)
model = AutoModelForSeq2SeqLM.from_pretrained(data_path)

## Trainer Selection

*Run only one cell depending on the type of trainig you want to perform*

### 1. Full fine tuning trainer

In [None]:
model = model
result_path += "/" + full_ft

### 2. LoRA fine-tuning

In [None]:
lora_config = LoraConfig(
    r=8,                         # rank dimention
    lora_alpha=16,               # scaling factor
    target_modules=["q", "v"],   # which layer to applay to LoRA
    lora_dropout=0.1,
    bias="none",
    task_type="SEQ_2_SEQ_LM"
)

model = get_peft_model(model, lora_config)
result_path += "/" + LoRA_ft

### 1. T5-small

In [None]:
data_path = model_path_t5
result_path += "/" + t5

### 2. FLAN-T5-small

In [None]:
data_path = model_path_flant5
result_path += "/" + flant5

## Dataset Selection

*Run only one cell depending on the dataset you want to load*

### 1. K = 2 dataset

In [None]:
data_path = dataset_path_K2
result_path += "/" + k_2

### 2. K = 3 dataset

In [None]:
data_path = dataset_path_K3
result_path += "/" + k_3

## Load Dataset

In [None]:
loaded_train_dataset = load_dataset('train', data_path)
loaded_test_dataset = load_dataset('test', data_path)

print(loaded_train_dataset[0].keys())
print(len(loaded_train_dataset))

print(loaded_test_dataset[0].keys())
print(len(loaded_test_dataset))

## Metric function

In [None]:
# to personalize with the metrics we want
def compute_metrics(eval_preds):

    preds, labels = eval_preds
    preds = np.where(preds != -100, preds, tokenizer.pad_token_id)
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Usa BLEU o ROUGE se vuoi (qui mostro solo es.)
    metric = load("sacrebleu")
    bleu = metric.compute(predictions=decoded_preds, references=[[l] for l in decoded_labels])
    return {"bleu": bleu["score"]}

## Training Arguments

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir= result_path,                  # Save location
    evaluation_strategy="epoch",              # evaluates at each epoch
    learning_rate=5e-5,                       
    per_device_train_batch_size=8,            
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    save_total_limit=2,                       # Saves only the last 2 checkpoints
    num_train_epochs=3,
    predict_with_generate=True,               # Use generate() evaluation
    logging_dir="/kaggle/working/logs",
    logging_strategy="steps",
    logging_steps=100,
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
)

## Create Trainer

In [None]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=loaded_train_dataset,
    eval_dataset=loaded_test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Training

## Start Training

In [None]:
trainer.train()