# Libraries

In [None]:
# 📦 Library installation

#may give you truble depending on the permission on your os
%pip install numpy transformers datasets accelerate peft --user

# ⚙️ Library import

import os
import numpy as np
from tqdm.auto import tqdm

# Datasets management
from datasets import load_dataset, Dataset, DatasetDict, load_from_disk

# HuggingFace Transformers
from transformers import (
    T5Tokenizer,
    AutoTokenizer, 
    AutoModelForSeq2SeqLM, 
    Seq2SeqTrainer, 
    Seq2SeqTrainingArguments
)

# PEFT (Parameter Efficient Fine Tuning - for LoRA)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

from functools import partial

# Paths

In [None]:
data_path = ""

#Scores
train_score_path = "Scores/xenc_scores_train-stsb-distilroberta-base.npy"
test_score_path = "Scores/xenc_scores_test-stsb-distilroberta-base.npy"

#Tokenized preprocessed data
dataset_path_K3 = "Processed Data/tokenized_data_K=3"
dataset_path_K2 = "Processed Data/tokenized_data_K=2"

#Models
model_path_t5 = "Models/t5_small"
model_path_flant5 = "Models/flan_t5_small"

# Data load and preprocessing

## Data load

### Loading the documents' score

In [None]:
document_score_train = np.load(train_score_path, allow_pickle=True)
document_score_test = np.load(test_score_path, allow_pickle=True)

print(document_score_train.tolist()[0][0])

### Loading the Dataset

In [None]:
# Loading the whole dataset
dataset = load_dataset("FreedomIntelligence/RAG-Instruct", split="train")

# Split 80% train, 20% test
train_test_dataset = dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = train_test_dataset['train']
test_dataset = train_test_dataset['test']

print(train_dataset)
print(test_dataset)

## Preprocessing funcion

In [None]:
def preprocess(example, scores, idx, tokenizer, k=3):
    if idx >= len(scores):
        return {
            "input_ids": [],
            "attention_mask": [],
            "labels": []
        }

    question = example['question']
    #print(question)
    answer = example['answer']
    #print(answer)
    all_documents = example['documents']
    #print(all_documents)

    score = scores[idx]
    top_k_indices = np.argsort(-score)[:k]
    top_k_indices = [int(i) for i in np.array(top_k_indices).flatten()]

    selected_docs = [all_documents[i] for i in top_k_indices]

    input_text = f"question: {question} context: {' '.join(selected_docs)}"
    #print(input_text)
    target_text = answer

    model_inputs = tokenizer(input_text, max_length=1024, truncation=True)
    labels = tokenizer(target_text, max_length=256, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    
    model_inputs["attention_mask"] = model_inputs.get("attention_mask", [1] * len(model_inputs["input_ids"]))

    return model_inputs

### test preprocessing function

In [None]:
# You can de-comment the print in the function if you want to debug it
print(preprocess(train_dataset[0], document_score_train,0, T5Tokenizer.from_pretrained('t5-base')))

### Class to apply the preprocess function

In [None]:
class PreprocessorWithScores:
    def __init__(self, scores, k=3):
        self.scores = scores
        self.k = k
        self.tokenizer = T5Tokenizer.from_pretrained('t5-base')
    
    def __call__(self, examples, indices):
        if isinstance(indices, int):
            indices = [indices]

        outputs = []
        for i, idx in enumerate(indices):
            example = {key: examples[key] for key in examples}
            output = preprocess(example, self.scores, idx,self.tokenizer, self.k)
            outputs.append(output)

        return {key: [output[key] for output in outputs] for key in outputs[0]}

## Data preprocressing

### Preprocess dataset

In [None]:
K = 2
preprocessor_train = PreprocessorWithScores(document_score_train, k=K)
preprocessor_test = PreprocessorWithScores(document_score_test, k=K)

tokenized_train = train_dataset.map(preprocessor_train, with_indices=True, num_proc=1).filter(lambda example: len(example['input_ids']) > 0)
tokenized_test = test_dataset.map(preprocessor_test, with_indices=True, num_proc=1).filter(lambda example: len(example['input_ids']) > 0)

### Verify that the tokenization didn't corrupt the data

In [None]:
example = tokenized_train[0]
input_ids = example['input_ids'][0]
labels_ids = example['labels'][0]
tokenizer = T5Tokenizer.from_pretrained('t5-base')

# Decodifica input e target
input_text = tokenizer.decode(input_ids, skip_special_tokens=True)
target_text = tokenizer.decode(labels_ids, skip_special_tokens=True)

print("Input text:", input_text)
print("Target text:", target_text)

### Save tokenized datasets

In [None]:
def save_dataset(dataset, prefix, save_dir):
    os.makedirs(save_dir, exist_ok=True)
    path = os.path.join(save_dir, f"{prefix}")
    dataset.save_to_disk(path)
    print(f"Dataset salvato in {path}")

if(K == 3):
    data_path = dataset_path_K3
    
if(K == 2):
    data_path = dataset_path_K2
save_dataset(tokenized_train, 'train', data_path)
save_dataset(tokenized_test, 'test', data_path)

print(tokenized_test)

## Download and save the models

In [None]:
def download_and_save_model(model_name, save_dir):
    os.makedirs(save_dir, exist_ok=True)
    model_path = os.path.join(save_dir, model_name.replace("/", "_"))
    os.makedirs(model_path, exist_ok=True)

    print(f"Downloading {model_name}...")

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

    print(f"Saving {model_name} to {model_path}...")
    tokenizer.save_pretrained(model_path)
    model.save_pretrained(model_path)

    print(f"{model_name} saved at {model_path}")

# Download one at the time
download_and_save_model("t5-small", model_path_t5)
download_and_save_model("google/flan-t5-small", model_path_flant5)