# Libraries

In [1]:
import os
import numpy as np
from datasets import load_dataset, Dataset, DatasetDict
from transformers import T5Tokenizer
from tqdm.auto import tqdm
from datasets import load_from_disk

# Data load and preprocessing

## Data load

### Loading the documents' score

In [2]:
train_path = "/kaggle/input/train-document-score/xenc_scores_train-stsb-distilroberta-base.npy"
test_path = "/kaggle/input/test-score-npy/xenc_scores_test-stsb-distilroberta-base.npy"

document_score_train = np.load(train_path, allow_pickle=True)
document_score_test = np.load(test_path, allow_pickle=True)

### Loading the Dataset

In [3]:
# Loading the whole dataset
dataset = load_dataset("FreedomIntelligence/RAG-Instruct", split="train")

# Split 80% train, 20% test
train_test_dataset = dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = train_test_dataset['train']
test_dataset = train_test_dataset['test']

print(train_dataset)
print(test_dataset)

README.md:   0%|          | 0.00/2.64k [00:00<?, ?B/s]

rag_instruct.json:   0%|          | 0.00/296M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/40541 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'answer', 'documents'],
    num_rows: 32432
})
Dataset({
    features: ['question', 'answer', 'documents'],
    num_rows: 8109
})


## Preprocessing funcion

In [4]:
from datasets import Dataset, DatasetDict
from transformers import T5Tokenizer

tokenizer = T5Tokenizer.from_pretrained('t5-base')

def preprocess(example, scores, idx, k=3):
    if idx >= len(scores):
        return {
            "input_ids": [],
            "attention_mask": [],
            "labels": []
        }

    question = example['question']
    answer = example['answer']
    all_documents = example['documents']

    score = scores[idx]
    top_k_indices = np.argsort(-score)[:k]
    top_k_indices = [int(i) for i in np.array(top_k_indices).flatten()]

    selected_docs = [all_documents[i] for i in top_k_indices]

    input_text = f"question: {question} context: {' '.join(selected_docs)}"
    target_text = answer

    model_inputs = tokenizer(input_text, max_length=1024, truncation=True)
    labels = tokenizer(target_text, max_length=256, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    
    model_inputs["attention_mask"] = model_inputs.get("attention_mask", [1] * len(model_inputs["input_ids"]))

    return model_inputs

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


### Class to apply the preprocess function

In [5]:
class PreprocessorWithScores:
    def __init__(self, scores, k=3):
        self.scores = scores
        self.k = k

    def __call__(self, examples, indices):
        if isinstance(indices, int):
            indices = [indices]

        outputs = []
        for i, idx in enumerate(indices):
            example = {key: examples[key][i] for key in examples}
            output = preprocess(example, self.scores, idx, self.k)
            outputs.append(output)

        return {key: [output[key] for output in outputs] for key in outputs[0]}

## Data preprocressing

### Preprocess dataset

In [10]:
K = 2
dataset_path_K3 = "/kaggle/working/tokenized_data_K3"
dataset_path_K2 = "/kaggle/working/tokenized_data_K2"
preprocessor_train = PreprocessorWithScores(document_score_train, k=K)
preprocessor_test = PreprocessorWithScores(document_score_test, k=K)

tokenized_train = train_dataset.map(preprocessor_train, with_indices=True, num_proc=4).filter(lambda example: len(example['input_ids']) > 0)
tokenized_test = test_dataset.map(preprocessor_test, with_indices=True, num_proc=4).filter(lambda example: len(example['input_ids']) > 0)

Map (num_proc=4):   0%|          | 0/32432 [00:00<?, ? examples/s]

Filter:   0%|          | 0/32432 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/8109 [00:00<?, ? examples/s]

Filter:   0%|          | 0/8109 [00:00<?, ? examples/s]

### Save tokenized datasets in batches

In [11]:
def save_dataset(dataset, prefix, save_dir="/kaggle/working/tokenized_data"):
    os.makedirs(save_dir, exist_ok=True)
    path = os.path.join(save_dir, f"{prefix}")
    dataset.save_to_disk(path)
    print(f"Dataset salvato in {path}")

data_path = ""
if(K == 3):
    data_path = dataset_path_K3
    
if(K == 2):
    data_path = dataset_path_K2
save_dataset(tokenized_train, 'train', data_path)
save_dataset(tokenized_test, 'test', data_path)

Saving the dataset (0/1 shards):   0%|          | 0/32432 [00:00<?, ? examples/s]

Dataset salvato in /kaggle/working/tokenized_data_K2/train


Saving the dataset (0/1 shards):   0%|          | 0/8109 [00:00<?, ? examples/s]

Dataset salvato in /kaggle/working/tokenized_data_K2/test


### Reload batches

In [13]:
def load_dataset(prefix, save_dir="/kaggle/working/tokenized_data"):
    path = os.path.join(save_dir, prefix)
    dataset = load_from_disk(path)
    print(f"Dataset {prefix} caricato da {path}")
    return dataset

loaded_train_dataset = load_dataset('train', data_path)
loaded_test_dataset = load_dataset('test', data_path)

print(loaded_train_dataset[0].keys())
print(len(loaded_train_dataset))

print(loaded_test_dataset[0].keys())
print(len(loaded_test_dataset))

Dataset train caricato da /kaggle/working/tokenized_data_K2/train
Dataset test caricato da /kaggle/working/tokenized_data_K2/test
dict_keys(['question', 'answer', 'documents', 'input_ids', 'attention_mask', 'labels'])
32432
dict_keys(['question', 'answer', 'documents', 'input_ids', 'attention_mask', 'labels'])
8109


# Training

## Downlad and save the models

In [15]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import os

def download_and_save_model(model_name, save_dir="/kaggle/working/models"):
    os.makedirs(save_dir, exist_ok=True)
    model_path = os.path.join(save_dir, model_name.replace("/", "_"))
    os.makedirs(model_path, exist_ok=True)

    print(f"Downloading {model_name}...")

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

    print(f"Saving {model_name} to {model_path}...")
    tokenizer.save_pretrained(model_path)
    model.save_pretrained(model_path)

    print(f"{model_name} saved at {model_path}")

# Download one at the time
#download_and_save_model("t5-small")
download_and_save_model("google/flan-t5-small")

Downloading google/flan-t5-small...


tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/308M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Saving google/flan-t5-small to /kaggle/working/models/google_flan-t5-small...
google/flan-t5-small saved at /kaggle/working/models/google_flan-t5-small
