# Libraries

In [2]:
import os
import numpy as np
from datasets import load_dataset, Dataset, DatasetDict
from transformers import T5Tokenizer
from tqdm.auto import tqdm

# Data load and preprocessing

## Data load

### Loading the documents' score

In [3]:
train_path = "/kaggle/input/train-document-score/xenc_scores_train-stsb-distilroberta-base.npy"
test_path = "/kaggle/input/test-score-npy/xenc_scores_test-stsb-distilroberta-base.npy"

document_score_train = np.load(train_path, allow_pickle=True)
document_score_test = np.load(test_path, allow_pickle=True)

### Loading the Dataset

In [4]:
# Loading the whole dataset
dataset = load_dataset("FreedomIntelligence/RAG-Instruct", split="train")

# Split 80% train, 20% test
train_test_dataset = dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = train_test_dataset['train']
test_dataset = train_test_dataset['test']

print(train_dataset)
print(test_dataset)

README.md:   0%|          | 0.00/2.64k [00:00<?, ?B/s]

rag_instruct.json:   0%|          | 0.00/296M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/40541 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'answer', 'documents'],
    num_rows: 32432
})
Dataset({
    features: ['question', 'answer', 'documents'],
    num_rows: 8109
})


## Preprocessing funcion

In [5]:
from datasets import Dataset, DatasetDict
from transformers import T5Tokenizer

tokenizer = T5Tokenizer.from_pretrained('t5-base')

def preprocess(example, scores, idx, k=3):
    if idx >= len(scores):
        return {
            "input_ids": [],
            "attention_mask": [],
            "labels": []
        }

    question = example['question']
    answer = example['answer']
    all_documents = example['documents']

    score = scores[idx]
    top_k_indices = np.argsort(-score)[:k]
    top_k_indices = [int(i) for i in np.array(top_k_indices).flatten()]

    selected_docs = [all_documents[i] for i in top_k_indices]

    input_text = f"question: {question} context: {' '.join(selected_docs)}"
    target_text = answer

    model_inputs = tokenizer(input_text, max_length=1024, truncation=True)
    labels = tokenizer(target_text, max_length=256, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    
    model_inputs["attention_mask"] = model_inputs.get("attention_mask", [1] * len(model_inputs["input_ids"]))

    return model_inputs




### Class to apply the preprocess function

In [6]:
class PreprocessorWithScores:
    def __init__(self, scores, k=3):
        self.scores = scores
        self.k = k

    def __call__(self, examples, indices):
        if isinstance(indices, int):
            indices = [indices]

        outputs = []
        for i, idx in enumerate(indices):
            example = {key: examples[key][i] for key in examples}
            output = preprocess(example, self.scores, idx, self.k)
            outputs.append(output)

        return {key: [output[key] for output in outputs] for key in outputs[0]}

## Data preprocressing

### Preprocess dataset

In [8]:
K = 3
preprocessor_train = PreprocessorWithScores(document_score_train, k=K)
preprocessor_test = PreprocessorWithScores(document_score_test, k=K)

tokenized_train = train_dataset.map(preprocessor_train, with_indices=True, num_proc=4).filter(lambda example: len(example['input_ids']) > 0)
tokenized_test = test_dataset.map(preprocessor_test, with_indices=True, num_proc=4).filter(lambda example: len(example['input_ids']) > 0)

Map (num_proc=4):   0%|          | 0/32432 [00:00<?, ? examples/s]

Filter:   0%|          | 0/32432 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/8109 [00:00<?, ? examples/s]

Filter:   0%|          | 0/8109 [00:00<?, ? examples/s]

### Save tokenized datasets in batches

In [10]:
def save_batches(dataset, prefix, batch_size=1000, save_dir="/kaggle/working/tokenized_data"):
    os.makedirs(save_dir, exist_ok=True)
    n = len(dataset)
    num_batches = (n + batch_size - 1) // batch_size

    for i in tqdm(range(num_batches), desc=f"Saving {prefix} batches"):
        batch = dataset.select(range(i * batch_size, min((i + 1) * batch_size, n)))
        path = os.path.join(save_dir, f"{prefix}_batch_{i}.arrow")
        batch.save_to_disk(path)

save_batches(tokenized_train, 'train', batch_size=1000)
save_batches(tokenized_test, 'test', batch_size=1000)

Saving train batches:   0%|          | 0/33 [00:00<?, ?it/s]

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/432 [00:00<?, ? examples/s]

Saving test batches:   0%|          | 0/9 [00:00<?, ?it/s]

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/109 [00:00<?, ? examples/s]

### Reload batches

In [11]:
def load_batches(prefix, save_dir="/kaggle/working/tokenized_data"):
    batch_paths = sorted([os.path.join(save_dir, f) 
                          for f in os.listdir(save_dir) 
                          if f.startswith(prefix)])
    
    datasets_list = [Dataset.load_from_disk(p) for p in batch_paths]
    concatenated_dataset = Dataset.from_dict({k: sum([ds[k] for ds in datasets_list], []) 
                                              for k in datasets_list[0].features})
    return concatenated_dataset

loaded_train_dataset = load_batches('train')
loaded_test_dataset = load_batches('test')

print(loaded_train_dataset[0].keys())
print(len(loaded_train_dataset))

print(loaded_test_dataset[0].keys())
print(len(loaded_test_dataset))

dict_keys(['question', 'answer', 'documents', 'input_ids', 'attention_mask', 'labels'])
32432
