# Libraries

In [1]:
!pip install hnswlib
!pip install pyterrier
!pip install transformers datasets

from datasets import load_dataset
from transformers import T5Tokenizer
import pandas as pd
import random
from datasets import load_dataset
from matplotlib import pyplot as plt
import numpy as np
import nltk
from nltk.corpus import stopwords
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
from mpl_toolkits.mplot3d import Axes3D
from sklearn.preprocessing import LabelEncoder
import hnswlib
import pyterrier as pt
from tqdm import tqdm
from gensim.models.word2vec import Word2Vec

Collecting hnswlib
  Downloading hnswlib-0.8.0.tar.gz (36 kB)
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: hnswlib
  Building wheel for hnswlib (pyproject.toml) ... [?25l[?25hdone
  Created wheel for hnswlib: filename=hnswlib-0.8.0-cp311-cp311-linux_x86_64.whl size=2389209 sha256=a93e5198cf67aec80a3e0a019313dd7ad6e039979ca791e95c5a4f96749990be
  Stored in directory: /root/.cache/pip/wheels/ea/4e/27/39aebca9958719776e36fada290845a7ef10f053ad70e22ceb
Successfully built hnswlib
Installing collected packages: hnswlib
Successfully installed hnswlib-0.8.0
Collecting pyterrier
  Downloading pyterrier-0.1.5-py2.py3-none-any.whl.metadata (9.3 kB)
Downloading pyterrier-0.1.5-py2.py3-none-any.whl (22 kB)
Installing collected packages: pyterrier
Successfully installed pyterrier-0.1.5
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fss

# Data load and preprocessing

## Data load

### Loading the documents' score

In [2]:
train_path = "/kaggle/input/train-document-score/xenc_scores_train-stsb-distilroberta-base.npy"
test_path = "/kaggle/input/test-score-npy/xenc_scores_test-stsb-distilroberta-base.npy"

document_score_train = np.load(train_path)
document_score_test = np.load(test_path)

### Loading the Dataset

In [3]:
# Loading the whole dataset
dataset = load_dataset("FreedomIntelligence/RAG-Instruct", split="train")

# Split 80% train, 20% test
train_test_dataset = dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = train_test_dataset['train']
test_dataset = train_test_dataset['test']

print(train_dataset)
print(test_dataset)

README.md:   0%|          | 0.00/2.64k [00:00<?, ?B/s]

rag_instruct.json:   0%|          | 0.00/296M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/40541 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'answer', 'documents'],
    num_rows: 32432
})
Dataset({
    features: ['question', 'answer', 'documents'],
    num_rows: 8109
})


## Preprocessing funcion

In [4]:
from datasets import Dataset, DatasetDict
from transformers import T5Tokenizer

tokenizer = T5Tokenizer.from_pretrained('t5-base')

def preprocess(dataset, scores, k=10):
    tokenized_dataset = []

    for i in range(len(dataset['question'])): #Iterate along the dataset
        question = dataset['question'][i]
        answer = dataset['answer'][i]
        all_documents = dataset['documents'][i]
        score = scores[i]

        # Reset documents at any cicle
        documents = ""
        for j in range(min(k, len(all_documents))):  #iterate long the minimum between k and the lenght of documents
            documents += (' ' + all_documents[j]) #add the first k documents

        #format the input for the model
        input_text = f"question: {question} context:{documents}"
        target_text = answer

        #tokenize
        model_inputs = tokenizer(input_text, max_length=1024, truncation=True)
        labels = tokenizer(target_text, max_length=256, truncation=True)

        model_inputs["labels"] = labels["input_ids"]

        tokenized_dataset.append(model_inputs)

    # convert the dict list into an HuggingFace Dataset
    hf_dataset = Dataset.from_list(tokenized_dataset)

    return hf_dataset


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


## Data preprocressing

### Small example to test the funcion

In [5]:
K = 3
temp_dataset = train_dataset[0:10]
tokenized_temp_dataset = preprocess(temp_dataset, document_score_train, K)

print(tokenized_temp_dataset[0].keys())

dict_keys(['input_ids', 'attention_mask', 'labels'])


### Preprocess dataset

In [None]:
K = 3
tokenized_train_dataset = preprocess(train_dataset, document_score_train, K)
tokenized_test_dataset = preprocess(test_dataset, document_score_test, K)

### Print tokenized datasets' keys

In [None]:
print(tokenized_temp_dataset[0].keys())

### Save tokenized datasets

In [None]:
from datasets import DatasetDict

tokenized_train_save_path = '/kaggle/working/tokenized_train_dataset'
tokenized_test_save_path = '/kaggle/working/tokenized_test_dataset'

tokenized_train_dataset.save_to_disk(tokenized_train_save_path)
print(f"Dataset salvato in: {tokenized_train_save_path}")

tokenized_test_dataset.save_to_disk(tokenized_test_save_path)

print(f"Dataset salvato in: {tokenized_test_save_path}")