## Install Dependencies

In [1]:
!pip install -q transformers accelerate bitsandbytes datasets sentence-transformers pandas tqdm
!pip install -q langchain langchain-community langchain-huggingface langchain-chroma chromadb


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.4/59.4 MB[0m [31m45.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m90.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.4/21.4 MB[0m [31m119.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m278.2/278.2 kB[0m [31m26.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m95.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m64.1 MB/s[0m eta [36m0:00

In [2]:

import locale
locale.getpreferredencoding = lambda: "UTF-8"

import torch
import pandas as pd
from tqdm import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma
from langchain_core.documents import Document
from datasets import load_dataset


In [3]:
RETRIEVAL_K = 3
EMBEDDING_MODEL_NAME = "intfloat/e5-base-v2"
SAMPLES_FOR_DB = 1000
SAMPLES_FOR_TEST = 100
MODEL_ID = "HuggingFaceTB/SmolLM2-1.7B-Instruct"

print(f"Using model: {MODEL_ID}")



Using model: HuggingFaceTB/SmolLM2-1.7B-Instruct


## Evaluation Technique

In [4]:

class Evaluator:

    def normalize_text(text):
        text = str(text).lower().strip()
        return ' '.join(text.split())

    def is_correct(prediction, ground_truth):
        pred = Evaluator.normalize_text(prediction)
        truth = Evaluator.normalize_text(ground_truth)
        return truth in pred


## Preparing the dataset

In [5]:
#
def prepare_data(dataset, start_idx, count):
    data = []
    subset = dataset.select(range(start_idx, start_idx + count))

    for row in subset:
        answer = row['answer']['aliases'][0]
        snippets = row['search_results']['search_context']
        context = " ||| ".join(snippets) if snippets else ""

        data.append({
            "question": row['question'],
            "answer": answer,
            "context": context
        })

    return pd.DataFrame(data)


## Loading model

In [6]:

def load_model_and_tokenizer():
    print(f"Loading model...")

    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    model = AutoModelForCausalLM.from_pretrained(
        MODEL_ID,
        torch_dtype=torch.float16,
        device_map="auto",
        trust_remote_code=True,
    )

    return model, tokenizer


In [7]:

def generate_answer(model, tokenizer, prompt):
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
    inputs = {k: v.to(model.device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=100,
            temperature=0.5,
            do_sample=True,
            top_p=0.9,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id,
        )

    full_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

    if prompt in full_text:
        answer = full_text[len(prompt):].strip()
    else:
        answer = full_text.strip()

    return answer[:300].strip()



## Building Vector Database

In [8]:

def build_vector_database(train_df):

    documents = []
    for _, row in train_df.iterrows():
        snippets = str(row['context']).split(" ||| ")
        for snippet in snippets:
            if len(snippet) > 20:
                documents.append(Document(page_content=snippet[:800]))

    documents = documents[:3000]

    embeddings = HuggingFaceEmbeddings(
        model_name=EMBEDDING_MODEL_NAME,
        encode_kwargs={'normalize_embeddings': True}
    )

    vectordb = Chroma.from_documents(
        documents,
        embeddings,
        collection_name="trivia_qa"
    )

    return vectordb


In [9]:

def create_prompt(question, context):
    return f"{context[:300]}\n\nQ: {question}\nA:"


def run_evaluation(model, tokenizer, test_df, retriever):
    evaluator = Evaluator()
    results = []

    for _, row in tqdm(test_df.iterrows(), total=len(test_df), desc="Evaluating"):
        question = row['question']
        ground_truth = row['answer']

        query = f"query: {question}"
        retrieved_docs = retriever.invoke(query)
        context = "\n".join([doc.page_content[:250] for doc in retrieved_docs])

        prompt = create_prompt(question, context)
        prediction = generate_answer(model, tokenizer, prompt)

        correct = evaluator.is_correct(prediction, ground_truth)

        results.append({
            "Question": question,
            "Ground_Truth": ground_truth,
            "Prediction": prediction,
            "Correct": correct
        })

    return pd.DataFrame(results)



## Evaluation

In [10]:


def main():
    print("RAG TriviaQA Evaluation\n")

    # Load dataset
    print("Loading dataset...")
    dataset = load_dataset("mandarjoshi/trivia_qa", "rc", split="train")

    train_df = prepare_data(dataset, 0, SAMPLES_FOR_DB)
    test_df = prepare_data(dataset, 0, SAMPLES_FOR_TEST)
    print(f"Prepared {len(train_df)} training and {len(test_df)} test samples\n")

    # Load model
    model, tokenizer = load_model_and_tokenizer()

    # Build retrieval system
    vectordb = build_vector_database(train_df)
    retriever = vectordb.as_retriever(search_kwargs={"k": RETRIEVAL_K})

    # Run evaluation
    print("\nStarting evaluation...")
    results_df = run_evaluation(model, tokenizer, test_df, retriever)

    # Calculate and display accuracy
    accuracy = results_df['Correct'].mean() * 100

    # Save results
    results_df.to_csv("rag_triviaqa_results.csv", index=False)

    print(f"\nFinal Accuracy: {accuracy:.2f}%")


if __name__ == "__main__":
    main()

RAG TriviaQA Evaluation

Loading dataset...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

Resolving data files:   0%|          | 0/26 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/26 [00:00<?, ?it/s]

Downloading data:   0%|          | 0/26 [00:00<?, ?files/s]

rc/train-00000-of-00026.parquet:   0%|          | 0.00/308M [00:00<?, ?B/s]

rc/train-00001-of-00026.parquet:   0%|          | 0.00/298M [00:00<?, ?B/s]

rc/train-00002-of-00026.parquet:   0%|          | 0.00/290M [00:00<?, ?B/s]

rc/train-00003-of-00026.parquet:   0%|          | 0.00/444M [00:00<?, ?B/s]

rc/train-00004-of-00026.parquet:   0%|          | 0.00/461M [00:00<?, ?B/s]

rc/train-00005-of-00026.parquet:   0%|          | 0.00/474M [00:00<?, ?B/s]

rc/train-00006-of-00026.parquet:   0%|          | 0.00/404M [00:00<?, ?B/s]

rc/train-00007-of-00026.parquet:   0%|          | 0.00/324M [00:00<?, ?B/s]

rc/train-00008-of-00026.parquet:   0%|          | 0.00/329M [00:00<?, ?B/s]

rc/train-00009-of-00026.parquet:   0%|          | 0.00/336M [00:00<?, ?B/s]

rc/train-00010-of-00026.parquet:   0%|          | 0.00/400M [00:00<?, ?B/s]

rc/train-00011-of-00026.parquet:   0%|          | 0.00/370M [00:00<?, ?B/s]

rc/train-00012-of-00026.parquet:   0%|          | 0.00/341M [00:00<?, ?B/s]

rc/train-00013-of-00026.parquet:   0%|          | 0.00/327M [00:00<?, ?B/s]

rc/train-00014-of-00026.parquet:   0%|          | 0.00/310M [00:00<?, ?B/s]

rc/train-00015-of-00026.parquet:   0%|          | 0.00/157M [00:00<?, ?B/s]

rc/train-00016-of-00026.parquet:   0%|          | 0.00/136M [00:00<?, ?B/s]

rc/train-00017-of-00026.parquet:   0%|          | 0.00/159M [00:00<?, ?B/s]

rc/train-00018-of-00026.parquet:   0%|          | 0.00/200M [00:00<?, ?B/s]

rc/train-00019-of-00026.parquet:   0%|          | 0.00/180M [00:00<?, ?B/s]

rc/train-00020-of-00026.parquet:   0%|          | 0.00/150M [00:00<?, ?B/s]

rc/train-00021-of-00026.parquet:   0%|          | 0.00/153M [00:00<?, ?B/s]

rc/train-00022-of-00026.parquet:   0%|          | 0.00/147M [00:00<?, ?B/s]

rc/train-00023-of-00026.parquet:   0%|          | 0.00/157M [00:00<?, ?B/s]

rc/train-00024-of-00026.parquet:   0%|          | 0.00/154M [00:00<?, ?B/s]

rc/train-00025-of-00026.parquet:   0%|          | 0.00/158M [00:00<?, ?B/s]

rc/validation-00000-of-00004.parquet:   0%|          | 0.00/327M [00:00<?, ?B/s]

rc/validation-00001-of-00004.parquet:   0%|          | 0.00/296M [00:00<?, ?B/s]

rc/validation-00002-of-00004.parquet:   0%|          | 0.00/184M [00:00<?, ?B/s]

rc/validation-00003-of-00004.parquet:   0%|          | 0.00/129M [00:00<?, ?B/s]

rc/test-00000-of-00004.parquet:   0%|          | 0.00/307M [00:00<?, ?B/s]

rc/test-00001-of-00004.parquet:   0%|          | 0.00/288M [00:00<?, ?B/s]

rc/test-00002-of-00004.parquet:   0%|          | 0.00/171M [00:00<?, ?B/s]

rc/test-00003-of-00004.parquet:   0%|          | 0.00/128M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/138384 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/17944 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/17210 [00:00<?, ? examples/s]

Loading dataset shards:   0%|          | 0/24 [00:00<?, ?it/s]

Prepared 1000 training and 100 test samples

Loading model...


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/655 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/908 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors:   0%|          | 0.00/3.42G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

Building vector database...


modules.json:   0%|          | 0.00/387 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/57.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/650 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/200 [00:00<?, ?B/s]


Starting evaluation...


Evaluating: 100%|██████████| 100/100 [03:00<00:00,  1.81s/it]


Final Accuracy: 14.00%



