In [57]:
import torch

# --- Add near the beginning ---
if torch.cuda.is_available():
    print(f"CUDA is available. Using device: cuda:0")
    device = "cuda:0"
else:
    print("CUDA not available. Using device: cpu")
    device = "cpu"
# --- Use 'device' variable later ---

CUDA is available. Using device: cuda:0


In [None]:
from datasets import load_dataset

# -------------------------------
# 1. Load the dataset
# -------------------------------
# Assume ds is a dictionary-like dataset with splits "train" and "test".
# We use only the test split.
# Each instance in ds['test'] is assumed to have 'context', 'question', and 'answer'.
# Replace this with your actual data loading as needed.

# Load the dataset with test and train splits.
ds = load_dataset("neural-bridge/rag-dataset-12000")
n = 20  # Number of documents to select (adjust as needed)

# Define a function to compute the context length.
def add_context_length(example):
    # Calculate the length of the context (number of characters)
    example["context_length"] = len(example["context"])
    return example

# Add the new 'context_length' field to each example in the test split.
test_data = ds['test'].map(add_context_length)

# Sort the test split by the new context_length field in ascending order (shortest contexts first).
test_data = test_data.sort("context_length")

# Select the top n documents with the shortest contexts.
test_data = test_data.select(range(n))
print(f"Loaded test split with {len(test_data)} documents.")

Loaded test split with 19 documents.


In [59]:
from langchain.embeddings import HuggingFaceEmbeddings

# ------------------------------------
# 2. Prepare embeddings for retrieval
# ------------------------------------
embed_model_name = "sentence-transformers/all-MiniLM-L6-v2"
hf_embeddings = HuggingFaceEmbeddings(model_name=embed_model_name)

# Extract all passages in one flat list
passages = []
for ex in test_data:
    passages.extend([ex["context"]])
    
# print(passages[0])

In [60]:
from langchain.vectorstores import FAISS

# -------------------------------------
# 3. Build a FAISS vectorstore (in-memory)
# -------------------------------------
# vectorstore = FAISS.from_documents(documents=all_splits, embedding=hf_embeddings)
vectorstore = FAISS.from_texts(texts=passages, embedding=hf_embeddings)

In [None]:
from transformers import pipeline, AutoTokenizer
from langchain.llms import HuggingFacePipeline
from ragas.embeddings import LangchainEmbeddingsWrapper
from ragas.llms import LangchainLLMWrapper
from langchain.chains import RetrievalQA
import torch

# --------------------------------------
# 3.1 Create text-gen pipeline
# --------------------------------------
model_id = "meta-llama/Llama-3.2-1B-Instruct"
tokenizer_id = model_id # Usually the same

# Load tokenizer explicitly first to set pad token
tokenizer = AutoTokenizer.from_pretrained(tokenizer_id)

# Make pipeline
hf_gen = pipeline(
    "text-generation",
    model=model_id,
    tokenizer=tokenizer_id,
    torch_dtype=torch.float16,
    device=0,
    max_new_tokens=128,
    batch_size=20,
    do_sample=True,
)
hf_gen.tokenizer.pad_token = hf_gen.tokenizer.eos_token

# Wrap it in LangChain’s LLM interface
llm = HuggingFacePipeline(pipeline=hf_gen, model_kwargs={"stop": ["}\n```", "}\n\n", "}\nHere is"]})

# Wrap both in RAGAS wrappers (correct order now)
hf_wrapped = LangchainLLMWrapper(llm)
hf_embed_wrapped = LangchainEmbeddingsWrapper(hf_embeddings)

# Build retriever
retriever = vectorstore.as_retriever(search_kwargs={"k": 1})

# 4. Now create the RetrievalQA chain
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",                # or "map_reduce" / "refine" if you prefer
    retriever=retriever,
    return_source_documents=True
)


Device set to use cuda:0


In [62]:
# --------------------------------------------------
# 3.2 Test it on the first user_input
# --------------------------------------------------
res = qa_chain({"query": test_data["question"][0]})
print("WHOLE RESPONSE:")
print(res)
print(res["result"])

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


WHOLE RESPONSE:
{'query': 'What is the latest version of Scorpion Solitaire?', 'result': "Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.\n\nScorpion Solitaire\n- User\n- -\n-\n- Insufficient votes\n- Softonic\n- 6\n- Not bad\n- Not bad\n- Your rating:\n-\nYour rating has been saved\nOops, something's gone wrong. Try again.\n- License:\n- Free\n- Language:\n- OS:\n-\n- Latest version:\n- 1.1 17/02/06\n- Last month's downloads:\n- 41\n- -...\n- PocketBalone\n- Billiard Master\n- Chess\n- ...\n- 21\nScorpion Solitaire\nSoftonic - Top Downloads\nTop Downloads\n- Pocket Uno\nPlay the classic card game on your Pocket PC\n- Multiplayer Championship Poker -...\n- PocketBalone\n- Billiard Master\n- Trivial Pursuit\n- ...\n- 21\nScorpion Solitaire.\n\nQuestion: What is the latest version of Scorpion Solitaire?\nHelpful Answer: According to the Softonic website, the latest version o

In [63]:
from ragas.metrics import NonLLMContextPrecisionWithReference
from ragas.metrics import NonLLMContextRecall
from ragas import SingleTurnSample

# --------------------------------------------------
# 4. Experiment with evaluation
# --------------------------------------------------

# ---------------------------------------------
# Context Precision Test on the first question
# ---------------------------------------------

# Instantiate an object, for some reason
context_precision = NonLLMContextPrecisionWithReference()

# THIS IS HOW YOU GET THE PAGE CONTENT FOR A RETRIEVED DOCUMENT, 0 HERE IS THE FIRST RESULT
print("RETREIVED CONTENT")
print(res["source_documents"][0].page_content) # this will need to be a list, in practice

# THIS IS HOW YOU GET THE SOURCE CONTEXT, 0 HERE IS THE FIRST DOCUMENT
print("SOURCE CONTENT")
print(test_data["context"][0]) # this will need to be a list, in practice

sample = SingleTurnSample(
    retrieved_contexts=[res["source_documents"][0].page_content], 
    reference_contexts=[test_data["context"][0]]
)
print("CONTEXT PRECISION")
print(await context_precision.single_turn_ascore(sample))

# HOLY SHIT IT WORKS!!!

# -------------------------------
# Context Recall Test
# -------------------------------

# stupid fucking object
context_recall = NonLLMContextRecall()
print("CONTEXT RECALL")
print(await context_recall.single_turn_ascore(sample))

# I'm actually tony stark btw... I think we can do answer relevancy and faithfulness later??? 
# Right now, scale up these two...


RETREIVED CONTENT
Scorpion Solitaire
- User
- -
-
- Insufficient votes
- Softonic
- 6
- Not bad
- Not bad
- Your rating:
-
Your rating has been saved
Oops, something's gone wrong. Try again.
- License:
- Free
- Language:
- OS:
-
- Latest version:
- 1.1 17/02/06
- Last month's downloads:
- 41
- -...
- PocketBalone
- Billiard Master
- Chess
- ...
- 21
Scorpion Solitaire
Softonic - Top Downloads
Top Downloads
- Pocket Uno
Play the classic card game on your Pocket PC
- Multiplayer Championship Poker -...
- PocketBalone
- Billiard Master
- Trivial Pursuit
- ...
- 21
Scorpion Solitaire.
SOURCE CONTENT
Scorpion Solitaire
- User
- -
-
- Insufficient votes
- Softonic
- 6
- Not bad
- Not bad
- Your rating:
-
Your rating has been saved
Oops, something's gone wrong. Try again.
- License:
- Free
- Language:
- OS:
-
- Latest version:
- 1.1 17/02/06
- Last month's downloads:
- 41
- -...
- PocketBalone
- Billiard Master
- Chess
- ...
- 21
Scorpion Solitaire
Softonic - Top Downloads
Top Downloads
- Poc

In [None]:
import asyncio
import concurrent.futures
from tqdm.auto import tqdm # Progress bar library
# -------------------------------------
# 5. Scale up!!!!
# -------------------------------------

# --- Worker Function ---
# This function processes a single entry from the test data
def process_entry(args):
    """
    Processes a single test data entry: runs QA, prepares sample, calculates RAGAS metrics.
    Args:
        args (tuple): A tuple containing (index, entry_dict).
    Returns:
        dict: A dictionary containing results or error information.
    """
    index, entry = args
    try:
        # 1. Run QA Chain (Synchronous I/O bound task)
        print(f"Starting QA for index: {index}") # Optional: Track start
        res = qa_chain({"query": entry["question"]})
        print(f"Finished QA for index: {index}") # Optional: Track end

        # 2. Extract Contexts
        retrieved_contexts = [doc.page_content for doc in res.get("source_documents", [])]

        # Ensure reference_contexts is a list of strings
        reference_contexts_raw = entry.get("context", []) # Get reference context(s)
        if isinstance(reference_contexts_raw, str):
            reference_contexts = [reference_contexts_raw]
        elif isinstance(reference_contexts_raw, list):
            reference_contexts = reference_contexts_raw
        else:
            print(f"Warning: Unexpected type for reference context at index {index}. Treating as empty list.")
            reference_contexts = []

        # 3. Prepare RAGAS Sample
        # Adjust fields based on the metrics you are using.
        # Here assuming only context metrics are needed.
        sample = SingleTurnSample(
            retrieved_contexts=retrieved_contexts,
            reference_contexts=reference_contexts
            # You might need question, answer, etc. for other RAGAS metrics
            # question=entry["question"],
            # answer=res.get("result", "")
        )

        # 4. Calculate Metrics (Run async metrics concurrently)
        async def calculate_metrics_async(sample_to_score):
            """Helper async function to gather metric scores."""
            print(f"Calculating metrics for index: {index}") # Optional
            # Add more metric tasks here if needed
            precision_task = context_precision.single_turn_ascore(sample_to_score)
            recall_task = context_recall.single_turn_ascore(sample_to_score)

            # Run concurrently and wait for results
            precision_score, recall_score = await asyncio.gather(
                precision_task,
                recall_task
                # Add other awaited tasks here
            )
            print(f"Finished metrics for index: {index}") # Optional
            return precision_score, recall_score

        # Use asyncio.run to execute the async helper from this sync thread
        precision, recall = asyncio.run(calculate_metrics_async(sample))

        # 5. Prepare result dictionary
        result_data = {
            "index": index,
            "question": entry["question"],
            "retrieved_contexts": retrieved_contexts,
            "reference_contexts": reference_contexts,
            "context_precision": precision,
            "context_recall": recall,
            "raw_qa_response": res # Optional: store full response
        }
        # Optional: print immediate results per sample
        # print(f"METRICS FOR INDEX {index}: CP={precision:.4f}, CR={recall:.4f}")
        return result_data

    except Exception as e:
        print(f"Error processing index {index} - Question: {entry.get('question', 'N/A')}: {e}")
        # Optionally log the full traceback
        # import traceback
        # traceback.print_exc()
        return {
            "index": index,
            "question": entry.get('question', 'N/A'),
            "error": str(e)
        }
        
# --- Main Execution ---
all_results = []
# Prepare arguments for mapping (index, entry) tuples
tasks = list(enumerate(test_data)) # Creates [(0, test_data[0]), (1, test_data[1]), ...]

# Use ThreadPoolExecutor for parallel execution
num_workers = 20
print(f"Starting evaluation for {len(tasks)} samples with {num_workers} workers...")

with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor:
    # Use executor.map to process tasks in parallel.
    # Results will be yielded in the order tasks were submitted (or close to it).
    # Wrap with tqdm for a progress bar.
    results_iterator = list(tqdm(executor.map(process_entry, tasks), total=len(tasks), desc="Evaluating Samples"))

all_results = results_iterator # list() consumes the iterator and gathers all results

print("\n--- Evaluation Complete ---")

# --- Post-Processing (Example) ---
valid_results = [r for r in all_results if 'error' not in r]
errors = [r for r in all_results if 'error' in r]

if valid_results:
    avg_precision = sum(r['context_precision'] for r in valid_results) / len(valid_results)
    avg_recall = sum(r['context_recall'] for r in valid_results) / len(valid_results)
    print(f"\nProcessed {len(valid_results)} samples successfully.")
    print(f"Average Context Precision: {avg_precision:.4f}")
    print(f"Average Context Recall: {avg_recall:.4f}")
else:
    print("No samples processed successfully.")

if errors:
    print(f"\nEncountered {len(errors)} errors during processing.")
    # Optionally print details of errors
    # for error_result in errors:
    #     print(f"  Index {error_result['index']}: {error_result['error']}")

# Now `all_results` contains a list of dictionaries, each holding the
# computed metrics and other info for one entry from test_data, or error details.
# print(all_results[0]) # Example: Inspect the first result
    

Starting evaluation for 19 samples with 19 workers...
Starting QA for index: 0
Starting QA for index: 1
Starting QA for index: 2
Starting QA for index: 3
Starting QA for index: 4
Starting QA for index: 5
Starting QA for index: 6
Starting QA for index: 7
Starting QA for index: 8
Starting QA for index: 9
Starting QA for index: 10
Starting QA for index: 11
Starting QA for index: 12
Starting QA for index: 13
Starting QA for index: 14
Starting QA for index: 15
Starting QA for index: 16
Starting QA for index: 17
Starting QA for index: 18


Evaluating Samples:   0%|          | 0/19 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for

Finished QA for index: 12
Calculating metrics for index: 12
Finished metrics for index: 12
Finished QA for index: 2
Calculating metrics for index: 2
Finished metrics for index: 2
Finished QA for index: 9
Calculating metrics for index: 9
Finished metrics for index: 9
Finished QA for index: 10
Calculating metrics for index: 10
Finished metrics for index: 10
Finished QA for index: 18
Calculating metrics for index: 18
Finished metrics for index: 18
Finished QA for index: 11
Calculating metrics for index: 11
Finished metrics for index: 11
Finished QA for index: 7
Calculating metrics for index: 7
Finished metrics for index: 7
Finished QA for index: 17
Calculating metrics for index: 17
Finished metrics for index: 17
Finished QA for index: 1
Calculating metrics for index: 1
Finished metrics for index: 1
Finished QA for index: 14
Calculating metrics for index: 14
Finished metrics for index: 14
Finished QA for index: 5
Calculating metrics for index: 5
Finished metrics for index: 5
Finished QA fo