In [1]:
import torch

# --- Add near the beginning ---
if torch.cuda.is_available():
    print(f"CUDA is available. Using device: cuda:0")
    device = "cuda:0"
else:
    print("CUDA not available. Using device: cpu")
    device = "cpu"
# --- Use 'device' variable later ---

CUDA is available. Using device: cuda:0


In [2]:
from datasets import load_dataset

# -------------------------------
# 1. Load the dataset
# -------------------------------
# Assume ds is a dictionary-like dataset with splits "train" and "test".
# We use only the test split.
# Each instance in ds['test'] is assumed to have 'context', 'question', and 'answer'.
# Replace this with your actual data loading as needed.

# Load the dataset with test and train splits.
ds = load_dataset("neural-bridge/rag-dataset-12000")
n = 1  # Number of documents to select (adjust as needed)

# Define a function to compute the context length.
def add_context_length(example):
    # Calculate the length of the context (number of characters)
    example["context_length"] = len(example["context"])
    return example

# Add the new 'context_length' field to each example in the test split.
test_data = ds['test'].map(add_context_length)

# Sort the test split by the new context_length field in ascending order (shortest contexts first).
test_data = test_data.sort("context_length")

# Select the top n documents with the shortest contexts.
test_data = test_data.select(range(n))
print(f"Loaded test split with {len(test_data)} documents.")

Loaded test split with 1 documents.


In [3]:
from langchain.embeddings import HuggingFaceEmbeddings

# ------------------------------------
# 2. Prepare embeddings for retrieval
# ------------------------------------
embed_model_name = "sentence-transformers/all-MiniLM-L6-v2"
hf_embeddings = HuggingFaceEmbeddings(model_name=embed_model_name)

# Extract all passages in one flat list
passages = []
for ex in test_data:
    passages.extend([ex["context"]])
    
# print(passages[0])

  hf_embeddings = HuggingFaceEmbeddings(model_name=embed_model_name)
2025-04-21 23:24:34.435391: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-04-21 23:24:34.502397: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-04-21 23:24:34.519003: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-04-21 23:24:34.523102: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
20

In [4]:
from langchain.vectorstores import FAISS

# -------------------------------------
# 3. Build a FAISS vectorstore (in-memory)
# -------------------------------------
# vectorstore = FAISS.from_documents(documents=all_splits, embedding=hf_embeddings)
vectorstore = FAISS.from_texts(texts=passages, embedding=hf_embeddings)

In [63]:
from transformers import pipeline, AutoTokenizer
from langchain.llms import HuggingFacePipeline
from ragas.embeddings import LangchainEmbeddingsWrapper
from ragas.llms import LangchainLLMWrapper
from langchain.chains import RetrievalQA
import torch

# --------------------------------------
# 3.1 Create text-gen pipeline
# --------------------------------------
model_id = "meta-llama/Llama-3.2-1B-Instruct"
tokenizer_id = model_id # Usually the same

# Load tokenizer explicitly first to set pad token
tokenizer = AutoTokenizer.from_pretrained(tokenizer_id)

# Make pipeline
hf_gen = pipeline(
    "text-generation",
    model=model_id,
    tokenizer=tokenizer_id,
    torch_dtype=torch.float16,
    device=0,
    max_new_tokens=1024,
    batch_size=20,
    temperature=0.01,
    do_sample=True,
)
hf_gen.tokenizer.pad_token = hf_gen.tokenizer.eos_token

# Wrap it in LangChain’s LLM interface
llm = HuggingFacePipeline(pipeline=hf_gen, model_kwargs={"stop": ["}\n```", "}\n\n", "}\nHere is"]})

# Wrap both in RAGAS wrappers (correct order now)
hf_wrapped = LangchainLLMWrapper(llm)
hf_embed_wrapped = LangchainEmbeddingsWrapper(hf_embeddings)

# Build retriever
retriever = vectorstore.as_retriever(search_kwargs={"k": 1})

# 4. Now create the RetrievalQA chain
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",                # or "map_reduce" / "refine" if you prefer
    retriever=retriever,
    return_source_documents=False
)


Device set to use cuda:0


In [68]:
# --------------------------------------------------
# 3.2 Test it on the first user_input
# --------------------------------------------------
res = qa_chain({"query": test_data["question"][0]})
print("WHOLE RESPONSE:")
print(res)
print(res["result"])

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


WHOLE RESPONSE:
{'query': 'What is the latest version of Scorpion Solitaire?', 'result': "Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.\n\nScorpion Solitaire\n- User\n- -\n-\n- Insufficient votes\n- Softonic\n- 6\n- Not bad\n- Not bad\n- Your rating:\n-\nYour rating has been saved\nOops, something's gone wrong. Try again.\n- License:\n- Free\n- Language:\n- OS:\n-\n- Latest version:\n- 1.1 17/02/06\n- Last month's downloads:\n- 41\n- -...\n- PocketBalone\n- Billiard Master\n- Chess\n- ...\n- 21\nScorpion Solitaire\nSoftonic - Top Downloads\nTop Downloads\n- Pocket Uno\nPlay the classic card game on your Pocket PC\n- Multiplayer Championship Poker -...\n- PocketBalone\n- Billiard Master\n- Trivial Pursuit\n- ...\n- 21\nScorpion Solitaire.\n\nQuestion: What is the latest version of Scorpion Solitaire?\nHelpful Answer: According to the Softonic website, the latest version o

In [7]:
from ragas.metrics import NonLLMContextPrecisionWithReference
from ragas.metrics import NonLLMContextRecall
from ragas import SingleTurnSample

# --------------------------------------------------
# 4. Experiment with evaluation
# --------------------------------------------------

# ---------------------------------------------
# Context Precision Test on the first question
# ---------------------------------------------

# Instantiate an object, for some reason
context_precision = NonLLMContextPrecisionWithReference()

# THIS IS HOW YOU GET THE PAGE CONTENT FOR A RETRIEVED DOCUMENT, 0 HERE IS THE FIRST RESULT
print("RETREIVED CONTENT")
print(res["source_documents"][0].page_content) # this will need to be a list, in practice

# THIS IS HOW YOU GET THE SOURCE CONTEXT, 0 HERE IS THE FIRST DOCUMENT
print("SOURCE CONTENT")
print(test_data["context"][0]) # this will need to be a list, in practice

sample = SingleTurnSample(
    retrieved_contexts=[res["source_documents"][0].page_content], 
    reference_contexts=[test_data["context"][0]]
)
print("CONTEXT PRECISION")
print(await context_precision.single_turn_ascore(sample))

# HOLY SHIT IT WORKS!!!

# -------------------------------
# Context Recall Test
# -------------------------------

# stupid fucking object
context_recall = NonLLMContextRecall()
print("CONTEXT RECALL")
print(await context_recall.single_turn_ascore(sample))

# I'm actually tony stark btw... I think we can do answer relevancy and faithfulness later??? 
# Right now, scale up these two...


RETREIVED CONTENT
Scorpion Solitaire
- User
- -
-
- Insufficient votes
- Softonic
- 6
- Not bad
- Not bad
- Your rating:
-
Your rating has been saved
Oops, something's gone wrong. Try again.
- License:
- Free
- Language:
- OS:
-
- Latest version:
- 1.1 17/02/06
- Last month's downloads:
- 41
- -...
- PocketBalone
- Billiard Master
- Chess
- ...
- 21
Scorpion Solitaire
Softonic - Top Downloads
Top Downloads
- Pocket Uno
Play the classic card game on your Pocket PC
- Multiplayer Championship Poker -...
- PocketBalone
- Billiard Master
- Trivial Pursuit
- ...
- 21
Scorpion Solitaire.
SOURCE CONTENT
Scorpion Solitaire
- User
- -
-
- Insufficient votes
- Softonic
- 6
- Not bad
- Not bad
- Your rating:
-
Your rating has been saved
Oops, something's gone wrong. Try again.
- License:
- Free
- Language:
- OS:
-
- Latest version:
- 1.1 17/02/06
- Last month's downloads:
- 41
- -...
- PocketBalone
- Billiard Master
- Chess
- ...
- 21
Scorpion Solitaire
Softonic - Top Downloads
Top Downloads
- Poc

In [None]:
# --------------------------------------------------
# 4.1 Experiment with faithfulness
# --------------------------------------------------

# This prompt is pretty good for getting the number of claims
prompt_text = """
[SYSTEM]
    TEXT: Given a question, an answer, and sentences from the answer analyze the complexity of each sentence given under 'sentences' and break down each sentence into one or more fully understandable statements while also ensuring no pronouns are used in each statement. Format the outputs in JSON.
Please return the output in a JSON format that complies with the following schema as specified in JSON Schema and OpenAPI specification:
{'$defs': {'SentenceComponents': {'properties': {'sentence_index': {'description': 'The index of the sentence', 'title': 'Sentence Index', 'type': 'integer'}, 'simpler_statements': {'description': 'A list of simpler statements that can be directly inferred from the context', 'items': {'type': 'string'}, 'title': 'Simpler Statements', 'type': 'array'}}, 'required': ['sentence_index', 'simpler_statements'], 'title': 'SentenceComponents', 'type': 'object'}}, 'properties': {'sentences': {'description': 'A list of sentences and their simpler versions', 'items': {'$ref': '#/$defs/SentenceComponents'}, 'title': 'Sentences', 'type': 'array'}}, 'required': ['sentences'], 'title': 'SentencesSimplified', 'type': 'object'}
These are some examples to show how to perform the above instruction
Given a question, an answer, and sentences from the answer analyze the complexity of each sentence given under 'sentences' and break down each sentence into one or more fully understandable statements while also ensuring no pronouns are used in each statement. Format the outputs in JSON.
input: {
    "question": "Who was Albert Einstein and what is he best known for?",
    "answer": "He was a German-born theoretical physicist, widely acknowledged to be one of the greatest and most influential physicists of all time. He was best known for developing the theory of relativity, he also made important contributions to the development of the theory of quantum mechanics.",
    "sentences": {
        "0": "He was a German-born theoretical physicist, widely acknowledged to be one of the greatest and most influential physicists of all time.",
        "1": "He was best known for developing the theory of relativity, he also made important contributions to the development of the theory of quantum mechanics."
    }
}
output: {
    "sentences": [
        {
            "sentence_index": 0,
            "simpler_statements": [
                "Albert Einstein was a German-born theoretical physicist.",
                "Albert Einstein is recognized as one of the greatest and most influential physicists of all time."
            ]
        },
        {
            "sentence_index": 1,
            "simpler_statements": [
                "Albert Einstein was best known for developing the theory of relativity.",
                "Albert Einstein also made important contributions to the development of the theory of quantum mechanics."
            ]
        }
    ]
}
Now perform the above instruction with the following input
input: {
    "question": "When was the first super bowl?",
    "answer": "The first superbowl was held on Jan 15, 1967",
    "sentences": {}
}
Respond only with a valid JSON object that complies with the specified schema. **CRITICAL: Your entire response must be ONLY the single JSON object, starting with { and ending with }. Do not add any explanations, notes, or other text before or after the JSON object.**
output:
"""
raw = llm(prompt_text, return_full_text=False)
print(raw)


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.



[SYSTEM]
    TEXT: Given a question, an answer, and sentences from the answer analyze the complexity of each sentence given under 'sentences' and break down each sentence into one or more fully understandable statements while also ensuring no pronouns are used in each statement. Format the outputs in JSON.
Please return the output in a JSON format that complies with the following schema as specified in JSON Schema and OpenAPI specification:
{'$defs': {'SentenceComponents': {'properties': {'sentence_index': {'description': 'The index of the sentence', 'title': 'Sentence Index', 'type': 'integer'}, 'simpler_statements': {'description': 'A list of simpler statements that can be directly inferred from the context', 'items': {'type': 'string'}, 'title': 'Simpler Statements', 'type': 'array'}}, 'required': ['sentence_index', 'simpler_statements'], 'title': 'SentenceComponents', 'type': 'object'}}, 'properties': {'sentences': {'description': 'A list of sentences and their simpler versions', 

In [None]:

# --------------------------------------------------
# 4.2 Extract the JSON object from the LLM's response
# --------------------------------------------------
import json
import re

def extract_json_from_output(text):
    """Extract the JSON object from the LLM's response."""
    # Find the last occurrence of a complete JSON object
    json_pattern = r'\{(?:[^{}]|(?:\{(?:[^{}]|(?:\{(?:[^{}]|(?:\{[^{}]*\}))*\}))*\}))*\}'
    matches = list(re.finditer(json_pattern, text))
    
    if matches:
        # Get the last match (which should be our output JSON)
        json_str = matches[-1].group(0)
        try:
            return json.loads(json_str)
        except json.JSONDecodeError:
            return None
    return None

json_result = extract_json_from_output(raw)

# Print the extracted JSON in a readable format
if json_result:
    print("Extracted JSON:")
    print(json.dumps(json_result, indent=2))
else:
    print("OH SHIT")
    print("Failed to extract valid JSON")

In [59]:
# --------------------------------------------------
# 4.3 Test faithfulness prompts
# --------------------------------------------------

FaithfulnessPrompt = """
[SYSTEM]
TEXT: You are a precise analyzer of factual claims. Your task is to evaluate if claims made in a given 'response' are supported by a provided 'context', identify claims unique to the context, and detail your reasoning meticulously.

**CORE TASK:**
1.  Strictly identify atomic claims made *only* within the `response` text.
2.  Strictly identify atomic claims made *only* within the `context` text (i.e., not mentioned in the `response`).
3.  Verify if the claims from the `response` are supported by the `context`.
4.  Outline your step-by-step reasoning process clearly separating response analysis from context analysis.

**Follow these steps sequentially and precisely:**

1.  **Analyze Response ONLY:** Read *only* the `response` text. Break it down into the smallest possible distinct, atomic factual statements that are *directly stated*. List these identified statements internally. This forms the candidate pool for `response_claims`.
2.  **Finalize `response_claims`:** Review the statements identified in Step 1. Ensure they are truly atomic and directly from the `response`. Populate the `response_claims` list in the JSON with these finalized claims. **Do not proceed until this step is complete and accurate.**
3.  **Analyze Context ONLY:** Now, read the `context` text. Identify all distinct, atomic factual claims within it. List these internally.
4.  **Identify `context_only_claims`:** Compare the full list of context claims (from Step 3) with the finalized `response_claims` list (from Step 2). Create the `context_only_claims` list by including only those context claims that are *NOT* present in `response_claims`.
5.  **Verify `response_claims` against Context:** For *each* claim in the `response_claims` list (from Step 2), check if the `context` text fully supports it.
    *   "Supported" means *all* information in the response claim is explicitly stated or directly inferable from the context.
    *   "Unsupported" means the context is silent, contradicts, or only partially supports the response claim.
    *   Populate `supported_response_claims` and `unsupported_response_claims` based *only* on this verification of `response_claims`.
6.  **Generate Justifications for `response_claims`:** For *each* claim in `response_claims`, write a brief justification explaining *why* it is supported or unsupported, referencing the context comparison. Populate the `justifications` dictionary. **CRITICAL: Keys in this dictionary MUST exactly match the strings in the `response_claims` list.**
7.  **Count `response_claims`:** Count the total number of claims in the `response_claims` list (`num_total_response_claims`). Count the number of claims in the `supported_response_claims` list (`num_supported_response_claims`). Prepare the justification strings for these counts.
8.  **Outline Reasoning (Chain of Thought):** Describe the *exact* steps you took, emphasizing the separation of response analysis (Steps 1-2) from context analysis and comparison (Steps 3-7). Detail *what* claims were identified in the response *before* mentioning context analysis. Explain the verification process for *each* response claim. Populate the `chain_of_thought` field.
9.  **Format Output:** Assemble the final JSON object ensuring all fields are present and in the specified order.

**CRITICAL INSTRUCTIONS:**
*   `response_claims` MUST contain ONLY atomic statements directly extracted from the `response`. Absolutely NO information from the `context` should influence this list.
*   `context_only_claims` MUST contain ONLY atomic statements found in the `context` but demonstrably NOT in the `response`.
*   Verification, justifications, counts, and count justifications apply ONLY to `response_claims`.
*   Keys in the `justifications` dictionary MUST be exact string matches of claims found in the `response_claims` list.

Input will contain a 'response' (text to analyze) and 'context' (reference information).

Format your output as a single, valid JSON object with the following structure (Pay attention to the order):
{
  "response": "the original response text",
  "context": "the provided context",
  "chain_of_thought": "Detailed step-by-step reasoning, explicitly separating response analysis from context analysis and verification.",
  "response_claims": [
    "atomic claim 1 extracted ONLY from response",
    // ... more claims solely from response
  ],
  "context_only_claims": [
    "claim A found ONLY in context, not in response",
    // ... more claims solely from context
  ],
  "supported_response_claims": [
    "response claim 1 verified against context",
    // ... more supported response claims
  ],
  "unsupported_response_claims": [
    "response claim 2 verified against context",
    // ... more unsupported response claims
  ],
  "justifications": {
    // Keys MUST be claims from the 'response_claims' list
    "atomic claim 1 extracted ONLY from response": "Justification based on context",
    "atomic claim 2 extracted ONLY from response": "Justification based on context (e.g., 'not in context', 'contradicted by context')"
    // ... justifications ONLY for response_claims
  },
  "num_supported_response_claims_justification": "Explanation of how the supported count for response claims was derived.",
  "num_supported_response_claims": 1, // Integer count of supported response claims
  "num_total_response_claims_justification": "Explanation of how the total count for response claims was derived.",
  "num_total_response_claims": 2 // Integer count of total claims identified ONLY in the response
}

Be thorough. Your entire output must be ONLY the single JSON object, starting with { and ending with }. Do not add any explanations, notes, or other text before or after the JSON object.

**EXAMPLES:** Pay close attention to the detailed `chain_of_thought`, the strict separation of claim sources in `response_claims` vs `context_only_claims`, the exact matching required for `justifications` keys, and the field order.

input: {
  "response": "Marie Curie was a Polish-born physicist and chemist. She was the first woman to win a Nobel Prize.",
  "context": "Marie Skłodowska Curie (7 November 1867 – 4 July 1934) was a Polish and naturalized-French physicist and chemist who conducted pioneering research on radioactivity. She was the first woman to win a Nobel Prize, the first person to win Nobel Prizes in two scientific fields, and the only person to win Nobel Prizes in multiple scientific fields."
}

output: {
  "response": "Marie Curie was a Polish-born physicist and chemist. She was the first woman to win a Nobel Prize.",
  "context": "Marie Skłodowska Curie (7 November 1867 – 4 July 1934) was a Polish and naturalized-French physicist and chemist who conducted pioneering research on radioactivity. She was the first woman to win a Nobel Prize, the first person to win Nobel Prizes in two scientific fields, and the only person to win Nobel Prizes in multiple scientific fields.",
  "chain_of_thought": "1. Analyzed Response ONLY: Identified two atomic claims directly stated: 'Marie Curie was a Polish-born physicist and chemist' and 'Marie Curie was the first woman to win a Nobel Prize'. 2. Finalized `response_claims` with these two claims. 3. Analyzed Context: Identified claims about full name, dates, French naturalization, radioactivity research, winning Nobel in two fields, being the only person to win in multiple fields. 4. Identified `context_only_claims`: Compared context claims (Step 3) to response claims (Step 2). Found claims about full name, dates, French naturalization, radioactivity, winning in two fields, and being the only person to win multiple are unique to the context. 5. Verified `response_claims`: Claim 1 ('Polish-born physicist and chemist') is supported by context. Claim 2 ('first woman to win Nobel Prize') is supported by context. 6. Generated Justifications: Created justifications for the two response claims based on context verification. Keys exactly match `response_claims`. 7. Counted `response_claims`: Total=2, Supported=2. Prepared count justifications. 8. Formatted JSON output.",
  "response_claims": [
    "Marie Curie was a Polish-born physicist and chemist",
    "Marie Curie was the first woman to win a Nobel Prize"
  ],
  "context_only_claims": [
    "Marie Curie's full name was Marie Skłodowska Curie",
    "Marie Curie lived from 7 November 1867 to 4 July 1934",
    "Marie Curie was also naturalized-French",
    "Marie Curie conducted pioneering research on radioactivity",
    "Marie Curie was the first person to win Nobel Prizes in two scientific fields",
    "Marie Curie was the only person to win Nobel Prizes in multiple scientific fields"
  ],
  "supported_response_claims": [
    "Marie Curie was a Polish-born physicist and chemist",
    "Marie Curie was the first woman to win a Nobel Prize"
  ],
  "unsupported_response_claims": [],
  "justifications": {
    "Marie Curie was a Polish-born physicist and chemist": "in context - context confirms Polish physicist and chemist",
    "Marie Curie was the first woman to win a Nobel Prize": "in context"
  },
  "num_supported_response_claims_justification": "Derived by comparing the 2 claims identified strictly from the response against the context. Both were found to be fully supported.",
  "num_supported_response_claims": 2,
  "num_total_response_claims_justification": "Derived by breaking down the response ONLY into distinct factual statements. 2 atomic claims were identified.",
  "num_total_response_claims": 2
}

input: {
  "response": "The Great Wall of China is 13,171 miles long and visible from space.",
  "context": "The Great Wall of China is an ancient series of walls and fortifications spanning approximately 13,171 miles (21,196 kilometers). Construction began as early as the 7th century BCE. It attracts millions of tourists each year."
}

output: {
  "response": "The Great Wall of China is 13,171 miles long and visible from space.",
  "context": "The Great Wall of China is an ancient series of walls and fortifications spanning approximately 13,171 miles (21,196 kilometers). Construction began as early as the 7th century BCE. It attracts millions of tourists each year.",
  "chain_of_thought": "1. Analyzed Response ONLY: Identified two atomic claims directly stated: 'The Great Wall of China is 13,171 miles long' and 'The Great Wall of China is visible from space'. 2. Finalized `response_claims` with these two claims. 3. Analyzed Context: Identified claims about being ancient walls/fortifications, km length, construction start date, attracting millions of tourists. 4. Identified `context_only_claims`: Compared context claims (Step 3) to response claims (Step 2). Found claims about ancient walls, km length, construction date, and tourism are unique to the context. 5. Verified `response_claims`: Claim 1 ('13,171 miles long') is supported by context ('approximately 13,171 miles'). Claim 2 ('visible from space') is unsupported as context does not mention it. 6. Generated Justifications: Created justifications for the two response claims based on context verification. Keys exactly match `response_claims`. 7. Counted `response_claims`: Total=2, Supported=1. Prepared count justifications. 8. Formatted JSON output.",
  "response_claims": [
    "The Great Wall of China is 13,171 miles long",
    "The Great Wall of China is visible from space"
  ],
  "context_only_claims": [
    "The Great Wall of China is an ancient series of walls and fortifications",
    "The Great Wall of China spans 21,196 kilometers",
    "Construction began as early as the 7th century BCE",
    "The Great Wall attracts millions of tourists each year"
  ],
  "supported_response_claims": [
    "The Great Wall of China is 13,171 miles long"
  ],
  "unsupported_response_claims": [
    "The Great Wall of China is visible from space"
  ],
  "justifications": {
    "The Great Wall of China is 13,171 miles long": "in context - context states length is approximately 13,171 miles",
    "The Great Wall of China is visible from space": "not in context - context does not mention visibility from space"
  },
  "num_supported_response_claims_justification": "Derived by comparing the 2 claims identified strictly from the response against the context. 1 claim was found to be fully supported.",
  "num_supported_response_claims": 1,
  "num_total_response_claims_justification": "Derived by breaking down the response ONLY into distinct factual statements. 2 atomic claims were identified.",
  "num_total_response_claims": 2
}

Now perform the above instruction with the following input:

input: {
    "response": "William Shakespeare wrote famous plays like 'Hamlet' and 'Romeo and Juliet'. He was born in Stratford-upon-Avon in 1564 and died in 1616. He also invented over 1000 new words.",
     "context": "William Shakespeare (bapt. 26 April 1564 – 23 April 1616) was an English playwright, poet and actor. He is widely regarded as the greatest writer in the English language and the world's pre-eminent dramatist. He is often called England's national poet and the 'Bard of Avon'. His extant works consist of some 39 plays, 154 sonnets, three long narrative poems, and a few other verses. His plays include 'Hamlet', 'Othello', 'King Lear', and 'Macbeth', as well as 'Romeo and Juliet'. He was born and raised in Stratford-upon-Avon."
}

Respond only with a valid JSON object that complies with the specified schema.
**CRITICAL: Your entire response must be ONLY the single JSON object, starting with { and ending with }.
Do not add any explanations, notes, or other text before or after the JSON object.**

output:
"""

raw_faith = llm(FaithfulnessPrompt, return_full_text=False)
print(raw_faith)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.



[SYSTEM]
TEXT: You are a precise analyzer of factual claims. Your task is to evaluate if claims made in a given 'response' are supported by a provided 'context', identify claims unique to the context, and detail your reasoning meticulously.

**CORE TASK:**
1.  Strictly identify atomic claims made *only* within the `response` text.
2.  Strictly identify atomic claims made *only* within the `context` text (i.e., not mentioned in the `response`).
3.  Verify if the claims from the `response` are supported by the `context`.
4.  Outline your step-by-step reasoning process clearly separating response analysis from context analysis.

**Follow these steps sequentially and precisely:**

1.  **Analyze Response ONLY:** Read *only* the `response` text. Break it down into the smallest possible distinct, atomic factual statements that are *directly stated*. List these identified statements internally. This forms the candidate pool for `response_claims`.
2.  **Finalize `response_claims`:** Review the

In [54]:
# --------------------------------------------------
# 4.3 More Faithfulness work
# --------------------------------------------------

json_faith = extract_json_from_output(raw)

# Print the extracted JSON in a readable format
if json_faith:
    print("Extracted JSON:")
    print(json.dumps(json_faith, indent=2))
else:
    print("OH SHIT")
    print("Failed to extract valid JSON")

Extracted JSON:
{
  "response": "The first superbowl was held on Jan 15, 1967",
  "context": "The first Super Bowl, known as Super Bowl I, was played on January 15, 1967, at the Los Angeles Memorial Coliseum in Los Angeles, California. The game featured the Green Bay Packers and the Kansas City Chiefs.",
  "chain_of_thought": "1. Analyzed Response ONLY: Identified one atomic claim directly stated: 'The first superbowl was held on Jan 15, 1967'. 2. Finalized `response_claims` with this claim. 3. Analyzed Context: Identified claims about date, location, featured teams, game. 4. Identified `context_only_claims`: Compared context claims (Step 3) to response claims (Step 2). Found claims about date and location are unique to the context. 5. Verified `response_claims`: Claim 1 ('The first superbowl was held on Jan 15, 1967') is supported by context ('played on January 15, 1967, at the Los Angeles Memorial Coliseum in Los Angeles, California'). 6. Generated Justifications: Created justificati

In [60]:
# --------------------------------------------------
# 4.4 Develop Faithfulness Prompt
# --------------------------------------------------
def create_faithfulness_prompt(response: str, context: str) -> str:
    """
    Generates the faithfulness evaluation prompt with the provided
    response and context strings embedded.

    Args:
        response: The response string generated by the LLM.
        context: The context string used for RAG.

    Returns:
        The complete prompt string ready for the LLM.
    """
    # Escape the response and context for safe JSON embedding
    response_escaped = json.dumps(response)
    context_escaped = json.dumps(context)

    # Define the prompt template using an f-string
    # Note: Literal curly braces { } needed for the JSON examples within the
    # prompt must be escaped by doubling them {{ }}
    prompt = f"""
[SYSTEM]
TEXT: You are a precise analyzer of factual claims. Your task is to evaluate if claims made in a given 'response' are supported by a provided 'context', identify claims unique to the context, and detail your reasoning meticulously.

**CORE TASK:**
1.  Strictly identify atomic claims made *only* within the `response` text.
2.  Strictly identify atomic claims made *only* within the `context` text (i.e., not mentioned in the `response`).
3.  Verify if the claims from the `response` are supported by the `context`.
4.  Outline your step-by-step reasoning process clearly separating response analysis from context analysis.

**Follow these steps sequentially and precisely:**

1.  **Analyze Response ONLY:** Read *only* the `response` text. Break it down into the smallest possible distinct, atomic factual statements that are *directly stated*. List these identified statements internally. This forms the candidate pool for `response_claims`.
2.  **Finalize `response_claims`:** Review the statements identified in Step 1. Ensure they are truly atomic and directly from the `response`. Populate the `response_claims` list in the JSON with these finalized claims. **Do not proceed until this step is complete and accurate.**
3.  **Analyze Context ONLY:** Now, read the `context` text. Identify all distinct, atomic factual claims within it. List these internally.
4.  **Identify `context_only_claims`:** Compare the full list of context claims (from Step 3) with the finalized `response_claims` list (from Step 2). Create the `context_only_claims` list by including only those context claims that are *NOT* present in `response_claims`.
5.  **Verify `response_claims` against Context:** For *each* claim in the `response_claims` list (from Step 2), check if the `context` text fully supports it.
    *   "Supported" means *all* information in the response claim is explicitly stated or directly inferable from the context.
    *   "Unsupported" means the context is silent, contradicts, or only partially supports the response claim.
    *   Populate `supported_response_claims` and `unsupported_response_claims` based *only* on this verification of `response_claims`.
6.  **Generate Justifications for `response_claims`:** For *each* claim in `response_claims`, write a brief justification explaining *why* it is supported or unsupported, referencing the context comparison. Populate the `justifications` dictionary. **CRITICAL: Keys in this dictionary MUST exactly match the strings in the `response_claims` list.**
7.  **Count `response_claims`:** Count the total number of claims in the `response_claims` list (`num_total_response_claims`). Count the number of claims in the `supported_response_claims` list (`num_supported_response_claims`). Prepare the justification strings for these counts.
8.  **Outline Reasoning (Chain of Thought):** Describe the *exact* steps you took, emphasizing the separation of response analysis (Steps 1-2) from context analysis and comparison (Steps 3-7). Detail *what* claims were identified in the response *before* mentioning context analysis. Explain the verification process for *each* response claim. Populate the `chain_of_thought` field.
9.  **Format Output:** Assemble the final JSON object ensuring all fields are present and in the specified order.

**CRITICAL INSTRUCTIONS:**
*   `response_claims` MUST contain ONLY atomic statements directly extracted from the `response`. Absolutely NO information from the `context` should influence this list.
*   `context_only_claims` MUST contain ONLY atomic statements found in the `context` but demonstrably NOT in the `response`.
*   Verification, justifications, counts, and count justifications apply ONLY to `response_claims`.
*   Keys in the `justifications` dictionary MUST be exact string matches of claims found in the `response_claims` list.

Input will contain a 'response' (text to analyze) and 'context' (reference information).

Format your output as a single, valid JSON object with the following structure (Pay attention to the order):
{{
  "response": "the original response text",
  "context": "the provided context",
  "chain_of_thought": "Detailed step-by-step reasoning, explicitly separating response analysis from context analysis and verification.",
  "response_claims": [
    "atomic claim 1 extracted ONLY from response",
    // ... more claims solely from response
  ],
  "context_only_claims": [
    "claim A found ONLY in context, not in response",
    // ... more claims solely from context
  ],
  "supported_response_claims": [
    "response claim 1 verified against context",
    // ... more supported response claims
  ],
  "unsupported_response_claims": [
    "response claim 2 verified against context",
    // ... more unsupported response claims
  ],
  "justifications": {{
    // Keys MUST be claims from the 'response_claims' list
    "atomic claim 1 extracted ONLY from response": "Justification based on context",
    "atomic claim 2 extracted ONLY from response": "Justification based on context (e.g., 'not in context', 'contradicted by context')"
    // ... justifications ONLY for response_claims
  }},
  "num_supported_response_claims_justification": "Explanation of how the supported count for response claims was derived.",
  "num_supported_response_claims": 1, // Integer count of supported response claims
  "num_total_response_claims_justification": "Explanation of how the total count for response claims was derived.",
  "num_total_response_claims": 2 // Integer count of total claims identified ONLY in the response
}}

Be thorough. Your entire output must be ONLY the single JSON object, starting with {{ and ending with }}. Do not add any explanations, notes, or other text before or after the JSON object.

**EXAMPLES:** Pay close attention to the detailed `chain_of_thought`, the strict separation of claim sources in `response_claims` vs `context_only_claims`, the exact matching required for `justifications` keys, and the field order.

input: {{
  "response": "Marie Curie was a Polish-born physicist and chemist. She was the first woman to win a Nobel Prize.",
  "context": "Marie Skłodowska Curie (7 November 1867 – 4 July 1934) was a Polish and naturalized-French physicist and chemist who conducted pioneering research on radioactivity. She was the first woman to win a Nobel Prize, the first person to win Nobel Prizes in two scientific fields, and the only person to win Nobel Prizes in multiple scientific fields."
}}

output: {{
  "response": "Marie Curie was a Polish-born physicist and chemist. She was the first woman to win a Nobel Prize.",
  "context": "Marie Skłodowska Curie (7 November 1867 – 4 July 1934) was a Polish and naturalized-French physicist and chemist who conducted pioneering research on radioactivity. She was the first woman to win a Nobel Prize, the first person to win Nobel Prizes in two scientific fields, and the only person to win Nobel Prizes in multiple scientific fields.",
  "chain_of_thought": "1. Analyzed Response ONLY: Identified two atomic claims directly stated: 'Marie Curie was a Polish-born physicist and chemist' and 'Marie Curie was the first woman to win a Nobel Prize'. 2. Finalized `response_claims` with these two claims. 3. Analyzed Context: Identified claims about full name, dates, French naturalization, radioactivity research, winning Nobel in two fields, being the only person to win in multiple fields. 4. Identified `context_only_claims`: Compared context claims (Step 3) to response claims (Step 2). Found claims about full name, dates, French naturalization, radioactivity, winning in two fields, and being the only person to win multiple are unique to the context. 5. Verified `response_claims`: Claim 1 ('Polish-born physicist and chemist') is supported by context. Claim 2 ('first woman to win Nobel Prize') is supported by context. 6. Generated Justifications: Created justifications for the two response claims based on context verification. Keys exactly match `response_claims`. 7. Counted `response_claims`: Total=2, Supported=2. Prepared count justifications. 8. Formatted JSON output.",
  "response_claims": [
    "Marie Curie was a Polish-born physicist and chemist",
    "Marie Curie was the first woman to win a Nobel Prize"
  ],
  "context_only_claims": [
    "Marie Curie's full name was Marie Skłodowska Curie",
    "Marie Curie lived from 7 November 1867 to 4 July 1934",
    "Marie Curie was also naturalized-French",
    "Marie Curie conducted pioneering research on radioactivity",
    "Marie Curie was the first person to win Nobel Prizes in two scientific fields",
    "Marie Curie was the only person to win Nobel Prizes in multiple scientific fields"
  ],
  "supported_response_claims": [
    "Marie Curie was a Polish-born physicist and chemist",
    "Marie Curie was the first woman to win a Nobel Prize"
  ],
  "unsupported_response_claims": [],
  "justifications": {{
    "Marie Curie was a Polish-born physicist and chemist": "in context - context confirms Polish physicist and chemist",
    "Marie Curie was the first woman to win a Nobel Prize": "in context"
  }},
  "num_supported_response_claims_justification": "Derived by comparing the 2 claims identified strictly from the response against the context. Both were found to be fully supported.",
  "num_supported_response_claims": 2,
  "num_total_response_claims_justification": "Derived by breaking down the response ONLY into distinct factual statements. 2 atomic claims were identified.",
  "num_total_response_claims": 2
}}

input: {{
  "response": "The Great Wall of China is 13,171 miles long and visible from space.",
  "context": "The Great Wall of China is an ancient series of walls and fortifications spanning approximately 13,171 miles (21,196 kilometers). Construction began as early as the 7th century BCE. It attracts millions of tourists each year."
}}

output: {{
  "response": "The Great Wall of China is 13,171 miles long and visible from space.",
  "context": "The Great Wall of China is an ancient series of walls and fortifications spanning approximately 13,171 miles (21,196 kilometers). Construction began as early as the 7th century BCE. It attracts millions of tourists each year.",
  "chain_of_thought": "1. Analyzed Response ONLY: Identified two atomic claims directly stated: 'The Great Wall of China is 13,171 miles long' and 'The Great Wall of China is visible from space'. 2. Finalized `response_claims` with these two claims. 3. Analyzed Context: Identified claims about being ancient walls/fortifications, km length, construction start date, attracting millions of tourists. 4. Identified `context_only_claims`: Compared context claims (Step 3) to response claims (Step 2). Found claims about ancient walls, km length, construction date, and tourism are unique to the context. 5. Verified `response_claims`: Claim 1 ('13,171 miles long') is supported by context ('approximately 13,171 miles'). Claim 2 ('visible from space') is unsupported as context does not mention it. 6. Generated Justifications: Created justifications for the two response claims based on context verification. Keys exactly match `response_claims`. 7. Counted `response_claims`: Total=2, Supported=1. Prepared count justifications. 8. Formatted JSON output.",
  "response_claims": [
    "The Great Wall of China is 13,171 miles long",
    "The Great Wall of China is visible from space"
  ],
  "context_only_claims": [
    "The Great Wall of China is an ancient series of walls and fortifications",
    "The Great Wall of China spans 21,196 kilometers",
    "Construction began as early as the 7th century BCE",
    "The Great Wall attracts millions of tourists each year"
  ],
  "supported_response_claims": [
    "The Great Wall of China is 13,171 miles long"
  ],
  "unsupported_response_claims": [
    "The Great Wall of China is visible from space"
  ],
  "justifications": {{
    "The Great Wall of China is 13,171 miles long": "in context - context states length is approximately 13,171 miles",
    "The Great Wall of China is visible from space": "not in context - context does not mention visibility from space"
  }},
  "num_supported_response_claims_justification": "Derived by comparing the 2 claims identified strictly from the response against the context. 1 claim was found to be fully supported.",
  "num_supported_response_claims": 1,
  "num_total_response_claims_justification": "Derived by breaking down the response ONLY into distinct factual statements. 2 atomic claims were identified.",
  "num_total_response_claims": 2
}}

Now perform the above instruction with the following input:

input: {{
    "response": {response_escaped},
    "context": {context_escaped}
}}

Respond only with a valid JSON object that complies with the specified schema.
**CRITICAL: Your entire response must be ONLY the single JSON object, starting with {{ and ending with }}.
Do not add any explanations, notes, or other text before or after the JSON object.**

output:
"""
    return prompt

In [75]:
# --------------------------------------------------
# 4.4 Test Faithfulness Prompt and interactions
# --------------------------------------------------

# RES IS OUTPUT FROM CHAIN HERE
# --- Extract the part after "Helpful Answer:" ---
result_string = res.get("result", "") # Get the result string, default to empty if key missing
marker = "Helpful Answer:"

# Use partition to split the string based on the first occurrence of the marker
before_marker, found_marker, after_marker = result_string.partition(marker)

# Check if the marker was actually found
if found_marker:
    # If found, take the part after it and strip leading/trailing whitespace
    extracted_answer = after_marker.strip()
else:
    # If marker wasn't found, the whole result might be the answer,
    # or something went wrong. Handle as appropriate.
    # For now, let's assume the whole string is the fallback, but print a warning.
    print(f"\nWarning: Marker '{marker}' not found in result. Using full result string.")
    extracted_answer = result_string.strip() # Use the whole string

print("\nEXTRACTED ANSWER:")
print(extracted_answer)

##############################
response = extracted_answer
context = test_data["context"][0]
# print(context)

################################################################################
# JUDGE LLM????
model_id = "meta-llama/Llama-3.2-3B-Instruct"
tokenizer_id = model_id # Usually the same

# Load tokenizer explicitly first to set pad token
tokenizer = AutoTokenizer.from_pretrained(tokenizer_id)

# Make pipeline
hf_gen = pipeline(
    "text-generation",
    model=model_id,
    tokenizer=tokenizer_id,
    torch_dtype=torch.float16,
    device=0,
    max_new_tokens=1024,
    batch_size=20,
    temperature=0.01,
    do_sample=True,
)
hf_gen.tokenizer.pad_token = hf_gen.tokenizer.eos_token

# Wrap it in LangChain’s LLM interface
llm_judge = HuggingFacePipeline(pipeline=hf_gen, model_kwargs={"stop": ["}\n```", "}\n\n", "}\nHere is"]})
################################################################################

modified_faithfulness_prompt = create_faithfulness_prompt(response, context)
example_faith = llm_judge(modified_faithfulness_prompt, return_full_text=False)
print(example_faith)


EXTRACTED ANSWER:
According to the Softonic website, the latest version of Scorpion Solitaire is 1.1, released on 17/02/06.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.



[SYSTEM]
TEXT: You are a precise analyzer of factual claims. Your task is to evaluate if claims made in a given 'response' are supported by a provided 'context', identify claims unique to the context, and detail your reasoning meticulously.

**CORE TASK:**
1.  Strictly identify atomic claims made *only* within the `response` text.
2.  Strictly identify atomic claims made *only* within the `context` text (i.e., not mentioned in the `response`).
3.  Verify if the claims from the `response` are supported by the `context`.
4.  Outline your step-by-step reasoning process clearly separating response analysis from context analysis.

**Follow these steps sequentially and precisely:**

1.  **Analyze Response ONLY:** Read *only* the `response` text. Break it down into the smallest possible distinct, atomic factual statements that are *directly stated*. List these identified statements internally. This forms the candidate pool for `response_claims`.
2.  **Finalize `response_claims`:** Review the

In [79]:
# --------------------------------------------------
# 4.6 mess with answer relevancy
# ----------------------------------------------------
def create_answer_relevancy_prompt(question: str, answer: str, context: str, num_questions: int = 3) -> str:
    """
    Generates the Answer Relevancy question generation prompt.

    Args:
        question: The original question asked.
        answer: The answer generated by the LLM.
        context: The context provided to the LLM (can inform question style).
        num_questions: The number of questions to generate (default 3).

    Returns:
        The complete prompt string ready for the LLM.
    """
    # Escape inputs for safe JSON embedding
    question_escaped = json.dumps(question)
    answer_escaped = json.dumps(answer)
    context_escaped = json.dumps(context) # Context might still be useful for style/scope

    # Define the prompt template
    prompt = f"""
[SYSTEM]
TEXT: You are an expert analytical tool. Your task is to generate plausible questions based on a given answer, considering the original question and context. The goal is to create questions that the provided answer directly and comprehensively addresses.

**CORE TASK:**
Generate {num_questions} distinct questions for which the provided `answer` is a relevant and informative response.

**Follow these steps sequentially and precisely:**

1.  **Analyze the `answer` ONLY:** Read the `answer` text carefully. Identify the key pieces of information, main points, and specific details presented in the answer. List these internally.
2.  **Analyze the original `question`:** Understand the topic, scope, and intent of the original `question` that led to the `answer`.
3.  **Analyze the `context` (Optional but Recommended):** Briefly review the `context` to understand the source material's style and scope, which might help in phrasing the generated questions appropriately. However, the generated questions MUST be answerable primarily by the provided `answer`.
4.  **Generate {num_questions} Questions:** Based *only* on the key information identified in the `answer` (Step 1), formulate {num_questions} distinct questions.
    *   Each generated question should be clearly and fully answerable using *only* the information present in the provided `answer`.
    *   The questions should be natural-sounding and relevant to the topic suggested by the original `question` and the `answer`.
    *   Avoid generating questions that require information *not* present in the `answer`.
5.  **Outline Reasoning (Chain of Thought):** Describe the exact steps you took. Explain:
    *   What key information you extracted from the `answer`.
    *   How you considered the original `question`'s topic.
    *   How each generated question specifically relates to and is answerable by the content of the `answer`.
    *   Populate the `chain_of_thought` field with this detailed reasoning.
6.  **Format Output:** Assemble the final JSON object ensuring all fields are present and in the specified order.

**CRITICAL INSTRUCTIONS:**
*   The `generated_questions` list MUST contain exactly {num_questions} distinct questions.
*   Each generated question MUST be answerable using *only* the information explicitly present in the provided `answer`.
*   The generated questions should reflect the core information conveyed by the `answer`.

Input will contain the original 'question', the generated 'answer', and the 'context'.

Format your output as a single, valid JSON object with the following structure (Pay attention to the order):
{{
  "question": "the original question text",
  "answer": "the provided answer text",
  "context": "the provided context text",
  "chain_of_thought": "Detailed step-by-step reasoning: 1. Analyzed answer for key info. 2. Considered original question topic. 3. (Optional) Considered context style. 4. Formulated N questions based *only* on answer content. 5. Verified questions are answerable by the answer.",
  "generated_questions": [
    "Generated question 1 based ONLY on the answer",
    "Generated question 2 based ONLY on the answer",
    "Generated question 3 based ONLY on the answer"
    // ... up to N questions
  ],
  "num_generated_questions": {num_questions} // Integer count of generated questions
}}

Be thorough. Your entire output must be ONLY the single JSON object, starting with {{ and ending with }}. Do not add any explanations, notes, or other text before or after the JSON object.

**EXAMPLES:** Pay close attention to the detailed `chain_of_thought` and how the `generated_questions` strictly derive from the `answer`.

input: {{
  "question": "Who was Marie Curie?",
  "answer": "Marie Curie was a Polish-born physicist and chemist, famous for her pioneering research on radioactivity. She was the first woman to win a Nobel Prize.",
  "context": "Marie Skłodowska Curie (7 November 1867 – 4 July 1934) was a Polish and naturalized-French physicist and chemist who conducted pioneering research on radioactivity. She was the first woman to win a Nobel Prize, the first person to win Nobel Prizes in two scientific fields..."
}}

output: {{
  "question": "Who was Marie Curie?",
  "answer": "Marie Curie was a Polish-born physicist and chemist, famous for her pioneering research on radioactivity. She was the first woman to win a Nobel Prize.",
  "context": "Marie Skłodowska Curie (7 November 1867 – 4 July 1934) was a Polish and naturalized-French physicist and chemist who conducted pioneering research on radioactivity. She was the first woman to win a Nobel Prize, the first person to win Nobel Prizes in two scientific fields...",
  "chain_of_thought": "1. Analyzed Answer: Key info includes Polish-born physicist/chemist, research on radioactivity, first woman Nobel winner. 2. Considered Original Question: Broad 'Who was...' question. 3. Considered Context: Confirms details, adds more specifics not in answer. 4. Formulated 3 questions based *only* on answer content: Focused on nationality/profession, research area, and Nobel achievement mentioned. 5. Verified questions are answerable by the answer.",
  "generated_questions": [
    "What was Marie Curie's nationality and profession?",
    "What field did Marie Curie conduct pioneering research in?",
    "What significant 'first' did Marie Curie achieve regarding the Nobel Prize?"
  ],
  "num_generated_questions": 3
}}

input: {{
  "question": "What is the Great Wall of China?",
  "answer": "The Great Wall of China is a very long ancient wall in China.",
  "context": "The Great Wall of China is an ancient series of walls and fortifications spanning approximately 13,171 miles... Construction began as early as the 7th century BCE..."
}}

output: {{
  "question": "What is the Great Wall of China?",
  "answer": "The Great Wall of China is a very long ancient wall in China.",
  "context": "The Great Wall of China is an ancient series of walls and fortifications spanning approximately 13,171 miles... Construction began as early as the 7th century BCE...",
  "chain_of_thought": "1. Analyzed Answer: Key info is 'very long', 'ancient wall', 'in China'. 2. Considered Original Question: 'What is...' question. 3. Considered Context: Provides specifics like length, type, date. 4. Formulated 3 questions based *only* on the limited answer content: Focused on length description, age description, and location. 5. Verified questions are answerable by the answer.",
  "generated_questions": [
    "How is the length of the Great Wall of China described?",
    "What is stated about the age of the Great Wall of China?",
    "Where is the Great Wall located?"
  ],
  "num_generated_questions": 3
}}

Now perform the above instruction with the following input:

input: {{
    "question": {question_escaped},
    "answer": {answer_escaped},
    "context": {context_escaped}
}}

Respond only with a valid JSON object that complies with the specified schema.
**CRITICAL: Your entire response must be ONLY the single JSON object, starting with {{ and ending with }}.
Do not add any explanations, notes, or other text before or after the JSON object.**

output:
"""
    return prompt


In [84]:
################################################################################

# response = extracted_answer
context = test_data["context"][0]

# Use one of the previous test cases or define new ones
original_question = test_data["question"][0]
generated_answer = extracted_answer
context_info = context

# Generate the prompt
relevancy_prompt = create_answer_relevancy_prompt(original_question, generated_answer, context_info, num_questions=3)

# Print the generated prompt (optional, for verification)
# print("\n--- Generated Prompt for Answer Relevancy ---")
# print(relevancy_prompt)
# print("--- End of Prompt ---")

# Now you would pass 'relevancy_prompt' to your LLM
relevancy_out = llm_judge(relevancy_prompt, return_full_text=False)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


In [88]:
# print(relevancy_out)
# Parse the JSON output from the LLM
generated_questions = extract_json_from_output(relevancy_out)
print(generated_questions['generated_questions'])

# Then proceed with embedding comparison...

['What is the version number of Scorpion Solitaire?', 'When was the latest version of Scorpion Solitaire released?', 'Who is the source of the latest version information for Scorpion Solitaire?']


In [None]:
import asyncio
import concurrent.futures
from tqdm.auto import tqdm # Progress bar library
# -------------------------------------
# 5. Scale up!!!!
# -------------------------------------

# --- Worker Function ---
# This function processes a single entry from the test data
def process_entry(args):
    """
    Processes a single test data entry: runs QA, prepares sample, calculates RAGAS metrics.
    Args:
        args (tuple): A tuple containing (index, entry_dict).
    Returns:
        dict: A dictionary containing results or error information.
    """
    index, entry = args
    try:
        # 1. Run QA Chain (Synchronous I/O bound task)
        print(f"Starting QA for index: {index}") # Optional: Track start
        res = qa_chain({"query": entry["question"]})
        print(f"Finished QA for index: {index}") # Optional: Track end

        # 2. Extract Contexts
        retrieved_contexts = [doc.page_content for doc in res.get("source_documents", [])]

        # Ensure reference_contexts is a list of strings
        reference_contexts_raw = entry.get("context", []) # Get reference context(s)
        if isinstance(reference_contexts_raw, str):
            reference_contexts = [reference_contexts_raw]
        elif isinstance(reference_contexts_raw, list):
            reference_contexts = reference_contexts_raw
        else:
            print(f"Warning: Unexpected type for reference context at index {index}. Treating as empty list.")
            reference_contexts = []

        # 3. Prepare RAGAS Sample
        # Adjust fields based on the metrics you are using.
        # Here assuming only context metrics are needed.
        sample = SingleTurnSample(
            retrieved_contexts=retrieved_contexts,
            reference_contexts=reference_contexts
            # You might need question, answer, etc. for other RAGAS metrics
            # question=entry["question"],
            # answer=res.get("result", "")
        )

        # 4. Calculate Metrics (Run async metrics concurrently)
        async def calculate_metrics_async(sample_to_score):
            """Helper async function to gather metric scores."""
            print(f"Calculating metrics for index: {index}") # Optional
            # Add more metric tasks here if needed
            precision_task = context_precision.single_turn_ascore(sample_to_score)
            recall_task = context_recall.single_turn_ascore(sample_to_score)

            # Run concurrently and wait for results
            precision_score, recall_score = await asyncio.gather(
                precision_task,
                recall_task
                # Add other awaited tasks here
            )
            print(f"Finished metrics for index: {index}") # Optional
            return precision_score, recall_score

        # Use asyncio.run to execute the async helper from this sync thread
        precision, recall = asyncio.run(calculate_metrics_async(sample))

        # 5. Prepare result dictionary
        result_data = {
            "index": index,
            "question": entry["question"],
            "retrieved_contexts": retrieved_contexts,
            "reference_contexts": reference_contexts,
            "context_precision": precision,
            "context_recall": recall,
            "raw_qa_response": res # Optional: store full response
        }
        # Optional: print immediate results per sample
        # print(f"METRICS FOR INDEX {index}: CP={precision:.4f}, CR={recall:.4f}")
        return result_data

    except Exception as e:
        print(f"Error processing index {index} - Question: {entry.get('question', 'N/A')}: {e}")
        # Optionally log the full traceback
        # import traceback
        # traceback.print_exc()
        return {
            "index": index,
            "question": entry.get('question', 'N/A'),
            "error": str(e)
        }
        
# --- Main Execution ---
all_results = []
# Prepare arguments for mapping (index, entry) tuples
tasks = list(enumerate(test_data)) # Creates [(0, test_data[0]), (1, test_data[1]), ...]

# Use ThreadPoolExecutor for parallel execution
num_workers = 20
print(f"Starting evaluation for {len(tasks)} samples with {num_workers} workers...")

with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor:
    # Use executor.map to process tasks in parallel.
    # Results will be yielded in the order tasks were submitted (or close to it).
    # Wrap with tqdm for a progress bar.
    results_iterator = list(tqdm(executor.map(process_entry, tasks), total=len(tasks), desc="Evaluating Samples"))

all_results = results_iterator # list() consumes the iterator and gathers all results

print("\n--- Evaluation Complete ---")

# --- Post-Processing (Example) ---
valid_results = [r for r in all_results if 'error' not in r]
errors = [r for r in all_results if 'error' in r]

if valid_results:
    avg_precision = sum(r['context_precision'] for r in valid_results) / len(valid_results)
    avg_recall = sum(r['context_recall'] for r in valid_results) / len(valid_results)
    print(f"\nProcessed {len(valid_results)} samples successfully.")
    print(f"Average Context Precision: {avg_precision:.4f}")
    print(f"Average Context Recall: {avg_recall:.4f}")
else:
    print("No samples processed successfully.")

if errors:
    print(f"\nEncountered {len(errors)} errors during processing.")
    # Optionally print details of errors
    # for error_result in errors:
    #     print(f"  Index {error_result['index']}: {error_result['error']}")

# Now `all_results` contains a list of dictionaries, each holding the
# computed metrics and other info for one entry from test_data, or error details.
# print(all_results[0]) # Example: Inspect the first result
    

Starting evaluation for 19 samples with 19 workers...
Starting QA for index: 0
Starting QA for index: 1
Starting QA for index: 2
Starting QA for index: 3
Starting QA for index: 4
Starting QA for index: 5
Starting QA for index: 6
Starting QA for index: 7
Starting QA for index: 8
Starting QA for index: 9
Starting QA for index: 10
Starting QA for index: 11
Starting QA for index: 12
Starting QA for index: 13
Starting QA for index: 14
Starting QA for index: 15
Starting QA for index: 16
Starting QA for index: 17
Starting QA for index: 18


Evaluating Samples:   0%|          | 0/19 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for

Finished QA for index: 12
Calculating metrics for index: 12
Finished metrics for index: 12
Finished QA for index: 2
Calculating metrics for index: 2
Finished metrics for index: 2
Finished QA for index: 9
Calculating metrics for index: 9
Finished metrics for index: 9
Finished QA for index: 10
Calculating metrics for index: 10
Finished metrics for index: 10
Finished QA for index: 18
Calculating metrics for index: 18
Finished metrics for index: 18
Finished QA for index: 11
Calculating metrics for index: 11
Finished metrics for index: 11
Finished QA for index: 7
Calculating metrics for index: 7
Finished metrics for index: 7
Finished QA for index: 17
Calculating metrics for index: 17
Finished metrics for index: 17
Finished QA for index: 1
Calculating metrics for index: 1
Finished metrics for index: 1
Finished QA for index: 14
Calculating metrics for index: 14
Finished metrics for index: 14
Finished QA for index: 5
Calculating metrics for index: 5
Finished metrics for index: 5
Finished QA fo