## Evaluate RAG Quality
##### Evaluates the app by running an experiment in Langsmith with the following metrics:
-  Accuracy- Is the answer correct according to the ground truth answer
-  Recall- How many of the relevant documents were retrieved
-  Truthfulness - Did the response stray from the documents or hallucinate?

Do not add code to this to run a regular rag inferences or it may put the wrong tracing project name. Use inference_tester.ipynb instead


In [12]:
from dotenv import load_dotenv
import os, sys
import streamlit as st

load_dotenv(
    '/Users/drew_wilkins/Drews_Files/Drew/Python/Localcode/.env', override=True)


# Add the parent directory to sys.path so you can import your modules from a subdirectory
sys.path.append(os.path.abspath('..'))
import rag
from rag import CONFIG

from langsmith.evaluation import evaluate
from langsmith import Client
from langchain import hub
from langchain_openai import ChatOpenAI
from langsmith import traceable
from langsmith.utils import ContextThreadPoolExecutor
from concurrent.futures import TimeoutError

#### **OPTIONAL:** Recored traces of rag. Required for cost and token info

In [13]:
# Config LangSmith observability if you want to see the traces for this notebook
# This assumes you have any traces left in your monthly usage allotment LOL!
# os.environ["LANGCHAIN_API_KEY"] = st.secrets["LANGCHAIN_API_KEY"]
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_PROJECT"] = "inference_tester.ipynb_on_ASK_main"

#### Select your LangSmith account based on API key

In [14]:
# choose the Langsmith account you want to use based on the API key
client = Client(api_key=os.environ["LANGCHAIN_API_KEY"])

eval_model = "gpt-4o-mini"

#### **OPTIONAL:** Debugging

In [4]:
import logging

logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger("langsmith")

### Set up the Evaluators

JSON Helper function

In [15]:
import json
from typing import Dict, Any


def validate_and_fix_json(raw_output: Any, required_fields: Dict[str, Any]) -> Dict[str, Any]:
    """
    Validates and fixes JSON output, ensuring required fields are present.

    Args:
        raw_output (Any): The raw JSON string or dictionary from the LLM.
        required_fields (dict): A dictionary of required fields with their default values.

    Returns:
        dict: Validated and fixed JSON output aligned with required fields.
    """
    # If the input is already a dictionary, skip parsing
    if isinstance(raw_output, dict):
        parsed_response = raw_output
    else:
        try:
            # Attempt to parse the JSON string
            parsed_response = json.loads(raw_output)
        except json.JSONDecodeError as e:
            print(f"JSONDecodeError: {e}")
            print(f"Raw output: {raw_output}")

            # Attempt common fixes
            if isinstance(raw_output, str):
                if raw_output.strip().endswith('"'):
                    raw_output = raw_output.rstrip('"') + '"}'
                elif not raw_output.strip().endswith('}'):
                    raw_output += '}'

                # Retry parsing
                try:
                    parsed_response = json.loads(raw_output)
                except json.JSONDecodeError as final_e:
                    print(f"Failed to fix JSON: {final_e}")
                    parsed_response = {}
            else:
                # If it's not a string and cannot be parsed, fallback to empty dict
                parsed_response = {}

    # Ensure required fields are present with default values
    validated_response = {
        key: parsed_response.get(key, default) for key, default in required_fields.items()
    }

    return validated_response

LLM Helper function

In [16]:
from langsmith.utils import ContextThreadPoolExecutor
from concurrent.futures import TimeoutError


def invoke_with_timeout(grader, input_data, timeout=60):
    """
    Invokes an LLM grader with a timeout.

    Args:
        grader: The structured LangChain prompt + LLM model.
        input_data (dict): The input dictionary for the LLM grader.
        timeout (int): The maximum time to wait before returning defaults.

    Returns:
        dict: The validated response from the grader or default values.
    """

    def invoke_grader():
        return grader.invoke(input_data)

    with ContextThreadPoolExecutor() as executor:
        future = executor.submit(invoke_grader)
        try:
            return future.result(timeout=timeout)
        except TimeoutError:
            print("LLM grader call timed out. Cancelling future and returning None.")
            future.cancel()

In [17]:
grade_prompt_accuracy = prompt = hub.pull(
    "drew-wks/cot_qa")

# https://smith.langchain.com/hub/drew-wks/cot_qa?organizationId=adac21b1-016d-49e4-84f0-672bf1a6e7b1


def accuracy_evaluator(run, example) -> dict:
    """
    A simple evaluator for detecting generation accuracy
    """

    # Inputs to Evaluator from Eval set
    query = example.inputs["question"]
    print(f"Accuracy eval: retrieving result for question: {query}")
    ground_truth_answer = example.outputs["ground_truth_answer"]

    # Inputs to Evaluator from RAG output
    prediction = run.outputs.get("answer")
    if prediction is None:
        print(f"'answer' key is missing in outputs: {run.outputs}")

    llm = ChatOpenAI(model=eval_model, temperature=0,
                     tags=["accuracy_evaluator"])

    # Define the grader
    answer_grader = grade_prompt_accuracy | llm

    # Get score by passing populated prompt to the evaluator
    # The prompt template takes in "query", "ground_truth_answer", "answer" as inputs
    print("Passing result to LLM to grade accuracy")
    grader_response = invoke_with_timeout(answer_grader, {
        "query": query,
        "ground_truth_answer": ground_truth_answer,
        "student_answer": prediction
    })

    if grader_response is None:
        return {
            "key": "Accuracy",
            "score": 0,
            "comment": "LLM evaluation timed out."
        }

    required_fields = {
        "correctness": 0,  # Default correctness value
        "explanation": "No explanation provided."  # Default explanation
    }

    validated_response = validate_and_fix_json(
        grader_response, required_fields)

    correctness = validated_response["correctness"]
    explanation = validated_response["explanation"]

    return {
        "key": "Accuracy",
        "score": correctness,  # Numerical score expected by the evaluator
        "comment": explanation,  # Additional metadata
    }

In [18]:
grade_prompt_recall = prompt = hub.pull(
    "drew-wks/recall_drew")


def recall_evaluator(run, example) -> dict:
    """
    A simple evaluator for checing the retrieved documents against the question
    """

    # Inputs to Evaluator from Eval set
    query = example.inputs["question"]
    print(f"Recall eval: retrieving docs for question: {query}")

    # Inputs to Evaluator from RAG output
    documents = run.outputs.get("context")
    if documents is None:
        print(f"'context' key is missing in outputs: {run.outputs}")
    sources = run.outputs.get("sources")
    if sources is None:
        print(f"'sources' key is missing in outputs: {run.outputs}")

    # LLM grader
    # other models gpt-4-turbo gpt-4o-mini
    llm = ChatOpenAI(model=eval_model, temperature=0,
                     tags=["recall_evaluator"])

    # Structured prompt
    answer_grader = grade_prompt_recall | llm

    # Get score by passing populated prompt to the evaluator
    # The evaluator template expects "documents" as input
    # The evaluator returns "Score" (int) and "Explanation" (str) as output
    print("Passing result to LLM to grade recall")
    grader_response = invoke_with_timeout(answer_grader, {
        "query": query,
        "documents": documents
    })

    if grader_response is None:
        return {
            "key": "Recall",
            "score": 0,
            "sources": sources,
            "comment": "LLM evaluation timed out."
        }

    required_fields = {
        "Score": 0,  # Default value
        "Explanation": "No explanation provided."  # Default value
    }

    validated_response = validate_and_fix_json(
        grader_response, required_fields)

    score = validated_response["Score"]
    explanation = validated_response["Explanation"]

    return {"key": "Recall", "score": score, "sources": sources, "comment": explanation}

In [19]:
grade_prompt_truthfulness = prompt = hub.pull(
    "langchain-ai/rag-answer-hallucination")


def hallucination_evaluator(run, example) -> dict:
    """
    A simple evaluator for detecting generation hallucinations
    """

    # Inputs to Evaluator from Eval set
    input_question = example.inputs["question"]
    print(
        f"Truthfulness eval: retrieving result for question: {input_question}")

    # Inputs to Evaluator from RAG output
    documents = run.outputs.get("context")
    if not documents:
        print(
            f"No documents retrieved. Skipping grading. Outputs: {run.outputs}")
        return {
            "key": "Truthfulness",
            "score": 0,  # Or any default score you'd prefer for empty context
            "comment": "No relevant documents were found to evaluate the answer."
        }

    prediction = run.outputs.get("answer")
    if not prediction:
        print(f"'answer' key is missing in outputs: {run.outputs}")
        return {
            "key": "Truthfulness",
            "score": 0,
            "comment": "No answer provided to evaluate."
        }
    print("Passing result to LLM to grade truth")
    # LLM grader
    # other models gpt-4-turbo gpt-4o-mini
    llm = ChatOpenAI(model=eval_model, temperature=0,
                     tags=["hallucination_evaluator"])

    # Structured prompt
    answer_grader = grade_prompt_truthfulness | llm

    # Get score by passing populated prompt to the evaluator
    # The evaluator template expects "documents" and "student_answer" as inputs
    # The evaluator returns "Score" (int) and "Explanation" (str) as output
    grader_response = invoke_with_timeout(answer_grader, {
        "documents": documents,
        "student_answer": prediction
    })

    if grader_response is None:
        return {
            "key": "Truthfulness",
            "score": 0,
            "comment": "LLM evaluation timed out."
        }

    required_fields = {
        "Score": None,  # Default value
        "Explanation": "No explanation provided."  # Default value
    }

    validated_response = validate_and_fix_json(
        grader_response, required_fields)

    score = validated_response["Score"]
    explanation = validated_response["Explanation"]

    return {"key": "Truthfulness", "score": score, "comment": explanation}

### Choose the evaluation dataset

**Option 1:** Use Lagnsmith dataset

In [20]:
dataset_name = "ASK-groundtruth-v3"
# ASK-groundtruth-v3 ASK-groundtruth_v1   initial_EDA one_example_easy

data = dataset_name

**Option 2:** Use single Lagnsmith example

In [21]:
# I don't think I need this one anymore
# data = client.list_examples(dataset_name=dataset_name, splits=["1_question"])
# data = client.list_examples(dataset_name=dataset_name, example_ids=[
#                            "a3326a11-5c24-4ceb-a04e-5bb708dd9b38"])

**Option 3:** Use JSONL file

NOTE: Be sure to set evaluate.upload_results=False 

In [8]:
from langsmith.schemas import Example
import json
import uuid

# one_example  dataset_0b24ff94-f4f0-4197-89f3-765f835936c9
examples_file_path = "one_example.jsonl"


# Convert JSONL data to `schemas.Example` objects
with open(examples_file_path, "r") as f:
    data = [
        Example(
            id=str(uuid.uuid4()),
            inputs={"question": entry["inputs"]["question"]},
            outputs={
                # Ground truth answer
                "ground_truth_answer": entry["outputs"]["ground_truth_answer"],
                # Ground truth sources
                "ground_truth_sources": entry["outputs"]["ground_truth_sources"]
            },
            metadata={
                # Dataset split information
                "dataset_split": entry["metadata"]["dataset_split"]
            }
        )
        for entry in map(json.loads, f)
    ]

### Name the experiment

In [22]:
# experiment_prefix = "ASK_Eval_code_whichpromptisbroke"
experiment_prefix = "ASK_AT_without-AnswersWithSources-prompt"

experiment_description = "Testing the rag on full set without AnswersWithSources using gpt-4o-mini. \n\nNAMING CONVENTION\nAppName_TestMetrics_TestVariables \nExample: ASK_ART_llm-gpt-4o-mini\nTest metrics are CLART = Cost, Latency, Accuracy, Recall, Truthfulness. Test Variable is gpt-4o-mini which we will compare against some other llm. Other example of TestMetrics could be Eval_cost, App_cost, App_time, etc."

### Run the Evaluation
 OpenAI API pricing is [here.](https://openai.com/api/pricing/)  
 Your billing is [here.](https://platform.openai.com/settings/organization/usage/activity)

In [23]:
try:
    evaluate(
        # maps the shape input from our example, which is a single-field dictionary, to the rag function we are testing, which accepts a string
        lambda input: rag.rag(input["question"]),
        data=data,
        client=client,  # Needed or it will use default API key
        # accuracy_evaluator, recall_evaluator, hallucination_evaluator
        evaluators=[accuracy_evaluator, hallucination_evaluator],
        experiment_prefix=experiment_prefix,
        description=experiment_description,
        max_concurrency=1,  # Limit concurrency to avoid crashing
        upload_results=True,  # Set to false for local testing
        num_repetitions=1,
        metadata=CONFIG,
        # type: ignore    # This supresses an error
    )
except Exception as e:
    print(f"Evaluation failed with error: {e}")

View the evaluation results for experiment: 'ASK_AT_without-AnswersWithSources-prompt-02b627da' at:
https://smith.langchain.com/o/adac21b1-016d-49e4-84f0-672bf1a6e7b1/datasets/e44c3abc-7871-49a3-a388-6197d7c2dcf3/compare?selectedSessions=5a505541-37aa-46d7-95b7-431eb31e99ae




0it [00:00, ?it/s]

Retrieved context: 5 documents.
LLM response received
Accuracy eval: retrieving result for question: Is it appropriate for junior officers to enter boats and vehicles first?
Passing result to LLM to grade accuracy
Retrieved context: 5 documents.
Truthfulness eval: retrieving result for question: Is it appropriate for junior officers to enter boats and vehicles first?
Passing result to LLM to grade truth
LLM response received
Accuracy eval: retrieving result for question: Who is eligible for flotilla elections?
Passing result to LLM to grade accuracy
Retrieved context: 5 documents.
Truthfulness eval: retrieving result for question: Who is eligible for flotilla elections?
Passing result to LLM to grade truth
LLM response received
Retrieved context: 5 documents.
Accuracy eval: retrieving result for question: On what occasion would you wear a Tropical Blue Uniform?
Passing result to LLM to grade accuracy
LLM response received
Truthfulness eval: retrieving result for question: On what occas