## Evaluate RAG Quality
##### Evaluates the app by running an experiment in Langsmith with the following metrics:
-  Accuracy- Is the answer correct according to the ground truth answer
-  Recall- How many of the relevant documents were retrieved
-  Truthfulness - Did the response stray from the documents or hallucinate?

Do not add code to this to run a regular rag inferences or it may put the wrong tracing project name. Use inference_tester.ipynb instead


In [1]:
from dotenv import load_dotenv
import os, sys
import streamlit as st

load_dotenv('/Users/drew_wilkins/Drews_Files/Drew/Python/VSCode/.env')

# Add the parent directory to sys.path so you can import your modules from a subdirectory
sys.path.append(os.path.abspath('..'))

import rag
from rag import CONFIG
from langsmith.evaluation import evaluate
from langsmith import Client
from langchain import hub
from langchain_openai import ChatOpenAI
from langsmith import traceable

In [2]:
# Config LangSmith if you also want the traces
os.environ["LANGCHAIN_API_KEY"] = st.secrets["LANGCHAIN_API_KEY"]
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_PROJECT"] = "langchain_evaluator.ipynb on ASK main/local"

In [3]:
client = Client()

eval_model = "gpt-4o-mini"

### Set up the Evaluators

In [4]:
import json


def validate_and_fix_json(raw_output: str, required_fields: dict) -> dict:
    """
    Validates and fixes JSON output, ensuring required fields are present.

    Args:
        raw_output (str): The raw JSON string from the LLM.
        required_fields (dict): A dictionary of required fields with their default values.

    Returns:
        dict: Validated and fixed JSON output aligned with required fields.
    """
    try:
        parsed_response = json.loads(raw_output)
    except json.JSONDecodeError as e:
        print(f"JSONDecodeError: {e}")
        print(f"Raw output: {raw_output}")

        # Attempt common fixes
        if raw_output.endswith('"'):
            raw_output += "}"  # Close the JSON object if improperly terminated
        elif not raw_output.endswith('}'):
            raw_output += '"}'  # Add missing closing braces

        # Retry parsing
        try:
            parsed_response = json.loads(raw_output)
        except json.JSONDecodeError as final_e:
            print(f"Failed to fix JSON: {final_e}")
            parsed_response = {}

    # Ensure required fields are present with default values
    validated_response = {key: parsed_response.get(
        key, default) for key, default in required_fields.items()}

    return validated_response

In [4]:
grade_prompt_accuracy = prompt = hub.pull(
    "cot_qa_drew")


def accuracy_evaluator(run, example) -> dict:
    """
    A simple evaluator for detecting generation accuracy
    """

    # Inputs to Evaluator from Eval set
    query = example.inputs["question"]
    ground_truth_answer = example.outputs["ground_truth_answer"]

    # Inputs to Evaluator from RAG output
    prediction = run.outputs.get("answer")
    if prediction is None:
        print(f"'answer' key is missing in outputs: {run.outputs}")

    llm = ChatOpenAI(model=eval_model, temperature=0,
                     tags=["accuracy_evaluator"])

    # Define the grader
    answer_grader = grade_prompt_accuracy | llm

    # Get score by passing populated prompt to the evaluator
    # The prompt template takes in "query", "ground_truth_answer", "answer" as inputs
    grader_response = answer_grader.invoke({"query": query,
                                           "ground_truth_answer": ground_truth_answer,
                                            "student_answer": prediction}
                                           )

    required_fields = {
        "correctness": None,  # Default correctness value
        "explanation": "No explanation provided."  # Default explanation
    }

    validated_response = validate_and_fix_json(
        grader_response, required_fields)

    correctness = validated_response["correctness"]
    explanation = validated_response["explanation"]

    return {
        "key": "Accuracy",
        "score": correctness,  # Numerical score expected by the evaluator
        "value": "Correct" if correctness == 1 else "Incorrect",  # Optional categorical value
        "comment": explanation,  # Additional metadata
    }

In [6]:
grade_prompt_recall = prompt = hub.pull(
    "recall_drew")


def recall_evaluator(run, example) -> dict:
    """
    A simple evaluator for checing the retrieved documents against the question
    """

    # Inputs to Evaluator from Eval set
    query = example.inputs["question"]

    # Inputs to Evaluator from RAG output
    documents = run.outputs.get("context")
    if documents is None:
        print(f"'context' key is missing in outputs: {run.outputs}")
    sources = run.outputs.get("sources")
    if sources is None:
        print(f"'sources' key is missing in outputs: {run.outputs}")

    # LLM grader
    # other models gpt-4-turbo gpt-4o-mini
    llm = ChatOpenAI(model=eval_model, temperature=0,
                     tags=["recall_evaluator"])

    # Structured prompt
    answer_grader = grade_prompt_recall | llm

    # Get score by passing populated prompt to the evaluator
    # The evaluator template expects "documents" as input
    # The evaluator returns "Score" (int) and "Explanation" (str) as output
    grader_response = answer_grader.invoke({"query": query,
                                            "documents": documents})

    required_fields = {
        "Score": 0,  # Default value
        "Explanation": "No explanation provided."  # Default value
    }

    validated_response = validate_and_fix_json(
        grader_response, required_fields)
    score = validated_response["Score"]
    explanation = validated_response["Explanation"]

    return {"key": "Recall", "score": score, "sources": sources, "comment": explanation}

In [7]:
grade_prompt_truthfulness = prompt = hub.pull(
    "langchain-ai/rag-answer-hallucination")


def hallucination_evaluator(run, example) -> dict:
    """
    A simple evaluator for detecting generation hallucinations
    """

    # Inputs to Evaluator from Eval set
    input_question = example.inputs["question"]

    # Inputs to Evaluator from RAG output
    documents = run.outputs.get("context")
    if not documents:
        print(
            f"No documents retrieved. Skipping grading. Outputs: {run.outputs}")
        return {
            "key": "Truthfulness",
            "score": 0,  # Or any default score you'd prefer for empty context
            "comment": "No relevant documents were found to evaluate the answer."
        }

    prediction = run.outputs.get("answer")
    if not prediction:
        print(f"'answer' key is missing in outputs: {run.outputs}")
        return {
            "key": "Truthfulness",
            "score": 0,
            "comment": "No answer provided to evaluate."
        }

    # LLM grader
    # other models gpt-4-turbo gpt-4o-mini
    llm = ChatOpenAI(model=eval_model, temperature=0,
                     tags=["hallucination_evaluator"])

    # Structured prompt
    answer_grader = grade_prompt_truthfulness | llm

    # Get score by passing populated prompt to the evaluator
    # The evaluator template expects "documents" and "student_answer" as inputs
    # The evaluator returns "Score" (int) and "Explanation" (str) as output
    grader_response = answer_grader.invoke({"documents": documents,
                                            "student_answer": prediction})

    required_fields = {
        "Score": None,  # Default value
        "Explanation": "No explanation provided."  # Default value
    }

    validated_response = validate_and_fix_json(
        grader_response, required_fields)

    score = validated_response["Score"]
    explanation = validated_response["Explanation"]

    return {"key": "Truthfulness", "score": score, "comment": explanation}

### Config your Evaluation

In [8]:
dataset_name = "ASK-groundtruth-v2"
# ASK-groundtruth_v1   initial_EDA

data = dataset_name

# I don't think I need this one anymore
# data = client.list_examples(dataset_name=dataset_name, splits=["1_question"])
data = client.list_examples(dataset_name=dataset_name, example_ids=[
                            "2eea461c-3653-4c36-961f-256c70ee6268"])

# experiment_prefix = "ASK_Eval_code_whichpromptisbroke"
experiment_prefix = "ASK_CLART_ContextualCompressionRetriever-gpt-4o-mini"

experiment_description = "Testing ContextualCompressionRetriever. \n\nNAMING CONVENTION\nAppName_TestMetrics_TestVariables \nExample: ASK_ART_llm-gpt-4o-mini\nTest metrics are CLART = Cost, Latency, Accuracy, Recall, Truthfulness. Test Variable is gpt-4o-mini which we will compare against some other llm. Other example of TestMetrics could be Eval_cost, App_cost, App_time, etc."

### Run the Evaluation
 OpenAI API pricing is [here.](https://openai.com/api/pricing/)  
 Your billing is [here.](https://platform.openai.com/settings/organization/usage/activity)

In [9]:
evaluate(
    # maps the shape input from our example, which is a single-field dictionary, to the rag function we are testing, which accepts a string
    lambda input: rag.rag(input["question"]),
    data=data,
    # accuracy_evaluator, recall_evaluator, hallucination_evaluator
    evaluators=[accuracy_evaluator, recall_evaluator, hallucination_evaluator],
    experiment_prefix=experiment_prefix,
    description=experiment_description,
    num_repetitions=1,
    metadata=CONFIG,
)  # type: ignore    # This supresses an error

View the evaluation results for experiment: 'ASK_CLART_ContextualCompressionRetriever-gpt-4o-mini-8e4b21f1' at:
https://smith.langchain.com/o/3941ecea-6957-508c-9f4f-08ed62dc7d61/datasets/0b24ff94-f4f0-4197-89f3-765f835936c9/compare?selectedSessions=d6955dcd-2ab8-425f-ac27-777358b765f3




0it [00:00, ?it/s]

2025-01-17 19:59:25.669 
  command:

    streamlit run /Users/drew_wilkins/Drews_Files/Drew/Python/Repositories/ASK/.venv-main/lib/python3.11/site-packages/ipykernel_launcher.py [ARGUMENTS]


Retrieved context: 0 documents.


Error running evaluator <DynamicRunEvaluator accuracy_evaluator> on run 6e99bc68-6ad0-4531-8cf2-b4cac7de3616: TypeError('the JSON object must be str, bytes or bytearray, not dict')
Traceback (most recent call last):
  File "/Users/drew_wilkins/Drews_Files/Drew/Python/Repositories/ASK/.venv-main/lib/python3.11/site-packages/langsmith/evaluation/_runner.py", line 1573, in _run_evaluators
    evaluator_response = evaluator.evaluate_run(
                         ^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/drew_wilkins/Drews_Files/Drew/Python/Repositories/ASK/.venv-main/lib/python3.11/site-packages/langsmith/evaluation/evaluator.py", line 331, in evaluate_run
    result = self.func(
             ^^^^^^^^^^
  File "/Users/drew_wilkins/Drews_Files/Drew/Python/Repositories/ASK/.venv-main/lib/python3.11/site-packages/langsmith/run_helpers.py", line 617, in wrapper
    raise e
  File "/Users/drew_wilkins/Drews_Files/Drew/Python/Repositories/ASK/.venv-main/lib/python3.11/site-packages/langsmith/run_he

No documents retrieved. Skipping grading. Outputs: {'answer': "To dispose of your old Operational Dress Uniforms (ODUs), you can follow these steps:\n\n1. **Check for Local Regulations**: Before disposal, check with your local waste management authority or recycling center for any specific guidelines on disposing of uniforms or textiles. Some areas have regulations regarding textile waste.\n\n2. **Donation**: If the ODUs are still in good condition, consider donating them to organizations that accept military uniforms. This can include local veterans' organizations, thrift stores, or charities that support military personnel.\n\n3. **Recycling**: If the uniforms are too worn for reuse, look for textile recycling programs in your area. Many municipalities offer textile recycling services that can help reduce waste.\n\n4. **Disposal**: If no other options are available, the uniforms can be disposed of in the regular trash. However, it's advisable to ensure they are cut up or altered in a

Unnamed: 0,inputs.question,outputs.answer,outputs.sources,outputs.user_question,outputs.enriched_question,outputs.context,outputs.llm_sources,error,reference.ground_truth_answer,reference.ground_truth_sources,feedback.Accuracy,feedback.Recall,feedback.Truthfulness,execution_time,example_id,id
0,How Do I Dispose of My Old ODUs?,To dispose of your old Operational Dress Unifo...,[],How Do I Dispose of My Old ODUs?,How Do I Dispose of My Old ODUs?,[],"[Local Waste Management Guidelines, Military U...",,"Uniform items that are no longer serviceable, ...",[],,,0,9.444729,2eea461c-3653-4c36-961f-256c70ee6268,6e99bc68-6ad0-4531-8cf2-b4cac7de3616
