## Evaluate RAG Quality
##### Evaluates the app by running an experiment in Langsmith with the following metrics:
-  Accuracy- Is the answer correct according to the ground truth answer
-  Recall- How many of the relevant documents were retrieved
-  Truthfulness - Did the response stray from the documents or hallucinate?

Do not add code to this to run a regular rag inferences or it may put the wrong tracing project name. Use inference_tester.ipynb instead


In [1]:
from dotenv import load_dotenv
import os, sys
import streamlit as st

load_dotenv('/Users/drew_wilkins/Drews_Files/Drew/Python/VSCode/.env')

# Add the parent directory to sys.path so you can import your modules from a subdirectory
sys.path.append(os.path.abspath('..'))

import rag
from rag import CONFIG

In [2]:
# Config LangSmith if you also want the traces
os.environ["LANGCHAIN_API_KEY"] = st.secrets["LANGCHAIN_API_KEY"]
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_PROJECT"] = "langchain_evaluator.ipynb on ASK main/local"

In [3]:
import logging
from langsmith.evaluation import evaluate
from langsmith import Client
from langchain import hub
from langchain_openai import ChatOpenAI
from langsmith import traceable

# Set up logging
# logging.basicConfig(level=logging.DEBUG)
# logger = logging.getLogger()
# logger.setLevel(logging.DEBUG)

client = Client()

eval_model = "gpt-4o-mini"

### Set up the Evaluators

In [4]:
grade_prompt_accuracy = prompt = hub.pull(
    "cot_qa_drew")


def accuracy_evaluator(run, example) -> dict:
    """
    A simple evaluator for detecting generation accuracy
    """

    # Inputs to Evaluator from Eval set
    query = example.inputs["question"]
    ground_truth_answer = example.outputs["ground_truth_answer"]

    # Inputs to Evaluator from RAG output
    prediction = run.outputs["answer"]

    llm = ChatOpenAI(model=eval_model, temperature=0, stream_usage=True)

    # Define the grader
    answer_grader = grade_prompt_accuracy | llm

    # Get score by passing populated prompt to the evaluator
    # The prompt template takes in "query", "ground_truth_answer", "answer" as inputs
    grader_response = answer_grader.invoke({"query": query,
                                           "ground_truth_answer": ground_truth_answer,
                                            "student_answer": prediction}
                                           )

    correctness = grader_response["correctness"]
    explanation = grader_response["explanation"]

    return {
        "key": "Accuracy",
        "score": correctness,  # Numerical score expected by the evaluator
        "value": "Correct" if correctness == 1 else "Incorrect",  # Optional categorical value
        "comment": explanation,  # Additional metadata
    }

In [5]:
grade_prompt_recall = prompt = hub.pull(
    "recall_drew")


def recall_evaluator(run, example) -> dict:
    """
    A simple evaluator for checing the retrieved documents against the question
    """

    # Inputs to Evaluator from Eval set
    query = example.inputs["question"]

    # Inputs to Evaluator from RAG output
    documents = run.outputs["context"]
    sources = run.outputs["sources"]

    # LLM grader
    # other models gpt-4-turbo gpt-4o-mini
    llm = ChatOpenAI(model=eval_model, temperature=0, stream_usage=True)

    # Structured prompt
    answer_grader = grade_prompt_recall | llm

    # Get score by passing populated prompt to the evaluator
    # The evaluator template expects "documents" as input
    # The evaluator returns "Score" (int) and "Explanation" (str) as output
    grader_response = answer_grader.invoke({"query": query,
                                            "documents": documents})
    score = grader_response["Score"]
    explanation = grader_response.get("Explanation", "No explanation provided")

    return {"key": "Recall", "score": score, "sources": sources, "comment": explanation}

In [6]:
grade_prompt_truthfulness = prompt = hub.pull(
    "langchain-ai/rag-answer-hallucination")


def hallucination_evaluator(run, example) -> dict:
    """
    A simple evaluator for detecting generation hallucinations
    """

    # Inputs to Evaluator from Eval set
    input_question = example.inputs["question"]

    # Inputs to Evaluator from RAG output
    contexts = run.outputs["context"]
    prediction = run.outputs["answer"]

    # LLM grader
    # other models gpt-4-turbo gpt-4o-mini
    llm = ChatOpenAI(model=eval_model, temperature=0, stream_usage=True)

    # Structured prompt
    answer_grader = grade_prompt_truthfulness | llm

    # Get score by passing populated prompt to the evaluator
    # The evaluator template expects "documents" and "student_answer" as inputs
    # The evaluator returns "Score" (int) and "Explanation" (str) as output
    grader_response = answer_grader.invoke({"documents": contexts,
                                            "student_answer": prediction})
    score = grader_response["Score"]
    explanation = grader_response.get("Explanation", "No explanation provided")

    return {"key": "Truthfulness", "score": score, "comment": explanation}

### Config your Evaluation

In [7]:
dataset_name = "ASK-groundtruth-v2"
# ASK-groundtruth_v1   initial_EDA

data = dataset_name

# I don't think I need this one anymore
# data = client.list_examples(dataset_name=dataset_name, splits=["1_question"])
# data = client.list_examples(dataset_name=dataset_name, example_ids=[
#                            "2eea461c-3653-4c36-961f-256c70ee6268"])

# experiment_prefix = "ASK_Eval_code_whichpromptisbroke"
experiment_prefix = "ASK_ART_AnswerWithSources-gpt-4o-mini"

experiment_description = "Baseline test with cleaner QA Eval Set. using gpt-4o-mini for Eval and gpt-4o-mini for RAG. \n\nNAMING CONVENTION\nAppName_TestMetrics_TestVariables \nExample: ASK_ART_llm-gpt-4o-mini\nTest metrics are ART = Accuracy, Recall, Truthfulness. Test Variable is gpt-4o-mini which we will compare against some other llm. Other example of TestMetrics could be Eval_cost, App_cost, App_time, etc."

### Run the Evaluation
 OpenAI API pricing is [here.](https://openai.com/api/pricing/)  
 Your billing is [here.](https://platform.openai.com/settings/organization/usage/activity)

In [8]:
evaluate(
    # maps the shape input from our example, which is a single-field dictionary, to the rag function we are testing, which accepts a string
    lambda input: rag.rag(input["question"]),
    data=data,
    # accuracy_evaluator, recall_evaluator, hallucination_evaluator
    evaluators=[accuracy_evaluator, recall_evaluator, hallucination_evaluator],
    experiment_prefix=experiment_prefix,
    description=experiment_description,
    num_repetitions=1,
    metadata=CONFIG,
)  # type: ignore    # This supresses an error

View the evaluation results for experiment: 'ASK_ART_AnswerWithSources-gpt-4o-mini-5a95cd79' at:
https://smith.langchain.com/o/3941ecea-6957-508c-9f4f-08ed62dc7d61/datasets/0b24ff94-f4f0-4197-89f3-765f835936c9/compare?selectedSessions=50bb176e-4dfa-438d-92f6-c679f3275553




0it [00:00, ?it/s]

2025-01-15 19:25:33.339 
  command:

    streamlit run /Users/drew_wilkins/Drews_Files/Drew/Python/Repositories/ASK/.venv-main/lib/python3.11/site-packages/ipykernel_launcher.py [ARGUMENTS]


Unnamed: 0,inputs.question,outputs.answer,outputs.sources,outputs.user_question,outputs.enriched_question,outputs.context,outputs.llm_sources,error,reference.ground_truth_answer,reference.ground_truth_sources,feedback.Accuracy,feedback.Recall,feedback.Truthfulness,execution_time,example_id,id
0,What is the Auxiliary Chain of Leadership and ...,The Auxiliary Chain of Leadership and Manageme...,"[AUXILIARY NATIONAL STAFF GUIDE, Auxiliary Man...",What is the Auxiliary Chain of Leadership and ...,What is the Auxiliary Chain of Leadership and ...,[page_content='19 The Auxiliary Chain of Leade...,"[COMDTINST M16790.1G, Auxiliary Chain of Leade...",,The Auxiliary Chain of Leadership and Manageme...,[],1,10,1,12.068415,ab998c5d-bed3-41a5-9108-007848bd6658,4a9a08ad-2240-4948-939e-8e9e5b3e5535
1,Name one of the elected leader positions for t...,One of the elected leader positions for the Di...,"[Auxiliary Manual, COMDTINST M16790.1G, Auxili...",Name one of the elected leader positions for t...,Name one of the elected leader positions for t...,[page_content='COMDTINST M16790.1G \n \n \n \n...,[COMDTINST M16790.1G],,One of the elected leader positions for the Di...,[],1,10,1,2.358701,a9a0b5b7-d729-4c81-8bf3-17b70782179a,3e0dcdd4-092b-4ba5-b53f-d45d056baf8e
2,Describe the purpose of the flotilla and revie...,The purpose of a flotilla within the Auxiliary...,"[Auxiliary Flotilla Procedures Guide, Auxiliar...",Describe the purpose of the flotilla and revie...,Describe the purpose of the flotilla and revie...,[page_content=' \n1-2 A. The Flotilla \nThe...,"[Flotilla Administration Manual, Chapter 1, Fl...",,The flotilla is the basic organizational unit ...,"[Auxiliary Flotilla, Procedures Guide]",1,10,1,9.816504,25a2dbd2-04b9-4e70-8cb4-c2b0cb1ad766,0b70a401-3b3f-4dda-bfb9-6ee43975b019
3,When walking abreast and overtaking a senior o...,When walking abreast and overtaking a senior o...,"[Auxiliary Flotilla Procedures Guide, Navigati...",When walking abreast and overtaking a senior o...,When walking abreast and overtaking a senior o...,[page_content=' \n6-5 National Ensign at the...,"[6-5 National Ensign at the stern staff, B.2. ...",,"When walking and overtaking a senior, come abr...",[],0,0,0,3.174102,1f698eaa-02ed-466f-942b-9b466eb24f99,08a3130f-a15c-43b5-a0b0-66e69f8d3f93
4,What are the requirements to join the Auxiliary?,To join the United States Coast Guard Auxiliar...,"[Auxiliary Flotilla Procedures Guide, Auxiliar...",What are the requirements to join the Auxiliary?,What are the requirements to join the Auxiliary?,"[page_content='1-8 a. Must, by the date of ...","[Auxiliary Training Handbook – Aviation, COMDT...",,"To join the Auxiliary, all of these requiremen...",[],0,10,1,4.806933,cb558ae2-d7b1-429d-954a-d0d2d9d7bd15,bf5e8908-41ba-42dd-91f1-5b2d3eca1651
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
141,Is the Coast Guard Mutual Assistance Program (...,"Yes, the Coast Guard Mutual Assistance Program...",[AUXILIARY COAST GUARD MUTUAL ASSISTANCE AMBAS...,Is the Coast Guard Mutual Assistance Program (...,Is the Coast Guard Mutual Assistance Program (...,[page_content='United States Coast Guard Auxil...,"[COMDTINST M16790.1G, Section F.2. Coast Guard...",,"Yes, the Coast Guard Mutual Assistance Program...",[],1,10,1,3.282748,f37eb9cd-2446-49d4-a139-87bb7c3f71b6,37ec53f8-6fa1-414b-bd37-77dee86f934f
142,Can an Auxiliarist use the Coast Guard Exchange?,"Yes, Auxiliarists of appropriate age are autho...",[ALAUX 014/23 EXPANSION OF COAST GUARD EXCHANG...,Can an Auxiliarist use the Coast Guard Exchange?,Can an Auxiliarist use the Coast Guard Exchange?,[page_content='Page 1 of 1 \n07 APR 2023 \n...,[ALAUX 014/23],,"Yes, Auxiliarists are authorized to use the Co...",[],1,10,1,3.016256,f4e77283-b478-4feb-ac59-4c1601e778a8,b7046d38-b3ee-441e-a8da-b7b7a4360c56
143,On what occasion would you wear a Tropical Blu...,The Tropical Blue uniform is worn year-round o...,"[Auxiliary Manual, COMDTINST M16790.1G, AUXILI...",On what occasion would you wear a Tropical Blu...,On what occasion would you wear a Tropical Blu...,[page_content='COMDTINST M16790.1G \n \n \n \n...,"[COMDTINST M16790.1G, COMDTINST M1020.6K, AUX-...",,The Tropical Blue Uniform is typically worn fo...,[],1,10,1,5.227383,fd1632be-17f9-49a3-bf2c-83eb3693afb4,a166017f-c8d3-4438-92ea-9541e092f3d9
144,Who is eligible for flotilla elections?,"To be eligible for flotilla elections, a candi...","[Auxiliary Manual, COMDTINST M16790.1G, Auxili...",Who is eligible for flotilla elections?,Who is eligible for flotilla elections?,[page_content='COMDTINST M16790.1G \n \n \n \n...,[COMDTINST M16790.1G],,Eligible Auxiliarists are those who meet the e...,[],1,10,1,9.260633,fddb8c31-7c76-4e32-922a-f1d6af258195,9790997b-2c02-4d55-aca2-3e784ce5ae09
