In [11]:
from dotenv import load_dotenv
import os, sys

load_dotenv('/Users/drew_wilkins/Drews_Files/Drew/Python/VSCode/.env')

# Add the parent directory to sys.path so you can import your modules from a subdirectory
sys.path.append(os.path.abspath('..'))

import rag
from rag import CONFIG

In [12]:
# Config LangSmith if you also want the traces
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_PROJECT"] = "langchain_evaluator.ipynb on ASK main/local"

In [13]:
from langsmith.evaluation import evaluate
from langsmith import Client

client = Client()

In [14]:
from langsmith.evaluation import LangChainStringEvaluator
from langchain_openai import ChatOpenAI


def prepare_cot_qa_data(run, example):
    '''
    Create a dictionary for the evaluator to use.

    run is the rag function 
    example is the example from the dataset
    '''
    return {
        "input": example.inputs["question"],
        "reference": example.outputs["ground_truth_answer"],
        "prediction": run.outputs["answer"],
    }


# Initialize the LLM with the desired model and temperature
llm = ChatOpenAI(model="gpt-4-turbo", temperature=0)

# cot_qa uses the CotQAEvalChain class which uses the prompt template here: https://smith.langchain.com/hub/wfh/cot_qa
accuracy_evaluator = LangChainStringEvaluator(
    "cot_qa",
    prepare_data=prepare_cot_qa_data,
)

In [8]:
from typing_extensions import Annotated, TypedDict
from langchain_openai import ChatOpenAI

# Grade output schema


class CorrectnessGrade(TypedDict):
    # Note that the order in the fields are defined is the order in which the model will generate them.
    # It is useful to put explanations before responses because it forces the model to think through
    # its final response before generating it:
    explanation: Annotated[str, ..., "Explain your reasoning for the score"]
    correct: Annotated[bool, ...,
                       "True if the answer is correct, False otherwise."]


# Grade prompt
correctness_instructions = """You are a teacher grading a quiz. 

You will be given a QUESTION, the GROUND TRUTH (correct) ANSWER, and the STUDENT ANSWER. 

Here is the grade criteria to follow:
(1) Grade the student answers based ONLY on their factual accuracy relative to the ground truth answer. 
(2) Ensure that the student answer does not contain any conflicting statements.
(3) It is OK if the student answer contains more information than the ground truth answer, as long as it is factually accurate relative to the  ground truth answer.

Correctness:
A correctness value of True means that the student's answer meets all of the criteria.
A correctness value of False means that the student's answer does not meet all of the criteria.

Explain your reasoning in a step-by-step manner to ensure your reasoning and conclusion are correct. 

Avoid simply stating the correct answer at the outset."""

# Grader LLM
grader_llm = ChatOpenAI(model="gpt-4o", temperature=0).with_structured_output(
    CorrectnessGrade, method="json_schema", strict=True)


def accuracy_evaluator(inputs: dict, outputs: dict) -> bool:
    """An evaluator for RAG answer accuracy"""
    # Ensure keys exist in the inputs and outputs dictionaries
    question = inputs.get("question")
    ground_truth_answer = inputs.get("ground_truth_answer")
    student_answer = outputs.get("answer")

    if not question or not ground_truth_answer or not student_answer:
        raise ValueError(
            "Required input, reference output, or student answer is missing")

    answers = f"""      QUESTION: {inputs['question']}
GROUND TRUTH ANSWER: {inputs['ground_truth_answer']}
STUDENT ANSWER: {outputs['answer']}"""

    # Run evaluator
    grade = grader_llm.invoke([{"role": "system", "content": correctness_instructions}, {
                              "role": "user", "content": answers}])
    return grade["correct"]

Just testing what the AIMEssage looks like

In [None]:
from langchain import hub
from langchain_openai import ChatOpenAI


grade_prompt_accuracy = prompt = hub.pull(
    "cot_qa_drew")


llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)

answer_grader = grade_prompt_accuracy | llm

response = answer_grader.invoke({"query": "What is the Auxiliary",
                                 "context": "documents",
                                 "result": "The Auxiliary is a rock band"}
                                )

In [None]:
text = response.content

In [69]:
from langchain import hub
from langchain_openai import ChatOpenAI


grade_prompt_accuracy = prompt = hub.pull(
    "wfh/cot_qa")


def accuracy_evaluator(run, example) -> dict:
    """
    A simple evaluator for detecting generation accuracy
    """

    # RAG inputs
    input = example.inputs["question"]
    reference = example.outputs["ground_truth_answer"]
    prediction = run.outputs["answer"]

    # LLM grader
    # other models gpt-4-turbo gpt-4o-mini
    llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)

    # Structured prompt
    answer_grader = grade_prompt_accuracy | llm

    # Get score by passing populated prompt to the evaluator
    score_response = answer_grader.invoke({"query": input,
                                           "context": reference,
                                           "result": prediction}
                                          )
    print(score_response)
    reasoning = score_response.reasoning  # Accessing reasoning directly
    value = score_response.value  # Accessing grade directly
    score = score_response.score  # Accessing score directly

    return {"key": "Accuracy", "score": score}

In [15]:

dataset_name = "ASK-groundtruth-v2"
# ASK-groundtruth_v1   initial_EDA

split_name = "1_question"

data = dataset_name

# I don't think I need this one anymore
data = client.list_examples(dataset_name=dataset_name, splits=["1_question"])


experiment_prefix = "ASK_ART_eval-llm-gpt-4o-mini"

experiment_description = "Testing cost using gpt-4o-mini for Eval and gpt-4-turbo for RAG. This will run an eval over a signle question 1x. AppName-TestType-TestVariables. ART stands for Accuraacy, Recall, Truthfulness. oai= OpenAI model. accuracy, recall, truthfulness are the test variables."

In [16]:
def target_function(input: dict):
    '''maps the shape input from our example, which is a single-field dictionary, to the rag function we are testing, which accepts a string'''
    return rag.rag(input["question"])


evaluate(
    target_function,
    data=data,
    evaluators=[accuracy_evaluator],
    experiment_prefix=experiment_prefix,
    num_repetitions=1,
    metadata=CONFIG,
)  # type: ignore    # This supresses an error

View the evaluation results for experiment: 'ASK_ART_eval-llm-gpt-4o-mini-eab3607f' at:
https://smith.langchain.com/o/3941ecea-6957-508c-9f4f-08ed62dc7d61/datasets/0b24ff94-f4f0-4197-89f3-765f835936c9/compare?selectedSessions=4226d0f7-e62b-4f98-95d8-3170e1a5faef




0it [00:00, ?it/s]

Unnamed: 0,inputs.question,outputs.user_question,outputs.enriched_question,outputs.context,outputs.answer,outputs.llm_sources,error,reference.ground_truth_answer,reference.ground_truth_sources,feedback.COT Contextual Accuracy,execution_time,example_id,id
0,"How is harassment generally defined, and who i...","How is harassment generally defined, and who i...","How is harassment generally defined, and who i...",[page_content='COMDTINST M16790.1G \n \n \n \n...,Harassment is generally defined as unwelcome a...,[COMDTINST M16790.1G Section B. Anti-Discrimin...,,Harassment is generally defined as unwelcome a...,[],1,9.754796,a3326a11-5c24-4ceb-a04e-5bb708dd9b38,ab766cb2-b938-43d9-a57d-ab1a0adf0fef
