## Evaluate RAG Quality
##### Evaluates the app by running an experiment in Langsmith  
Do not add code to this to run a regular rag inferences or it may put the wrong tracing project name. Use inference_tester.ipynb instead

In [None]:
%pip install pip --upgrade

In [2]:
from dotenv import load_dotenv
import os, sys

load_dotenv('/Users/drew_wilkins/Drews_Files/Drew/Python/VSCode/.env')

# Add the parent directory to sys.path so you can import your modules from a subdirectory
sys.path.append(os.path.abspath('..'))

import rag
from rag import CONFIG

In [3]:
# Config LangSmith if you also want the traces
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_PROJECT"] = "langchain_evaluator.ipynb on ASK main/local"


# ASK-groundtruth_v1  # one_example #ASK-groundtruth-BQII
dataset_name = "one_example"
# ASK-groundtruth_v1  ASK-accuracy-test-BQII_QA_set   initial_EDA
experiment_prefix = "testing the tracing"

experiment_description = ""
# Checking accuracy of RAG on BQII_QA_set
# Initial EDA check of dataset to ensure questions are in scope and groundtruth answers are correct

### Evaluate for Accuracy

In [None]:
from langsmith.evaluation import evaluate, LangChainStringEvaluator


def prepare_cot_qa_data(run, example):
    '''
    Create a dictionary for the evaluator to use.
    
    run is the rag funtion 
    example is the example from the dataset
    '''
    return {
        # Populates the input key with the question from the test dataset
        "input": example.inputs["Question"],
        # Populates the reference key with the ground truth answer from the test dataset
        "reference": example.outputs["Ground Truth Answer"],
        # Populates the prediction key with answer from rag.rag output
        "prediction": run.outputs["answer"],
    }


cot_qa_evaluator = LangChainStringEvaluator(
    "cot_qa",
    prepare_data=prepare_cot_qa_data
)


experiment_results = evaluate(
    # sends rag.rag a string instead of a dict, which is its default
    lambda input: rag.rag(input["Question"]),
    data=dataset_name,
    evaluators=[cot_qa_evaluator],
    metadata=CONFIG,
    experiment_prefix=experiment_prefix,
    description=experiment_description,
    # type: ignore
)

### Evaluate for Hallucination

In [None]:
from langchain import hub
from langchain_openai import ChatOpenAI
from langsmith.evaluation import evaluate


# Prompt to grade halucinations. It's a Langchain object with two inputs: "documents", "student_answer"
grade_prompt_hallucinations = prompt = hub.pull(
    "langchain-ai/rag-answer-hallucination")


def answer_hallucination_evaluator(run, example) -> dict:
    """
    A simple evaluator for generation hallucination
    """

    # RAG inputs
    input_question = example.inputs["Question"]
    contexts = run.outputs["context"]
    prediction = run.outputs["answer"]

    # LLM grader
    llm = ChatOpenAI(model="gpt-4-turbo", temperature=0)

    # Structured prompt
    answer_grader = grade_prompt_hallucinations | llm

    # Get score by passing populated prompt to the evaluator
    score = answer_grader.invoke({"documents": contexts,
                                  "student_answer": prediction})
    score = score["Score"]

    return {"key": "answer_hallucination", "score": score}


experiment_results = evaluate(
    lambda input: rag.rag(input["Question"]),
    data=dataset_name,
    evaluators=[answer_hallucination_evaluator],
    experiment_prefix="rag-qa-oai-hallucination",
    metadata=CONFIG,
)