## Evaluate RAG Quality
##### Evaluates the app by running an experiment in Langsmith  
Do not add code to this to run a regular rag inferences or it may put the wrong tracing project name. Use inference_tester.ipynb instead

Tests 
-  Accuracy (COT Answer Accuracy)
-  Recall- How many of the relevant documents were retrieved
-  Precision- How well did the response answer the question given the retrieved documents
-  Truthfulness - Did the response stray from the documents or hallucinate?


In [None]:
%pip install pip --upgrade

In [17]:
from dotenv import load_dotenv
import os, sys

load_dotenv('/Users/drew_wilkins/Drews_Files/Drew/Python/VSCode/.env')

# Add the parent directory to sys.path so you can import your modules from a subdirectory
sys.path.append(os.path.abspath('..'))

import rag
from rag import CONFIG

In [18]:
# Config LangSmith if you also want the traces
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_PROJECT"] = "langchain_evaluator.ipynb on ASK main/local"

In [19]:
from langsmith.evaluation import evaluate
from langsmith import Client

client = Client()

### Choose your dataset

In [4]:

dataset_name = "ASK-groundtruth_with_sources"
# ASK-groundtruth_v1   initial_EDA


# data = dataset_name
data = client.list_examples(dataset_name=dataset_name, example_ids=[
                            "096d9dfb-1835-4b42-99a5-bf4985ca25c6"])


experiment_prefix = "testing the tracing"

experiment_description = ""

### Evaluate for Answer Correctness (Accuracy)

In [5]:
from langsmith.evaluation import LangChainStringEvaluator


def prepare_cot_qa_data(run, example):
    '''
    Create a dictionary for the evaluator to use.

    run is the rag funtion 
    example is the example from the dataset
    '''
    return {
        # Populates the input key with the question from the test dataset
        "input": example.inputs["question"],
        # Populates the reference key with the ground truth answer from the test dataset
        "reference": example.outputs["ground_truth_answer"],
        # Populates the prediction key with answer from rag.rag output
        "prediction": run.outputs["answer"],
    }


# cot_qa uses the CotQAEvalChain class which uses the prompt template here: https://smith.langchain.com/hub/wfh/cot_qa
accuracy_evaluator = LangChainStringEvaluator(
    "cot_qa",
    prepare_data=prepare_cot_qa_data
)

In [None]:
evaluate(
    # sends rag.rag a string instead of a dict, which is its default
    lambda input: rag.rag(input["question"]),
    data=data,
    evaluators=[accuracy_evaluator],
    metadata=CONFIG,
    experiment_prefix=experiment_prefix,
    description=experiment_description,
    # type: ignore    # This supresses an error
)

### Evaluate for Truthfulness (detect Hallucinations)

In [22]:
from langchain import hub
from langchain_openai import ChatOpenAI


# Prompt to grade Truthfulness. It's a Langchain object with two inputs: "documents", "student_answer"
grade_prompt_truthfulness = prompt = hub.pull(
    "langchain-ai/rag-answer-hallucination")


def hallucination_evaluator(run, example) -> dict:
    """
    A simple evaluator for detecting generation hallucinations
    """

    # RAG inputs
    input_question = example.inputs["question"]
    contexts = run.outputs["context"]
    prediction = run.outputs["answer"]

    # LLM grader
    llm = ChatOpenAI(model="gpt-4-turbo", temperature=0)

    # Structured prompt
    answer_grader = grade_prompt_truthfulness | llm

    # Get score by passing populated prompt to the evaluator
    score = answer_grader.invoke({"documents": contexts,
                                  "student_answer": prediction})
    score = score["Score"]

    return {"key": "Truthfulness", "score": score}

In [None]:
grade_prompt_truthfulness

In [None]:
evaluate(
    lambda input: rag.rag(input["question"]),
    data=data,
    evaluators=[hallucination_evaluator],
    experiment_prefix="rag-qa-oai-precision",
    metadata=CONFIG,
    blocking=True  # This shows the results in the notebook
)

### Retreival Relevance (Recall)

In [None]:
from langsmith.evaluation import LangChainStringEvaluator


recall_evaluator = LangChainStringEvaluator(
    "score_string",
    config={
        "criteria": {
            "Recall": """The Assistant's Answer is a set of documents retrieved from a vectorstore. The input is a question used for retrieval. You will score whether the Assistant's Answer (retrieved documents) are relevant to the input question. The score should be between 0 and 10. A score of [[0]] means that the Assistant answer contains documents that are not at all relevnat to the input question. A score of [[5]] menas that the Assistant answre contains some documents are relevant to the input question. A score of [[10]] means that all of the Assistant answer documents are all relevant to the input question. """
        },
        "normalize_by": 10,
    },
    prepare_data=lambda run, example: {
        "prediction": run.outputs["context"],
        "input": example.inputs["question"],
    }
)

In [None]:
evaluate(
    lambda input: rag.rag(input["question"]),
    data=data,
    evaluators=[recall_evaluator],
    experiment_prefix="rag-qa-oai-recall",
    metadata=CONFIG,
)

### Combo Evaluation

In [20]:

dataset_name = "ASK-groundtruth_with_sources"
# ASK-groundtruth_v1   initial_EDA


# data = dataset_name
data = client.list_examples(dataset_name=dataset_name, example_ids=[
                            "096d9dfb-1835-4b42-99a5-bf4985ca25c6"])


experiment_prefix = "setting up the evaluators"

experiment_description = ""

In [None]:
evaluate(
    lambda input: rag.rag(input["question"]),
    data=data,
    evaluators=[accuracy_evaluator, recall_evaluator,
                hallucination_evaluator],
    experiment_prefix="rag-qa-oai-accuracy-recall-truthfulness",
    metadata=CONFIG,
    blocking=True  # This shows the results in the notebook
)  # type: ignore    # This supresses an error