In [None]:
from langsmith.evaluation import LangChainStringEvaluator, SingleEvaluatorInput
from langchain.chat_models import ChatOpenAI


def prepare_cot_qa_data(run, example):
    '''
    Create a dictionary for the evaluator to use.

    run is the rag function 
    example is the example from the dataset
    '''
    return SingleEvaluatorInput(
        input=example.inputs["question"],
        reference=example.outputs["ground_truth_answer"],
        prediction=run.outputs["answer"],
    )


# Initialize the LLM with the desired model and temperature
llm = ChatOpenAI(model="gpt-4-turbo", temperature=0)

# cot_qa uses the CotQAEvalChain class which uses the prompt template here: https://smith.langchain.com/hub/wfh/cot_qa
accuracy_evaluator = LangChainStringEvaluator(
    "cot_qa",
    prepare_data=prepare_cot_qa_data,
    llm=llm  # Pass the LLM instance with the desired model and temperature
)

# Example usage of the evaluator
input_data = {
    "question": "What is the capital of France?",
    "ground_truth_answer": "Paris",
    "answer": "Paris"
}

# Simulate a run object
run = {
    "outputs": {
        "answer": input_data["answer"]
    }
}

# Simulate an example object
example = {
    "inputs": {
        "question": input_data["question"]
    },
    "outputs": {
        "ground_truth_answer": input_data["ground_truth_answer"]
    }
}

# Prepare the data for evaluation
evaluator_input = prepare_cot_qa_data(run, example)

# Evaluate the input data
grader_response = accuracy_evaluator.evaluate(evaluator_input)

# Print the grader response
print(grader_response)