In [15]:
import sys
import os
import pandas as pd
from langchain_core.messages import SystemMessage

from eval_prompts import *

evals_dir = os.path.abspath(os.path.join(os.getcwd(), '../'))
sys.path.insert(0, evals_dir)

from eval_utils import *  # Import all functions from eval_utils
from src.evaluator import Evaluator
from src.traces_etl import TracesETL

evaluator = Evaluator()
trace_etl = TracesETL()

In [16]:
golden_df = get_golden_dataset()
golden_lod = golden_df.to_dict('records')

✅ Success with encoding: cp1252


In [18]:
thread_id_to_factual_accuracy = {
    thread_id: factual_accuracy 
    for thread_id, factual_accuracy in zip(golden_df["thread_id"], golden_df["eval_factual_accuracy"])
}
thread_id_to_successful_response = {
    thread_id: successful_response 
    for thread_id, successful_response in zip(golden_df["thread_id"], golden_df["eval_answer_user_question"])
}


In [3]:
factual_accuracy_response_lod = []
successful_response_response_lod = []

for i, dict in enumerate(golden_lod):
    if i % 10 == 0:
        print(round(i / len(golden_lod), 2))
    human_question = dict["human_question"]
    ai_answer = dict["ai_answer"]
    reference_text = dict["system_prompt"]
    tool_used = dict["tool_used"]
    thread_id = dict["thread_id"]

    factual_accuracy_system_prompt = create_factual_accuracy_system_prompt(human_question, reference_text, ai_answer)
    successful_response_system_prompt = create_successful_response_system_prompt(human_question, ai_answer, tool_used)

    factual_accuracy_response = evaluator.auto_annotate(factual_accuracy_system_prompt)
    factual_accuracy_response["eval_name"] = "factual_accuracy"
    factual_accuracy_response["thread_id"] = thread_id
    factual_accuracy_response_lod.append(factual_accuracy_response)

    successful_response_response = evaluator.auto_annotate(successful_response_system_prompt)
    successful_response_response["eval_name"] = "complete_response"
    successful_response_response["thread_id"] = thread_id
    successful_response_response_lod.append(successful_response_response)

0.0
0.21
0.43
0.64
0.85


In [30]:
factual_accuracy_df = pd.DataFrame(factual_accuracy_response_lod)
factual_accuracy_df["golden_response"] = golden_df["thread_id"].map(thread_id_to_factual_accuracy)
successful_response_df = pd.DataFrame(successful_response_response_lod)
successful_response_df["golden_response"] = golden_df["thread_id"].map(thread_id_to_successful_response)
eval_results_df = pd.concat([factual_accuracy_df, successful_response_df])

In [31]:
eval_results_df.groupby(["eval_name", "response"])["thread_id"].count().reset_index()

Unnamed: 0,eval_name,response,thread_id
0,factual_accuracy,fail,44
1,factual_accuracy,pass,3
2,successful_response,fail,8
3,successful_response,pass,39


In [34]:
eval_results_df["eval_match"] = eval_results_df["golden_response"] == eval_results_df["response"]
eval_results_df.groupby(["eval_name", "eval_match"])["thread_id"].count().reset_index()

Unnamed: 0,eval_name,eval_match,thread_id
0,factual_accuracy,False,38
1,factual_accuracy,True,9
2,successful_response,False,4
3,successful_response,True,43


In [42]:
final_eval_df = eval_results_df.merge(golden_df[["thread_id", "ai_answer", "system_prompt", "tool_used", "human_question"]], on="thread_id", how="left")

In [43]:
final_eval_df.to_csv("/Users/ewashington/Desktop/github/resume-bot/data/evals/eval_results_df.csv", index=False)