In [243]:
from langsmith import Client
import os
from datetime import datetime, timedelta
import pandas as pd

In [254]:
def get_runs_list(lookback_days: int = 1):
    client = Client()
    langsmith_project_name = os.getenv("LANGSMITH_PROJECT")
    start_time = datetime.now() - timedelta(days=lookback_days)
    project_runs = client.list_runs(
        project_name=langsmith_project_name,
        start_time=start_time,
        run_type="llm")
    runs_list = list(project_runs)
    return runs_list

def extract_question(trace_lod):
    input_dict = trace_lod[-1]["inputs"]["messages"][0][-1]["kwargs"]
    if input_dict["type"] == "human":
        question = input_dict["content"]
        return question
    else:
        raise ValueError("Input is not a human message")
    
def extract_ai_answer(trace_lod):
    return trace_lod[0]["outputs"]["generations"][0][0]["text"]

def extract_classifications(trace_lod):
    classification_dict = {}
    for trace_dict in trace_lod:
        langgraph_node = trace_dict["extra"]["metadata"]["langgraph_node"]
        if "classification" in langgraph_node:
            classification = trace_dict["outputs"]["generations"][0][0]["text"]
            classification_dict[langgraph_node] = classification
    return classification_dict

def get_filtered_traces_and_thread_id_list(is_local_testing: bool, runs_list):
    filtered_traces = [trace for trace in runs_list if trace.metadata['is_local_testing'] == is_local_testing]
    thread_id_list = list(set([trace.metadata['thread_id'] for trace in filtered_traces]))
    return filtered_traces, thread_id_list

def create_eval_lod(is_local_testing: bool = True, lookback_days: int = 1):
    runs_list = get_runs_list(lookback_days=lookback_days)
    filtered_traces, thread_id_list = get_filtered_traces_and_thread_id_list(is_local_testing=is_local_testing, runs_list=runs_list)
    output_lod = []
    for thread_id in thread_id_list:
        output_dict_i = {}
        trace_lod_i = [trace.dict() for trace in filtered_traces if trace.metadata['thread_id'] == thread_id]
        human_question = extract_question(trace_lod_i)
        ai_answer = extract_ai_answer(trace_lod_i)
        classifications = extract_classifications(trace_lod_i)
        output_dict_i["thread_id"] = thread_id
        output_dict_i["human_question"] = human_question
        output_dict_i["ai_answer"] = ai_answer
        output_dict_i["tool_used"] = "tools" in trace_lod_i[0]["extra"]["invocation_params"].keys()
        output_dict_i.update(classifications)
        output_lod.append(output_dict_i)
    return output_lod

In [255]:
eval_lod = create_eval_lod(is_local_testing=True, lookback_days=2)
eval_df = pd.DataFrame(eval_lod)
print(eval_df.shape)
eval_df.head()


(47, 6)


Unnamed: 0,thread_id,human_question,ai_answer,tool_used,intent_classification,github_stats_classification
0,f9e45a14-9936-4a68-a3ca-74192000655d,What features or interfaces did you build to e...,To empower analysts to create and apply tags e...,False,article tagging,False
1,ead49e51-d1b9-43bc-a202-e7cb8521a2cb,Can you describe how you handle ambiguity or o...,To handle ambiguity or overlap between differe...,False,medical taxonomy,False
2,9a61356d-5b14-48a1-abb4-02461f7f2bf6,What principles or frameworks did you follow w...,When designing the system architectures for ev...,False,ds-lead,False
3,817d30b0-31a8-425c-a6c7-05cea7a9d221,Can you describe the process you used for prod...,In leading my team of data scientists for prod...,False,ds-lead,False
4,623d845c-e2c4-408d-ab9e-443cff3851e5,Can you elaborate on how you integrated the ta...,The integration of the tagging product with Go...,False,article tagging,False


In [256]:
eval_df.to_csv("/Users/ewashington/Desktop/github/resume-bot/data/evals/golden_df.csv", index=False)