# Evaluate results of architectures

In [None]:
import os
import json
from pydantic import BaseModel
from openai import OpenAI

from dotenv import load_dotenv

load_dotenv()

In [None]:
# Load the sample submission
sample_submission = ""

with open(f"../data/submissions/{sample_submission}.json", "r") as file:
    submission = json.load(file)

In [None]:
# Load ground truth
ground_truth = "answers_round1.json"

with open(os.path.join("../data/", ground_truth), "r") as file:
    ground_truth = json.load(file)

In [None]:
# Using Pheonix Tracing
# Quickstart tutorial (for Notebooks, no persistent trace storing!): https://docs.arize.com/phoenix/tracing/llm-traces-1

# pip install arize-phoenix
import phoenix as px
from phoenix.otel import register

# pip install -q openinference-instrumentation-openai
from openinference.instrumentation.openai import OpenAIInstrumentor

px.launch_app()

tracer_provider = register(
    project_name="my-llm-app",  # Default is 'default'
    endpoint="http://localhost:4317",  # Sends traces using gRPC
)

OpenAIInstrumentor().instrument(tracer_provider=tracer_provider)

In [None]:
# pip install nest_asyncio
import nest_asyncio
from phoenix.evals import HallucinationEvaluator, OpenAIModel, QAEvaluator, run_evals

nest_asyncio.apply()  # This is needed for concurrency in notebook environments

# Set your OpenAI API key
eval_model = OpenAIModel(model="gpt-4o")

# Define your evaluators
hallucination_evaluator = HallucinationEvaluator(eval_model)
qa_evaluator = QAEvaluator(eval_model)

# We have to make some minor changes to our dataframe to use the column names expected by our evaluators
# for `hallucination_evaluator` the input df needs to have columns 'output', 'input', 'context'
# for `qa_evaluator` the input df needs to have columns 'output', 'input', 'reference'
df = df.copy()
df["context"] = df["reference"]
df.rename(columns={"query": "input", "response": "output"}, inplace=True)
assert all(column in df.columns for column in ["output", "input", "context", "reference"])

# Run the evaluators, each evaluator will return a dataframe with evaluation results
# We upload the evaluation results to Phoenix in the next step
hallucination_eval_df, qa_eval_df = run_evals(
    dataframe=df, evaluators=[hallucination_evaluator, qa_evaluator], provide_explanation=True
)

In [None]:
results_df = df.copy()
results_df["hallucination_eval"] = hallucination_eval_df["label"]
results_df["hallucination_explanation"] = hallucination_eval_df["explanation"]
results_df["qa_eval"] = qa_eval_df["label"]
results_df["qa_explanation"] = qa_eval_df["explanation"]
results_df.head()