# Deep Eval Exercise

In [None]:
import requests
from io import StringIO
from dotenv import load_dotenv
import os, subprocess

# Export the API key to an environment variable
if not os.path.exists('.env.instruqt'):
    env_text = requests.get('http://kubernetes-vm:9000/env').text
    with open('.env.instruqt', 'w') as f:
        f.write(env_text)
load_dotenv('.env.instruqt')

openai_api_key =  os.environ.get("LLM_APIKEY") 
url = os.environ.get("LLM_PROXY_URL") 
openai_api_base = f"https://{url}"

os.environ["OPENAI_API_KEY"] = openai_api_key
os.environ["OPENAI_BASE_URL"] = openai_api_base

subprocess.run([
    "deepeval", "set-local-model", 
    "--model-name=eval-gpt-4o", ## needs azure 2024-11-20 +
    f"--base-url={openai_api_base}", 
    f"--api-key={openai_api_key}"
    f""])

# # # ## Uncomment the following lines if you want to use .env file to control settings
# load_dotenv()   


In [None]:
import json
from utility.util_deep_eval import generateLLMTestCase, evaluateTestCases
from rich.console import Console
import textwrap
# Monkey patch to suppress console.print
Console.print = lambda *args, **kwargs: None

test_cases = [
    {
        "name": "1: Simple Barcelona",
        "question": "What is a good time of year to avoid the crowds in Barcelona",
        "rag_answer": "A good time to avoid the crowds in Barcelona is during the off-season, particularly in the winter months of January and February, as long as the possibility of rain is low. These months are considered lovely despite the cold weather [1].",
        "top_context_citations": [
            "[1] Barcelona can be visited off-season and despite the cold weather, is a lovely city even in the winter months of January and February.",
            "[2] sunscreen is essential in summer months"
        ],
        "correct_answer": "Barcelona can be visited off-season and despite the cold weather, is a lovely city even in the winter months of January and Februarys"
    },
    {
        "name": "2: Example missing citations",
        "question": "What is the answer to the ultimate question of life, the universe, and everything? What book is this from?",
        "rag_answer": "The answer is 42.",
        "top_context_citations": [
            "[1] After 7.5 million years, Deep Thought solemnly reveals that the answer is: Forty-two.",
            "[2] The Hitchhiker's Guide to the Galaxy, by Douglas Adams identifies the answer is 42."
        ],
        "correct_answer": "The answer to the ultimate question of life, the universe, and everything is 42. This comes from a book called The Hitchhiker's Guide to the Galaxy by Douglas Adams"
    },
    {
        "name": "3: Model ignores RAG context and gives public answer",
        "question": "Which city offers card games for money, lots of alcohol, and hotels for tech company events all on the same street?",
        "rag_answer": "The context does not provide the answer to this question. However, based on my knowledge, Las Vegas is known for its casinos, nightlife, and hotels that cater to tech company events.",
        "top_context_citations": [
            "[1] city 1 has great card games and no hotels",
            "[2] city 2 is great for tech company events but no gambling or alcohol",
        ],
        "correct_answer": "Las Vegas, Nevada"
    }
]


deep_eval_test_cases = []

for case in test_cases:
    name = case["name"]
    question = case["question"]
    rag_answer = case["rag_answer"]
    top_context_citations = case["top_context_citations"]
    correct_answer = case["correct_answer"]

    # Generate the test case
    deep_eval_test_cases.append( generateLLMTestCase(name, question, rag_answer, top_context_citations, correct_answer) ) 


rag_evaluation = evaluateTestCases(deep_eval_test_cases)

## If you just want to print out the full return
# print(json.dumps(rag_evaluation.model_dump(), indent=4))


# Sort test results by name before printing
sorted_results = sorted(rag_evaluation.test_results, key=lambda x: x.name)

for result in sorted_results:
    print(f"Test {result.name}\n\tsuccess: {result.success}")
    for metric in result.metrics_data:
        print(f"\t{metric.name}: {metric.score}")
        wrapped_reason = textwrap.fill(metric.reason, width=60)
        for line in wrapped_reason.splitlines():
            print(f"\t\t{line}")
    print("\n")


