# Deep Eval Exercise

In [None]:
import requests
from io import StringIO
from dotenv import load_dotenv
import os, subprocess

# Export the API key to an environment variable
if not os.path.exists('.env.instruqt'):
    env_text = requests.get('http://kubernetes-vm:9000/env').text
    with open('.env.instruqt', 'w') as f:
        f.write(env_text)
load_dotenv('.env.instruqt')

openai_api_key =  os.environ.get("LLM_APIKEY") 
url = os.environ.get("LLM_PROXY_URL") 
openai_api_base = f"https://{url}"

os.environ["OPENAI_API_KEY"] = openai_api_key
os.environ["OPENAI_BASE_URL"] = openai_api_base

## needed for the G-Eval test type
subprocess.run([
    "deepeval", "set-local-model", 
    "--model-name=eval-gpt-4o", ## needs azure 2024-11-20 +
    f"--base-url={openai_api_base}", 
    f"--api-key={openai_api_key}"
    f""])

# # # ## Uncomment the following lines if you want to use .env file to control settings
# load_dotenv()   


True

In [2]:
import json
from utility.util_deep_eval import generateLLMTestCase, evaluateTestCases
from rich.console import Console
import textwrap
# Monkey patch to suppress console.print
Console.print = lambda *args, **kwargs: None

test_cases = [
    {
        "name": "1: Simple Barcelona (Expected to Pass)",
        "testShouldPass": True,
        "idealTestScore": 0.6,  ## we want a score >= than this
        "question": "What is a good time of year to avoid the crowds in Barcelona",
        "rag_answer": "A good time to avoid the crowds in Barcelona is during the off-season, particularly in the winter months of January and February, as long as the possibility of rain is low. These months are considered lovely despite the cold weather [1].",
        "top_context_citations": [
            "[1] Barcelona can be visited off-season and despite the cold weather, is a lovely city even in the winter months of January and February.",
            "[2] sunscreen is essential in summer months"
        ],
        "correct_answer": "Barcelona can be visited off-season and despite the cold weather, is a lovely city even in the winter months of January and Februarys"
    },
    {
        "name": "2: Example missing citations (Expected to FAIL)",
        "testShouldPass": False,
        "idealTestScore": 0.0, ## we want a score <= than this
        "question": "What is the answer to the ultimate question of life, the universe, and everything? What book is this from?",
        "rag_answer": "42.",
        "top_context_citations": [
            "[1] After 7.5 million years, Deep Thought solemnly reveals that the answer is: Forty-two.",
            "[2] The Hitchhiker's Guide to the Galaxy, by Douglas Adams identifies the answer is 42."
        ],
        "correct_answer": "The answer to the ultimate question of life, the universe, and everything is 42. This comes from a book called The Hitchhiker's Guide to the Galaxy by Douglas Adams"
    },
    {
        "name": "3: Incorrect answer with a correctly referenced citation (Expected to FAIL)",
        "testShouldPass": False,
        "idealTestScore": 0.3,  ## we want a score <= than this
        "question": "What is the answer to the ultimate question of life, the universe, and everything? What book is this from?",
        "rag_answer": "The friends you make along the way. [1]",
        "top_context_citations": [
            "[1] It's not winning that's the goal, it's the friends you make along the way.",
        ],
        "correct_answer": "The answer to the ultimate question of life, the universe, and everything is 42. This comes from a book called The Hitchhiker's Guide to the Galaxy by Douglas Adams"
    },
    {
        "name": "4: Model ignores RAG context and gives public answer (Expected to FAIL)",
        "testShouldPass": False,
        "idealTestScore": 0.0, ## we want a score <= than this
        "question": "Which city offers card games for money, lots of alcohol, and hotels for tech company events all on the same street?",
        "rag_answer": "The context does not provide the answer to this question. However, based on my knowledge, Las Vegas is known for its casinos, nightlife, and hotels that cater to tech company events.",
        "top_context_citations": [
            "[1] city 1 has great card games and no hotels",
            "[2] city 2 is great for tech company events but no gambling or alcohol",
        ],
        "correct_answer": "Las Vegas, Nevada"
    }
]


deep_eval_test_cases = []

for case in test_cases:
    name = case["name"]
    question = case["question"]
    rag_answer = case["rag_answer"]
    top_context_citations = case["top_context_citations"]
    correct_answer = case["correct_answer"]

    # Generate the test case
    deep_eval_test_cases.append( generateLLMTestCase(name, question, rag_answer, top_context_citations, correct_answer) ) 


rag_evaluation = evaluateTestCases(deep_eval_test_cases, use_cache=False)

## If you just want to print out the full return
# print(json.dumps(rag_evaluation.model_dump(), indent=4))


# Sort test results by name before printing
sorted_results = sorted(rag_evaluation.test_results, key=lambda x: x.name)

for result in sorted_results:
    # Find the corresponding test case to get testShouldPass value
    test_should_pass = False
    ideal_test_score = 0.0
    for test_case in test_cases:
        if test_case["name"] == result.name:
            test_should_pass = test_case["testShouldPass"]
            ideal_test_score = test_case["idealTestScore"]
            break
    
    
    test_score = -1.0
    test_print_value = ""
    test_reason = ""
    print(f"Test {result.name}\n\tDeepEval reports success: {result.success}")
    
    for metric in result.metrics_data:
        test_score = metric.score
        test_print_value = f"{metric.name}: {metric.score}"
        test_reason = metric.reason
        break

    # Determine if the test had the correct result

    score_is_as_expected = (test_should_pass and test_score >= ideal_test_score) or ((not test_should_pass) and  test_score <= ideal_test_score)  


    correct_resul_emoji = "✅" if result.success == test_should_pass and score_is_as_expected else "❌"
    if 'TODO' not in test_reason:
        print(f"\tCorrect Test Result: {correct_resul_emoji}")

    print(f"\t{metric.name}: {metric.score}")
    wrapped_reason = textwrap.fill(metric.reason, width=60)
    for line in wrapped_reason.splitlines():
        print(f"\t\t{line}")


    print("\n")






Event loop is already running. Applying nest_asyncio patch to allow async execution...


Evaluating 4 test case(s) in parallel: |██████████|100% (4/4) [Time Taken: 00:00, 239.26test case/s]

Test 1: Simple Barcelona (Expected to Pass)
	DeepEval reports success: True
	Correct Test Result: ✅
	Citation Correctness (DAG): 0.8366997233899609
		Actual Output aligns well with Expected Output, stating
		January and February as off-season months with lovely
		conditions despite cold weather. The only minor omission is
		the lack of explicit mention of 'Barcelona can be visited
		off-season,' which is implied but not directly stated.


Test 2: Example missing citations (Expected to FAIL)
	DeepEval reports success: False
	Correct Test Result: ✅
	Citation Correctness (DAG): 0.0
		The score is 0.0 because the Deterministic Decision Tree
		traversal evaluated the path starting at 'TaskNode', where
		no citation annotation in the format [#] was extracted,
		resulting in 'null'. Proceeding to the
		'BinaryJudgementNode', the presence test for a citation
		annotation format [#] returned 'False' as the format was
		absent and stated as null. This led to the 'VerdictNode'
		delivering a 'Fal


