# Learning DeepEval


### Setup

In [None]:
import requests
from io import StringIO
from dotenv import load_dotenv
import os, subprocess

# Export the API key to an environment variable
if not os.path.exists('.env.instruqt'):
    env_text = requests.get('http://kubernetes-vm:9000/env').text
    with open('.env.instruqt', 'w') as f:
        f.write(env_text)
load_dotenv('.env.instruqt')

openai_api_key =  os.environ.get("LLM_APIKEY") 
url = os.environ.get("LLM_PROXY_URL") 
openai_api_base = f"https://{url}"

os.environ["OPENAI_API_KEY"] = openai_api_key
os.environ["OPENAI_BASE_URL"] = openai_api_base


# # # # ## Uncomment the following lines if you want to use .env file to control settings
# load_dotenv()   


True

### Let's See A Passing Test

In [3]:
from deepeval import evaluate
from deepeval.test_case import LLMTestCase
from deepeval.metrics import FaithfulnessMetric


## query
user_input = "What if these shoes don't fit?"

## simulated RAG citation
retrieval_context = ["All customers are eligible for a 30 day full refund at no extra cost."]

## simulated generated response from RAG
actual_output = "We offer a 30-day full refund at no extra cost."

metric = FaithfulnessMetric(
    threshold=0.7,
    model="gpt-4o",
    include_reason=True
)

test_case = LLMTestCase(
    input=user_input,
    actual_output=actual_output,
    retrieval_context=retrieval_context
)


evaluate(test_cases=[test_case], metrics=[metric])



Event loop is already running. Applying nest_asyncio patch to allow async execution...


Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:02,  2.57s/test case]



Metrics Summary

  - ✅ Faithfulness (score: 1.0, threshold: 0.7, strict: False, evaluation model: gpt-4o, reason: The score is 1.00 because the actual output perfectly aligns with the retrieval context, showcasing consistency and accuracy., error: None)

For test case:

  - input: What if these shoes don't fit?
  - actual output: We offer a 30-day full refund at no extra cost.
  - expected output: None
  - context: None
  - retrieval context: ['All customers are eligible for a 30 day full refund at no extra cost.']


Overall Metric Pass Rates

Faithfulness: 100.00% pass rate







EvaluationResult(test_results=[TestResult(name='test_case_0', success=True, metrics_data=[MetricData(name='Faithfulness', threshold=0.7, success=True, score=1.0, reason='The score is 1.00 because the actual output perfectly aligns with the retrieval context, showcasing consistency and accuracy.', strict_mode=False, evaluation_model='gpt-4o', error=None, evaluation_cost=0.005110000000000001, verbose_logs='Truths (limit=None):\n[\n    "All customers are eligible for a full refund.",\n    "The refund period is 30 days.",\n    "No extra cost is associated with the refund."\n] \n \nClaims:\n[\n    "We offer a 30-day full refund at no extra cost."\n] \n \nVerdicts:\n[\n    {\n        "verdict": "yes",\n        "reason": null\n    }\n]')], conversational=False, multimodal=False, input="What if these shoes don't fit?", actual_output='We offer a 30-day full refund at no extra cost.', expected_output=None, context=None, retrieval_context=['All customers are eligible for a 30 day full refund at n

### Let's See A Failing Test

In [4]:
## query
user_input = "What if these shoes don't fit?"

## simulated RAG citation
retrieval_context = ["All customers are eligible for a 30 day full refund at no extra cost."]

## simulated generated response from RAG
actual_output = "You will never get your money back. No refunds. No Soup for you!"

metric = FaithfulnessMetric(
    threshold=0.7,
    model="gpt-4o",
    include_reason=True
)

test_case = LLMTestCase(
    input=user_input,
    actual_output=actual_output,
    retrieval_context=retrieval_context
)


evaluate(test_cases=[test_case], metrics=[metric])

Event loop is already running. Applying nest_asyncio patch to allow async execution...


Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:03,  3.23s/test case]



Metrics Summary

  - ❌ Faithfulness (score: 0.3333333333333333, threshold: 0.7, strict: False, evaluation model: gpt-4o, reason: The score is 0.33 because the claims in the actual output regarding refunds directly contradict the retrieval context, which assures customers of a 30-day full refund policy. These discrepancies indicate a low level of alignment with the provided information., error: None)

For test case:

  - input: What if these shoes don't fit?
  - actual output: You will never get your money back. No refunds. No Soup for you!
  - expected output: None
  - context: None
  - retrieval context: ['All customers are eligible for a 30 day full refund at no extra cost.']


Overall Metric Pass Rates

Faithfulness: 0.00% pass rate







EvaluationResult(test_results=[TestResult(name='test_case_0', success=False, metrics_data=[MetricData(name='Faithfulness', threshold=0.7, success=False, score=0.3333333333333333, reason='The score is 0.33 because the claims in the actual output regarding refunds directly contradict the retrieval context, which assures customers of a 30-day full refund policy. These discrepancies indicate a low level of alignment with the provided information.', strict_mode=False, evaluation_model='gpt-4o', error=None, evaluation_cost=0.006295000000000001, verbose_logs='Truths (limit=None):\n[\n    "All customers are eligible for a 30-day full refund.",\n    "There is no extra cost for the 30-day full refund."\n] \n \nClaims:\n[\n    "You will never get your money back.",\n    "No refunds.",\n    "No Soup for you!"\n] \n \nVerdicts:\n[\n    {\n        "verdict": "no",\n        "reason": "The claim states \'You will never get your money back,\' contradicting the retrieval context, which states that all c