### Create AI Login

In [4]:
import os
from dotenv import load_dotenv
import deepeval

load_dotenv() 

api_key = os.getenv("CONFIDENT_API_KEY")
deepeval.login_with_confident_api_key(api_key)

In [5]:
from dotenv import load_dotenv

load = load_dotenv('.env')

### Answer Relevancy

In [6]:
from deepeval.test_case import LLMTestCase
from deepeval.metrics import AnswerRelevancyMetric

answer_relevancy_metric = AnswerRelevancyMetric()

test_case = LLMTestCase(
    input="Convert hola to english",
    actual_output="hola",
    expected_output="Hi or Hello",
    retrieval_context=["Hi or Hello"],
)

answer_relevancy_metric.measure(test_case)
print (answer_relevancy_metric.score, answer_relevancy_metric.reason)

Output()

1.0 The score is 1.00 because the output perfectly translates 'hola' to 'hello' in English, addressing the input directly and accurately without any irrelevant information.


### Contextual Precision

In [7]:
from deepeval.test_case import LLMTestCase
from deepeval.metrics import ContextualPrecisionMetric
from deepeval.evaluate import evaluate

# Replace this with the actual output from your LLM application
actual_output = "You need to buy other shoes"

# Replace this with the expected output of your RAG generator
expected_output = "You are eligible for a 30 day full refund at no extra cost."

# Replace this with the actual retrieved context from your RAG pipeline
retrieval_context = ["All customers are eligible for a 30 day full refund at no extra cost."]

metric = ContextualPrecisionMetric(
    threshold=0.7,
    model="gpt-4o",
    include_reason=True
)
test_case = LLMTestCase(
    input="What if these shoes don't fit?",
    actual_output=actual_output,
    expected_output=expected_output,
    retrieval_context=retrieval_context
)
evaluate(test_cases=[test_case], metrics=[metric])

Output()



Metrics Summary

  - ✅ Contextual Precision (score: 1.0, threshold: 0.7, strict: False, evaluation model: gpt-4o, reason: The score is 1.00 because the relevant node in the retrieval context is perfectly aligned with the input, ensuring that the most pertinent information is prioritized. Great job on achieving the highest precision!, error: None)

For test case:

  - input: What if these shoes don't fit?
  - actual output: You need to buy other shoes
  - expected output: You are eligible for a 30 day full refund at no extra cost.
  - context: None
  - retrieval context: ['All customers are eligible for a 30 day full refund at no extra cost.']


Overall Metric Pass Rates

Contextual Precision: 100.00% pass rate




EvaluationResult(test_results=[TestResult(name='test_case_0', success=True, metrics_data=[MetricData(name='Contextual Precision', threshold=0.7, success=True, score=1.0, reason='The score is 1.00 because the relevant node in the retrieval context is perfectly aligned with the input, ensuring that the most pertinent information is prioritized. Great job on achieving the highest precision!', strict_mode=False, evaluation_model='gpt-4o', error=None, evaluation_cost=0.0033025000000000003, verbose_logs='Verdicts:\n[\n    {\n        "verdict": "yes",\n        "reason": "The context directly states \'All customers are eligible for a 30 day full refund at no extra cost,\' which aligns perfectly with the expected output."\n    }\n]')], conversational=False, multimodal=False, input="What if these shoes don't fit?", actual_output='You need to buy other shoes', expected_output='You are eligible for a 30 day full refund at no extra cost.', context=None, retrieval_context=['All customers are eligibl

### FaithFulness

In [None]:
from deepeval.test_case import LLMTestCase, LLMTestCaseParams
from deepeval.metrics import GEval
from deepeval import assert_test

def test_correctness():

    correctness_metric = GEval(
        name="Correctness",
        criteria="Determine if the 'actual output' is correct based on the 'expected output'.",
        evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT,
                           LLMTestCaseParams.EXPECTED_OUTPUT],
        threshold=0.8
    )
    actual_output = "We offer a 30-day full refund at no extra cost."
    # actual_output = "You should pay for another shoes"
    test_case = LLMTestCase(
        input="What if these shoes don't fit?",
        # Replace this with the actual output from your LLM application
        actual_output=actual_output,
        expected_output="You are eligible for a 30 day full refund at no extra cost."
    )
    correctness_metric.measure(test_case)
    print(correctness_metric.score, correctness_metric.reason)

    assert_test(test_case, [correctness_metric])


In [9]:
from deepeval.test_case import LLMTestCase, LLMTestCaseParams
from deepeval.metrics import GEval
from deepeval import assert_test

def test_correctness():

    correctness_metric = GEval(
        name="Correctness",
        criteria="Determine if the 'actual output' is correct based on the 'expected output'.",
        evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT,
                           LLMTestCaseParams.EXPECTED_OUTPUT],
        threshold=0.8
    )
    actual_output = "We offer a 30-day full refund at no extra cost."
    # actual_output = "You should pay for another shoes"
    test_case = LLMTestCase(
        input="What if these shoes don't fit?",
        # Replace this with the actual output from your LLM application
        actual_output=actual_output,
        expected_output="You are eligible for a 30 day full refund at no extra cost."
    )
    correctness_metric.measure(test_case)
    print(correctness_metric.score, correctness_metric.reason)

    assert_test(test_case, [correctness_metric])
  

### Datasets

In [8]:
from deepeval.test_case import LLMTestCase
from deepeval.dataset import EvaluationDataset
from deepeval.metrics import AnswerRelevancyMetric
from deepeval.metrics import ContextualPrecisionMetric

answer_relevancy_metric = AnswerRelevancyMetric()
context_precision_metric = ContextualPrecisionMetric()
test_case = LLMTestCase(
  input="Who is the current president of the United States of America?",
  actual_output="not evaluated",
  expected_output="Donald Trump",
  retrieval_context=["Donald Trump serves as the current president of America."]
)

dataset = EvaluationDataset(test_cases=[test_case])
dataset.evaluate([context_precision_metric])

Evaluating 1 test case(s) in parallel: |██████████|100% (1/1) [Time Taken: 00:02,  2.80s/test case]




Metrics Summary

  - ✅ Contextual Precision (score: 1.0, threshold: 0.5, strict: False, evaluation model: gpt-4o, reason: The score is 1.00 because the relevant node in the retrieval contexts is perfectly aligned with the input query, ensuring that the correct information is prioritized. Great job on achieving the highest precision!, error: None)

For test case:

  - input: Who is the current president of the United States of America?
  - actual output: not evaluated
  - expected output: Donald Trump
  - context: None
  - retrieval context: ['Donald Trump serves as the current president of America.']


Overall Metric Pass Rates

Contextual Precision: 100.00% pass rate




EvaluationResult(test_results=[TestResult(name='test_case_0', success=True, metrics_data=[MetricData(name='Contextual Precision', threshold=0.5, success=True, score=1.0, reason='The score is 1.00 because the relevant node in the retrieval contexts is perfectly aligned with the input query, ensuring that the correct information is prioritized. Great job on achieving the highest precision!', strict_mode=False, evaluation_model='gpt-4o', error=None, evaluation_cost=0.0031950000000000004, verbose_logs='Verdicts:\n[\n    {\n        "verdict": "yes",\n        "reason": "The context directly states \'Donald Trump serves as the current president of America,\' which aligns with the expected output."\n    }\n]')], conversational=False, multimodal=False, input='Who is the current president of the United States of America?', actual_output='not evaluated', expected_output='Donald Trump', context=None, retrieval_context=['Donald Trump serves as the current president of America.'], additional_metadat

# Set local LLM for evaluation

In [None]:
!deepeval set-ollama deepseek-r1:latest

🙌 Congratulations! You're now using a local Ollama model for all evals that 
require an LLM.


In [12]:
dataset.pull("test")

print(dataset)

Output()

EvaluationDataset(test_cases=[], goldens=[Golden(input="What if these shoes don't fit?", actual_output=None, expected_output='You are eligible for a 30 day full refund at no extra cost.', context=None, retrieval_context=['All customers are eligible for a 30 day full refund at no extra cost.'], additional_metadata=None, comments=None, tools_called=None, expected_tools=None, source_file=None, name=None, custom_column_key_values={})], conversational_goldens=[], _alias=test, _id=cmbiukhu906baowa2bu2zub7q)
