### Installation of DeepEval

In [None]:
!pip install -U deepeval

### Creating Confident AI login

In [1]:
import deepeval

deepeval.login("confident_us_mi2F9Prqn81UJrqywO7sqM+pR7W7LS5MepLi4LBtbeE=")


In [3]:
from dotenv import load_dotenv

load = load_dotenv('./../.env')

### Writing simple DeepEval Test

### Answer Relevancy Metrics - Standalone

In [4]:
from deepeval.test_case import LLMTestCase
from deepeval.metrics import AnswerRelevancyMetric

answer_relevancy_metric = AnswerRelevancyMetric()
test_case = LLMTestCase(
  input="Who is the current president of the United States of America?",
  actual_output="Joe Biden",
  retrieval_context=["Joe Biden serves as the current president of America."]
)

answer_relevancy_metric.measure(test_case)
print(answer_relevancy_metric.score)

1.0


### Test using Contextual Precision Metrics - Standalone

In [None]:
from deepeval.test_case import LLMTestCase
from deepeval.metrics import ContextualPrecisionMetric

contextual_precision_metrics = ContextualPrecisionMetric()

test_case = LLMTestCase(
    input="Who is the current president of USA in 2024",
    # Should come from an LLM or from an Agent or RAG
    actual_output="Donald Trump",
    # RAG - Vector DB, AI Agent - Agent Tools, LLM - LLM invoke response
    retrieval_context=["Donald Trump serves as the current president of America."],
    expected_output="Donald Trump is the current president of America."
)

contextual_precision_metrics.measure(test_case=test_case)
print(contextual_precision_metrics.score)
print(contextual_precision_metrics.success)
print(contextual_precision_metrics.score_breakdown)



### Evaluate our Tests without Standalone using - Evaluate

In [6]:
from deepeval.test_case import LLMTestCase
from deepeval.metrics import AnswerRelevancyMetric
from deepeval.evaluate import evaluate

answer_relevancy_metric = AnswerRelevancyMetric()
test_case = LLMTestCase(
  input="Who is the current president of the United States of America?",
  actual_output="Joe Biden",
  retrieval_context=["Joe Biden serves as the current president of America."]
)

evaluate(test_cases=[test_case], metrics=[answer_relevancy_metric])



Metrics Summary

  - ‚úÖ Answer Relevancy (score: 1.0, threshold: 0.5, strict: False, evaluation model: gpt-4.1, reason: The score is 1.00 because the answer was fully relevant and directly addressed the question with no irrelevant information. Great job!, error: None)

For test case:

  - input: Who is the current president of the United States of America?
  - actual output: Joe Biden
  - expected output: None
  - context: None
  - retrieval context: ['Joe Biden serves as the current president of America.']


Overall Metric Pass Rates

Answer Relevancy: 100.00% pass rate




EvaluationResult(test_results=[TestResult(name='test_case_0', success=True, metrics_data=[MetricData(name='Answer Relevancy', threshold=0.5, success=True, score=1.0, reason='The score is 1.00 because the answer was fully relevant and directly addressed the question with no irrelevant information. Great job!', strict_mode=False, evaluation_model='gpt-4.1', error=None, evaluation_cost=0.0030800000000000003, verbose_logs='Statements:\n[\n    "Joe Biden"\n] \n \nVerdicts:\n[\n    {\n        "verdict": "yes",\n        "reason": null\n    }\n]')], conversational=False, multimodal=False, input='Who is the current president of the United States of America?', actual_output='Joe Biden', expected_output=None, context=None, retrieval_context=['Joe Biden serves as the current president of America.'], additional_metadata=None)], confident_link='https://app.confident-ai.com/project/cmekcjvxy00e7e82whpp81o98/evaluation/test-runs/cmekks4zj048f2jlogm82esc1/test-cases')

In [12]:
from deepeval.test_case import LLMTestCase
from deepeval.metrics import AnswerRelevancyMetric
from deepeval.evaluate import evaluate

answer_relevancy_metric = AnswerRelevancyMetric()

test_case1 = LLMTestCase(
  input="Who is the current president of the United States of America?",
  actual_output="Joe Biden",
  retrieval_context=["Joe Biden serves as the current president of America."]
)

test_case2 = LLMTestCase(
  input="Who built the Claude Models?",
  actual_output="OpenAI",
  expected_output= "Claude Anthrophic",
  retrieval_context=["Claude Anthrophic built the GPT models."]
)

evaluate(test_cases=[test_case1, test_case2], metrics=[answer_relevancy_metric])



Metrics Summary

  - ‚ùå Answer Relevancy (score: 0.0, threshold: 0.5, strict: False, evaluation model: gpt-4.1, reason: The score is 0.00 because the actual output did not answer the question and instead provided irrelevant information about OpenAI, which is not related to who built the Claude Models., error: None)

For test case:

  - input: Who built the Claude Models?
  - actual output: OpenAI
  - expected output: Claude Anthrophic
  - context: None
  - retrieval context: ['Claude Anthrophic built the GPT models.']


Overall Metric Pass Rates

Answer Relevancy: 50.00% pass rate




Metrics Summary

  - ‚úÖ Answer Relevancy (score: 1.0, threshold: 0.5, strict: False, evaluation model: gpt-4.1, reason: The score is 1.00 because the answer was fully relevant and directly addressed the question with no irrelevant information. Great job staying focused and concise!, error: None)

For test case:

  - input: Who is the current president of the United States of America?
  - actual output

EvaluationResult(test_results=[TestResult(name='test_case_1', success=False, metrics_data=[MetricData(name='Answer Relevancy', threshold=0.5, success=False, score=0.0, reason='The score is 0.00 because the actual output did not answer the question and instead provided irrelevant information about OpenAI, which is not related to who built the Claude Models.', strict_mode=False, evaluation_model='gpt-4.1', error=None, evaluation_cost=0.0033099999999999996, verbose_logs='Statements:\n[\n    "OpenAI"\n] \n \nVerdicts:\n[\n    {\n        "verdict": "no",\n        "reason": "OpenAI did not build the Claude Models; they were built by Anthropic. The statement is not relevant to answering the question."\n    }\n]')], conversational=False, multimodal=False, input='Who built the Claude Models?', actual_output='OpenAI', expected_output='Claude Anthrophic', context=None, retrieval_context=['Claude Anthrophic built the GPT models.'], additional_metadata=None), TestResult(name='test_case_0', success=

### Evaluate With Golden DataSet and EvaluationDataSet

In [21]:
from deepeval.test_case import LLMTestCase
from deepeval.metrics import AnswerRelevancyMetric
from deepeval.evaluate import evaluate
from deepeval.dataset import EvaluationDataset, Golden

# Create Golden instead of Test cases
golden = Golden(
    input="Who is the current president of the United States of America?",
    expected_output="Joe Biden",
    context=["Joe Biden serves as the current president of America."]
)

dataset = EvaluationDataset()
dataset.add_golden(golden)



In [22]:
dataset

EvaluationDataset(test_cases=[], goldens=[Golden(input='Who is the current president of the United States of America?', actual_output=None, expected_output='Joe Biden', context=['Joe Biden serves as the current president of America.'], retrieval_context=None, turns=None, additional_metadata=None, comments=None, tools_called=None, expected_tools=None, source_file=None, name=None, custom_column_key_values=None)], _alias=None, _id=None, _multi_turn=False)

#### Creating Test Case from Golden

In [23]:
for golden in dataset.goldens:
    test_case = LLMTestCase(
        input=golden.input,
        expected_output=golden.expected_output,
        actual_output="Joe Biden",
        retrieval_context=golden.context
    )
    
    dataset.add_test_case(test_case)
    
evaluate(test_cases=dataset.test_cases, metrics=[AnswerRelevancyMetric()])



Metrics Summary

  - ‚úÖ Answer Relevancy (score: 1.0, threshold: 0.5, strict: False, evaluation model: gpt-4.1, reason: The score is 1.00 because the answer was fully relevant and directly addressed the question without any irrelevant information. Great job staying focused and concise!, error: None)

For test case:

  - input: Who is the current president of the United States of America?
  - actual output: Joe Biden
  - expected output: Joe Biden
  - context: None
  - retrieval context: ['Joe Biden serves as the current president of America.']


Overall Metric Pass Rates

Answer Relevancy: 100.00% pass rate




EvaluationResult(test_results=[TestResult(name='test_case_0', success=True, metrics_data=[MetricData(name='Answer Relevancy', threshold=0.5, success=True, score=1.0, reason='The score is 1.00 because the answer was fully relevant and directly addressed the question without any irrelevant information. Great job staying focused and concise!', strict_mode=False, evaluation_model='gpt-4.1', error=None, evaluation_cost=0.006103999999999999, verbose_logs='Statements:\n[\n    "Joe Biden"\n] \n \nVerdicts:\n[\n    {\n        "verdict": "yes",\n        "reason": null\n    }\n]')], conversational=False, multimodal=False, input='Who is the current president of the United States of America?', actual_output='Joe Biden', expected_output='Joe Biden', context=None, retrieval_context=['Joe Biden serves as the current president of America.'], additional_metadata=None)], confident_link='https://app.confident-ai.com/project/cmekcjvxy00e7e82whpp81o98/evaluation/test-runs/cmekpsq0i026cvpvcb8d5fa2l/test-case

#### Creating Evaluation Dataset as Goldens in Confident AI 

###### Creating Data

In [24]:
test_data = [
    {
        "input": "Who is the current president of the United States of America?",
        "expected_output": "Joe Biden",
    },
    {
        "input": "Who introducted the GPT Model?",
        "expected_output": "Open AI"
    }
]

In [28]:
from deepeval.test_case import LLMTestCase
from deepeval.metrics import AnswerRelevancyMetric
from deepeval.evaluate import evaluate
from deepeval.dataset import EvaluationDataset, Golden

goldens = []

for data in test_data:
    golden = Golden(
        input= data['input'],
        expected_output=data['expected_output'],
    )
    goldens.append(golden)
    
new_dataset = EvaluationDataset(goldens=goldens)
new_dataset

EvaluationDataset(test_cases=[], goldens=[Golden(input='Who is the current president of the United States of America?', actual_output=None, expected_output='Joe Biden', context=None, retrieval_context=None, turns=None, additional_metadata=None, comments=None, tools_called=None, expected_tools=None, source_file=None, name=None, custom_column_key_values=None), Golden(input='Who introducted the GPT Model?', actual_output=None, expected_output='Open AI', context=None, retrieval_context=None, turns=None, additional_metadata=None, comments=None, tools_called=None, expected_tools=None, source_file=None, name=None, custom_column_key_values=None)], _alias=None, _id=None, _multi_turn=False)

#### Push the Dataset to Confident AI

In [30]:
new_dataset.push(alias="TestGoldenDataSet", overwrite=True)

In [31]:
new_dataset

EvaluationDataset(test_cases=[], goldens=[Golden(input='Who is the current president of the United States of America?', actual_output=None, expected_output='Joe Biden', context=None, retrieval_context=None, turns=None, additional_metadata=None, comments=None, tools_called=None, expected_tools=None, source_file=None, name=None, custom_column_key_values=None), Golden(input='Who introducted the GPT Model?', actual_output=None, expected_output='Open AI', context=None, retrieval_context=None, turns=None, additional_metadata=None, comments=None, tools_called=None, expected_tools=None, source_file=None, name=None, custom_column_key_values=None)], _alias=None, _id=None, _multi_turn=False)

#### Pull the Dataset from Confident AI


In [37]:
cloudDataSet = EvaluationDataset()
cloudDataSet.pull(alias="TestGoldenDataSet")
cloudDataSet

EvaluationDataset(test_cases=[], goldens=[Golden(input='Who is the current president of the United States of America?', actual_output=None, expected_output='Donald Trump', context=None, retrieval_context=None, turns=None, additional_metadata=None, comments=None, tools_called=None, expected_tools=None, source_file=None, name=None, custom_column_key_values=None), Golden(input='Who introducted the GPT Model?', actual_output=None, expected_output='Open AI', context=None, retrieval_context=None, turns=None, additional_metadata=None, comments=None, tools_called=None, expected_tools=None, source_file=None, name=None, custom_column_key_values=None)], _alias=TestGoldenDataSet, _id=cmekqc1lx026xvpvc36zgupts, _multi_turn=False)

#### Prepare our Testcase to evaluate our records

In [38]:
def mock_llms_app(input):
    if input == 1:
        return "Joe Biden"
    elif input == 2:
        return "Open AI"

In [39]:
from deepeval.test_case import LLMTestCase

counter = 1
for golden in cloudDataSet.goldens:
    test_case = LLMTestCase(
        input= golden.input,
        expected_output=golden.expected_output,
        actual_output=mock_llms_app(counter),
    )
    counter += 1
    cloudDataSet.add_test_case(test_case)

In [35]:
print(cloudDataSet.test_cases)

[LLMTestCase(input='Who is the current president of the United States of America?', actual_output='Joe Biden', expected_output='Joe Biden', context=None, retrieval_context=None, additional_metadata=None, tools_called=None, comments=None, expected_tools=None, token_cost=None, completion_time=None, name=None, tags=None, mcp_servers=None, mcp_tools_called=None, mcp_resources_called=None, mcp_prompts_called=None), LLMTestCase(input='Who introducted the GPT Model?', actual_output='Open AI', expected_output='Open AI', context=None, retrieval_context=None, additional_metadata=None, tools_called=None, comments=None, expected_tools=None, token_cost=None, completion_time=None, name=None, tags=None, mcp_servers=None, mcp_tools_called=None, mcp_resources_called=None, mcp_prompts_called=None)]


In [42]:
evaluate(test_cases=cloudDataSet.test_cases, metrics=[AnswerRelevancyMetric()])



Metrics Summary

  - ‚úÖ Answer Relevancy (score: 1.0, threshold: 0.5, strict: False, evaluation model: gpt-4.1, reason: The score is 1.00 because the answer was fully relevant and directly addressed the question without any irrelevant information. Great job staying focused and concise!, error: None)

For test case:

  - input: Who introducted the GPT Model?
  - actual output: Open AI
  - expected output: Open AI
  - context: None
  - retrieval context: None


Overall Metric Pass Rates

Answer Relevancy: 100.00% pass rate




Metrics Summary

  - ‚úÖ Answer Relevancy (score: 1.0, threshold: 0.5, strict: False, evaluation model: gpt-4.1, reason: The score is 1.00 because the answer was fully relevant and directly addressed the question with no irrelevant information. Great job staying focused and concise!, error: None)

For test case:

  - input: Who is the current president of the United States of America?
  - actual output: Joe Biden
  - expected output: Donald Trump
  - context: No

EvaluationResult(test_results=[TestResult(name='test_case_1', success=True, metrics_data=[MetricData(name='Answer Relevancy', threshold=0.5, success=True, score=1.0, reason='The score is 1.00 because the answer was fully relevant and directly addressed the question without any irrelevant information. Great job staying focused and concise!', strict_mode=False, evaluation_model='gpt-4.1', error=None, evaluation_cost=0.003092, verbose_logs='Statements:\n[\n    "Open AI"\n] \n \nVerdicts:\n[\n    {\n        "verdict": "yes",\n        "reason": null\n    }\n]')], conversational=False, multimodal=False, input='Who introducted the GPT Model?', actual_output='Open AI', expected_output='Open AI', context=None, retrieval_context=None, additional_metadata=None), TestResult(name='test_case_0', success=True, metrics_data=[MetricData(name='Answer Relevancy', threshold=0.5, success=True, score=1.0, reason='The score is 1.00 because the answer was fully relevant and directly addressed the question wit

### Using Local LLM for Evaluation

In [43]:
!deepeval set-ollama deepseek-r1:8b

üôå Congratulations! You're now using a local Ollama model for all evals that 
require an LLM.


In [45]:
from deepeval.test_case import LLMTestCase
from deepeval.metrics import AnswerRelevancyMetric
from deepeval.evaluate import evaluate

answer_relevancy_metric = AnswerRelevancyMetric()

test_case1 = LLMTestCase(
  input="Who is the current president of the United States of America?",
  actual_output="Joe Biden",
  retrieval_context=["Joe Biden serves as the current president of America."]
)

test_case2 = LLMTestCase(
  input="Who built the Claude Models?",
  actual_output="OpenAI",
  expected_output= "Claude Anthrophic",
  retrieval_context=["Claude Anthrophic built the GPT models."]
)

evaluate(test_cases=[test_case1, test_case2], metrics=[answer_relevancy_metric])



Metrics Summary

  - ‚úÖ Answer Relevancy (score: 1.0, threshold: 0.5, strict: False, evaluation model: deepseek-r1:8b (Ollama), reason: The score is 1.00 because the answer correctly identifies that the Claude Models were built by Anthropic, a company known for its advanced AI technologies., error: None)

For test case:

  - input: Who built the Claude Models?
  - actual output: OpenAI
  - expected output: Claude Anthrophic
  - context: None
  - retrieval context: ['Claude Anthrophic built the GPT models.']


Overall Metric Pass Rates

Answer Relevancy: 100.00% pass rate




Metrics Summary

  - ‚úÖ Answer Relevancy (score: 1.0, threshold: 0.5, strict: False, evaluation model: deepseek-r1:8b (Ollama), reason: The score is 1.00 because the response directly addresses the question by identifying the current president, ensuring relevance and accuracy., error: None)

For test case:

  - input: Who is the current president of the United States of America?
  - actual output: Joe Biden
  -

EvaluationResult(test_results=[TestResult(name='test_case_1', success=True, metrics_data=[MetricData(name='Answer Relevancy', threshold=0.5, success=True, score=1.0, reason='The score is 1.00 because the answer correctly identifies that the Claude Models were built by Anthropic, a company known for its advanced AI technologies.', strict_mode=False, evaluation_model='deepseek-r1:8b (Ollama)', error=None, evaluation_cost=0.0, verbose_logs='Statements:\n[\n    "OpenAI is an AI company.",\n    "It specializes in developing advanced AI technologies."\n] \n \nVerdicts:\n[\n    {\n        "verdict": "yes",\n        "reason": null\n    },\n    {\n        "verdict": "yes",\n        "reason": null\n    }\n]')], conversational=False, multimodal=False, input='Who built the Claude Models?', actual_output='OpenAI', expected_output='Claude Anthrophic', context=None, retrieval_context=['Claude Anthrophic built the GPT models.'], additional_metadata=None), TestResult(name='test_case_0', success=True, m