## Testing RAGs with DeepEvals

### Setting-up dotEnv & LLM 

In [1]:
from dotenv import load_dotenv
from langchain_ollama import ChatOllama

load_dotenv(override=True)
load_dotenv('./../.env')

llm = ChatOllama(
    base_url="http://localhost:11434",
    model="llama3.1:8b",
    temperature=0.5,
    max_tokens=300
)

### Connect with DeepEval

In [None]:
# %pip install -U deepeval

In [2]:
import deepeval

deepeval.login_with_confident_api_key("confident_us_DNEaOTujz584SicCTCNGpP7snhHa3APERq7FhaTvpDI=")

### Creating Golden Dataset

In [3]:
golden_dataset = [
     {
        "question": "What is Playwright and what browsers does it support?",
        "expected_answer": "Playwright is a modern automation library supporting Chromium, Firefox, and WebKit."
    },
    {
        "question": "What is Selenium and what programming languages does it support?",
        "expected_answer": "Selenium is an open-source framework supporting multiple programming languages like Python, Java, and C#."
    },
    {
        "question": "What is network interception in Playwright?",
        "expected_answer": "Network interception in Playwright allows users to modify, block, or inspect network requests and responses, enabling better control over API calls and testing."
    },
    {
        "question": "Does Playwright have a native test runner unlike Selenium?",
        "expected_answer": "Yes, Playwright has a native test runner called Playwright Test, which handles test creation and execution, unlike Selenium, which relies on JUnit, NUnit, or XUnit for test execution."
    },
    {
        "question": "What are the advantages of using Cypress for front-end testing?",
        "expected_answer": "Cypress provides fast feedback loops, automatic waiting, and time-travel debugging, making it highly efficient for front-end testing."
    },
    {
        "question": "How does Selenium WebDriver enable browser automation?",
        "expected_answer": "Selenium WebDriver enables automated browser testing by providing bindings in multiple languages, such as Python, Java, and C#, to interact with web elements programmatically."
    },
    {
        "question": "How does Playwright handle debugging for web applications?",
        "expected_answer": "Playwright supports debugging through tracing, network interception, and headless execution, making it easier to identify and fix issues in web applications."
    }
]

golden_dataset

[{'question': 'What is Playwright and what browsers does it support?',
  'expected_answer': 'Playwright is a modern automation library supporting Chromium, Firefox, and WebKit.'},
 {'question': 'What is Selenium and what programming languages does it support?',
  'expected_answer': 'Selenium is an open-source framework supporting multiple programming languages like Python, Java, and C#.'},
 {'question': 'What is network interception in Playwright?',
  'expected_answer': 'Network interception in Playwright allows users to modify, block, or inspect network requests and responses, enabling better control over API calls and testing.'},
 {'question': 'Does Playwright have a native test runner unlike Selenium?',
  'expected_answer': 'Yes, Playwright has a native test runner called Playwright Test, which handles test creation and execution, unlike Selenium, which relies on JUnit, NUnit, or XUnit for test execution.'},
 {'question': 'What are the advantages of using Cypress for front-end testi

### Upload the Golden Dataset

In [4]:
from deepeval.dataset import EvaluationDataset, Golden

goldens = []

for dataset in golden_dataset:
    golden = Golden(
        input=dataset['question'],
        expected_output=dataset['expected_answer']
    )
    
    goldens.append(golden)
    
eval_dataset = EvaluationDataset(goldens=goldens)

In [5]:
eval_dataset

EvaluationDataset(test_cases=[], goldens=[Golden(input='What is Playwright and what browsers does it support?', actual_output=None, expected_output='Playwright is a modern automation library supporting Chromium, Firefox, and WebKit.', context=None, retrieval_context=None, additional_metadata=None, comments=None, tools_called=None, expected_tools=None, source_file=None, name=None, custom_column_key_values=None), Golden(input='What is Selenium and what programming languages does it support?', actual_output=None, expected_output='Selenium is an open-source framework supporting multiple programming languages like Python, Java, and C#.', context=None, retrieval_context=None, additional_metadata=None, comments=None, tools_called=None, expected_tools=None, source_file=None, name=None, custom_column_key_values=None), Golden(input='What is network interception in Playwright?', actual_output=None, expected_output='Network interception in Playwright allows users to modify, block, or inspect netwo

In [6]:
eval_dataset.push("TestingTool Dataset")

Aborted.


### RAG Application with Following Internal Dataset

In [7]:
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma

embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

In [8]:
from langchain.docstore.document import Document

docs = [
    Document(page_content="Playwright is a modern automation library for end-to-end testing. It supports multiple browsers like Chromium, Firefox, and WebKit."),
    Document(page_content="Selenium is a widely used open-source framework for web automation, supporting multiple programming languages and browsers."),
    Document(page_content="Playwright comes with native test runner called Playwright Test Runner, which handles test creation and execution"),
    Document(page_content="Playwright supports debugging through tracing, network interception, and headless execution, making it easier to identify and fix issues in web applications."),
    Document(page_content="Cypress is a JavaScript-based testing tool primarily used for front-end testing. It runs in the browser and provides fast feedback loops."),
    Document(page_content="Playwright allows network interception, headless execution, and tracing for debugging complex web applications."),
    Document(page_content="Selenium WebDriver enables automated browser testing using various bindings such as Python, Java, and C#."),
    Document(page_content="Cypress has built-in support for retries, time-travel debugging, and automatic waiting, making it easy to test dynamic web pages."),
    Document(page_content="This document talks about REST API testing tools, which are unrelated to Playwright, Selenium, or Cypress.")
]

vector_store = Chroma.from_documents(docs, embeddings)

### Setup the Retriever

In [9]:
from langchain.chains import RetrievalQA

retriever = vector_store.as_retriever(search_kwargs={'k': 3})

qa_chain = RetrievalQA.from_chain_type(llm=llm, retriever=retriever)

query = "What does PlayWright do ?"

response = qa_chain.invoke(query)

retrieved_docs = retriever.invoke(query)

response, retrieved_docs

({'query': 'What does PlayWright do ?',
  'result': "According to the context, Playwright is a tool that:\n\n1. Handles test creation and execution (through its native test runner)\n2. Allows for debugging complex web applications through:\n\t* Network interception\n\t* Headless execution\n\t* Tracing\n\nIt's essentially a tool designed to help developers identify and fix issues in web applications."},
 [Document(id='95e080dd-44d1-4cb2-ba3b-21e9e4c59773', metadata={}, page_content='Playwright comes with native test runner called Playwright Test Runner, which handles test creation and execution'),
  Document(id='04172508-f474-4724-ab92-cb705a87ef4d', metadata={}, page_content='Playwright allows network interception, headless execution, and tracing for debugging complex web applications.'),
  Document(id='233645d5-9542-45e4-9a9d-9cd6457f4e73', metadata={}, page_content='Playwright supports debugging through tracing, network interception, and headless execution, making it easier to identi

### Querying the Context & Generating Actual Output from LLM (For setting-up LLMTestCase)

In [10]:
def get_response_with_context(question):
    
    retrieved_docs = retriever.invoke(question)
    retrieved_contexts = [doc.page_content for doc in retrieved_docs]
    
    response = qa_chain.invoke(question)
    
    return response['result'], retrieved_contexts

In [11]:
from deepeval.test_case import LLMTestCase

eval_dataset.pull("TestingTool Dataset")

for golden in goldens:
    input = golden.input
    actual_output, retrieval_context = get_response_with_context(input)
 
    test_case = LLMTestCase(
        input=input,
        actual_output=actual_output,
        expected_output=golden.expected_output,
        retrieval_context=retrieval_context
    )

    eval_dataset.add_test_case(test_case)

eval_dataset.test_cases

Output()

[LLMTestCase(input='What is Playwright and what browsers does it support?', actual_output='According to the provided context, Playwright is a modern automation library for end-to-end testing that supports multiple browsers, specifically:\n\n1. Chromium\n2. Firefox\n3. WebKit', expected_output='Playwright is a modern automation library supporting Chromium, Firefox, and WebKit.', context=None, retrieval_context=['Playwright is a modern automation library for end-to-end testing. It supports multiple browsers like Chromium, Firefox, and WebKit.', 'Playwright supports debugging through tracing, network interception, and headless execution, making it easier to identify and fix issues in web applications.', 'Playwright allows network interception, headless execution, and tracing for debugging complex web applications.'], additional_metadata=None, tools_called=None, comments=None, expected_tools=None, token_cost=None, completion_time=None, name=None, tags=None),
 LLMTestCase(input='What is Sel

### Evaluation with DeepEval

In [12]:
import deepeval.metrics

deepeval.evaluate(
    test_cases=eval_dataset.test_cases,
    metrics=[
        deepeval.metrics.AnswerRelevancyMetric(),
        deepeval.metrics.FaithfulnessMetric(),
        deepeval.metrics.ContextualRelevancyMetric(),
        deepeval.metrics.ContextualPrecisionMetric()
    ]
)

Output()



Metrics Summary

  - ✅ Answer Relevancy (score: 1.0, threshold: 0.5, strict: False, evaluation model: gpt-4.1, reason: The score is 1.00 because the answer was fully relevant and directly addressed the question without any irrelevant information. Great job staying focused and clear!, error: None)
  - ✅ Faithfulness (score: 1.0, threshold: 0.5, strict: False, evaluation model: gpt-4.1, reason: Great job! There are no contradictions, so the actual output is fully faithful to the retrieval context., error: None)
  - ✅ Contextual Relevancy (score: 0.6666666666666666, threshold: 0.5, strict: False, evaluation model: gpt-4.1, reason: The score is 0.67 because while the relevant statements confirm that Playwright has a native test runner, part of the context discusses unrelated REST API testing tools, which does not address the input question., error: None)
  - ✅ Contextual Precision (score: 1.0, threshold: 0.5, strict: False, evaluation model: gpt-4.1, reason: The score is 1.00 because the

EvaluationResult(test_results=[TestResult(name='test_case_3', success=True, metrics_data=[MetricData(name='Answer Relevancy', threshold=0.5, success=True, score=1.0, reason='The score is 1.00 because the answer was fully relevant and directly addressed the question without any irrelevant information. Great job staying focused and clear!', strict_mode=False, evaluation_model='gpt-4.1', error=None, evaluation_cost=0.0035499999999999998, verbose_logs='Statements:\n[\n    "Playwright comes with a native test runner called Playwright Test Runner.",\n    "Selenium does not have a native test runner."\n] \n \nVerdicts:\n[\n    {\n        "verdict": "yes",\n        "reason": null\n    },\n    {\n        "verdict": "yes",\n        "reason": null\n    }\n]'), MetricData(name='Faithfulness', threshold=0.5, success=True, score=1.0, reason='Great job! There are no contradictions, so the actual output is fully faithful to the retrieval context.', strict_mode=False, evaluation_model='gpt-4.1', error=