# Scoring Different Prompting Techniques with DeepEval

Three different prompting techniques are evaluated in this notebook:

1. Base RAG Prompt 
2. Advanced RAG Prompt (role-based)
3. Optimized RAG Prompt (based on DSPy)

In [6]:
import pandas as pd
import deepeval
import dspy
import requests
import json
import boto3

from dotenv import load_dotenv
load_dotenv()

True

## Base/Advanced RAG Prompt

In [25]:
# Load data
with open("synthetics/testset.json", "r") as f:
	dataset = json.load(f)
print(f"Test set has {len(dataset)} samples")
print(f"Test set keys: {dataset[0].keys()}")

Test set has 12 samples
Test set keys: dict_keys(['question', 'response'])


In [None]:
# Setting constants for all the models
TOP_N = 20
MAX_TOKENS = 1024
TOP_P = 0.7
TEMPERATURE = 0.3

In [None]:
# Base Prompt
from typing import Literal

def rag_call(prompt: str, prompt_type: Literal["cite", "base"]) -> str:
    request_body = {
        "body": prompt,
        "max_tokens": 1024,
		"prompt": prompt_type,
		"top_n": TOP_N,
		"top_p": TOP_P,
        "temperature": TEMPERATURE
	}
    response = requests.post(
        url="http://greencompute-1575332443.us-east-1.elb.amazonaws.com/api/llm/rag",
        json=request_body
	)
    return response.json()["response"]

In [28]:
from tqdm.notebook import tqdm

for prompt_type in ["cite", "base"]:
	print(f"Prompt Type: {prompt_type}")
	for record in tqdm(dataset):
		generated_text = rag_call(record["question"], prompt_type)
		record[f"generated_{prompt_type}"] = generated_text

Prompt Type: cite


  0%|          | 0/12 [00:00<?, ?it/s]

Prompt Type: base


  0%|          | 0/12 [00:00<?, ?it/s]

## Optimized RAG Prompt

In [None]:
def search(query: str, top_k: int) -> list[str]:
    url = "http://greencompute-1575332443.us-east-1.elb.amazonaws.com/api/llm/retrieval"
    headers = {
        "accept": "application/json",
        "Content-Type": "application/json"
    }
    data = {
        "query": query,
        "top_k": top_k
    }

    documents = requests.post(url, headers=headers, json=data).json()["documents"]
    return [f"[{i}]" + doc["doc_title"] + doc["url"] + "\n\n" + doc["content"] for i, doc in enumerate(documents)]

class TitanLM(dspy.LM):
    def __init__(self, model: str, client, max_tokens: int = 1024, temperature: float = 0.3, top_p: float = 0.7, **kwargs):
        self.client = client
        self.history = []
        self.temperature = temperature
        self.max_tokens = max_tokens
        self.top_p = top_p

        super().__init__(model, **kwargs)
        self.model = model
    
    def _format_message(self, prompt: str):
        body = json.dumps(
            {
                "inputText": prompt,
                "textGenerationConfig": {
                    "maxTokenCount": self.max_tokens,
                    "stopSequences": [],
                    "temperature": self.temperature,
                    "topP": self.top_p,
                },
            }
        )
        return body

    def generate_content(self, prompt: str) -> str:
        body = self._format_message(prompt)
        response = self.client.invoke_model(
            body=body,
            modelId=self.model,
            accept="application/json",
            contentType="application/json",
        )
        response_body = json.loads(response.get("body").read())
        return response_body.get("results")

    def __call__(self, prompt=None, messages=None, **kwargs):
        # Custom chat model working for text completion model
        prompt = '\n\n'.join([x['content'] for x in messages] + ['BEGIN RESPONSE:'])

        completions = self.generate_content(prompt)
        self.history.append({"prompt": prompt, "completions": completions})

        # Must return a list of strings
        return [completions[0].get("outputText")]

    def inspect_history(self):
        for interaction in self.history:
            print(f"Prompt: {interaction['prompt']} -> Completions: {interaction['completions']}")

In [30]:
class RAG(dspy.Module):
    def __init__(self, num_docs=20):
        self.num_docs = num_docs
        self.respond = dspy.ChainOfThought('context, question -> response')

    def forward(self, question):
        context = search(question, top_k=self.num_docs)
        return self.respond(context=context, question=question)

In [31]:
# Load the optimized RAG model
lm = TitanLM("amazon.titan-text-premier-v1:0", client=boto3.client("bedrock-runtime"))
dspy.configure(lm=lm)
rag = RAG()
rag.load("output/optimized_rag_v2.json")

In [32]:
rag("How can I increase my data center efficiency?")

Prediction(
    reasoning='To answer the question, consider the following: By focusing cooling resources on individual racks, rack-level cooling systems can minimize energy waste and optimize cooling performance. This targeted approach leads to significant energy savings and reduced cooling costs.',
    response='One way to increase data center efficiency is to use rack-level cooling systems.'
)

In [None]:
for record in dataset:
	record["optimized_rag"] = rag(record["question"])

In [40]:
for record in dataset:
    record["optimized_rag"] = record["optimized_rag"].response

In [43]:
# Save the results
import pathlib

pathlib.Path("output").mkdir(parents=True, exist_ok=True)

with open("output/results.json", "w") as f:
	json.dump(dataset, f)

## Evaluation

In [3]:
# Load the results from the saved file
with open("output/results.json", "r") as f:
	results = json.load(f)

In [7]:
from deepeval.metrics import GEval
from deepeval.test_case import LLMTestCaseParams

correctness_metric = GEval(
    name="Correctness",
    criteria="Determine whether the actual output is factually correct based on the expected output.",
    # NOTE: you can only provide either criteria or evaluation_steps, and not both
    evaluation_steps=[
        "Check whether the facts in 'actual output' contradicts any facts in 'expected output'",
        "You should also heavily penalize omission of detail",
        "Vague language, or contradicting OPINIONS, are OK"
    ],
    evaluation_params=[LLMTestCaseParams.INPUT, LLMTestCaseParams.ACTUAL_OUTPUT],
)

In [8]:
from deepeval.test_case import LLMTestCase

test_case = LLMTestCase(
    input="The dog chased the cat up the tree, who ran up the tree?",
    actual_output="It depends, some might consider the cat, while others might argue the dog.",
    expected_output="The cat."
)

correctness_metric.measure(test_case)
print(correctness_metric.score)
print(correctness_metric.reason)

Output()

0.2322884834150058
The actual output is vague and does not clearly identify the cat as the one who ran up the tree, omitting a crucial detail from the input.


In [9]:
from deepeval.dataset import EvaluationDataset

dataset = EvaluationDataset()

# Add as test cases
dataset.add_test_cases_from_json_file(
    # file_path is the absolute path to you .json file
    file_path="output/results.json",
    input_key_name="question",
    actual_output_key_name="generated_cite",
    expected_output_key_name="response",
)

In [12]:
correctness_metric = GEval(
    name="Correctness",
    criteria="Determine whether the actual output is factually correct based on the expected output.",
    evaluation_params=[LLMTestCaseParams.INPUT, LLMTestCaseParams.ACTUAL_OUTPUT],
)

In [13]:
dataset.evaluate([correctness_metric])

Event loop is already running. Applying nest_asyncio patch to allow async execution...


Evaluating 12 test case(s) in parallel: |██████████|100% (12/12) [Time Taken: 00:10,  1.14test case/s]



Metrics Summary

  - ✅ Correctness (GEval) (score: 0.7775845021643252, threshold: 0.5, strict: False, evaluation model: gpt-4o, reason: The actual output accurately describes methods such as closed-loop chilled water, evaporative, and DX cooling systems, aligning with the question's focus on enhancing energy efficiency. It also accurately mentions advantages like improved temperature control, reduced energy consumption, and increased reliability. However, it assumes a data center context not specified in the input, which slightly affects relevance., error: None)

For test case:

  - input: What methods and advantages arise from installing dedicated cooling systems to enhance energy efficiency?
  - actual output: There are several methods and advantages to installing dedicated cooling systems to enhance energy efficiency in a data center. One method is to use a closed-loop chilled water system, which can provide high efficiency and flexibility in cooling distribution. Another method i




EvaluationResult(test_results=[TestResult(success=True, metrics_data=[MetricData(name='Correctness (GEval)', threshold=0.5, success=True, score=0.7775845021643252, reason="The actual output accurately describes methods such as closed-loop chilled water, evaporative, and DX cooling systems, aligning with the question's focus on enhancing energy efficiency. It also accurately mentions advantages like improved temperature control, reduced energy consumption, and increased reliability. However, it assumes a data center context not specified in the input, which slightly affects relevance.", strict_mode=False, evaluation_model='gpt-4o', error=None, evaluation_cost=0.0029575, verbose_logs='Criteria:\nDetermine whether the actual output is factually correct based on the expected output. \n \nEvaluation Steps:\n[\n    "Compare the factual accuracy of the actual output with the expected output.",\n    "Identify discrepancies between facts stated in the actual output and those in the expected out