In [None]:
from typing import List, Optional
from langchain_huggingface import HuggingFaceEmbeddings
from deepeval.models import DeepEvalBaseEmbeddingModel

from greencompute_backend.services.llm.svc import EMBEDDINGS_MODEL

class CustomEmbeddingModel(DeepEvalBaseEmbeddingModel):
    def __init__(self):
        pass

    def load_model(self):
        return HuggingFaceEmbeddings(
			model_name=EMBEDDINGS_MODEL
		)

    def embed_text(self, text: str) -> List[float]:
        embedding_model = self.load_model()
        return embedding_model.embed_query(text)

    def embed_texts(self, texts: List[str]) -> List[List[float]]:
        embedding_model = self.load_model()
        return embedding_model.embed_documents(texts)

    async def a_embed_text(self, text: str) -> List[float]:
        embedding_model = self.load_model()
        return await embedding_model.aembed_query(text)

    async def a_embed_texts(self, texts: List[str]) -> List[List[float]]:
        embedding_model = self.load_model()
        return await embedding_model.aembed_documents(texts)

    def get_model_name(self):
        "Custom HugginFace Embeddings Model"

In [21]:
import pathlib

data_dir = pathlib.Path("../extraction")
data_paths = [p.as_posix() for p in data_dir.glob("*.txt")]
data_paths

['../extraction/DCEP Process Manual v3.2 (2-2-2021).txt',
 '../extraction/Improving Energy Efficiency for Server Rooms and Closets.txt',
 '../extraction/EE in Small Server Rooms ACEEE_2014.txt',
 '../extraction/DCEE_Actions_Master_List_090920_final.txt',
 '../extraction/DOE_LBNL Data Center Energy Assessment Process Manual DOE v3_032222.txt',
 '../extraction/Computer Server Selection Guidelines 12-22 (4).txt',
 '../extraction/NRDC_ServerRooms2012.txt',
 '../extraction/Building the business case for energy efficiency in data centers.txt',
 '../extraction/SmallServerRooms_Final Report Task 2.13_2013.txt',
 '../extraction/USER GUIDE FOR IMPLEMENTING ECBC_v9.2_06_May 2021_0.txt',
 '../extraction/Small Data Centers, Big Energy Savings.txt']

In [1]:
from deepeval.synthesizer import Synthesizer
from dotenv import load_dotenv; load_dotenv()

synthesizer = Synthesizer(
	model="gpt-4o-mini",
	embedder=CustomEmbeddingModel(),
)
synthesizer.generate_goldens_from_docs(
    document_paths=['output.txt'],
    chunk_size=500, 
    chunk_overlap=0,
)

Event loop is already running. Applying nest_asyncio patch to allow async execution...


✨ 🚀 ✨ Loading Documents: 100%|██████████| 1/1 [00:00<00:00,  1.25it/s]
✨ 📚 ✨ Chunking Documents: 100%|██████████| 1/1 [00:01<00:00,  1.99s/it]
✨ 🧩 ✨ Generating Contexts: 100%|██████████| 9/9 [00:04<00:00,  1.86it/s]


✨ Generating up to 6 goldens using DeepEval (using gpt-4o and text-embedding-3-small, use case=QA, method=docs):  50%|█████     | 3/6 [00:14<00:10,  3.60s/it]ERROR:root:OpenAI rate limit exceeded. Retrying: 1 time(s)...
ERROR:root:OpenAI rate limit exceeded. Retrying: 1 time(s)...
✨ Generating up to 6 goldens using DeepEval (using gpt-4o and text-embedding-3-small, use case=QA, method=docs): 100%|██████████| 6/6 [00:35<00:00,  5.89s/it]


[Golden(input='Investigate how elevated RH levels mitigate ESD threats in data center environments.', actual_output=None, expected_output='Elevated relative humidity (RH) levels, such as 40% RH at the IT equipment intake, help mitigate electrostatic discharge (ESD) threats in data center environments. Maintaining higher humidity reduces the likelihood of ESD, which can damage sensitive electronic components. This is because dry air increases the potential for static electricity buildup. However, maintaining high RH levels can be energy-intensive, especially if electric humidifiers are used. Alternatives to manage ESD include using conductive flooring materials, implementing good cable grounding methods, and providing grounded wrist straps for technicians.', context=[' centers is often set relatively high (40% RH at the IT equipment intake is common) to guard against damage to the equipment due to electrostatic discharge (ESD). Maintaining this level of humidity is energy intensive if t

In [2]:
synthesizer.to_pandas().to_csv('output.csv', index=False)

In [8]:
# Load the dataset
import pandas as pd
import requests
df = pd.read_csv('output.csv')
records = df.to_dict(orient="records")

In [9]:
# Populate each row's actual_output column based on the row's input column
for record in records:
	response = requests.post(
		"http://localhost:8000/llm/rag",
		json={"body": record["input"]}
	)
	record["actual_output"] = response.json()["body"]

In [11]:
from deepeval.test_case import LLMTestCase

test_cases = []
for record in records:
	test_cases.append(
		LLMTestCase(
			input=record["input"],
			expected_output=record["expected_output"],
			actual_output=record["actual_output"]
		)
	)

In [12]:
test_cases

[LLMTestCase(input='Investigate how elevated RH levels mitigate ESD threats in data center environments.', actual_output="The question is not directly addressed in the provided context. However, I can offer some insights on the topic.\n\nElevated relative humidity (RH) levels can mitigate electrostatic discharge (ESD) threats in data center environments. ESD occurs when an electrical charge builds up on a surface and is discharged, potentially damaging sensitive electronic equipment. \n\nAccording to the American Society of Heating, Refrigerating, and Air-Conditioning Engineers (ASHRAE), maintaining an RH level of 40-55% can help reduce the risk of ESD. This is because higher RH levels increase the conductivity of the air, allowing charges to dissipate more quickly and reducing the likelihood of a discharge.\n\nHowever, it's important to note that maintaining the correct RH level is just one aspect of preventing ESD in data centers. Other measures, such as using anti-static mats and wr

In [13]:
from deepeval import evaluate
from deepeval.metrics import HallucinationMetric, AnswerRelevancyMetric
from deepeval.dataset import EvaluationDataset

eval_dataset = EvaluationDataset(test_cases)

In [14]:
from deepeval.metrics import GEval
from deepeval.test_case import LLMTestCaseParams

correctness_metric = GEval(
    name="Correctness",
    criteria="Determine whether the actual output is factually correct based on the expected output.",
    evaluation_params=[LLMTestCaseParams.INPUT, LLMTestCaseParams.ACTUAL_OUTPUT],
)

In [16]:
results = eval_dataset.evaluate([correctness_metric])

Event loop is already running. Applying nest_asyncio patch to allow async execution...


Evaluating 6 test case(s) in parallel: |██████████|100% (6/6) [Time Taken: 00:03,  1.72test case/s]



Metrics Summary

  - ❌ Correctness (GEval) (score: 0.0, threshold: 0.5, strict: False, evaluation model: gpt-4o, reason: The actual output is unrelated to power distribution, load balancing, power factor, or total harmonic distortion management, which are the focus of the input question., error: None)

For test case:

  - input: What steps ensure efficient power distribution through balanced loads, PF, and THD management?
  - actual output: Answer: Replace Fan Belt Sheaves
  - expected output: To ensure efficient power distribution, implement the following steps: 

1. Maintain balanced loads between phases to reduce waste heat and optimize transformer operation.
2. Keep the Total Harmonic Distortion (THD) at the main feeder panel at 5% or less, which is linked to maintaining a high power factor (PF).
3. Ensure the power factor at the main feeder panel is 0.90 or higher to avoid high utility tariffs and improve efficiency.
4. Retrofit IT equipment to sustain a high power factor and lo




In [18]:
results.test_results

[TestResult(success=False, metrics_data=[MetricData(name='Correctness (GEval)', threshold=0.5, success=False, score=0.0, reason='The actual output is unrelated to power distribution, load balancing, power factor, or total harmonic distortion management, which are the focus of the input question.', strict_mode=False, evaluation_model='gpt-4o', error=None, evaluation_cost=0.00224, verbose_logs='Criteria:\nDetermine whether the actual output is factually correct based on the expected output. \n \nEvaluation Steps:\n[\n    "Compare the actual output with the expected output to ensure factual accuracy.",\n    "Identify any discrepancies between the actual output and the expected facts.",\n    "Verify the sources or references used in the actual output for credibility and correctness.",\n    "Assess whether the actual output logically follows from the input and aligns with the expected factual information."\n]')], conversational=False, multimodal=False, input='What steps ensure efficient pow