This notebook will not be required at all for the most part. If at any time the regular notebook hangs, it usually means there's a timeout error. So you can use this notebook in `Google Colab`.

- `curl -fsSL https://ollama.com/install.sh | sh`
- `OLLAMA_KEEP_ALIVE="3h" OLLAMA_CONTEXT_LENGTH="${LLM_CONTEXT_WINDOW_TOKENS:-16000}" ollama serve &`
- `ollama pull mxbai-embed-large && ollama pull llama3.1:8b-instruct-q4_1`

In [None]:
# Clone repository
!git clone https://github.com/danielpetrov18/Evaluation-Approaches-for-Retrieval-Augmented-Generation-RAG-.git

In [None]:
# Switch into the deepeval evaluation folder
%cd Evaluation-Approaches-for-Retrieval-Augmented-Generation-RAG-/evaluation/deepeval_eval/

In [None]:
# Install dependencies
!pip3 install deepeval==3.2.1 python-dotenv==1.1.0

In [None]:
import os
import json
from typing import (
    Final, List, Dict, Any, Union, Optional
)
from multiprocessing import Process, Queue

from pydantic import BaseModel
from dotenv import load_dotenv
from google.colab import userdata

from deepeval import evaluate
from deepeval.metrics import (
    AnswerRelevancyMetric,
    FaithfulnessMetric,
    ContextualPrecisionMetric,
    ContextualRecallMetric,
    ContextualRelevancyMetric
)
from deepeval.test_case import LLMTestCase
from deepeval.models.llms import OllamaModel
from deepeval.dataset import EvaluationDataset
from deepeval import login_with_confident_api_key
from deepeval.evaluate.configs import (
    AsyncConfig, CacheConfig, DisplayConfig, ErrorConfig
)
from deepeval.evaluate.evaluate import EvaluationResult

# Custom prompts for the DeepEval metrics
from prompts.custom_faithfulness_prompt import MyFaithfulnessTemplate
from prompts.custom_answer_relevancy_prompt import MyAnswerRelevancyTemplate
from prompts.custom_contextual_recall_prompt import MyContextualRecallTemplate
from prompts.custom_contextual_precision_prompt import MyContextualPrecisionTemplate
from prompts.custom_contextual_relevancy_prompt import MyContextualRelevancyTemplate

# Loads all the RAG parameters like chunk size, chunk overlap, temperature, etc.
load_dotenv("../../env/rag.env")

In [None]:
DEEPEVAL_API_KEY: str = userdata.get("DEEPEVAL_API_KEY")
os.environ["DEEPEVAL_API_KEY"] = DEEPEVAL_API_KEY

if os.getenv("DEEPEVAL_API_KEY"):
    deepeval_api_key: str = os.getenv("DEEPEVAL_API_KEY")

    # You should get a message letting you know you are logged-in.
    login_with_confident_api_key(deepeval_api_key)

In [None]:
# By default DeepEval doesn't provide a way to set a timeout
# Certain operations can block indefinitely
# To avoid this I provide this custom solution

class TimeoutOllamaModel(OllamaModel):
    def __init__(
        self,
        model: Optional[str] = None,
        base_url: Optional[str] = None,
        temperature: float = 0,
        timeout: Optional[int] = 1800,  # seconds
        **kwargs,
    ):
        super().__init__(model, base_url, temperature, **kwargs)
        self.timeout = timeout

    def _run_chat(self, prompt: str, schema: Optional[BaseModel], q: Queue):
        try:
            chat_model = self.load_model()
            response = chat_model.chat(
                model=self.model_name,
                messages=[{"role": "user", "content": prompt}],
                format=schema.model_json_schema() if schema else None,
                options={"temperature": self.temperature},
            )
            q.put(response.message.content)
        except Exception as e:
            q.put(e)

    def generate(self, prompt: str, schema: Optional[BaseModel] = None):
        from multiprocessing import Queue

        q = Queue()
        p = Process(target=self._run_chat, args=(prompt, schema, q))
        p.start()
        p.join(self.timeout)

        if p.is_alive():
            p.terminate()
            p.join()
            print(f"Ollama generation exceeded timeout of {self.timeout} seconds")
            raise TimeoutError(f"Ollama generation exceeded timeout of {self.timeout} seconds")

        result = q.get()
        if isinstance(result, Exception):
            raise result

        return (
            schema.model_validate_json(result) if schema else result,
            0.0,
        )

EVALUATION_MODEL: Final[str] = os.getenv("EVALUATION_MODEL")

eval_model = TimeoutOllamaModel(
  model=EVALUATION_MODEL,
  timeout=600, # 10 minutes
)

In [None]:
# The datasets are located under `../datasets`
experiment_id: int = int(input("Please specify experiment_id (Ex. 1): "))

ragas_samples: List[Dict[str, Any]] = []
try:
    # Your filepath may vary
    # Alternatively, you can pull a dataset from ConfidentAI
    with open(
        file=f"../datasets/{experiment_id}_dataset.jsonl",
        mode="r",
        encoding="utf-8"
    ) as file:
        for line in file:
            if line.strip():  # Skip empty lines
                ragas_samples.append(json.loads(line))

    # Convert from JSON to LLMTestCase.
    # Have in mind that RAGAs and DeepEval use similar names for the same parameters.
    # We are mapping the RAGAs parameters to DeepEval ones.
    test_cases: List[LLMTestCase] = []
    for ragas_sample in ragas_samples:
        test_case = LLMTestCase(
            input=ragas_sample["user_input"],
            actual_output=ragas_sample["response"],
            expected_output=ragas_sample["reference"],
            retrieval_context=ragas_sample["retrieved_contexts"],
            context=ragas_sample["reference_contexts"],
        )
        test_cases.append(test_case)

    # The fully loaded and ready for evaluation dataset
    evaluation_dataset = EvaluationDataset(test_cases=test_cases)
except FileNotFoundError:
    raise Exception(
        f"File: `../datasets/{experiment_id}_dataset.jsonl` containing test cases not found!"
    )
except json.JSONDecodeError as e:
    raise Exception (
        f"Error parsing JSONL file: {str(e)}"
    )

In [None]:
answer_relevancy = AnswerRelevancyMetric(
    model=eval_model,
    include_reason=False,
    evaluation_template=MyAnswerRelevancyTemplate,
)

faithfulness = FaithfulnessMetric(
    model=eval_model,
    include_reason=False,
    evaluation_template=MyFaithfulnessTemplate,
)

contextual_precision = ContextualPrecisionMetric(
    model=eval_model,
    include_reason=False,
    evaluation_template=MyContextualPrecisionTemplate
)

contextual_recall = ContextualRecallMetric(
    model=eval_model,
    include_reason=False,
    evaluation_template=MyContextualRecallTemplate
)

contextual_relevancy = ContextualRelevancyMetric(
    model=eval_model,
    include_reason=False,
    evaluation_template=MyContextualRelevancyTemplate
)

In [None]:
# https://www.deepeval.com/docs/evaluation-flags-and-configs
async_conf = AsyncConfig(
    run_async=False,
)

cache_conf = CacheConfig(
    write_cache=True,
    use_cache=True
)

display_conf = DisplayConfig(
    show_indicator=True,
    print_results=True,
    verbose_mode=False
)

error_conf = ErrorConfig(
    ignore_errors=True,
    skip_on_missing_params=False
)

In [None]:
results: EvaluationResult = evaluate(
    test_cases=evaluation_dataset.test_cases,
    metrics=[
        answer_relevancy,
        faithfulness,
        contextual_precision,
        contextual_recall,
        contextual_relevancy
    ],
    #hyperparameters={},
    identifier=f"{experiment_id}_experiment",
    async_config=async_conf,
    cache_config=cache_conf,
    display_config=display_conf,
    error_config=error_conf
)

In [None]:
# Extract the scores
scores: Dict[str, List[Dict[str, Union[str, float]]]] = {} # Metric name -> list of test case name and score

for test_result in results.test_results: # Iterate over the test cases
    for metric in test_result.metrics_data: # Iterate over the metrics
        # Get the test case number
        test_case: int = int(test_result.name.split("_")[-1]) # Make sure to convert to int, so we can sort it properly
        if metric.name in scores:
            scores[metric.name].append({ "test_case": test_case, "score": metric.score })
        else:
            scores[metric.name] = [{ "test_case": test_case, "score": metric.score }]

# Sort properly, since DeepEval doesn't evaluate in the original order
for metric_name in scores:
    scores[metric_name] = sorted(scores[metric_name], key=lambda x: x["test_case"])

# Make sure the directory exists
os.makedirs("./res", exist_ok=True)

# Store on disk
with open(f"./res/{experiment_id}_eval.jsonl", "w") as f:
    for metric_name, metric_scores in scores.items():
        record = {
            "metric": metric_name,
            "scores": metric_scores  # This is the sorted list of {"test_case": ..., "score": ...}
        }
        f.write(json.dumps(record) + "\n")