This notebook has been used on [Kaggle](https://www.kaggle.com/).

In [None]:
# Clone my repository
!git clone https://github.com/danielpetrov18/Evaluation-Approaches-for-Retrieval-Augmented-Generation-RAG-.git

In [None]:
# Switch into the ragas folder
%cd Evaluation-Approaches-for-Retrieval-Augmented-Generation-RAG-/evaluation/ragas_eval

In [None]:
# Install dependencies
!pip3 install -U ragas==0.2.15 rapidfuzz==3.13.0 langchain-ollama==0.3.2 python-dotenv==1.1.0

In [None]:
# Download and install Ollama
!curl -fsSL https://ollama.com/install.sh | sh

In [None]:
import os
import time
import subprocess
from dotenv import load_dotenv

load_dotenv("../../env/rag.env")

# Create environment variables dictionary
env = os.environ.copy()  # Start with current environment
env["OLLAMA_KEEP_ALIVE"] = "1h"
env["OLLAMA_CONTEXT_LENGTH"] = os.environ.get("LLM_CONTEXT_WINDOW_TOKENS", "16000")

# Start Ollama server in the background
ollama_process = subprocess.Popen(
    ["ollama", "serve"],
    env=env,
    # Uncomment the following lines if you want to capture the output
    # stdout=subprocess.PIPE,
    # stderr=subprocess.PIPE
)

# Give it a moment to start up
time.sleep(2)

# Check if the process is running
if ollama_process.poll() is None:
    print("Ollama server started successfully")
else:
    print("Failed to start Ollama server")
    if hasattr(ollama_process, 'stderr'):
        print(ollama_process.stderr.read().decode())

# To stop the process if needed
# ollama_process.terminate()

In [None]:
embedding_model: str = os.getenv("EMBEDDING_MODEL")
evaluation_model: str = os.getenv("EVALUATION_MODEL")

# Download models
! ollama pull {embedding_model} && ollama pull {evaluation_model}

In [None]:
# Retrieve sensitive data (OPTIONAL STEP)
from kaggle_secrets import UserSecretsClient

user_secrets = UserSecretsClient()

ragas_app_token = user_secrets.get_secret("RAGAS_APP_TOKEN")

os.environ['RAGAS_APP_TOKEN'] = ragas_app_token 

In [None]:
import json
from typing import List, Dict
from ragas import EvaluationDataset, SingleTurnSample

# Load the dataset corresponding to the experiment you want to test

# The are located under `./datasets`
filepath: str = input("Please specify which dataset to evaluate (only the file name): ")

goldens: List[Dict] = []
try:
    with open(file=f"./datasets/{filepath}.jsonl", mode="r", encoding="utf-8") as file:
        for line in file:
            if line.strip():  # Skip empty lines
                goldens.append(json.loads(line))

    samples: List[SingleTurnSample] = []
    for golden in goldens:
        single_turn_sample = SingleTurnSample(**golden)
        samples.append(single_turn_sample)
        
    evaluation_dataset = EvaluationDataset(samples)
except FileNotFoundError:
    print(f"File: `./datasets/{filepath}.jsonl` containing goldens not found!")
except json.JSONDecodeError as e:
    print(f"Error parsing JSONL file: {e}")

Please specify which dataset to evaluate (only the file name):  test_id_1-dataset


In [None]:
import os
from langchain_ollama import ChatOllama, OllamaEmbeddings

from ragas import RunConfig, DiskCacheBackend
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper

# Instantiating required objects

run_config = RunConfig(
    timeout=86400,    # 24 hours on waiting for a single operation
    max_retries=20,   # Max retries before giving up
    max_wait=600,     # Max wait between retries
    max_workers=4,    # Concurrent requests
    log_tenacity=True # Print retry attempts
)

# This stores data generation and evaluation results locally on disk
# When using it for the first time, it will create a .cache folder
# When using it again, it will read from that folder and finish almost instantly
cacher = DiskCacheBackend(cache_dir=f".cache-{filepath}")

ragas_llm = LangchainLLMWrapper(
    langchain_llm=ChatOllama(
        model=evaluation_model,
        base_url="http://localhost:11434",
        temperature=float(os.getenv("EVALUATION_TEMPERATURE")),
        num_ctx=int(os.getenv("LLM_CONTEXT_WINDOW_TOKENS")),
        format="json"
    ),
    run_config=run_config,
    cache=cacher
)

ragas_embeddings = LangchainEmbeddingsWrapper(
    embeddings=OllamaEmbeddings(
        model=embedding_model,
        base_url="http://localhost:11434"
    ),
    run_config=run_config,
    cache=cacher
)

In [6]:
from ragas.metrics import LLMContextPrecisionWithReference
from prompts.metrics.custom_context_precision_prompt import MyContextPrecisionPrompt

context_precision = LLMContextPrecisionWithReference(
    name="context_precision",
    context_precision_prompt=MyContextPrecisionPrompt(),
    max_retries=20
)

In [7]:
from ragas.metrics import LLMContextRecall   
from prompts.metrics.custom_context_recall_prompt import MyContextRecallPrompt

context_recall = LLMContextRecall(
    name="context_recall",
    context_recall_prompt=MyContextRecallPrompt(),
    max_retries=20
)

In [8]:
from ragas.metrics import ContextEntityRecall
from prompts.metrics.custom_context_entities_recall_prompt import MyContextEntitiesRecallPrompt

context_entity_recall = ContextEntityRecall(
    context_entity_recall_prompt=MyContextEntitiesRecallPrompt(),
    max_retries=20
)

In [9]:
from ragas.metrics import NoiseSensitivity
from prompts.metrics.faithfulness.custom_nli_generator_prompt import MyNLIStatementPrompt
from prompts.metrics.faithfulness.custom_statement_generator_prompt import MyStatementGeneratorPrompt

noise_sensitivity = NoiseSensitivity(
    nli_statements_prompt=MyNLIStatementPrompt(),
    statement_generator_prompt=MyStatementGeneratorPrompt(),
    max_retries=20
)

In [10]:
from ragas.metrics import ResponseRelevancy
from prompts.metrics.custom_response_relevance_prompt import MyResponseRelevancePrompt

response_relevancy = ResponseRelevancy(
    question_generation=MyResponseRelevancePrompt()
)

In [11]:
from ragas.metrics import Faithfulness
from prompts.metrics.faithfulness.custom_nli_generator_prompt import MyNLIStatementPrompt
from prompts.metrics.faithfulness.custom_statement_generator_prompt import MyStatementGeneratorPrompt

faithfulness = Faithfulness(
    nli_statements_prompt=MyNLIStatementPrompt(),
    statement_generator_prompt=MyStatementGeneratorPrompt(),
    max_retries=20,
)

In [12]:
from ragas.metrics import FactualCorrectness
from prompts.metrics.faithfulness.custom_nli_generator_prompt import MyNLIStatementPrompt

factual_correctness = FactualCorrectness(
    nli_prompt=MyNLIStatementPrompt()
)

In [13]:
from ragas.metrics import SemanticSimilarity

semantic_similarity = SemanticSimilarity(
    threshold=0.7, # Default is 0.5 = 50%
)

In [None]:
from ragas.evaluation import evaluate, EvaluationResult

results: EvaluationResult = evaluate(
    dataset=evaluation_dataset,
    metrics=[
        context_precision,           # Metric 1
        context_recall,              # Metric 2
        context_entity_recall,       # Metric 3
        noise_sensitivity,           # Metric 4
        response_relevancy,          # Metric 5
        faithfulness,                # Metric 6
        factual_correctness,         # Metric 7
        semantic_similarity          # Metric 8
    ],
    llm=ragas_llm,
    embeddings=ragas_embeddings,
    experiment_name=f"{filepath}-evaluation",
    run_config=run_config,
    show_progress=True
)

In [None]:
import pandas as pd

# Save results locally (optional)
result_df: pd.DataFrame = results.to_pandas()
result_df.to_csv(f'./{filepath}-eval-results.csv', index=False)

# Display metric scores
results

In [None]:
results.upload()

In [None]:
! ls