In [6]:
from datasets import load_dataset

dataset = load_dataset(
    "explodinggradients/amnesty_qa",
    "english_v3",
    trust_remote_code=True
)

Repo card metadata block was not found. Setting CardData to empty.


In [7]:
from ragas import EvaluationDataset

eval_dataset = EvaluationDataset.from_hf_dataset(dataset["eval"])

In [8]:
import os
from dotenv import load_dotenv
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from langchain_ollama import OllamaLLM, OllamaEmbeddings

load_dotenv()

CHAT_MODEL = os.getenv("OLLAMA_CHAT_MODEL")
OLLAMA_API_BASE = os.getenv("OLLAMA_API_BASE")
EMBEDDING_MODEL = os.getenv("OLLAMA_EMBEDDING_MODEL")

ollama_llm = OllamaLLM(
    model=CHAT_MODEL, 
    base_url=OLLAMA_API_BASE
)
ollama_llm_wrapper = LangchainLLMWrapper(ollama_llm)

ollama_embedding = OllamaEmbeddings(
    model=EMBEDDING_MODEL, 
    base_url=OLLAMA_API_BASE
)
ollama_embedding_wrapper = LangchainEmbeddingsWrapper(ollama_embedding)

In [9]:
from ragas.metrics import LLMContextPrecisionWithReference

"""
    Context Precision measures the proportion of relevant context/chunks that have been retrieved.
    Importance here is on retrieving all relevant chunks.
    Presence of irrelevant data/chunks reduces the score.
"""

# Using this metric since we already have the ground truth and retrieved contexts in the eval dataset.
# This metric uses LLM to compare the retrieved context chunks with the ground truth.
context_precision_metric_with_reference = LLMContextPrecisionWithReference(
    llm=ollama_llm_wrapper
)

In [10]:
from ragas.metrics import LLMContextRecall

"""
    Context Recall is a metric which primarily tests whether or not all relevant context/chunks have been retrieved.
    Missing chunks lead to a penalty and the score is reduced.
    Importance here is on not missing anything relevant. 
    Presence of redundant/irrelevant data doesn't reduce the score.
"""

# Using this metric instead of NonLLMContextRecall since we cannot access the retrieved context from the R2R framework.
context_recall_metric = LLMContextRecall(
    llm=ollama_llm_wrapper    
)

In [11]:
from ragas.metrics import ContextEntityRecall

"""
    This metric checks if all the entities in the ground truth / reference are present in the retrieved context.
    If so the score is 1 indicating a perfect match.
    This metric can be useful in cases where entities matter, for example, a tourism help chatbot.
"""

context_entity_recall_metric = ContextEntityRecall(
    llm=ollama_llm_wrapper
)

In [12]:
from ragas.metrics import NoiseSensitivity

"""
    This metric checks how often the LLM provides invalid/incorrect responses 
        based on good or bad (noisy data / redundant data) retrieved context.
    The lower the score, the more robust is the RAG system to noisy data.
    This metric makes use of reference/ground truth, retrieved context, user_input and the actual LLM response.
    The response is split into claims. Each claim is verified against the ground truth and also the retrieved context.
    If either the ground truth or the retrieved context doesn't support the claim then the claim is marked as wrong.
"""

# focus='irrelevant' - Evaluates the effect of irrelevant contexts on the generated response. 
# Checks if irrelevant contexts introduce incorrect claims into the response.
noise_sensitivity_metric = NoiseSensitivity(
    llm=ollama_llm_wrapper,
    focus="irrelevant"
)

In [13]:
from ragas.metrics import ResponseRelevancy

"""
    This metric measures how relevant/comprehensive the answer is to the user input/query.
    Incomplete, irrelevant or redundant answers reduce the score.
    Factuality is not considered in this metric.
    To compute the metric one needs the user_input, retrieved_context and the LLM response.
    One derives n hypothetical/artificial questions based on the answer. 
    Then using cosine-similarity we compare the similarity of the answer with the hypothetical questions.
    Finally, we take the average/mean of the similarity scores.
"""

# Strictness refers to the number of questions generated per answer.
response_relevancy_metric = ResponseRelevancy(
    llm=ollama_llm_wrapper,
    embeddings=ollama_embedding_wrapper,
    strictness=3
)

In [14]:
from ragas.metrics import Faithfulness

"""
    This metric measures how much of the retrieved context is actually relevant/helpful in answering the user query.
    We take the number of claims in the generated answer and see how many of them can be inferred from the retrieved context.
    Finally, we take the number of claims that can be inferred and divide it by the total number of claims in the answer.
"""

faithfulness_metric = Faithfulness(
    llm=ollama_llm_wrapper
)

In [15]:
from ragas.metrics import FactualCorrectness

"""
    This metric is like a fact-checker. It compares the ground truth/reference to the generated answer.
    Both the answer and ground truth are split into claims and then using natural language inference we determine the factual overlap.
    
    TP: True positive - number of claims in the response that are present in the reference.
    FP: False positive - number of claims in the response not part of the reference.
    FN: False negative - number of claims in the reference not present in the response.
    
    Precision = TP / (TP + FP)
    Recall = TP / (TP + FN)
    F1 = 2 * Precision * Recall / (Precision + Recall)
    
    The mode can be used to compute F1, precision or recall.
    Atomicity controls the granularity when splitting both the answer and the reference into claims. (low, high)
    The lower the value the more a sentence is broken apart into its smallest, meaningful components.
    Coverage can be used to either generalize the content or focus on the most important parts of the content. (low, high)
"""

factual_correctness_metric = FactualCorrectness(
    llm=ollama_llm_wrapper,
    mode="f1",
    atomicity="low",
    coverage="low"
)

In [16]:
from ragas import evaluate, RunConfig

metrics = [
   # context_precision_metric_with_reference,
   # context_recall_metric,
   # context_entity_recall_metric,
   # noise_sensitivity_metric,
   # response_relevancy_metric,
    faithfulness_metric,
   # factual_correctness_metric
]

# Added this since otherwise timeout exceptions are thrown. The model is to weak.
config = RunConfig(
    timeout=600 
)

results = evaluate(
    dataset=eval_dataset, 
    metrics=metrics,
    llm=ollama_llm_wrapper,
    embeddings=ollama_embedding_wrapper,
    run_config=config,
    batch_size=16
)

Evaluating:   0%|          | 0/20 [01:58<?, ?it/s]


KeyboardInterrupt: 

Exception raised in Job[4]: RemoteProtocolError(Server disconnected without sending a response.)
Exception raised in Job[5]: ConnectError(All connection attempts failed)
Exception raised in Job[0]: ConnectError(All connection attempts failed)
Exception raised in Job[13]: ConnectError(All connection attempts failed)
Exception raised in Job[12]: ConnectError(All connection attempts failed)
Exception raised in Job[15]: ConnectError(All connection attempts failed)
Exception raised in Job[14]: ConnectError(All connection attempts failed)
Exception raised in Job[8]: ConnectError(All connection attempts failed)
Exception raised in Job[1]: ConnectError(All connection attempts failed)
Exception raised in Job[9]: ConnectError(All connection attempts failed)
Exception raised in Job[2]: ConnectError(All connection attempts failed)
Exception raised in Job[3]: ConnectError(All connection attempts failed)
Exception raised in Job[10]: ReadError()
Exception raised in Job[11]: ReadError()
Exception rais