In [1]:
import nest_asyncio
from dotenv import load_dotenv

nest_asyncio.apply()

"""
Loads the ragas app token which enables one to upload results from evaluations for later reference
Additionally, I have a Langsmith api key which enables one to track the evaluation in real time:
    https://docs.ragas.io/en/latest/howtos/integrations/langsmith/#tracing-ragas-metrics
"""
load_dotenv()

True

In [2]:
from ragas import EvaluationDataset

def load_dataset(filepath: str = "dataset.jsonl") -> EvaluationDataset:
    return EvaluationDataset.from_jsonl(filepath)

eval_dataset = load_dataset()

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from langchain_ollama.llms import OllamaLLM
from langchain_ollama.embeddings import OllamaEmbeddings

from ragas.evaluation import evaluate
from ragas.run_config import RunConfig
from ragas.cache import DiskCacheBackend
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper

ollama_llm = OllamaLLM(
    model="llama3.1",
    base_url="http://localhost:11434",
    temperature=0.1,
    num_ctx=24000,
    format="json"
)

ollama_embeddings = OllamaEmbeddings(
    model="mxbai-embed-large",
    base_url="http://localhost:11434"
)

run_config = RunConfig(
    timeout = 14400, # four hours, depending on GPU, model, testsize, etc -> can experinment
    max_wait = 30,
    log_tenacity = True
)

cacher = DiskCacheBackend(".cache")

llm = LangchainLLMWrapper(
    langchain_llm=ollama_llm,
    run_config=run_config,
    cache=cacher
)

embeddings = LangchainEmbeddingsWrapper(
    embeddings=ollama_embeddings,
    run_config=run_config,
    cache=cacher
)

In [4]:
from ragas.metrics import LLMContextPrecisionWithReference

"""
Measures the number of relevant chunks with respect to the number of all chunks at a given rank.

Example:
    We have 4 chunks in total that were retrieved by RAG and 2 of those were deemed relevant
    for answering the question of the user. For each rank k (1, 2, 3, 4), we calculate the 
    precision as the number of relevant chunks divided by the number of chunks at that rank.
    
    Assuming chunks at rank 1 and 3 were relevant it would look like this:
        precision @ 1 => 1/1 = 1 (since the chunk is relevant and we have only 1 chunk at rank 1)
        precision @ 2 => 1/2 = 0.5 (since there's only one relevant chunk, but 2 chunks at rank 2)
        precision @ 3 => 2/3 = 0.67 (since 2 chunks were relevant at rank 3 were we have 3 in total)
        precision @ 4 => 2/4 = 0.5 (since 2 out of all chunks were deemed relevant at rank 4)
        
        Final score in this case would be:
        Context precision @ (K = 4) => (presicion @ 1 + precision @ 2 + precision @ 3 + precision @ 4) / # relevant chunks
            => (1*1 + 0.5*0 + 0.67*1 + 0.5*0) / 2 = 0.835
            
    Abstract formula:
        precision @ k = (true positives @ k) / (true positives @ k + false positives @ k)
        context precision @ (K = n) = (precision @ 1 * v1 + ... + precision @ n * vn) / # relevant chunks
            where v1, ..., vn are in {0,1} => so either a chunk is relevant or not

https://docs.ragas.io/en/latest/concepts/metrics/available_metrics/context_precision/
"""
context_precision = LLMContextPrecisionWithReference()

In [5]:
from ragas.metrics import LLMContextRecall

"""
Measures how much of the relevant documents / pieces of information were retrieved, where the focus
lies on not missing any relevant / important data. The previous metric focuses more on how 
relevant the retrieved chunks are. This metric is all about making sure that we retrieve 
all the neccesary information, without missing important data.
Higher value for this metric means no missed or very few missed chunks.

Abstract formula:
    Context Recall = the intersection of claims in reference and retrieved context / Total number of claims in the reference

https://docs.ragas.io/en/latest/concepts/metrics/available_metrics/context_recall/
"""
llm_context_recall = LLMContextRecall()

In [6]:
from ragas.metrics import ResponseRelevancy

"""
This metric measures the redundancy or lack of information in the answer with respect to the users query.
The idea is that we use the response from the LLM, depending on the stricktness value (default 3) we use the LLM to
create 3 artificial questions Qk(1-3) and we compute the vector similarity between the original query the user
submitted and the questions we were able to infer from the answer of the LLM. Values scoring high means that the
answer is relevant with respect to the question. 

NOTE: This doesn't measure factuality, since no reference is used.
"""

response_relevancy = ResponseRelevancy()

In [7]:
from ragas.metrics import FaithfulnesswithHHEM

"""
The Faithfulness metric measures how factually consistent a response is with the retrieved context. 
It ranges from 0 to 1, with higher scores indicating better consistency.

A response is considered faithful if all its claims can be supported by the retrieved context.

To calculate this:
1. Identify all the claims in the response.
2. Check each claim to see if it can be inferred from the retrieved context.
3. Compute the faithfulness score using the formula:

Faithfulness Score = Number of claims supported by the retrieved context / Total number of claims in the response

This metric uses a particular model specificially trained to detect hallucinations.
It will be used in the second step, when the claims from the response are compared to the retrieved context.
"""

faithfulness = FaithfulnesswithHHEM(device="cuda:0")

You are using a model of type HHEMv2Config to instantiate a model of type HHEMv2. This is not supported for all configurations of models and can yield errors.


In [8]:
from ragas.metrics import FactualCorrectness

"""
Measures the factual consistency between the reference and the actual response by the LLM.

It uses true positives, false positives, false negatives.
TP = claim/s which is/are supported both by the reference and the response
FP = claim/s which is/are supported by the response, not by the reference
FN = claim/s which is/are supported by the reference, not response

Precision, Recall, and F1 modes

Precision = TP / (TP + FP) => everything which is in the response (even the redundant/missing data)
Recall = TP / (TP + FN) => all claims which are part and not part of the response
F1 = 2 * Precision * Recall / (Precision + Recall)

https://docs.ragas.io/en/latest/concepts/metrics/available_metrics/factual_correctness/
"""

factual_correctness = FactualCorrectness(atomicity="high", coverage="high")

In [9]:
result = evaluate(
    dataset=eval_dataset,
    metrics=[context_precision, llm_context_recall, response_relevancy, faithfulness, factual_correctness],
    llm=llm,
    embeddings=embeddings,
    run_config=run_config
)

Evaluating: 100%|██████████| 260/260 [3:15:39<00:00, 45.15s/it]   


In [10]:
result_df = result.to_pandas()
result_df.to_csv('metrics_evaluation.csv', index=False)
result

{'llm_context_precision_with_reference': 0.7500, 'context_recall': 0.5730, 'answer_relevancy': 0.9423, 'faithfulness_with_hhem': 0.4200, 'factual_correctness(mode=f1)': 0.4952}

In [11]:
result.upload()

Evaluation results uploaded! View at https://app.ragas.io/dashboard/alignment/evaluation/4cd1785f-206c-474d-a92e-f3a90538aa51


'https://app.ragas.io/dashboard/alignment/evaluation/4cd1785f-206c-474d-a92e-f3a90538aa51'