In [1]:
import os
import nest_asyncio
from dotenv import load_dotenv

nest_asyncio.apply()

load_dotenv()
app_token = os.getenv('RAGAS_APP_TOKEN')

In [2]:
import ast
import pandas as pd
from ragas import SingleTurnSample, EvaluationDataset

def load_dataset() -> EvaluationDataset:
    df = pd.read_csv('dataset.csv')

    # Convert the string representation of lists to actual Python lists
    df['retrieved_contexts'] = df['retrieved_contexts'].apply(ast.literal_eval)

    samples = []
    for i in range(2):
        sample = SingleTurnSample(
            user_input = df['user_input'].iloc[i],
            retrieved_contexts = df['retrieved_contexts'].iloc[i],
            response = df['response'].iloc[i],
            reference = df['reference'].iloc[i]
        )
        samples.append(sample)

    eval_dataset = EvaluationDataset(samples)
    return eval_dataset

eval_dataset = load_dataset()

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
from langchain_ollama.llms import OllamaLLM
from langchain_ollama.embeddings import OllamaEmbeddings

from ragas.evaluation import evaluate
from ragas.run_config import RunConfig
from ragas.cache import DiskCacheBackend
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper

llm_model = OllamaLLM(
    model="llama3.1",
    base_url="http://localhost:11434",
    temperature=0.1,
    num_ctx=24000
)

embeddings_model = OllamaEmbeddings(
    model="mxbai-embed-large",
    base_url="http://localhost:11434"
)

run_config = RunConfig(
    timeout = 3600, # One hour
    max_wait = 30,
    log_tenacity = True
)

cacher = DiskCacheBackend(".cache")

llm = LangchainLLMWrapper(
    langchain_llm=llm_model,
    cache=cacher
)

embeddings = LangchainEmbeddingsWrapper(
    embeddings=embeddings_model,
    cache=cacher
)

In [None]:
from ragas.metrics import LLMContextPrecisionWithoutReference

"""
Measures the number of relevant chunks with respect to the number of all chunks at a given rank.

Example:
    We have 4 chunks in total that were retrieved by RAG and 2 of those were deemed relevant
    for answering the question of the user. For each rank k (1, 2, 3, 4), we calculate the 
    precision as the number of relevant chunks divided by the number of chunks at that rank.
    
    Assuming chunks at rank 1 and 3 were relevant it would look like this:
        precision @ 1 => 1/1 = 1 (since the chunk is relevant and we have only 1 chunk at rank 1)
        precision @ 2 => 1/2 = 0.5 (since there's only one relevant chunk, but 2 chunks at rank 2)
        precision @ 3 => 2/3 = 0.67 (since 2 chunks were relevant at rank 3 were we have 3 in total)
        precision @ 4 => 2/4 = 0.5 (since 2 out of all chunks were deemed relevant at rank 4)
        
        Final score in this case would be:
        Context precision @ (K = 4) => (presicion @ 1 + precision @ 2 + precision @ 3 + precision @ 4) / # relevant chunks
            => (1*1 + 0.5*0 + 0.67*1 + 0.5*0) / 2 = 0.835
            
    Abstract formula:
        precision @ k = (true positives @ k) / (true positives @ k + false positives @ k)
        context precision @ (K = n) = (precision @ 1 * v1 + ... + precision @ n * vn) / # relevant chunks
            where v1, ..., vn are in {0,1} => so either a chunk is relevant or not

https://docs.ragas.io/en/latest/concepts/metrics/available_metrics/context_precision/
"""
context_precision = LLMContextPrecisionWithoutReference()

context_precision_results = evaluate(
    dataset=eval_dataset,
    metrics=[context_precision],
    llm=llm,
    embeddings=embeddings,
    run_config=run_config
)

In [None]:
from ragas.metrics import ContextRecall

"""
Measures how much of the relevant documents / pieces of information were retrieved, where the focus
lies on not missing any relevant / important data. The previous metric focuses more on how 
relevant the retrieved chunks are. This metric is all about making sure that we retrieve 
all the neccesary information, without missing important data.
Higher value for this metric means no missed or very few missed chunks.

Abstract formula:
    Context Recall = / Total number of claims in the reference

https://docs.ragas.io/en/latest/concepts/metrics/available_metrics/context_recall/
"""
context_recall = ContextRecall()

context_recall_results = evaluate(
    dataset=eval_dataset,
    metrics=[context_recall],
    llm=llm,
    embeddings=embeddings,
    run_config=run_config
)

In [6]:
from ragas.metrics import Faithfulness, AnswerRelevancy
from ragas.evaluation import evaluate
from ragas.run_config import RunConfig

run_config = RunConfig(
    timeout = 3600, # One hour
    max_wait = 30,
    log_tenacity = True
)

answer_relevancy = AnswerRelevancy()

result = evaluate(
    dataset=eval_dataset,
    metrics=[answer_relevancy],
    llm=llm,
    embeddings=embeddings,
    run_config=run_config
)

Evaluating: 100%|██████████| 2/2 [01:08<00:00, 34.50s/it]


In [6]:
result_df = result.to_pandas()
result_df.to_csv('eval_results/faithfulness.csv', index=False)

In [7]:
result.upload()

Evaluation results uploaded! View at https://app.ragas.io/dashboard/alignment/evaluation/b53b5f82-d241-4905-a5c4-57d93dfc1a0f


'https://app.ragas.io/dashboard/alignment/evaluation/b53b5f82-d241-4905-a5c4-57d93dfc1a0f'

In [None]:
from ragas.exceptions import RagasOutputParserException