# Prepare dataset


In [14]:
from datasets import load_dataset

from ragas.run_config import RunConfig

# loading the V2 dataset
amnesty_qa = load_dataset("explodinggradients/amnesty_qa", "english_v2")

Repo card metadata block was not found. Setting CardData to empty.


In [2]:
amnesty_subset = amnesty_qa["eval"].select(range(2))

In [3]:
amnesty_subset.to_pandas()

Unnamed: 0,question,ground_truth,answer,contexts
0,What are the global implications of the USA Su...,The global implications of the USA Supreme Cou...,The global implications of the USA Supreme Cou...,"[- In 2022, the USA Supreme Court handed down ..."
1,Which companies are the main contributors to G...,"According to the Carbon Majors database, the m...","According to the Carbon Majors database, the m...","[In recent years, there has been increasing pr..."


# Initialize model


In [10]:
from ragas.metrics import (
    answer_relevancy,
    faithfulness,
    context_recall,
    context_precision,
    answer_correctness,
    answer_similarity,
)

In [5]:
from langchain_ollama import ChatOllama, OllamaEmbeddings
from ragas import evaluate

In [6]:
model_name = "llama3.1:8b"
base_url = "http://61.28.230.60:11434"
langchain_llm = ChatOllama(model=model_name, base_url=base_url)
langchain_embeddings = OllamaEmbeddings(model=model_name, base_url=base_url)

# Evaluate


In [15]:
result = evaluate(
    amnesty_subset,
    metrics=[
        faithfulness,
        answer_relevancy,
        context_precision,
        context_recall,
        answer_correctness,
        answer_similarity,
    ],
    llm=langchain_llm,
    embeddings=langchain_embeddings,
    run_config=RunConfig(
        max_workers=3,
        timeout=240,
    ),
)

Evaluating:  17%|█▋        | 2/12 [00:15<01:22,  8.24s/it]Exception raised in Job[0]: TimeoutError()
Evaluating:  25%|██▌       | 3/12 [04:00<16:02, 106.94s/it]Exception raised in Job[1]: TimeoutError()
  embedding_1_normalized = embedding_1 / norms_1
  embedding_2_normalized = embedding_2 / norms_2
Evaluating:  42%|████▏     | 5/12 [04:14<05:52, 50.34s/it] Exception raised in Job[4]: TimeoutError()
  np.dot(gen_question_vec, question_vec.T).reshape(
  embedding_1_normalized = embedding_1 / norms_1
Evaluating:  83%|████████▎ | 10/12 [04:30<00:22, 11.44s/it]Prompt fix_output_format failed to parse output: The output parser failed to parse the output including retries.
Prompt fix_output_format failed to parse output: The output parser failed to parse the output including retries.
Prompt fix_output_format failed to parse output: The output parser failed to parse the output including retries.
Prompt correctness_classifier failed to parse output: The output parser failed to parse the output

In [8]:
print(result)

{'context_precision': 1.0000, 'faithfulness': 0.2143, 'answer_relevancy': 0.3890, 'context_recall': 0.9286}


In [16]:
result.to_pandas()

Unnamed: 0,user_input,retrieved_contexts,response,reference,faithfulness,answer_relevancy,context_precision,context_recall,answer_correctness,semantic_similarity
0,What are the global implications of the USA Su...,"[- In 2022, the USA Supreme Court handed down ...",The global implications of the USA Supreme Cou...,The global implications of the USA Supreme Cou...,,,1.0,0.857143,,
1,Which companies are the main contributors to G...,"[In recent years, there has been increasing pr...","According to the Carbon Majors database, the m...","According to the Carbon Majors database, the m...",,,1.0,1.0,,
