In [4]:
import pandas as pd
from ragas import evaluate as rag_eval
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_recall,
    context_precision,
)
from datasets import Dataset
import evaluate as eval

# Finding out the Basic QA Metrics (F1 score, EM score)

In [8]:
squad_metric = eval.load("squad")

df = pd.read_csv('../results/rag_generated_answers.csv')

predictions = []
references = []

# Loop through each row of the DataFrame
for index, row in df.iterrows():
    # Create a unique ID for each row. Using the index is a simple way.
    unique_id = str(index)

    # Format the prediction dictionary
    predictions.append({
        'id': unique_id,
        'prediction_text': row['generated_answer']
    })

    # Format the reference dictionary
    answer = row['answer']
    context = row['top_1_context']

    # Calculate the start index of the answer in the context
    answer_start = context.find(answer)

    references.append({
        'id': unique_id,
        'answers': {
            'text': [answer],                   # Must be a list of strings
            'answer_start': [answer_start]      # Must be a list of integers
        }
    })

squad_metric = eval.load("squad")
results = squad_metric.compute(predictions=predictions, references=references)

print("Evaluation Results:")
print(f"EM: {results['exact_match']:.2f}\nF1: {results['f1']:.2f}")

Evaluation Results:
EM: 34.86
F1: 43.13


# Advanced Evaluation using RAGAs

In [20]:
data = {
    "question": df['question'].tolist(),           # List of questions from the dataset
    "answer": df['generated_answer'].tolist(),     # List of generated answers
    "retrieved_contexts": df['top_1_context'].apply(lambda x: [x]).tolist(),      # List of top-1 contexts
    "reference": df['answer'].tolist()         # List of ground truth answers (human-annotated)
}

# Convert dict to dataset
dataset = Dataset.from_dict(data)

In [None]:
import os
os.environ["OPENAI_API_KEY"] = ""

In [None]:
# Run the evaluation
result = rag_eval(
    dataset=dataset,
    metrics=[
        faithfulness,
        answer_relevancy,
        context_precision,
        context_recall
    ]
)

df_results = result.to_pandas()
print(df_results.head())

Evaluating:   0%|          | 9/3672 [02:45<20:18:50, 19.96s/it]Exception raised in Job[3]: TimeoutError()
Exception raised in Job[7]: TimeoutError()
Exception raised in Job[8]: TimeoutError()
Exception raised in Job[9]: TimeoutError()
Exception raised in Job[10]: TimeoutError()
Exception raised in Job[14]: TimeoutError()
Exception raised in Job[0]: TimeoutError()
Evaluating:   0%|          | 10/3672 [03:00<18:57:06, 18.63s/it]Exception raised in Job[4]: TimeoutError()
Exception raised in Job[12]: TimeoutError()
Exception raised in Job[16]: TimeoutError()
Evaluating:   1%|          | 22/3672 [04:06<9:05:53,  8.97s/it] Exception raised in Job[20]: TimeoutError()
Evaluating:   1%|          | 23/3672 [04:24<10:45:18, 10.61s/it]Exception raised in Job[21]: TimeoutError()
Evaluating:   1%|          | 24/3672 [04:49<13:50:53, 13.67s/it]Exception raised in Job[22]: TimeoutError()
Evaluating:   1%|          | 25/3672 [04:51<11:11:06, 11.04s/it]Exception raised in Job[23]: TimeoutError()
Excepti

                                          user_input  \
0  Was Abraham Lincoln the sixteenth President of...   
1  Did Lincoln sign the National Banking Act of 1...   
2                   Did his mother die of pneumonia?   
3      How many long was Lincoln's formal education?   
4       When did Lincoln begin his political career?   

                                  retrieved_contexts  \
0                            [Young Abraham Lincoln]   
1  [Lincoln believed in the Whig theory of the pr...   
2  [An autopsy performed after his death revealed...   
3  [Lincoln's formal education consisted of about...   
4  [Lincoln began his political career in 1832, a...   

                    response  reference  faithfulness  answer_relevancy  \
0                       yes.        yes           NaN          0.783770   
1  House of Representatives.        yes           NaN          0.739514   
2                        No.         no           NaN               NaN   
3                 18 month

In [None]:
df_results.to_csv("../results/rag_evaluation_results.csv", index=False)

In [28]:
mean_scores = df_results[["faithfulness", "answer_relevancy", "context_precision", "context_recall"]].mean()
print("Mean Scores:")
print(mean_scores)

Mean Scores:
faithfulness         0.502273
answer_relevancy     0.696658
context_precision    0.634369
context_recall       0.541591
dtype: float64
