In [1]:
import pandas as pd
from ragas import evaluate as rag_eval
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_recall,
    context_precision
)
from datasets import Dataset
import evaluate as eval
from concurrent.futures import ThreadPoolExecutor
import os

  from .autonotebook import tqdm as notebook_tqdm


# Finding out the Basic QA Metrics (F1 score, EM score)

In [2]:
squad_metric = eval.load("squad")

enhanced_df = pd.read_csv('../results/enhanced_rag_answers.csv')

predictions = []
references = []

# Loop through each row of the DataFrame
for index, row in enhanced_df.iterrows():
    # Create a unique ID for each row. Using the index is a simple way.
    unique_id = str(index)

    # Format the prediction dictionary
    predictions.append({
        'id': unique_id,
        'prediction_text': row['generated_answer']
    })

    # Format the reference dictionary
    answer = row['answer']
    context = row['combined_context']

    # Calculate the start index of the answer in the context
    answer_start = context.find(answer)

    references.append({
        'id': unique_id,
        'answers': {
            'text': [answer],                   # Must be a list of strings
            'answer_start': [answer_start]      # Must be a list of integers
        }
    })

squad_metric = eval.load("squad")
results = squad_metric.compute(predictions=predictions, references=references)

print("Evaluation Results:")
print(f"EM: {results['exact_match']:.2f}\nF1: {results['f1']:.2f}")

Evaluation Results:
EM: 42.59
F1: 54.12


# Advanced Evaluation using RAGAs

In [3]:
# Limit to 200 queries for RAGAS evaluation
enhanced_df_subset = enhanced_df.head(200)

enhanced_data = {
    "question": enhanced_df_subset['question'].tolist(),           # List of questions from the dataset
    "answer": enhanced_df_subset['generated_answer'].tolist(),     # List of generated answers
    "retrieved_contexts": enhanced_df_subset['combined_context'].apply(lambda x: [x]).tolist(),      # List of top contexts
    "reference": enhanced_df_subset['answer'].tolist()         # List of ground truth answers (human-annotated)
}

# Convert dict to dataset
enhanced_dataset = Dataset.from_dict(enhanced_data)
print(f"Dataset size: {len(enhanced_dataset)} samples")

Dataset size: 200 samples


In [None]:
import os
os.environ["OPENAI_API_KEY"] = "<YOUR_KEY_HERE>"

In [None]:
# Function to evaluate a subset of the dataset
def evaluate_subset(subset_dataset, metrics):
    """Evaluate a subset of the dataset using RAGAS metrics"""
    try:
        result = rag_eval(
            dataset=subset_dataset,
            metrics=metrics
        )
        return result.to_pandas()
    except Exception as e:
        print(f"Error in evaluation: {e}")
        return None

# Split dataset into chunks for multiprocessing
def split_dataset_into_chunks(dataset, chunk_size=50):
    """Split dataset into smaller chunks for parallel processing"""
    chunks = []
    for i in range(0, len(dataset), chunk_size):
        chunk_data = {
            "question": dataset["question"][i:i+chunk_size],
            "answer": dataset["answer"][i:i+chunk_size],
            "retrieved_contexts": dataset["retrieved_contexts"][i:i+chunk_size],
            "reference": dataset["reference"][i:i+chunk_size]
        }
        chunk_dataset = Dataset.from_dict(chunk_data)
        chunks.append(chunk_dataset)
    return chunks

# Define metrics
metrics = [
    faithfulness,
    answer_relevancy,
    context_precision,
    context_recall
]

# Split dataset into chunks (50 samples per chunk for 200 total samples = 4 chunks)
dataset_chunks = split_dataset_into_chunks(enhanced_dataset, chunk_size=50)
print(f"Split dataset into {len(dataset_chunks)} chunks of ~50 samples each")

# Use ThreadPoolExecutor for parallel evaluation
n_workers = 4  # Adjust based on your system capabilities
print(f"Using {n_workers} workers for parallel evaluation")

with ThreadPoolExecutor(max_workers=n_workers) as executor:
    # Submit all evaluation tasks
    future_results = [
        executor.submit(evaluate_subset, chunk, metrics) 
        for chunk in dataset_chunks
    ]
    
    # Collect results
    results_list = []
    for i, future in enumerate(future_results):
        try:
            result = future.result(timeout=300)  # 5 minute timeout per chunk
            if result is not None:
                results_list.append(result)
                print(f"Completed evaluation for chunk {i+1}")
            else:
                print(f"Failed evaluation for chunk {i+1}")
        except Exception as e:
            print(f"Error in chunk {i+1}: {e}")

# Combine all results
if results_list:
    enhanced_df_results = pd.concat(results_list, ignore_index=True)
    print(f"Combined results shape: {enhanced_df_results.shape}")
    print(enhanced_df_results.head())
else:
    print("No successful evaluations completed")

Split dataset into 4 chunks of ~50 samples each
Using 4 workers for parallel evaluation



Evaluating:   0%|          | 0/200 [00:00<?, ?it/s]

[A[AException raised in Job[7]: APIConnectionError(Connection error.)
Exception raised in Job[0]: APIConnectionError(Connection error.)
Evaluating:   0%|          | 1/200 [00:11<36:47, 11.10s/it]Exception raised in Job[6]: APIConnectionError(Connection error.)
Exception raised in Job[15]: APIConnectionError(Connection error.)


[A[AException raised in Job[2]: APIConnectionError(Connection error.)
Exception raised in Job[14]: APIConnectionError(Connection error.)
Exception raised in Job[14]: APIConnectionError(Connection error.)
Exception raised in Job[13]: APIConnectionError(Connection error.)

[AException raised in Job[6]: APIConnectionError(Connection error.)
Exception raised in Job[7]: APIConnectionError(Connection error.)
Exception raised in Job[14]: APIConnectionError(Connection error.)
Evaluating:   1%|          | 2/200 [00:11<15:24,  4.67s/it]Exception raised in Job[6]: APIConnectionError(Connection error.)
Evaluating:  

Completed evaluation for chunk 1


Exception raised in Job[105]: TimeoutError()


[A[AException raised in Job[198]: AssertionError(LLM is not set)
Exception raised in Job[199]: AssertionError(set LLM before use)
Exception raised in Job[129]: TimeoutError()


[A[AException raised in Job[160]: TimeoutError()
Evaluating: 100%|██████████| 200/200 [04:21<00:00,  1.31s/it]


Completed evaluation for chunk 2


Exception raised in Job[180]: APIConnectionError(Connection error.)

Evaluating: 100%|██████████| 200/200 [04:24<00:00,  1.32s/it]
Exception raised in Job[145]: TimeoutError()


[A[AException raised in Job[157]: TimeoutError()


[A[AException raised in Job[181]: TimeoutError()


[A[AException raised in Job[182]: TimeoutError()


[A[AException raised in Job[185]: TimeoutError()


[A[AException raised in Job[186]: TimeoutError()


[A[AException raised in Job[189]: TimeoutError()


[A[AException raised in Job[190]: TimeoutError()


[A[AException raised in Job[191]: TimeoutError()


[A[AException raised in Job[193]: TimeoutError()


[A[AException raised in Job[194]: TimeoutError()


[A[AException raised in Job[195]: TimeoutError()


[A[AException raised in Job[196]: TimeoutError()


[A[AException raised in Job[197]: TimeoutError()


Evaluating: 100%|██████████| 200/200 [06:45<00:00,  2.03s/it]


Completed evaluation for chunk 3
Completed evaluation for chunk 4
Combined results shape: (200, 8)
                                          user_input  \
0  Was Abraham Lincoln the sixteenth President of...   
1  Did Lincoln sign the National Banking Act of 1...   
2                   Did his mother die of pneumonia?   
3      How many long was Lincoln's formal education?   
4       When did Lincoln begin his political career?   

                                  retrieved_contexts                response  \
0  [Young Abraham Lincoln\n\nAbraham Lincoln (Feb...                    yes.   
1  [Lincoln believed in the Whig theory of the pr...                    Yes.   
2  [An autopsy performed after his death revealed...  not enough information   
3  [Lincoln's formal education consisted of about...              18 months.   
4  [Lincoln began his political career in 1832, a...                   1832.   

   reference  faithfulness  answer_relevancy  context_precision  \
0        yes    

In [None]:
enhanced_df_results.to_csv("../results/enhanced_rag_evaluation.csv", index=False)

In [None]:
mean_scores = enhanced_df_results[["faithfulness", "answer_relevancy", "context_precision", "context_recall"]].mean()
print("Mean Enhanced Scores:")
print(mean_scores)

Mean Naive Scores:
faithfulness        NaN
answer_relevancy    NaN
context_precision   NaN
context_recall      NaN
dtype: float64
