In [1]:
import os
from dotenv import load_dotenv
load_dotenv(encoding='utf-8')

True

# Pre-Production Evaluation

1. Load Synthetic Data: Load the synthetic question-ground_truth generated by the 1-ragas-synthetic-test-data-generation.ipynb notebook.
2. Feed Data to System: Input these synthetic questions into your RAG system.
3. Extract Responses: Use the 2-chat-history-extraction.ipynb notebook to extract the system's generated responses for each question.
4. Match Pairs: Pair each question-ground_truth with its corresponding question-answer-context to create a question-contexts-answer-ground_truth dataframe.
5. Run Evaluation: Use LangSmith's client to evaluate the RAG system's performance based on the comparison of generated responses and ground truths.

# Prepare testing data

**When saving the test data in the notebook 2-chat-history-extraction.ipynb, we serialized the contexts column. To load that file to a dataframe, we need a function to de-serialize it.**

In [2]:
import json
import pandas as pd

def serialize_list(value):
    """Serializes a list to a JSON string."""
    return json.dumps(value)

def deserialize_list(value):
    """Deserializes a JSON string back into a list."""
    return json.loads(value)

def save_dataframe_with_list_column(df, filename):
    """Saves a DataFrame with a list column to a CSV file, preserving the list structure.

    Args:
        df: The DataFrame to save.
        filename: The name of the output CSV file.
    """

    # Apply the serialization function to the list column
    df['contexts'] = df['contexts'].apply(serialize_list)

    # Save the DataFrame to CSV
    df.to_csv(filename, index=False)

def load_dataframe_with_list_column(filename):
    """Loads a DataFrame from a CSV file, restoring the list structure.

    Args:
        filename: The name of the input CSV file.

    Returns:
        The loaded DataFrame.
    """

    # Load the DataFrame
    df = pd.read_csv(filename)

    # Apply the deserialization function to the list column
    df['contexts'] = df['contexts'].apply(deserialize_list)

    return df

## Load the test data from the chat history extraction process

In [22]:
import pandas as pd
from from_root import from_root
file_name = "test_dataset_it_openai_deployment_test.csv"
df_question_answer_contexts = load_dataframe_with_list_column(os.path.join(from_root(), "data-test/test-dataset/", file_name))

In [24]:
# Use the records with the conversation_id = 119128da-543a-4e36-b7f3-41fa2bebb6cb
df_question_answer_contexts = df_question_answer_contexts[df_question_answer_contexts['conversation_id']=='119128da-543a-4e36-b7f3-41fa2bebb6cb'][['question', 'answer', 'contexts']]

## Adding ground truth

In [5]:
# Import the ground truth from the test question set 
df_ground_truth = pd.read_json(os.path.join(from_root(), "data-test/test-dataset/", "test_dataset_it.json"))

In [9]:
# Merge the dataframe with the ground_truth by the question content
data_to_test = pd.merge(df_question_answer_contexts, df_ground_truth, on='question', how='left')

## Convert to RAGAS data format

In [13]:
# Convert testing data to RAGAS Dataset format
from datasets import Dataset

question = list(data_to_test['question'])
answer = list(data_to_test['answer'])
contexts = list(data_to_test['contexts'])
ground_truth = list(data_to_test['ground_truth'])

data = {
    'question': question,
    'answer': answer,
    'contexts': contexts,
    'ground_truth': ground_truth
}

dataset = Dataset.from_dict(data)

# Evaluation

In [11]:
# Uncomment this block of code if you want to store the evluation on LangSmith

# from langsmith import Client
# os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"
# os.environ["LANGCHAIN_PROJECT"] = os.getenv('LANGCHAIN_PROJECT')
# os.environ["LANGCHAIN_TRACING_V2"] = "true"
# os.environ["LANGSMITH_API_KEY"] = os.getenv("LANGSMITH_API_KEY")
# client = Client()

In [15]:
from ragas import evaluate
# from ragas.integrations.langsmith import evaluate
from ragas.metrics import (
    answer_relevancy,
    faithfulness,
    context_recall,
    context_precision,
)
result = evaluate(
    dataset,
    metrics=[
        answer_relevancy,
        faithfulness,
        context_recall,
        context_precision,
    ],
)

result

Evaluating:   0%|          | 0/36 [00:00<?, ?it/s]

{'answer_relevancy': 0.9684, 'faithfulness': 0.3209, 'context_recall': 0.3813, 'context_precision': 0.6534}

In [18]:
df = result.to_pandas()
df

Unnamed: 0,question,answer,contexts,ground_truth,answer_relevancy,faithfulness,context_recall,context_precision
0,How does Tech Innovators Inc. promote employee...,[openai]: Tech Innovators Inc. promotes employ...,"[IntroductionAt Tech Innovators Inc., we belie...",Tech Innovators Inc. promotes employee engagem...,0.980843,0.0,0.0,0.0
1,What steps are needed to extract data from Con...,[openai]: To extract data from Confluence and ...,[IntroductionThis guide provides a step-by-ste...,To extract data from Confluence and create a R...,0.945037,0.666667,0.181818,1.0
2,How do employee engagement and disengagement d...,[openai]: Employee engagement and disengagemen...,"[are motivated and committed, disengaged emplo...",Employee engagement and disengagement differ i...,0.931132,0.466667,1.0,0.880258
3,What mechanisms are in place for reporting vio...,"[openai]: At Tech Innovators Inc., there are s...",[and identify areas for improvement.5.3 Report...,Employees can report violations of labor laws ...,0.980979,0.117647,0.25,1.0
4,What is the purpose of the orientation session...,[openai]: The purpose of the orientation sessi...,[to help you get started. Company OverviewTech...,The purpose of the orientation session at Tech...,1.0,0.111111,0.0,0.0
5,What is the significance of emotional and aest...,[openai]: Emotional and aesthetic labor play a...,[LabourEmotional and aesthetic labor involves ...,Emotional and aesthetic labor in the workplace...,0.974615,0.5,0.666667,1.0
6,What forms of unethical behavior are strictly ...,"[openai]: In the recruitment process of Inc., ...",[Inc. upholds the highest ethical standards in...,Favoritism or nepotism,0.987251,0.333333,0.0,1.0
7,What is the importance of identifying and addr...,[openai]: Identifying and addressing growth ar...,[to identify strengths and areas for improveme...,Identifying and addressing growth areas in sel...,0.990183,0.307692,0.333333,0.0
8,How does the role of the Senior Director respo...,[openai]: The role of the Senior Director resp...,[the Senior Director responsible for Analytics...,The role of the Senior Director responsible fo...,0.925206,0.384615,1.0,1.0


In [None]:
# Save the result data
file_name = "eval_result_dataset_it_openai_deployment_1.csv"
result.to_pandas().to_csv(os.path.join(from_root(), "data-test/test-dataset/", file_name), index=False)