To simulate a real-world scenario, we fed the system a diverse set of questions that would typically arise from multiple users in different conversations. We then utilized the 2-chat-history-extraction.ipynb notebook to extract the system's responses.

Since these questions were generated organically and don't have predefined ground truths, we focused our evaluation on two key RAGAS metrics: answer relevancy and faithfulness. This approach provides insights into how well the system can address user queries and maintain consistency in its responses.

In [2]:
# Load the .env file
#pip install -U python-dotenv
import os
from dotenv import load_dotenv
load_dotenv(encoding='utf-8')
from from_root import from_root

# Evaluation

In [1]:
import json
import pandas as pd

def serialize_list(value):
    """Serializes a list to a JSON string."""
    return json.dumps(value)

def deserialize_list(value):
    """Deserializes a JSON string back into a list."""
    return json.loads(value)

def save_dataframe_with_list_column(df, filename):
    """Saves a DataFrame with a list column to a CSV file, preserving the list structure.

    Args:
        df: The DataFrame to save.
        filename: The name of the output CSV file.
    """

    # Apply the serialization function to the list column
    df['contexts'] = df['contexts'].apply(serialize_list)

    # Save the DataFrame to CSV
    df.to_csv(filename, index=False)

def load_dataframe_with_list_column(filename):
    """Loads a DataFrame from a CSV file, restoring the list structure.

    Args:
        filename: The name of the input CSV file.

    Returns:
        The loaded DataFrame.
    """

    # Load the DataFrame
    df = pd.read_csv(filename)

    # Apply the deserialization function to the list column
    df['contexts'] = df['contexts'].apply(deserialize_list)

    return df

## Load the test data from the chat history extraction process

In [3]:
import pandas as pd
from from_root import from_root
file_name = "test_dataset_it_openai_deployment_test.csv"
df_question_answer_contexts = load_dataframe_with_list_column(os.path.join(from_root(), "data-test/test-dataset/", file_name))

In [9]:
df_question_answer_contexts = df_question_answer_contexts[df_question_answer_contexts['conversation_id']=='8117578e-d06a-4bfd-988f-b2eee28121f1'][['question', 'answer', 'contexts']]

## Convert to RAGAS format

**Let evaluate the first 10 system's responses.**

In [12]:
from datasets import Dataset
question = list(df_question_answer_contexts['question'])
answer = list(df_question_answer_contexts['answer'])
contexts = list(df_question_answer_contexts['contexts'])

data_samples = {
    'question': question,
    'answer': answer,
    'contexts': contexts,
}

dataset = Dataset.from_dict(data_samples)

In [79]:
from langsmith import Client
os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"
os.environ["LANGCHAIN_PROJECT"] = os.getenv('LANGCHAIN_PROJECT')
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGSMITH_API_KEY"] = os.getenv("LANGSMITH_API_KEY")
client = Client()

## Run RAGAS evaluation

In [14]:
from ragas import evaluate
# from ragas.integrations.langsmith import evaluate
from ragas.metrics import (
    answer_relevancy,
    faithfulness,
    #context_recall,
    #context_precision,
)

In [15]:
result = evaluate(
    dataset,
    metrics=[
        answer_relevancy,
        faithfulness,
        #context_recall,
        #context_precision,
    ],
)

Evaluating:   0%|          | 0/14 [00:00<?, ?it/s]

In [16]:
file_name = "eval_result_post_prod_test_dataset_it_openai_deployment.csv"
json_file_path = os.path.join(from_root(), "data-test/eval-result/", file_name)
result.to_pandas().to_csv(json_file_path, index=False)