# EVAL
Evaluation of RAG systems from the ground up.

In [1]:
import os
import json
from dotenv import load_dotenv
from datetime import timedelta
import importlib
import src

from langchain_text_splitters import RecursiveCharacterTextSplitter
from openai import OpenAI
from couchbase.auth import PasswordAuthenticator
from couchbase.cluster import Cluster
from couchbase.options import ClusterOptions
from langchain_couchbase.vectorstores import CouchbaseVectorStore
from langchain_openai import OpenAIEmbeddings



load_dotenv()

True

## Simple RAG using Couchbase vectorstore

In [2]:
# Function to confgure the couchbase cluster and vector store
class CouchbaseConfig:
    def __init__(self):
        self.connection_string = os.environ.get("cluster_url")
        self.db_username = os.environ.get("cb_username")
        self.db_password = os.environ.get("cb_password")

    def connect(self):
        '''Connect to the couchbase cluster'''
        auth = PasswordAuthenticator(self.db_username, self.db_password)
        options = ClusterOptions(authenticator=auth)
        cluster = Cluster(self.connection_string, options)
        cluster.wait_until_ready(timedelta(seconds=5))
        return cluster
    
    def get_vectorstore(self, db_bucket, db_scope, db_collection, _embedding, index_name):
        '''Get the vectorstore instance'''
        _cluster = self.connect()
        return CouchbaseVectorStore(
            cluster=_cluster,
            bucket_name=db_bucket,
            scope_name=db_scope,
            collection_name=db_collection,
            embedding=_embedding,
            index_name=index_name
        )
       
# Function to split and chunk the text from the dataset 
def load_python_list(content, save_path=None):
    ''''Process and load a python list into a text file and split into docs'''
    if save_path:
        # Check if the folder mentioned in the save path exists
        folder = os.path.dirname(save_path)
        if not os.path.exists(folder):
            os.makedirs(folder)
        
        # Write the content to the file
        if os.path.exists(save_path):
            print(f"File already exists: {save_path}. Overwrite the file? (y/n): ")
            overwrite = input()
            if overwrite.lower() != "y":
                return load_text(save_path, doc_separators=["<break>"])

        with open(save_path, "w") as f:
            for item in content:
                f.write(item + "<break>")
        
        print(f"Content in list written to the file: {save_path}")
        return load_text(save_path, doc_separators=["<break>"])
    
def load_text(file_path, doc_separators=None):
    '''Load a text file and split it into documents based on the separators'''
    with open(file_path, "r") as f:
        text = f.read()
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size = 300,
        chunk_overlap = 50,
        # separators = doc_separators,
    )
    docs = text_splitter.create_documents([text])
    return docs

# Couchbase configuration
BUCKET_NAME = os.environ.get("bucket")
SCOPE_NAME = os.environ.get("scope")
COLLECTION_NAME = "rag-vectors"
SEARCH_INDEX_NAME = "vector-index"
embedding = OpenAIEmbeddings(model="text-embedding-3-large")

couchbase_config = CouchbaseConfig()
vector_store = couchbase_config.get_vectorstore(BUCKET_NAME, SCOPE_NAME, COLLECTION_NAME, embedding, SEARCH_INDEX_NAME)

retriever = vector_store.as_retriever()

# Using the opanai api for the llm
llm_client = OpenAI(api_key = os.environ.get("OPENAI_API_KEY"))
llm_model = "gpt-4o"

def generate_rag_response(query):
    semantic_results = retriever.invoke(query)
    context = "".join(doc.page_content for doc in semantic_results)

    system_prompt = "You are a question answering system. You answer the user query based on the given context only. If the answer cannot be deduced from the given context, just reply I dont know."
    contextual_prompt = system_prompt + "\nUser query: " + query + ". " + "\nContext: " + context

    ## Generating the response from the llm using the context generated
    stream = llm_client.chat.completions.create(
        model = llm_model,
        messages = [
            {
                "role": "user",
                "content": contextual_prompt,
            }
        ],
        stream = True,
    )
    llm_response = ""
    for chunk in stream:
        if chunk.choices[0].delta.content is not None:
            llm_response += chunk.choices[0].delta.content

    # LLM Response for the given query
    return context, llm_response

## Case 1: Ground truth dataset is present
Structure of a ground truth dataset:
- Questions
- Answers: Ground truth answers for the question
- Responses: The response from the LLM for the question and the context.
- Reference Contexts: The ground truth contexts
- Retrieved Contexts: The documents retrieved from the vector store

In [3]:
from datasets import load_dataset, Dataset

# Load the dataset from huggingface
ds = load_dataset("chloedh0228/rag-dataset-12000")
reference_contexts = ds["test"]["extracted_sentences"][:30]
raw_contexts = ds['test']['context'][:30]
questions = ds['test']['question'][:30]
answers = ds['test']['answer'][:30]

print("Dataset Description: \n")
print(ds)

Repo card metadata block was not found. Setting CardData to empty.


Dataset Description: 

DatasetDict({
    train: Dataset({
        features: ['context', 'question', 'answer', 'extracted_sentences', 'logical_relationship'],
        num_rows: 9600
    })
    test: Dataset({
        features: ['context', 'question', 'answer', 'extracted_sentences', 'logical_relationship'],
        num_rows: 2400
    })
})


In [4]:
# Vectorize the cotexts and store in the vector db
context_docs = load_python_list(raw_contexts, save_path="input_data/context_docs.txt")
print("Numberof documents to index: ", len(context_docs))
vector_store.add_documents(context_docs)

Content in list written to the file: input_data/context_docs.txt
Numberof documents to index:  499


['9dfce30d3a034d24b26ce1b63fa6be75',
 '98cd03b99caa4b19bb9a8b3e00ad1de8',
 'e56c0a9bda8d4fa694be5ac1b5d8be45',
 '53afdec090994af384220f9b3ac1c66d',
 'ebe97a5684ac4deb9726bd982804436a',
 'f37896515b3044c586a8e2c0250b35e6',
 '63b0711ff3d24f08be014863471088c0',
 '33e028bdd95746c582f4cbb919bf85a7',
 '3a1df08040664eaa86eec92259fcff7b',
 '832c391e94b0462cb1a3d5b3b92fadbe',
 'a79bcc02844e43509f34da33eec18bb9',
 '84f66802c94d4abc9b9b49d52576acc5',
 '24e01ec23832478ca40c3b03c1200552',
 'a6830a63654c410c97fd582c04ac7d97',
 'e6a8edbf7c334fe08abac306c52fe6dc',
 '177e13eaf7304e4695816063eef083f3',
 'f7230ca0f7cf43e1bb29991f501eaf30',
 'd283c89931b14388a05ffcc761fbc24b',
 '7242a539bba24a60acce01ea0b79b40b',
 'b1e02e0db8194885ba4702a5ea60d959',
 '7a3bea16a6724a51b6ef2f5f47b4cdf4',
 '1b493fec5e414f33823d8ceab09795e5',
 '1f34508cb3b44ed99ff1fa5cfa8c649a',
 '9b70253402fd4f56ae85218a92c536b1',
 '905c28717f4a468a9ba2c01b0e4f4ef5',
 '8726fe3829f24187bf5ec23bff62160e',
 'd8d3f254bd3e4df2ab38fbb817ef0ae1',
 

In [5]:
# Generate the responses and retrieve documents
responses, retrieved_contexts = [], []
retriever = vector_store.as_retriever()
for question in questions:
    context, llm_response = generate_rag_response(question)
    responses.append(llm_response)
    retrieved_contexts.append(context)
    
lol_answers = []
lol_ret_contexts = []
for answer in answers:
    lol_answers.append([answer])
for context in retrieved_contexts:
    lol_ret_contexts.append([context])
    
golden_data = {
    "questions": questions,
    "answers": lol_answers,
    "responses": responses,
    "reference_contexts": reference_contexts,
    "retrieved_contexts": lol_ret_contexts
}

In [6]:
# Create an EvalDataset object and save the ground truth 
# dataset as a json file
from src.data.dataset import EvalDataset

eval_dataset = EvalDataset(**golden_data)
json_data = eval_dataset.to_json(filename="ground.json")
print(json_data)

[{'question': 'Who is the music director of the Quebec Symphony Orchestra?', 'answers': ['The music director of the Quebec Symphony Orchestra is Fabien Gabel.'], 'response': 'The music director of the Quebec Symphony Orchestra is Fabien Gabel.', 'reference_context': "['Fabien Gabel, music director of the Quebec Symphony Orchestra, returns to Houston to lead the Houston Symphony in Ravel’s Daphnis and Chloé on Feb. 2 and 3 at 8 p.m. and Feb. 4 at 2:30 p.m. in Jones Hall.']", 'retrieved_contexts': ['HOUSTON (Jan. 23, 2018) – Fabien Gabel, music director of the Quebec Symphony Orchestra, returns to Houston to lead the Houston Symphony in Ravel’s Daphnis and Chloé on Feb. 2 and 3 at 8 p.m. and Feb. 4 at 2:30 p.m. in Jones Hall.Recognized internationally as one of the stars of the new generation, Fabien Gabel is a regular guest of the Houston Symphony and an audience favorite. Known for conducting music with French influences, Gabel leads the Symphony in a program of French and American cla

In [7]:
## Load the ground truth data into the Couchbase cluster
from src.data.load import LoadOperator

loader = LoadOperator(dataset=eval_dataset, dataset_description="Test dataset")
loader.load_docs()

Documents loaded successfully with dataset id: 89260603-2fae-44a2-967a-a999b5c06c07


In [12]:
# Retrieve the golden data saved on the couchbase cluster
from src.data.load import LoadOperator

retriever = LoadOperator()
retrieved_dataset = retriever.retrieve_docs(dataset_id=eval_dataset.dataset_id)

In [13]:
print(retrieved_dataset.dataset_id)

89260603-2fae-44a2-967a-a999b5c06c07


## Experiment based evaluation of the system

In [10]:
importlib.reload(src.data.load)

<module 'src.data.load' from '/Users/goutham.krishnan/Documents/Work/eval/src/data/load.py'>

In [14]:
from src.controller.options import ExperimentOptions
from src.controller.manager import Experiment
from src.evaluator.metrics import context_precision, context_recall, answer_relevancy, faithfulness, avg_chunk_size, answer_correctness

# Define experiment configuration using ExperimentOptions
experiment_options = ExperimentOptions(
    dataset_id = eval_dataset.dataset_id, # Pulls the data if it has been loaded into couchbase
    metrics = [context_precision, context_recall, answer_relevancy, faithfulness, avg_chunk_size, answer_correctness],
    chunk_size = 200,
    chunk_overlap = 40,
    embedding_model = "text-embedding-3-large",
    embedding_dimension = 3072,
    llm_model = "gpt-4o",
    experiment_description = "Experiment to evaluate the performance of the RAG system",
    custom_field = "custom value"
)

print(experiment_options.metrics)

# Create and perform an experiment
experiment = Experiment(options = experiment_options)

# Results are saved to `.results-<experiment_id>` folder

# Load the experiment results to the couchbase kv store
experiment.load_to_couchbase(collection="experiment")# If collection is not specified, the default collection in the .env file is used

[ContextPrecision(_required_columns={<MetricType.SINGLE_TURN: 'single_turn'>: {'reference', 'user_input', 'retrieved_contexts'}}, name='context_precision', llm=None, output_type=None, context_precision_prompt=ContextPrecisionPrompt(instruction=Given question, answer and context verify if the context was useful in arriving at the given answer. Give verdict as "1" if useful and "0" if not with json output., examples=[(QAC(question='What can you tell me about Albert Einstein?', context="Albert Einstein (14 March 1879 – 18 April 1955) was a German-born theoretical physicist, widely held to be one of the greatest and most influential scientists of all time. Best known for developing the theory of relativity, he also made important contributions to quantum mechanics, and was thus a central figure in the revolutionary reshaping of the scientific understanding of nature that modern physics accomplished in the first decades of the twentieth century. His mass–energy equivalence formula E = mc2, 

Processing chunks: 100%|██████████| 30/30 [00:00<00:00, 1039910.08it/s]


Evaluating:   0%|          | 0/150 [00:00<?, ?it/s]

Exception raised in Job[39]: TimeoutError()


Results saved to .results/results.csv and .results/results.json
Experiment data loaded successfully with experiment id: aae71ef6-e6b6-4461-b683-f609209fbf4e


In [None]:
from src.controller.manager import Experiment

# Retrieving an experiment stored in couchbase
experiment = Experiment() # init experiment as a retriever object

experiment.retrieve(experiment_id=experiment_options.experiment_id, collection="experiment")
# Results are saved to `.results-<experiment_id>` folder

Retrieved experiment data saved to .results-47a341f7-9e73-400d-92e3-45cbe7fa4ad8
