# EVAL
Evaluation of RAG systems from the ground up.

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from datasets import load_dataset, Dataset
import os
import json
from dotenv import load_dotenv
from datetime import timedelta
import importlib
import src

from langchain_text_splitters import RecursiveCharacterTextSplitter
from openai import OpenAI
from couchbase.auth import PasswordAuthenticator
from couchbase.cluster import Cluster
from couchbase.options import ClusterOptions
from langchain_couchbase.vectorstores import CouchbaseVectorStore
from langchain_openai import OpenAIEmbeddings

from src.data.dataset import EvalDataset
from src.data.load import LoadOperator
from src.evaluator.validation import ValidationEngine
from src.evaluator.options import ValidationOptions
from src.controller.options import ExperimentOptions
from src.controller.manager import Experiment

load_dotenv()

True

## Simple RAG using Couchbase vectorstore

In [3]:
# Function to confgure the couchbase cluster and vector store
class CouchbaseConfig:
    def __init__(self):
        self.connection_string = os.environ.get("cluster_url")
        self.db_username = os.environ.get("cb_username")
        self.db_password = os.environ.get("cb_password")

    def connect(self):
        '''Connect to the couchbase cluster'''
        auth = PasswordAuthenticator(self.db_username, self.db_password)
        options = ClusterOptions(authenticator=auth)
        cluster = Cluster(self.connection_string, options)
        cluster.wait_until_ready(timedelta(seconds=5))
        return cluster
    
    def get_vectorstore(self, db_bucket, db_scope, db_collection, _embedding, index_name):
        '''Get the vectorstore instance'''
        _cluster = self.connect()
        return CouchbaseVectorStore(
            cluster=_cluster,
            bucket_name=db_bucket,
            scope_name=db_scope,
            collection_name=db_collection,
            embedding=_embedding,
            index_name=index_name
        )
       
# Function to split and chunk the text from the dataset 
def load_python_list(content, save_path=None):
    ''''Process and load a python list into a text file and split into docs'''
    if save_path:
        # Check if the folder mentioned in the save path exists
        folder = os.path.dirname(save_path)
        if not os.path.exists(folder):
            os.makedirs(folder)
        
        # Write the content to the file
        if os.path.exists(save_path):
            print(f"File already exists: {save_path}. Overwrite the file? (y/n): ")
            overwrite = input()
            if overwrite.lower() != "y":
                return load_text(save_path, doc_separators=["<break>"])

        with open(save_path, "w") as f:
            for item in content:
                f.write(item + "<break>")
        
        print(f"Content in list written to the file: {save_path}")
        return load_text(save_path, doc_separators=["<break>"])
    
def load_text(file_path, doc_separators=None):
    '''Load a text file and split it into documents based on the separators'''
    with open(file_path, "r") as f:
        text = f.read()
    text_splitter = RecursiveCharacterTextSplitter(
        separators = doc_separators,
    )
    docs = text_splitter.create_documents([text])
    return docs

# Couchbase configuration
BUCKET_NAME = os.environ.get("bucket")
SCOPE_NAME = os.environ.get("scope")
COLLECTION_NAME = "rag-vectors"
SEARCH_INDEX_NAME = "vector-index"
embedding = OpenAIEmbeddings(model="text-embedding-3-large")

couchbase_config = CouchbaseConfig()
vector_store = couchbase_config.get_vectorstore(BUCKET_NAME, SCOPE_NAME, COLLECTION_NAME, embedding, SEARCH_INDEX_NAME)

retriever = vector_store.as_retriever()

# Using the opanai api for the llm
llm_client = OpenAI(api_key = os.environ.get("OPENAI_API_KEY"))
llm_model = "gpt-4o"

def generate_rag_response(query):
    semantic_results = retriever.invoke(query)
    context = "".join(doc.page_content for doc in semantic_results)

    system_prompt = "You are a question answering system. You answer the user query based on the given context only. If the answer cannot be deduced from the given context, just reply I dont know."
    contextual_prompt = system_prompt + "\nUser query: " + query + ". " + "\nContext: " + context

    ## Generating the response from the llm using the context generated
    stream = llm_client.chat.completions.create(
        model = llm_model,
        messages = [
            {
                "role": "user",
                "content": contextual_prompt,
            }
        ],
        stream = True,
    )
    llm_response = ""
    for chunk in stream:
        if chunk.choices[0].delta.content is not None:
            llm_response += chunk.choices[0].delta.content

    # LLM Response for the given query
    return context, llm_response

## Case 1: Ground truth dataset is present
Structure of a ground truth dataset:
- Questions
- Answers: Ground truth answers for the question
- Responses: The response from the LLM for the question and the context.
- Reference Contexts: The ground truth contexts
- Retrieved Contexts: The documents retrieved from the vector store

In [4]:
# Load the dataset from huggingface
ds = load_dataset("neural-bridge/rag-dataset-12000")

contexts = ds['test']['context'][:30]
questions = ds['test']['question'][:30]
answers = ds['test']['answer'][:30]

print("Dataset Description: \n")
print(ds)

Dataset Description: 

DatasetDict({
    train: Dataset({
        features: ['context', 'question', 'answer'],
        num_rows: 9600
    })
    test: Dataset({
        features: ['context', 'question', 'answer'],
        num_rows: 2400
    })
})


In [5]:
# Vectorize the cotexts and store in the vector db
context_docs = load_python_list(contexts, save_path="input_data/context_docs.txt")
print("Numberof documents to index: ", len(context_docs))
vector_store.add_documents(context_docs)

File already exists: input_data/context_docs.txt. Overwrite the file? (y/n): 
Numberof documents to index:  27


['3156b6d2db664ce983331369330f684c',
 '930436c43f4845d7822ba0f8b159d1e4',
 'bd22dddf603d40b1a99fa6199c295e1a',
 '8a55439cef284235b4b7221540024a68',
 '22fd3bdd2f104567bb6cb10ab306821c',
 'f5f96f80197b471f8c6db83820693cb7',
 '1acd71a087fe4680a9c9d007ccc26c59',
 '255dc2427e594c6fbd4a95cfe4e1ee6e',
 '390ac769f22c493d8817c5cb65623f60',
 'e94e08cc79f244da903faa82456c1054',
 '889434a0bf0b4d59a4b6e80a4ff5b2c7',
 'af7deff10ad6422d870d992299d54448',
 'ee8a5fb0f1c74c69969f1f3cd8b9ac4c',
 '86d01b8c3d9d4f48b6366942cac3874f',
 '08c1ddbcab87404ca251d5793d521625',
 '67f723bbf58943feaecfaa7a58916f81',
 '9f985af0a6754d60bd70a76a298977e4',
 '1095a8eecd1445e5a023d6da01706a4c',
 '1bb36a4ede424c9ba4676d5b625d998c',
 'bde23cf4593344578c0b480f7ab750e0',
 'f16ba070b658449293465e65586c47b9',
 'd02568127c6d4224a046b1299638fab0',
 '17a0bdfe5d594df99d1696e3e06dee4e',
 'f083eb8e6ea040bbb3ffdcd419e0699a',
 'e2d6028205c3430fb0a1e87257478b03',
 '3948871b0cd34dafb7167fe1d8a8e3a8',
 'da3e1a6378504695abf9ea259e51b2f9']

In [6]:
# Generate the responses and retrieve documents
responses, retrieved_contexts = [], []
retriever = vector_store.as_retriever()
for question in questions:
    context, llm_response = generate_rag_response(question)
    responses.append(llm_response)
    retrieved_contexts.append(context)
    
golden_data = {
    "questions": questions,
    "answers": answers,
    "responses": responses,
    "reference_contexts": contexts,
    "retrieved_contexts": retrieved_contexts
}

In [7]:
# Create an EvalDataset object and save the ground truth 
# dataset as a json file
eval_dataset = EvalDataset(**golden_data)
json_data = eval_dataset.to_json("ground_truth.json")
print(json_data)

[{'question': 'Who is the music director of the Quebec Symphony Orchestra?', 'answer': 'The music director of the Quebec Symphony Orchestra is Fabien Gabel.', 'response': 'The music director of the Quebec Symphony Orchestra is Fabien Gabel.', 'reference_context': 'HOUSTON (Jan. 23, 2018) – Fabien Gabel, music director of the Quebec Symphony Orchestra, returns to Houston to lead the Houston Symphony in Ravel’s Daphnis and Chloé on Feb. 2 and 3 at 8 p.m. and Feb. 4 at 2:30 p.m. in Jones Hall.\nRecognized internationally as one of the stars of the new generation, Fabien Gabel is a regular guest of the Houston Symphony and an audience favorite. Known for conducting music with French influences, Gabel leads the Symphony in a program of French and American classics, including the breathtaking musical sunrise from Ravel’s Daphnis and Chloé and Bernstein’s comic operetta Overture to Candide as the Symphony joins other orchestras around the world for Leonard Bernstein at 100, a worldwide celebr

In [8]:
## Load the ground truth data into the Couchbase cluster
loader = LoadOperator(data=eval_dataset, dataset_description="Test dataset")
loader.load_docs()

Documents loaded successfully with dataset id: a61e363a-3550-4de8-ac4e-abb36aaca09b


In [9]:
# Retrieve the golden data saved on the couchbase cluster
retriever = LoadOperator()
eval_dataset = retriever.retrieve_docs(dataset_id=loader.dataset_id)

## Evaluation of the system

In [10]:
# Define the evaluation options
_options = ValidationOptions(
        metrics = [
            "avg_chunk_size", "context_score", "embedding_similarity", "named_entity_score", "retrieval_accuracy", "bleu_score", "rouge_score", "faithfulness", "response_similarity"
        ],
        generateReport=False
    )

# Create a validation engine and evaluate the model
eval = ValidationEngine(dataset=eval_dataset, options=_options)
result = eval.evaluate()

The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


## Adding the results as an experiment

In [11]:
from src.controller.manager import Experiment

experiment_options = ExperimentOptions(
    experiment_id="321",
    chunk_size = 100,
    chunk_overlap = 20,
    embedding_model = "text-embedding-3-large",
    embedding_dimension = 3052,
    llm_model = "gpt-4o"
)

# Create an experiment
experiment = Experiment(experiment_options, result)

# Save the experiment to the couchbase cluster
experiment.add(dataset_id="7070211e-74e0-41f8-8281-893c6692fd7d", load=True, collection="experiment")

Experiment data loaded successfully with experiment id: 321


In [12]:
importlib.reload(src.controller.manager)

<module 'src.controller.manager' from '/Users/goutham.krishnan/Documents/Work/eval/src/controller/manager.py'>

In [14]:
# Retrieve the experiment from the couchbase cluster
experiment_object = Experiment()
result = experiment_object.retrieve("321", collection="experiment")
dump_result = json.dumps(result, indent=4)

# Save the result to a json file
with open("result.json", "w") as f:
    f.write(dump_result)

In [None]:
# Required implementation of the experiment
experiment_options = ExperimentOptions(
    dataset_id="7070211e-74e0-41f8-8281-893c6692fd7d",
    metrics = [context_precision, context_recall, answer_relevancy, faithfulness],
    chunk_size = 100,
    chunk_overlap = 20,
    embedding_model = "text-embedding-3-large",
    embedding_dimension = 3052,
    llm_model = "gpt-4o"
)

experiment = Experiment(experiment_options)

experiment.perform(single_turn=True, load=True)