# Ragas evaluation
Test batch and ragas capability.

Uses this article as a model: https://towardsdatascience.com/visualize-your-rag-data-evaluate-your-retrieval-augmented-generation-system-with-ragas-fc2486308557

Ragas repository: https://github.com/explodinggradients/ragas/tree/main

In [34]:
import os, sys
import json
from pathlib import Path
import pickle

from ragas.testset import TestsetGenerator
from ragas import RunConfig
from dotenv import load_dotenv,find_dotenv
import chromadb
from chromadb import PersistentClient
from langchain_community.vectorstores import Chroma
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings
from langchain_core.documents import Document
import pandas as pd
import random

from ragas import evaluate
from ragas.metrics import answer_correctness
from datasets import Dataset

# Import local packages
sys.path.append('../src/aerospace_chatbot')
import queries
from data_processing import _stable_hash_meta, get_docs_df

# Set environment variables with .env
load_dotenv(find_dotenv(), override=True)

True

## Nifty functions

In [35]:
def write_dict_to_file(data_dict, filename):
    """write a dictionary as a json line to a file - allowing for appending"""
    with open(filename, "a") as f:
        f.write(json.dumps(data_dict) + "\n")

def read_dicts_from_file(filename):
    """Read a json line file as a generator of dictionaries - allowing to load multiple dictionaries as list."""
    with open(filename, "r") as f:
        for line in f:
            yield json.loads(line)
            
def add_cached_column_from_file(df, file_name, merge_on, column):
    """Read a file with cached list of dicts data write it to a dataframe."""

    if Path(file_name).exists():

        cached_answer_correctness = (
            pd.DataFrame(list(read_dicts_from_file(file_name)))
            .drop_duplicates(
                subset=[merge_on],
            )[[column, merge_on]]
            .dropna()
            .reset_index(drop=True)
        )
        return df.merge(
            cached_answer_correctness,
            on=merge_on,
            how="left",
        ).reset_index(drop=True)
    else:
        # return df.insert(0, column, None)

        # Create a copy of the DataFrame
        df_out = df.copy()

        # Add the new column with the name of the variable 'column'
        df_out[column] = None

        # Reorder the columns to place the new column at the end
        columns = list(df_out.columns)
        columns.remove(column)
        columns.append(column)
        df_out = df_out[columns]
        
        # df_out = df.copy().assign(column="")
        # df_out = df_out.reindex(columns=( [col for col in df_out.columns if col not in [column]] + [column] ))
        return df_out

## Connect to database

In [36]:
persistent_client = chromadb.PersistentClient(path=os.path.join(os.getenv('LOCAL_DB_PATH'),'chromadb'))   
collections=persistent_client.list_collections()
collections

[Collection(name=text-embedding-3-large-ams-nomerge-400),
 Collection(name=mixedbread-ai-mxbai-embed-large-v1-ams-nomerge-400),
 Collection(name=text-embedding-3-small-ams-2merge-none),
 Collection(name=text-embedding-3-large-ams-nomerge-400-parent-child),
 Collection(name=text-embedding-3-large-ams-2merge-none)]

In [37]:
dbs=[{'index_name':'text-embedding-3-large-ams-2merge-none',
     'query_model':OpenAIEmbeddings(model='text-embedding-3-large',openai_api_key=os.getenv('OPENAI_API_KEY'))},
     {'index_name':'text-embedding-3-large-ams-nomerge-400',
     'query_model':OpenAIEmbeddings(model='text-embedding-3-large',openai_api_key=os.getenv('OPENAI_API_KEY'))},
     {'index_name':'text-embedding-3-large-ams-nomerge-400-parent-child',
     'query_model':OpenAIEmbeddings(model='text-embedding-3-large',openai_api_key=os.getenv('OPENAI_API_KEY'))},
     {'index_name':'text-embedding-3-small-ams-2merge-none',
     'query_model':OpenAIEmbeddings(model='text-embedding-3-small',openai_api_key=os.getenv('OPENAI_API_KEY'))},
     {'index_name':'mixedbread-ai-mxbai-embed-large-v1-ams-nomerge-400',
     'query_model':HuggingFaceInferenceAPIEmbeddings(model_name='mixedbread-ai-mxbai-embed-large-v1',api_key=os.getenv('HUGGINGFACEHUB_API_TOKEN'))}]

db=dbs[0]
# TODO: add Voyage 

In [38]:
# Connect to docs_vectorstore where no chunking was done only full PDF pages
docs_vectorstore = Chroma(client=persistent_client,
                        collection_name=db['index_name'],
                        embedding_function=db['query_model'])  

In [39]:
all_docs = docs_vectorstore.get(include=["metadatas", "documents", "embeddings"])

In [40]:
lcdocs = [Document(page_content=doc, metadata=metadata) 
          for doc, metadata in zip(all_docs['documents'], all_docs['metadatas'])]

In [41]:
len(lcdocs)

2696

In [42]:
lcdocs[0]

Document(page_content='Figure 6. Bearing Race-Ball Dent Fixture.  Results and Discussions The single ball versus race static load capacity of a hybrid Si 3N4-60NiTi bearing was evaluated and the  results are shown in Table 2. In the table, the dent depth, dent depth normalized by the ball diameter, the  peak stress, and the mean stress are listed. The normalized dent depth as a function of mean stress is plotted in Figure 7. The dent depth data for the ball-race contact agrees reasonably well with the more generalized ball-flat  plate dent depth data collected previously (Ref. 8). This agreement, despite differences in contact geometry, further support the argument that the Hertz stress relations for contacting bodies can be  confidently applied to bearing design. Using the generally accepted dent depth criteria (dp/D~0.00003 to  0.00010), the data indicate that the damage threshold for 60NiTi is around 2.8 to 3.3 GPa. This is slightly lower than the value estimated from flat plate den

## Export all collections to pickles to store them before doing anything heavy duty

In [47]:
for db in dbs:
    df_temp=get_docs_df(os.getenv('LOCAL_DB_PATH'), db['index_name'], db['query_model'])
    with open(os.path.join(os.getenv('LOCAL_DB_PATH'),
                      f"archive_chromadb_{db['index_name']}.pickle"), "wb") as f:
        pickle.dump(df_temp, f)

## Generate synthetic dataset

In [None]:
generator_model="gpt-3.5-turbo-16k"
synthetic_generator_llm = ChatOpenAI(model=generator_model, tags=[generator_model])

critic_model='gpt-4o'
synthetic_critic_llm = ChatOpenAI(model=critic_model,tags=[critic_model])

embedding_model="text-embedding-ada-002"
synthetic_embeddings = OpenAIEmbeddings(model=embedding_model,api_key=os.getenv('OPENAI_API_KEY'))

In [None]:
generator = TestsetGenerator.from_langchain(
    synthetic_generator_llm,
    synthetic_critic_llm,
    synthetic_embeddings
)

# sample_size=min(len(lcdocs),20) # 500 is the max size before you'll hit rate limits with a tier 3 openai account
sample_size=100
percent_total=sample_size/len(lcdocs)
print(percent_total)

# Get a random sample of lcdocs
lcdocs_random = random.sample(lcdocs, sample_size)

In [None]:
# Check if testset.csv exists
if not os.path.exists('./testset.csv'):
    run_config=RunConfig(timeout=1000,
                    max_retries=50,
                    max_wait=1000,
                    max_workers=1)

    n_questions=50
    testset = generator.generate_with_langchain_docs(lcdocs_random, 
                                                    test_size=n_questions,
                                                    with_debugging_logs=True,
                                                    is_async=False,
                                                    run_config=run_config,
                                                    raise_exceptions=False)
    df_testset=testset.to_pandas()
    df_testset.to_csv(f'testset_{index_name}.csv', index=False)
else:
    # Import testset.csv into a DataFrame
    df_testset = pd.read_csv(f'testset_{index_name}.csv')

In [None]:
df_testset

### Format dataset and database for RAG

In [None]:
df_questions = df_testset[['question', 'ground_truth']].copy()
df_questions['id'] = 'Question ' + df_questions.index.astype(str)
df_questions['question_by'] = generator_model
df_questions = df_questions[['id', 'question', 'ground_truth', 'question_by']]

In [None]:
df_questions

In [None]:
all_docs = docs_vectorstore.get(include=["metadatas", "documents", "embeddings"])
df_docs = pd.DataFrame(
    {
        "id": [_stable_hash_meta(metadata) for metadata in all_docs["metadatas"]],
        "source": [metadata.get("source") for metadata in all_docs["metadatas"]],
        "page": [metadata.get("page", -1) for metadata in all_docs["metadatas"]],
        "document": all_docs["documents"],
        "embedding": all_docs["embeddings"],
    }
)

In [None]:
# Load the cached RAG answers and source_documents ids from a file - or create an empty column
df_questions_answers = add_cached_column_from_file(
    df_questions, f'rag_response_cache_{index_name}.txt', "question", "answer")

df_questions_answers = add_cached_column_from_file(
    df_questions_answers, f'rag_response_cache_{index_name}.txt', "question", "source_documents")

In [None]:
df_questions_answers

# RAG questions/answers (batch mode)

The cell below demonstrates using aerospace_chatbot in batch mode. It requires some basic parameter setup and a QA_model.

In [None]:
# TODO turn this into a function/standalone notebook to show how to use the QA model in batch mode

index_type='ChromaDB'
index_name=index_name
query_model=synthetic_embeddings
llm=synthetic_generator_llm

QA_model_params={'rag_type':'Standard',
                 'k':4,
                 'search_type':'similarity',
                 'local_db_path':os.getenv('LOCAL_DB_PATH')}

for i, row in df_questions_answers.iterrows():
    if row['answer'] is None or pd.isnull(row['answer']) or row['answer']=='':
        print(f"Processing question {i+1}/{len(df_questions_answers)}")

        # Use the QA model to query the documents
        qa_obj=queries.QA_Model(index_type,
                        index_name,
                        query_model,
                        embedding_model,
                        llm,
                        **QA_model_params)
        qa_obj.query_docs(row['question'])
        response=qa_obj.result

        df_questions_answers.loc[df_questions_answers.index[i], "answer"] = response['answer'].content

        ids=[_stable_hash_meta(source_document.metadata)
            for source_document in response['references']]
        df_questions_answers.loc[df_questions_answers.index[i], "source_documents"] = ', '.join(ids)

        # Save the response to cache file
        response_dict = {
            "question": row['question'],
            "answer": response['answer'].content,
            "source_documents": ids,
        }
        write_dict_to_file(response_dict, f'rag_response_cache_{index_name}.json')

In [None]:
# Get the context documents content for each question
source_documents_list = []
for cell in df_questions_answers['source_documents']:
    cell_list = cell.strip('[]').split(', ')
    context=[]
    for cell in cell_list:
        context.append(df_docs[df_docs["id"] == cell]["document"].values[0])
    source_documents_list.append(context)
df_questions_answers["contexts"]=source_documents_list

In [None]:
# Addtionaly get embeddings for questions

if not Path(f'question_embeddings_{index_name}.pickle').exists():
    question_embeddings = [
        synthetic_embeddings.embed_query(question)
        for question in df_questions_answers["question"]
    ]
    with open(f'question_embeddings_{index_name}.pickle', "wb") as f:
        pickle.dump(question_embeddings, f)

question_embeddings = pickle.load(open(f'question_embeddings_{index_name}.pickle', "rb"))
df_questions_answers["embedding"] = question_embeddings

In [None]:
df_questions_answers

## Ragas eval

### Evaluate

In [None]:
df_questions_answers = add_cached_column_from_file(
    df_questions_answers, f'ragas_result_cache_{index_name}.txt', "question", "answer_correctness"
)
df_questions_answers

In [None]:
# Unclear why but sometimes ground_truth does not provide a response. Just filter those out.
df_questions_answers = df_questions_answers[df_questions_answers['ground_truth'].apply(lambda x: isinstance(x, str))]
df_questions_answers

In [None]:
# prepare the dataframe for evaluation
df_qa_eval = df_questions_answers.copy()

In [None]:
# evaluate the answer correctness if not already done
fields = ["question", "answer", "contexts", "ground_truth"]
for i, row in df_qa_eval.iterrows():
    print(i, row["question"])
    if row["answer_correctness"] is None or pd.isnull(row["answer_correctness"]):
        evaluation_result = evaluate(
            Dataset.from_pandas(df_qa_eval.iloc[i : i + 1][fields]),
            [answer_correctness],
        )
        df_qa_eval.loc[i, "answer_correctness"] = evaluation_result[
            "answer_correctness"
        ]

        # optionally save the response to cache
        response_dict = {
            "question": row["question"],
            "answer_correctness": evaluation_result["answer_correctness"],
        }
        write_dict_to_file(response_dict, f'ragas_result_cache_{index_name}.txt')

# write the answer correctness to the original dataframe
df_questions_answers["answer_correctness"] = df_qa_eval["answer_correctness"]

### Link from documents to questions, that used the document as source

This section adds a column to df_documents containing the ids of the questions that used the document as source.

In [None]:
# add the infos about questions using each document to the documents dataframe


# Explode 'source_documents' so each document ID is in its own row alongside the question ID
df_questions_exploded = df_qa_eval.explode("source_documents")

# Group by exploded 'source_documents' (document IDs) and aggregate
agg = (
    df_questions_exploded.groupby("source_documents")
    .agg(
        num_questions=("id", "count"),  # Count of questions referencing the document
        question_ids=(
            "id",
            lambda x: list(x),
        ),  # List of question IDs referencing the document
    )
    .reset_index()
    .rename(columns={"source_documents": "id"})
)

# Merge the aggregated information back into df_documents
df_documents_agg = pd.merge(df_docs, agg, on="id", how="left")

# Use apply to replace NaN values with empty lists for 'question_ids'
df_documents_agg["question_ids"] = df_documents_agg["question_ids"].apply(
    lambda x: x if isinstance(x, list) else []
)
# Replace NaN values in 'num_questions' with 0
df_documents_agg["num_questions"] = df_documents_agg["num_questions"].fillna(0)

In [None]:
df = pd.concat([df_qa_eval, df_documents_agg], axis=0)

In [None]:
# create UMAP only using documents and apply it to the documents and questions
from umap import UMAP
import numpy as np

df_questions = df[~df["question"].isna()]
umap = UMAP(n_neighbors=20, min_dist=0.15, metric="cosine", random_state=42).fit(
    df_questions["embedding"].values.tolist()
)
umap_questions = umap.transform(df["embedding"].values.tolist())


df_without_questions = df[df["question"].isna()]
umap = UMAP(n_neighbors=20, min_dist=0.15, metric="cosine", random_state=42).fit(
    df_without_questions["embedding"].values.tolist()
)
umap_docs = umap.transform(df["embedding"].values.tolist())
df["umap_docs"] = umap_docs.tolist()

umap = UMAP(n_neighbors=20, min_dist=0.15, metric="cosine", random_state=42).fit(
    df["embedding"].values.tolist()
)
umap_all = umap.transform(df["embedding"].values.tolist())
df["umap"] = umap_all.tolist()


# find the nearet question (by embedding) for each document
question_embeddings = np.array(df[df["question"].notna()]["embedding"].tolist())

df["nearest_question_dist"] = [  # brute force, could be optimized using ChromaDB
    np.min([np.linalg.norm(np.array(doc_emb) - question_embeddings, axis=1)])
    for doc_emb in df["embedding"].values
]

# write the dataframe to parquet for later use
df.to_parquet(f'df_{index_name}.parquet')

### Visualize

Adapt the first cell to use the downloaded dataframes if you skipped the preparation steps.

In [None]:
# concat the df containing the questions and the df containing the documents
import pandas as pd

# df = pd.concat([df_qa_eval, df_documents_agg], axis=0)

# OR Load the data from downloaded file https://spotlightpublic.blob.core.windows.net/docs-data/rag_demo/df_f1_rag_docs_and_questions_umaps_v3.parquet
df = pd.read_parquet(f'df_{index_name}.parquet')

In [None]:
# show the dataframe with the question and answer in spotlight
from renumics import spotlight
from renumics.spotlight import Embedding
import pandas as pd

spotlight.show(
    df,
    layout="https://spotlightpublic.blob.core.windows.net/docs-data/rag_demo/layout_rag_3.json",
    dtype={x: Embedding for x in df.keys() if "umap" in x},
)

##  UMAP visualization froms cluster of the questions, workaround: UMAP only on documents