# Ragas evaluation
Test batch and ragas capability.

Uses this article as a model: https://towardsdatascience.com/visualize-your-rag-data-evaluate-your-retrieval-augmented-generation-system-with-ragas-fc2486308557

Ragas repository: https://github.com/explodinggradients/ragas/tree/main

In [1]:
import os, sys
import json
from pathlib import Path
import pickle

from ragas.testset import TestsetGenerator
from ragas import RunConfig
from dotenv import load_dotenv,find_dotenv
import chromadb
from chromadb import PersistentClient
from langchain_community.vectorstores import Chroma
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_core.documents import Document
import pandas as pd
import random

from ragas import evaluate
from ragas.metrics import answer_correctness
from datasets import Dataset

# Import local packages
sys.path.append('../src/aerospace_chatbot')
import queries
from data_processing import _stable_hash_meta

# Set environment variables with .env
load_dotenv(find_dotenv(), override=True)

True

## Nifty functions

In [2]:
def write_dict_to_file(data_dict, filename):
    """write a dictionary as a json line to a file - allowing for appending"""
    with open(filename, "a") as f:
        f.write(json.dumps(data_dict) + "\n")

def read_dicts_from_file(filename):
    """Read a json line file as a generator of dictionaries - allowing to load multiple dictionaries as list."""
    with open(filename, "r") as f:
        for line in f:
            yield json.loads(line)
            
def add_cached_column_from_file(df, file_name, merge_on, column):
    """Read a file with cached list of dicts data write it to a dataframe."""

    if Path(file_name).exists():

        cached_answer_correctness = (
            pd.DataFrame(list(read_dicts_from_file(file_name)))
            .drop_duplicates(
                subset=[merge_on],
            )[[column, merge_on]]
            .dropna()
            .reset_index(drop=True)
        )
        return df.merge(
            cached_answer_correctness,
            on=merge_on,
            how="left",
        ).reset_index(drop=True)
    else:
        # return df.insert(0, column, None)

        # Create a copy of the DataFrame
        df_out = df.copy()

        # Add the new column with the name of the variable 'column'
        df_out[column] = None

        # Reorder the columns to place the new column at the end
        columns = list(df_out.columns)
        columns.remove(column)
        columns.append(column)
        df_out = df_out[columns]
        
        # df_out = df.copy().assign(column="")
        # df_out = df_out.reindex(columns=( [col for col in df_out.columns if col not in [column]] + [column] ))
        return df_out

## Connect to database

In [3]:
persistent_client = chromadb.PersistentClient(path=os.path.join(os.getenv('LOCAL_DB_PATH'),'chromadb'))   
query_model=OpenAIEmbeddings(model='text-embedding-ada-002',openai_api_key=os.getenv('OPENAI_API_KEY'))

collections=persistent_client.list_collections()
collections

[Collection(name=chromadb-text-embedding-ada-002-ams-nomerge-2000-2020-400-0)]

In [4]:

index_name='chromadb-text-embedding-ada-002-ams-nomerge-2000-2020-400-0'

# Connect to docs_vectorstore where no chunking was done only full PDF pages
docs_vectorstore = Chroma(client=persistent_client,
                        collection_name=index_name,
                        embedding_function=query_model)  

In [5]:
all_docs = docs_vectorstore.get(include=["metadatas", "documents", "embeddings"])

In [6]:
lcdocs = [Document(page_content=doc, metadata=metadata) 
          for doc, metadata in zip(all_docs['documents'], all_docs['metadatas'])]

In [7]:
len(lcdocs)

38377

In [8]:
lcdocs[0]

Document(page_content='locking actually relaxes the usual expectation of adhesive to maintain preload to the common expectation for prevailing torque locking features which is to prevent disassembly.', metadata={'page': 207, 'source': 'AMS_2016.pdf', 'start_index': 2696})

## Generate synthetic dataset

In [9]:
# Openai
generator_model="gpt-3.5-turbo-16k"
synthetic_generator_llm = ChatOpenAI(model=generator_model, tags=[generator_model])


# Hugging face LLM
# generator_model="google/gemma-7b-it"
# synthetic_generator_llm = ChatOpenAI(base_url='https://api-inference.huggingface.co/v1',
#                             model=generator_model,
#                             api_key=os.getenv('HUGGINGFACEHUB_API_TOKEN'),
#                             tags=generator_model)

# Local LLM via LM studio
# synthetic_generator_llm = ChatOpenAI(base_url="http://localhost:1234/v1", api_key="not-needed")

In [10]:
critic_model="gpt-3.5-turbo-0125"
# critic_model="gpt-4"
synthetic_critic_llm = ChatOpenAI(model=critic_model,tags=[critic_model])

embedding_model="text-embedding-ada-002"
synthetic_embeddings = OpenAIEmbeddings(model=embedding_model,api_key=os.getenv('OPENAI_API_KEY'))

In [11]:
generator = TestsetGenerator.from_langchain(
    synthetic_generator_llm,
    synthetic_critic_llm,
    synthetic_embeddings
)

sample_size=min(len(lcdocs),20) # 500 is the max size before you'll hit rate limits with a tier 3 openai account
percent_total=sample_size/len(lcdocs)
print(percent_total)

# Get a random sample of lcdocs
lcdocs_random = random.sample(lcdocs, sample_size)

0.0005211454777601167


In [12]:
# Check if testset.csv exists
if not os.path.exists('./testset.csv'):
    run_config=RunConfig(timeout=1000,
                    max_retries=50,
                    max_wait=1000,
                    max_workers=1)

    n_questions=10
    testset = generator.generate_with_langchain_docs(lcdocs_random, 
                                                    test_size=n_questions,
                                                    with_debugging_logs=True,
                                                    is_async=False,
                                                    run_config=run_config,
                                                    raise_exceptions=False)
    df_testset=testset.to_pandas()
    df_testset.to_csv('testset.csv', index=False)
else:
    # Import testset.csv into a DataFrame
    df_testset = pd.read_csv('testset.csv')

embedding nodes:   0%|          | 0/40 [00:00<?, ?it/s]

Filename and doc_id are the same for all nodes.


Generating:   0%|          | 0/10 [00:00<?, ?it/s]

[ragas.testset.filters.DEBUG] context scoring: {'clarity': 2, 'depth': 3, 'structure': 3, 'relevance': 3, 'score': 2.75}
[ragas.testset.evolutions.DEBUG] keyphrases in merged node: ['Model of the setup', 'Test data', 'Predicted response', 'Sine sweeps', 'Tap testing']
[ragas.testset.evolutions.INFO] seed question generated: How is the predicted response of the model validated in the setup?
[ragas.testset.filters.DEBUG] filtered question: {'feedback': 'The question is clear and specific, asking about the validation process for the predicted response of a model within a given setup. It provides a focused inquiry into the validation methodology used for model predictions, making it understandable and answerable based on the details provided.', 'verdict': 1}
[ragas.testset.evolutions.DEBUG] [ReasoningEvolution] simple question generated: How is the predicted response of the model validated in the setup?
[ragas.testset.filters.DEBUG] filtered question: {'feedback': 'The question is clear in

In [13]:
df_testset

Unnamed: 0,question,contexts,ground_truth,evolution_type,metadata,episode_done
0,What is the purpose of the basepoint in the op...,[outfitted with a grapple fixture or a basepoi...,The purpose of the basepoint in the operation ...,simple,"[{'page': 337, 'source': 'AMS_2014.pdf', 'star...",True
1,What is the qualified temperature range for ro...,[development time Mass < 12.4 kg < 15.4 kg...,-45°C to +55°C,simple,"[{'page': 326, 'source': 'AMS_2020.pdf', 'star...",True
2,How do accumulators affect the resonance and e...,[displacement canbeobtained ifthepumpisrunatre...,If the pump uses accumulators so that the pist...,simple,"[{'page': 175, 'source': 'AMS_2000.pdf', 'star...",True
3,What are the EVA overrides used for in the mec...,[Mechanisms : a. Camera & Lighting Unit (CLU)...,"The EVA overrides are used for GM, IST, TRM, a...",simple,"[{'page': 337, 'source': 'AMS_2014.pdf', 'star...",True
4,What is the purpose of the bogie wheel drive i...,[The de cision on how to deploy the b ogie wa ...,The purpose of the bogie wheel drive in the ro...,simple,"[{'page': 206, 'source': 'AMS_2004.pdf', 'star...",True
5,What is the max mass of the aluminum structure...,[development time Mass < 12.4 kg < 15.4 kg...,Not Fully Defined,reasoning,"[{'page': 326, 'source': 'AMS_2020.pdf', 'star...",True
6,What is the maximum mass of the aluminum struc...,[development time Mass < 12.4 kg < 15.4 kg...,15.4 kg,reasoning,"[{'page': 326, 'source': 'AMS_2020.pdf', 'star...",True
7,What process is used to verify the accuracy of...,[themselves are characterized and are accurate...,The process used to verify the accuracy of the...,reasoning,"[{'page': 448, 'source': 'AMS_2016.pdf', 'star...",True


### Format dataset and database for RAG

In [14]:
# df_questions['id'] = f"Question {i}" for i, _ in enumerate(questions_all),

df_questions = df_testset[['question', 'ground_truth']].copy()
df_questions['id'] = 'Question ' + df_questions.index.astype(str)
df_questions['question_by'] = generator_model
df_questions = df_questions[['id', 'question', 'ground_truth', 'question_by']]

In [15]:
df_questions

Unnamed: 0,id,question,ground_truth,question_by
0,Question 0,What is the purpose of the basepoint in the op...,The purpose of the basepoint in the operation ...,gpt-3.5-turbo-16k
1,Question 1,What is the qualified temperature range for ro...,-45°C to +55°C,gpt-3.5-turbo-16k
2,Question 2,How do accumulators affect the resonance and e...,If the pump uses accumulators so that the pist...,gpt-3.5-turbo-16k
3,Question 3,What are the EVA overrides used for in the mec...,"The EVA overrides are used for GM, IST, TRM, a...",gpt-3.5-turbo-16k
4,Question 4,What is the purpose of the bogie wheel drive i...,The purpose of the bogie wheel drive in the ro...,gpt-3.5-turbo-16k
5,Question 5,What is the max mass of the aluminum structure...,Not Fully Defined,gpt-3.5-turbo-16k
6,Question 6,What is the maximum mass of the aluminum struc...,15.4 kg,gpt-3.5-turbo-16k
7,Question 7,What process is used to verify the accuracy of...,The process used to verify the accuracy of the...,gpt-3.5-turbo-16k


In [16]:
all_docs = docs_vectorstore.get(include=["metadatas", "documents", "embeddings"])
df_docs = pd.DataFrame(
    {
        "id": [_stable_hash_meta(metadata) for metadata in all_docs["metadatas"]],
        "source": [metadata.get("source") for metadata in all_docs["metadatas"]],
        "page": [metadata.get("page", -1) for metadata in all_docs["metadatas"]],
        "document": all_docs["documents"],
        "embedding": all_docs["embeddings"],
    }
)

In [18]:
# Load the cached RAG answers and source_documents ids from a file - or create an empty column
df_questions_answers = add_cached_column_from_file(
    df_questions, "rag_response_cache.txt", "question", "answer")

df_questions_answers = add_cached_column_from_file(
    df_questions_answers, "rag_response_cache.txt", "question", "source_documents")

In [19]:
df_questions_answers

Unnamed: 0,id,question,ground_truth,question_by,answer,source_documents
0,Question 0,What is the purpose of the basepoint in the op...,The purpose of the basepoint in the operation ...,gpt-3.5-turbo-16k,,
1,Question 1,What is the qualified temperature range for ro...,-45°C to +55°C,gpt-3.5-turbo-16k,,
2,Question 2,How do accumulators affect the resonance and e...,If the pump uses accumulators so that the pist...,gpt-3.5-turbo-16k,,
3,Question 3,What are the EVA overrides used for in the mec...,"The EVA overrides are used for GM, IST, TRM, a...",gpt-3.5-turbo-16k,,
4,Question 4,What is the purpose of the bogie wheel drive i...,The purpose of the bogie wheel drive in the ro...,gpt-3.5-turbo-16k,,
5,Question 5,What is the max mass of the aluminum structure...,Not Fully Defined,gpt-3.5-turbo-16k,,
6,Question 6,What is the maximum mass of the aluminum struc...,15.4 kg,gpt-3.5-turbo-16k,,
7,Question 7,What process is used to verify the accuracy of...,The process used to verify the accuracy of the...,gpt-3.5-turbo-16k,,


# RAG questions/answers (batch mode)

The cell below demonstrates using aerospace_chatbot in batch mode. It requires some basic parameter setup and a QA_model.

In [20]:
# TODO turn this into a function/standalone notebook to show how to use the QA model in batch mode

index_type='ChromaDB'
index_name=index_name
query_model=synthetic_embeddings
llm=synthetic_generator_llm

QA_model_params={'rag_type':'Standard',
                 'k':4,
                 'search_type':'similarity',
                 'local_db_path':os.getenv('LOCAL_DB_PATH')}

for i, row in df_questions_answers.iterrows():
    if row['answer'] is None or pd.isnull(row['answer']) or row['answer']=='':
        print(f"Processing question {i+1}/{len(df_questions_answers)}")

        # Use the QA model to query the documents
        qa_obj=queries.QA_Model(index_type,
                        index_name,
                        query_model,
                        embedding_model,
                        llm,
                        **QA_model_params)
        qa_obj.query_docs(row['question'])
        response=qa_obj.result

        df_questions_answers.loc[df_questions_answers.index[i], "answer"] = response['answer'].content
        # print(response['references'])

        ids=[_stable_hash_meta(source_document.metadata)
            for source_document in response['references']]
        df_questions_answers.loc[df_questions_answers.index[i], "source_documents"] = ', '.join(ids)

        # Save the response to cache file
        response_dict = {
            "question": row['question'],
            "answer": response['answer'].content,
            "source_documents": ids,
        }
        write_dict_to_file(response_dict, "rag_response_cache.json")

Processing question 1/8
Processing question 2/8
Processing question 3/8
Processing question 4/8
Processing question 5/8
Processing question 6/8
Processing question 7/8
Processing question 8/8


In [21]:
# Get the context documents content for each question
source_documents_list = []
for cell in df_questions_answers['source_documents']:
    cell_list = cell.strip('[]').split(', ')
    context=[]
    for cell in cell_list:
        context.append(df_docs[df_docs["id"] == cell]["document"].values[0])
    source_documents_list.append(context)
df_questions_answers["contexts"]=source_documents_list

In [22]:
# Addtionaly get embeddings for questions

if not Path("question_embeddings.pickle").exists():
    question_embeddings = [
        synthetic_embeddings.embed_query(question)
        for question in df_questions_answers["question"]
    ]
    with open("question_embeddings.pickle", "wb") as f:
        pickle.dump(question_embeddings, f)

question_embeddings = pickle.load(open("question_embeddings.pickle", "rb"))
# answer_embeddings = pickle.load(open("answer_embeddings_2040214_1111.pickle", "rb"))
df_questions_answers["embedding"] = question_embeddings

In [23]:
df_questions_answers

Unnamed: 0,id,question,ground_truth,question_by,answer,source_documents,contexts,embedding
0,Question 0,What is the purpose of the basepoint in the op...,The purpose of the basepoint in the operation ...,gpt-3.5-turbo-16k,The basepoint contributes to the operation of ...,"a98e7630535742532ce18fede52330803e38599f, c733...",[outfitted with a grapple fixture or a basepoi...,"[-0.009081070091247421, 0.00030956256765110504..."
1,Question 1,What is the qualified temperature range for ro...,-45°C to +55°C,gpt-3.5-turbo-16k,The specific temperature range that falls unde...,"19432f243439fb274b12c6bfa1dc4f061637de8b, c980...","[During the development phase, the qualificati...","[0.015085699714604356, -0.011740906173298446, ..."
2,Question 2,How do accumulators affect the resonance and e...,If the pump uses accumulators so that the pist...,gpt-3.5-turbo-16k,The relationship between accumulators and the ...,"0c55ca2d965e2a2767f636e1bbc84e35d8dd0923, dc5c...",[displacement canbeobtained ifthepumpisrunatre...,"[0.00039506384056605716, 0.007486311915247879,..."
3,Question 3,What are the EVA overrides used for in the mec...,"The EVA overrides are used for GM, IST, TRM, a...",gpt-3.5-turbo-16k,EVA overrides are used in the mechanical subsy...,"d48e0eb3020c1a5fda8987ae42568a3bab11ae75, 6979...",[assembly are performed automatically by the a...,"[0.004960150191388086, -0.008385667914037838, ..."
4,Question 4,What is the purpose of the bogie wheel drive i...,The purpose of the bogie wheel drive in the ro...,gpt-3.5-turbo-16k,The rover's final configuration involves the d...,"57f3cbdfb03c715fbc131b4c34b0dcfc7759c5e5, ee9f...",[The de cision on how to deploy the b ogie wa ...,"[-0.0045647970726753586, 0.002929850968035887,..."
5,Question 5,What is the max mass of the aluminum structure...,Not Fully Defined,gpt-3.5-turbo-16k,The maximum mass of the aluminum structure for...,"7360ce9753059f79fdac230b5f9fc2716f27c860, f63f...",[The P5 truss weighs approximately 1225 kg (27...,"[0.021069765480615614, 0.01359533357432055, -0..."
6,Question 6,What is the maximum mass of the aluminum struc...,15.4 kg,gpt-3.5-turbo-16k,"According to the structural specifications, th...","7360ce9753059f79fdac230b5f9fc2716f27c860, 0923...",[The P5 truss weighs approximately 1225 kg (27...,"[0.01563425546177735, 0.01364178487976738, -0...."
7,Question 7,What process is used to verify the accuracy of...,The process used to verify the accuracy of the...,gpt-3.5-turbo-16k,Sine sweeps and tap testing are two methods us...,"80a5de80b34befe30d44844cc96830d0e123813f, 1c17...","[Test Design, Several lessons learned came fro...","[0.002622058088138635, 0.0006828275931649681, ..."


## Ragas eval

### Evaluate

In [24]:
df_questions_answers = add_cached_column_from_file(
    df_questions_answers, "ragas_result_cache.txt", "question", "answer_correctness"
)
df_questions_answers

Unnamed: 0,id,question,ground_truth,question_by,answer,source_documents,contexts,embedding,answer_correctness
0,Question 0,What is the purpose of the basepoint in the op...,The purpose of the basepoint in the operation ...,gpt-3.5-turbo-16k,The basepoint contributes to the operation of ...,"a98e7630535742532ce18fede52330803e38599f, c733...",[outfitted with a grapple fixture or a basepoi...,"[-0.009081070091247421, 0.00030956256765110504...",
1,Question 1,What is the qualified temperature range for ro...,-45°C to +55°C,gpt-3.5-turbo-16k,The specific temperature range that falls unde...,"19432f243439fb274b12c6bfa1dc4f061637de8b, c980...","[During the development phase, the qualificati...","[0.015085699714604356, -0.011740906173298446, ...",
2,Question 2,How do accumulators affect the resonance and e...,If the pump uses accumulators so that the pist...,gpt-3.5-turbo-16k,The relationship between accumulators and the ...,"0c55ca2d965e2a2767f636e1bbc84e35d8dd0923, dc5c...",[displacement canbeobtained ifthepumpisrunatre...,"[0.00039506384056605716, 0.007486311915247879,...",
3,Question 3,What are the EVA overrides used for in the mec...,"The EVA overrides are used for GM, IST, TRM, a...",gpt-3.5-turbo-16k,EVA overrides are used in the mechanical subsy...,"d48e0eb3020c1a5fda8987ae42568a3bab11ae75, 6979...",[assembly are performed automatically by the a...,"[0.004960150191388086, -0.008385667914037838, ...",
4,Question 4,What is the purpose of the bogie wheel drive i...,The purpose of the bogie wheel drive in the ro...,gpt-3.5-turbo-16k,The rover's final configuration involves the d...,"57f3cbdfb03c715fbc131b4c34b0dcfc7759c5e5, ee9f...",[The de cision on how to deploy the b ogie wa ...,"[-0.0045647970726753586, 0.002929850968035887,...",
5,Question 5,What is the max mass of the aluminum structure...,Not Fully Defined,gpt-3.5-turbo-16k,The maximum mass of the aluminum structure for...,"7360ce9753059f79fdac230b5f9fc2716f27c860, f63f...",[The P5 truss weighs approximately 1225 kg (27...,"[0.021069765480615614, 0.01359533357432055, -0...",
6,Question 6,What is the maximum mass of the aluminum struc...,15.4 kg,gpt-3.5-turbo-16k,"According to the structural specifications, th...","7360ce9753059f79fdac230b5f9fc2716f27c860, 0923...",[The P5 truss weighs approximately 1225 kg (27...,"[0.01563425546177735, 0.01364178487976738, -0....",
7,Question 7,What process is used to verify the accuracy of...,The process used to verify the accuracy of the...,gpt-3.5-turbo-16k,Sine sweeps and tap testing are two methods us...,"80a5de80b34befe30d44844cc96830d0e123813f, 1c17...","[Test Design, Several lessons learned came fro...","[0.002622058088138635, 0.0006828275931649681, ...",


In [25]:
# Unclear why but sometimes ground_truth does not provide a response. Just filter those out.
df_questions_answers = df_questions_answers[df_questions_answers['ground_truth'].apply(lambda x: isinstance(x, str))]
df_questions_answers

Unnamed: 0,id,question,ground_truth,question_by,answer,source_documents,contexts,embedding,answer_correctness
0,Question 0,What is the purpose of the basepoint in the op...,The purpose of the basepoint in the operation ...,gpt-3.5-turbo-16k,The basepoint contributes to the operation of ...,"a98e7630535742532ce18fede52330803e38599f, c733...",[outfitted with a grapple fixture or a basepoi...,"[-0.009081070091247421, 0.00030956256765110504...",
1,Question 1,What is the qualified temperature range for ro...,-45°C to +55°C,gpt-3.5-turbo-16k,The specific temperature range that falls unde...,"19432f243439fb274b12c6bfa1dc4f061637de8b, c980...","[During the development phase, the qualificati...","[0.015085699714604356, -0.011740906173298446, ...",
2,Question 2,How do accumulators affect the resonance and e...,If the pump uses accumulators so that the pist...,gpt-3.5-turbo-16k,The relationship between accumulators and the ...,"0c55ca2d965e2a2767f636e1bbc84e35d8dd0923, dc5c...",[displacement canbeobtained ifthepumpisrunatre...,"[0.00039506384056605716, 0.007486311915247879,...",
3,Question 3,What are the EVA overrides used for in the mec...,"The EVA overrides are used for GM, IST, TRM, a...",gpt-3.5-turbo-16k,EVA overrides are used in the mechanical subsy...,"d48e0eb3020c1a5fda8987ae42568a3bab11ae75, 6979...",[assembly are performed automatically by the a...,"[0.004960150191388086, -0.008385667914037838, ...",
4,Question 4,What is the purpose of the bogie wheel drive i...,The purpose of the bogie wheel drive in the ro...,gpt-3.5-turbo-16k,The rover's final configuration involves the d...,"57f3cbdfb03c715fbc131b4c34b0dcfc7759c5e5, ee9f...",[The de cision on how to deploy the b ogie wa ...,"[-0.0045647970726753586, 0.002929850968035887,...",
5,Question 5,What is the max mass of the aluminum structure...,Not Fully Defined,gpt-3.5-turbo-16k,The maximum mass of the aluminum structure for...,"7360ce9753059f79fdac230b5f9fc2716f27c860, f63f...",[The P5 truss weighs approximately 1225 kg (27...,"[0.021069765480615614, 0.01359533357432055, -0...",
6,Question 6,What is the maximum mass of the aluminum struc...,15.4 kg,gpt-3.5-turbo-16k,"According to the structural specifications, th...","7360ce9753059f79fdac230b5f9fc2716f27c860, 0923...",[The P5 truss weighs approximately 1225 kg (27...,"[0.01563425546177735, 0.01364178487976738, -0....",
7,Question 7,What process is used to verify the accuracy of...,The process used to verify the accuracy of the...,gpt-3.5-turbo-16k,Sine sweeps and tap testing are two methods us...,"80a5de80b34befe30d44844cc96830d0e123813f, 1c17...","[Test Design, Several lessons learned came fro...","[0.002622058088138635, 0.0006828275931649681, ...",


In [26]:
# prepare the dataframe for evaluation
df_qa_eval = df_questions_answers.copy()

In [27]:
# evaluate the answer correctness if not already done
fields = ["question", "answer", "contexts", "ground_truth"]
for i, row in df_qa_eval.iterrows():
    print(i, row["question"])
    if row["answer_correctness"] is None or pd.isnull(row["answer_correctness"]):
        evaluation_result = evaluate(
            Dataset.from_pandas(df_qa_eval.iloc[i : i + 1][fields]),
            [answer_correctness],
        )
        df_qa_eval.loc[i, "answer_correctness"] = evaluation_result[
            "answer_correctness"
        ]

        # optionally save the response to cache
        response_dict = {
            "question": row["question"],
            "answer_correctness": evaluation_result["answer_correctness"],
        }
        write_dict_to_file(response_dict, "ragas_result_cache.txt")

# write the answer correctness to the original dataframe
df_questions_answers["answer_correctness"] = df_qa_eval["answer_correctness"]

0 What is the purpose of the basepoint in the operation of the second end effector?


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

1 What is the qualified temperature range for routine engineering design in this context?


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

2 How do accumulators affect the resonance and efficiency of the pump system?


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

3 What are the EVA overrides used for in the mechanical subsystems and mechanisms?


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

4 What is the purpose of the bogie wheel drive in the rover's final configuration?


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

5 What is the max mass of the aluminum structure for the project?


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

6 What is the maximum mass of the aluminum structure?


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

7 What process is used to verify the accuracy of the model's predicted response after correlating it with test data and validating it through sine sweeps and tap testing?


Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]

### Link from documents to questions, that used the document as source

This section adds a column to df_documents containing the ids of the questions that used the document as source.

In [34]:
# add the infos about questions using each document to the documents dataframe


# Explode 'source_documents' so each document ID is in its own row alongside the question ID
df_questions_exploded = df_qa_eval.explode("source_documents")

# Group by exploded 'source_documents' (document IDs) and aggregate
agg = (
    df_questions_exploded.groupby("source_documents")
    .agg(
        num_questions=("id", "count"),  # Count of questions referencing the document
        question_ids=(
            "id",
            lambda x: list(x),
        ),  # List of question IDs referencing the document
    )
    .reset_index()
    .rename(columns={"source_documents": "id"})
)

# Merge the aggregated information back into df_documents
df_documents_agg = pd.merge(df_docs, agg, on="id", how="left")

# Use apply to replace NaN values with empty lists for 'question_ids'
df_documents_agg["question_ids"] = df_documents_agg["question_ids"].apply(
    lambda x: x if isinstance(x, list) else []
)
# Replace NaN values in 'num_questions' with 0
df_documents_agg["num_questions"] = df_documents_agg["num_questions"].fillna(0)

In [36]:
df = pd.concat([df_qa_eval, df_documents_agg], axis=0)

In [37]:
# create UMAP only using documents and apply it to the documents and questions
from umap import UMAP
import numpy as np

df_questions = df[~df["question"].isna()]
umap = UMAP(n_neighbors=20, min_dist=0.15, metric="cosine", random_state=42).fit(
    df_questions["embedding"].values.tolist()
)
umap_questions = umap.transform(df["embedding"].values.tolist())


df_without_questions = df[df["question"].isna()]
umap = UMAP(n_neighbors=20, min_dist=0.15, metric="cosine", random_state=42).fit(
    df_without_questions["embedding"].values.tolist()
)
umap_docs = umap.transform(df["embedding"].values.tolist())
df["umap_docs"] = umap_docs.tolist()

umap = UMAP(n_neighbors=20, min_dist=0.15, metric="cosine", random_state=42).fit(
    df["embedding"].values.tolist()
)
umap_all = umap.transform(df["embedding"].values.tolist())
df["umap"] = umap_all.tolist()


# find the nearet question (by embedding) for each document
question_embeddings = np.array(df[df["question"].notna()]["embedding"].tolist())

df["nearest_question_dist"] = [  # brute force, could be optimized using ChromaDB
    np.min([np.linalg.norm(np.array(doc_emb) - question_embeddings, axis=1)])
    for doc_emb in df["embedding"].values
]

# write the dataframe to parquet for later use
df.to_parquet(f'df_{index_name}.parquet')

  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")
  warn(
  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")
  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


### Visualize

Adapt the first cell to use the downloaded dataframes if you skipped the preparation steps.

In [39]:
# concat the df containing the questions and the df containing the documents
import pandas as pd

# df = pd.concat([df_qa_eval, df_documents_agg], axis=0)

# OR Load the data from downloaded file https://spotlightpublic.blob.core.windows.net/docs-data/rag_demo/df_f1_rag_docs_and_questions_umaps_v3.parquet
df = pd.read_parquet(f'df_{index_name}.parquet')

In [40]:
# show the dataframe with the question and answer in spotlight
from renumics import spotlight
from renumics.spotlight import Embedding
import pandas as pd

spotlight.show(
    df,
    layout="https://spotlightpublic.blob.core.windows.net/docs-data/rag_demo/layout_rag_3.json",
    dtype={x: Embedding for x in df.keys() if "umap" in x},
)

##  UMAP visualization froms cluster of the questions, workaround: UMAP only on documents

VBox(children=(Label(value='Spotlight running on http://127.0.0.1:60555/'), HBox(children=(Button(description=…