# Ragas evaluation
Test batch and ragas capability.

Uses this article as a model: https://towardsdatascience.com/visualize-your-rag-data-evaluate-your-retrieval-augmented-generation-system-with-ragas-fc2486308557

Ragas repository: https://github.com/explodinggradients/ragas/tree/main

In [1]:
import os, sys
import json
from pathlib import Path
import pickle

from ragas.testset import TestsetGenerator
from ragas import RunConfig
from dotenv import load_dotenv,find_dotenv
import chromadb
from chromadb import PersistentClient
from pinecone import Pinecone as pinecone_client, ServerlessSpec
from langchain_pinecone import PineconeVectorStore
from langchain_community.vectorstores import Chroma
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_voyageai import VoyageAIEmbeddings
from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings
from langchain_core.documents import Document
import pandas as pd
import random

from ragas import evaluate
from ragas.metrics import answer_correctness
from datasets import Dataset

# Import local packages
sys.path.append('../src/aerospace_chatbot')
import queries
from data_processing import _stable_hash_meta, archive_db

# Set environment variables with .env
load_dotenv(find_dotenv(), override=True)

True

## Nifty functions

In [2]:
def write_dict_to_file(data_dict, filename):
    """write a dictionary as a json line to a file - allowing for appending"""
    with open(filename, "a") as f:
        f.write(json.dumps(data_dict) + "\n")

def read_dicts_from_file(filename):
    """Read a json line file as a generator of dictionaries - allowing to load multiple dictionaries as list."""
    with open(filename, "r") as f:
        for line in f:
            yield json.loads(line)
            
def add_cached_column_from_file(df, file_name, merge_on, column):
    """Read a file with cached list of dicts data write it to a dataframe."""

    if Path(file_name).exists():

        cached_answer_correctness = (
            pd.DataFrame(list(read_dicts_from_file(file_name)))
            .drop_duplicates(
                subset=[merge_on],
            )[[column, merge_on]]
            .dropna()
            .reset_index(drop=True)
        )
        return df.merge(
            cached_answer_correctness,
            on=merge_on,
            how="left",
        ).reset_index(drop=True)
    else:
        # return df.insert(0, column, None)

        # Create a copy of the DataFrame
        df_out = df.copy()

        # Add the new column with the name of the variable 'column'
        df_out[column] = None

        # Reorder the columns to place the new column at the end
        columns = list(df_out.columns)
        columns.remove(column)
        columns.append(column)
        df_out = df_out[columns]
        
        # df_out = df.copy().assign(column="")
        # df_out = df_out.reindex(columns=( [col for col in df_out.columns if col not in [column]] + [column] ))
        return df_out

## Connect to database

### Pinecone

In [3]:
pinecone_client = pinecone_client(api_key=os.getenv('PINECONE_API_KEY'))
indexes=pinecone_client.list_indexes()
indexes

{'indexes': [{'dimension': 1024,
              'host': 'voyage-large-2-instruct-ams-2merge-none-dj30h8y.svc.aped-4627-b74a.pinecone.io',
              'metric': 'cosine',
              'name': 'voyage-large-2-instruct-ams-2merge-none',
              'spec': {'serverless': {'cloud': 'aws', 'region': 'us-east-1'}},
              'status': {'ready': True, 'state': 'Ready'}}]}

In [3]:
dbs=[{'index_name':'voyage-large-2-instruct-ams-2merge-none',
     'query_model': VoyageAIEmbeddings(model='voyage-large-2-instruct', 
                                       voyage_api_key=os.getenv('VOYAGE_API_KEY'), truncation=False)}
     ]

In [4]:
# Selection to use
db=dbs[0]

In [6]:
index = pinecone_client.Index(db['index_name'])

In [7]:
ids=[]
for id in index.list():
    ids.extend(id)

In [8]:
docs=[]
chunk_size=200  # Tune to whatever doesn't error out
for i in range(0, len(ids), chunk_size):
    print(f"Fetching {i} to {i+chunk_size}")
    vector=index.fetch(ids[i:i+chunk_size])['vectors']
    vector_data = []
    for key, value in vector.items():
        vector_data.append(value)
    docs.extend(vector_data)

Fetching 0 to 200
Fetching 200 to 400
Fetching 400 to 600
Fetching 600 to 800
Fetching 800 to 1000
Fetching 1000 to 1200
Fetching 1200 to 1400
Fetching 1400 to 1600
Fetching 1600 to 1800
Fetching 1800 to 2000
Fetching 2000 to 2200
Fetching 2200 to 2400
Fetching 2400 to 2600
Fetching 2600 to 2800


In [9]:
# Shema for accesing the data
# data=vector_data[0]['metadata']
# page_content=data['page_content']
# metadata={'page':data['page'],'source':data['source']}
# embedding=vector_data[0]['values']

In [16]:
lcdocs_pinecone = []
for data in docs:
    data=data['metadata']
    lcdocs_pinecone.append(Document(page_content=data['page_content'],
                           metadata={'page':data['page'],'source':data['source']}))

In [17]:
len(lcdocs_pinecone)

2696

In [18]:
lcdocs_pinecone[0]

Document(page_content='228 deployable hinge mechanisms. The SWOT mission has two identical reflector masts, each with two  deployable hinges. NISAR has a single mast with four deployable hinges. These masts can be seen in  Figure 1. Operationally, the deployable masts are launched in a stowed state wi th a launch restraint system  composed of separation nut devices. When commanded, the launch restraints release a pre- tensioned  spring and damper mechanism which deploys each hinge. Hinge deployment progress is monitored on the  ground using a potentiometer as well as a limit switch  on each hinge. Upon completion of the deployment,  an actuator -driven latching mechanism preloads precision alignment features on either side of the hinge  together. Figure 2 displays a n overview of  the mechanisms.  \n\nFigure 2. Hinge Deploy & Latching Mechanisms (NISAR configuration s hown)  \n\nMechanism Design and  Fabrication  \n\nMechanism Design   Each deployable hinge for  the SWOT and NISAR mast

In [5]:
# Export all collections to pickles to store them
# for db in dbs:
df_temp_pinecone=archive_db('Pinecone',db['index_name'],db['query_model'],export_pickle=False)

Fetching 0 to 200
Fetching 200 to 400
Fetching 400 to 600
Fetching 600 to 800
Fetching 800 to 1000
Fetching 1000 to 1200
Fetching 1200 to 1400
Fetching 1400 to 1600
Fetching 1600 to 1800
Fetching 1800 to 2000
Fetching 2000 to 2200
Fetching 2200 to 2400
Fetching 2400 to 2600
Fetching 2600 to 2800


Unnamed: 0,id,source,page,metadata,document,embedding
0,0000f9019f67f0266ac21eb6c543745fd6b44d71,"['AMS_2016.pdf', 'AMS_2016.pdf']","[293, 294]","{'page': '[293, 294]', 'source': '['AMS_2016.p...",279 James W ebb Space Telescope Deployment Bru...,"[0.00720933825, 0.0139422296, 0.00683793798, 0..."
1,001641be8dc3700ce8a5dfb7b89ba34cf1532a7a,"['AMS_2018.pdf', 'AMS_2018.pdf']","[68, 69]","{'page': '[68, 69]', 'source': '['AMS_2018.pdf...",by driving the percussor cam through the secon...,"[0.0130121643, 0.0220245682, -0.00461992202, 0..."
2,002f2a22399b4cddc2c4fc6e64701373f164c9ca,"['AMS_2020.pdf', 'AMS_2020.pdf']","[238, 239]","{'page': '[238, 239]', 'source': '['AMS_2020.p...",228 deployable hinge mechanisms. The SWOT miss...,"[0.0119848633, 0.0316759646, -0.0328605771, 0...."
3,004393406340fc757284d5fac68a9e11b5e83035,"['AMS_2006.pdf', 'AMS_2006.pdf']","[149, 150]","{'page': '[149, 150]', 'source': '['AMS_2006.p...",As part of the trade-off process an assessment...,"[0.013438114, 0.0535361692, -0.000656775548, 0..."
4,007433204368b888003b42689fbd32dd40f2d5fc,"['AMS_2018.pdf', 'AMS_2018.pdf']","[285, 287]","{'page': '[285, 287]', 'source': '['AMS_2018.p...",Application to Future Platforms With the succ...,"[0.0171286669, 0.0204658043, -0.0117366463, 0...."
...,...,...,...,...,...,...
2691,ff84a007bfa20d87454dc56a385b7e75cdc18285,"['AMS_2008.pdf', 'AMS_2008.pdf']","[30, 31]","{'page': '[30, 31]', 'source': '['AMS_2008.pdf...",Figure 1. High Resolution Rotary Actuator The...,"[0.00829231, 0.0277320761, -0.0370412283, 0.02..."
2692,ff8b812b601e9adadebd534c041a594057fd5163,"['AMS_2004.pdf', 'AMS_2004.pdf']","[319, 320]","{'page': '[319, 320]', 'source': '['AMS_2004.p...",mech anisms were al so performe d afte r ran d...,"[-0.00865876116, 0.0209979694, -0.0138918078, ..."
2693,ff8e46cc86c6a9a374df1023df243bd2cb65ff56,"['AMS_2002.pdf', 'AMS_2002.pdf']","[61, 62]","{'page': '[61, 62]', 'source': '['AMS_2002.pdf...","moment---notshowninthefigure---was13Nm,whichco...","[0.0218223, 0.0741071329, -0.0443298817, 0.023..."
2694,ff95dfb82721e17b22018b6f3a362ec9ae36807a,"['AMS_2004.pdf', 'AMS_2004.pdf']","[280, 281]","{'page': '[280, 281]', 'source': '['AMS_2004.p...",The second function of the launch restraint is...,"[0.0170335546, 0.0291346312, -0.0195622407, 0...."


### ChromaDB

In [9]:
persistent_client = chromadb.PersistentClient(path=os.path.join(os.getenv('LOCAL_DB_PATH'),'chromadb'))   
collections=persistent_client.list_collections()
collections

[Collection(name=text-embedding-3-large-ams-nomerge-400),
 Collection(name=mixedbread-ai-mxbai-embed-large-v1-ams-nomerge-400),
 Collection(name=text-embedding-3-small-ams-2merge-none),
 Collection(name=text-embedding-3-large-ams-nomerge-400-parent-child),
 Collection(name=text-embedding-3-large-ams-2merge-none)]

In [10]:
dbs=[{'index_name':'text-embedding-3-large-ams-2merge-none',
     'query_model':OpenAIEmbeddings(model='text-embedding-3-large',openai_api_key=os.getenv('OPENAI_API_KEY'))},
     {'index_name':'text-embedding-3-large-ams-nomerge-400',
     'query_model':OpenAIEmbeddings(model='text-embedding-3-large',openai_api_key=os.getenv('OPENAI_API_KEY'))},
     {'index_name':'text-embedding-3-small-ams-2merge-none',
     'query_model':OpenAIEmbeddings(model='text-embedding-3-small',openai_api_key=os.getenv('OPENAI_API_KEY'))},
     {'index_name':'text-embedding-3-large-ams-nomerge-400-parent-child',
     'query_model':OpenAIEmbeddings(model='text-embedding-3-large',openai_api_key=os.getenv('OPENAI_API_KEY'))},
     {'index_name':'mixedbread-ai-mxbai-embed-large-v1-ams-nomerge-400',
     'query_model':HuggingFaceInferenceAPIEmbeddings(model_name='mixedbread-ai-mxbai-embed-large-v1',api_key=os.getenv('HUGGINGFACEHUB_API_TOKEN'))}]

In [11]:
# Selection to use
db=dbs[3]

In [12]:
docs_vectorstore = Chroma(client=persistent_client,
                        collection_name=db['index_name'],
                        embedding_function=db['query_model'])  

In [13]:
all_docs = docs_vectorstore.get(include=["metadatas", "documents", "embeddings"])

In [14]:
lcdocs_chroma = [Document(page_content=doc, metadata=metadata) 
          for doc, metadata in zip(all_docs['documents'], all_docs['metadatas'])]

In [15]:
len(lcdocs_chroma)

41206

In [16]:
lcdocs_chroma[0]

Document(page_content='Figure 3 - Comparison of Gas Strut Separation and BDM Separation', metadata={'doc_id': '6faab80591d1ea3f0e2cf6b14e2af8734331c09e', 'page': 347, 'source': 'AMS_2008.pdf', 'start_index': 0})

In [None]:
# Export all collections to pickles to store them
# for db in dbs:
df_temp_chroma=archive_db('ChromaDB',db['index_name'],db['query_model'],export_pickle=False)

## Generate synthetic dataset

In [None]:
generator_model="gpt-3.5-turbo-16k"
synthetic_generator_llm = ChatOpenAI(model=generator_model, tags=[generator_model])

critic_model='gpt-4o'
synthetic_critic_llm = ChatOpenAI(model=critic_model,tags=[critic_model])

embedding_model="text-embedding-ada-002"
synthetic_embeddings = OpenAIEmbeddings(model=embedding_model,api_key=os.getenv('OPENAI_API_KEY'))

In [None]:
generator = TestsetGenerator.from_langchain(
    synthetic_generator_llm,
    synthetic_critic_llm,
    synthetic_embeddings
)

# sample_size=min(len(lcdocs),20) # 500 is the max size before you'll hit rate limits with a tier 3 openai account
sample_size=100
percent_total=sample_size/len(lcdocs)
print(percent_total)

# Get a random sample of lcdocs
lcdocs_random = random.sample(lcdocs, sample_size)

In [None]:
# Check if testset.csv exists
if not os.path.exists('./testset.csv'):
    run_config=RunConfig(timeout=1000,
                    max_retries=50,
                    max_wait=1000,
                    max_workers=1)

    n_questions=50
    testset = generator.generate_with_langchain_docs(lcdocs_random, 
                                                    test_size=n_questions,
                                                    with_debugging_logs=True,
                                                    is_async=False,
                                                    run_config=run_config,
                                                    raise_exceptions=False)
    df_testset=testset.to_pandas()
    df_testset.to_csv(f'testset_{index_name}.csv', index=False)
else:
    # Import testset.csv into a DataFrame
    df_testset = pd.read_csv(f'testset_{index_name}.csv')

In [None]:
df_testset

### Format dataset and database for RAG

In [None]:
df_questions = df_testset[['question', 'ground_truth']].copy()
df_questions['id'] = 'Question ' + df_questions.index.astype(str)
df_questions['question_by'] = generator_model
df_questions = df_questions[['id', 'question', 'ground_truth', 'question_by']]

In [None]:
df_questions

In [None]:
all_docs = docs_vectorstore.get(include=["metadatas", "documents", "embeddings"])
df_docs = pd.DataFrame(
    {
        "id": [_stable_hash_meta(metadata) for metadata in all_docs["metadatas"]],
        "source": [metadata.get("source") for metadata in all_docs["metadatas"]],
        "page": [metadata.get("page", -1) for metadata in all_docs["metadatas"]],
        "document": all_docs["documents"],
        "embedding": all_docs["embeddings"],
    }
)

In [None]:
# Load the cached RAG answers and source_documents ids from a file - or create an empty column
df_questions_answers = add_cached_column_from_file(
    df_questions, f'rag_response_cache_{index_name}.txt', "question", "answer")

df_questions_answers = add_cached_column_from_file(
    df_questions_answers, f'rag_response_cache_{index_name}.txt', "question", "source_documents")

In [None]:
df_questions_answers

# RAG questions/answers (batch mode)

The cell below demonstrates using aerospace_chatbot in batch mode. It requires some basic parameter setup and a QA_model.

In [None]:
# TODO turn this into a function/standalone notebook to show how to use the QA model in batch mode

index_type='ChromaDB'
index_name=index_name
query_model=synthetic_embeddings
llm=synthetic_generator_llm

QA_model_params={'rag_type':'Standard',
                 'k':4,
                 'search_type':'similarity',
                 'local_db_path':os.getenv('LOCAL_DB_PATH')}

for i, row in df_questions_answers.iterrows():
    if row['answer'] is None or pd.isnull(row['answer']) or row['answer']=='':
        print(f"Processing question {i+1}/{len(df_questions_answers)}")

        # Use the QA model to query the documents
        qa_obj=queries.QA_Model(index_type,
                        index_name,
                        query_model,
                        embedding_model,
                        llm,
                        **QA_model_params)
        qa_obj.query_docs(row['question'])
        response=qa_obj.result

        df_questions_answers.loc[df_questions_answers.index[i], "answer"] = response['answer'].content

        ids=[_stable_hash_meta(source_document.metadata)
            for source_document in response['references']]
        df_questions_answers.loc[df_questions_answers.index[i], "source_documents"] = ', '.join(ids)

        # Save the response to cache file
        response_dict = {
            "question": row['question'],
            "answer": response['answer'].content,
            "source_documents": ids,
        }
        write_dict_to_file(response_dict, f'rag_response_cache_{index_name}.json')

In [None]:
# Get the context documents content for each question
source_documents_list = []
for cell in df_questions_answers['source_documents']:
    cell_list = cell.strip('[]').split(', ')
    context=[]
    for cell in cell_list:
        context.append(df_docs[df_docs["id"] == cell]["document"].values[0])
    source_documents_list.append(context)
df_questions_answers["contexts"]=source_documents_list

In [None]:
# Addtionaly get embeddings for questions

if not Path(f'question_embeddings_{index_name}.pickle').exists():
    question_embeddings = [
        synthetic_embeddings.embed_query(question)
        for question in df_questions_answers["question"]
    ]
    with open(f'question_embeddings_{index_name}.pickle', "wb") as f:
        pickle.dump(question_embeddings, f)

question_embeddings = pickle.load(open(f'question_embeddings_{index_name}.pickle', "rb"))
df_questions_answers["embedding"] = question_embeddings

In [None]:
df_questions_answers

## Ragas eval

### Evaluate

In [None]:
df_questions_answers = add_cached_column_from_file(
    df_questions_answers, f'ragas_result_cache_{index_name}.txt', "question", "answer_correctness"
)
df_questions_answers

In [None]:
# Unclear why but sometimes ground_truth does not provide a response. Just filter those out.
df_questions_answers = df_questions_answers[df_questions_answers['ground_truth'].apply(lambda x: isinstance(x, str))]
df_questions_answers

In [None]:
# prepare the dataframe for evaluation
df_qa_eval = df_questions_answers.copy()

In [None]:
# evaluate the answer correctness if not already done
fields = ["question", "answer", "contexts", "ground_truth"]
for i, row in df_qa_eval.iterrows():
    print(i, row["question"])
    if row["answer_correctness"] is None or pd.isnull(row["answer_correctness"]):
        evaluation_result = evaluate(
            Dataset.from_pandas(df_qa_eval.iloc[i : i + 1][fields]),
            [answer_correctness],
        )
        df_qa_eval.loc[i, "answer_correctness"] = evaluation_result[
            "answer_correctness"
        ]

        # optionally save the response to cache
        response_dict = {
            "question": row["question"],
            "answer_correctness": evaluation_result["answer_correctness"],
        }
        write_dict_to_file(response_dict, f'ragas_result_cache_{index_name}.txt')

# write the answer correctness to the original dataframe
df_questions_answers["answer_correctness"] = df_qa_eval["answer_correctness"]

### Link from documents to questions, that used the document as source

This section adds a column to df_documents containing the ids of the questions that used the document as source.

In [None]:
# add the infos about questions using each document to the documents dataframe


# Explode 'source_documents' so each document ID is in its own row alongside the question ID
df_questions_exploded = df_qa_eval.explode("source_documents")

# Group by exploded 'source_documents' (document IDs) and aggregate
agg = (
    df_questions_exploded.groupby("source_documents")
    .agg(
        num_questions=("id", "count"),  # Count of questions referencing the document
        question_ids=(
            "id",
            lambda x: list(x),
        ),  # List of question IDs referencing the document
    )
    .reset_index()
    .rename(columns={"source_documents": "id"})
)

# Merge the aggregated information back into df_documents
df_documents_agg = pd.merge(df_docs, agg, on="id", how="left")

# Use apply to replace NaN values with empty lists for 'question_ids'
df_documents_agg["question_ids"] = df_documents_agg["question_ids"].apply(
    lambda x: x if isinstance(x, list) else []
)
# Replace NaN values in 'num_questions' with 0
df_documents_agg["num_questions"] = df_documents_agg["num_questions"].fillna(0)

In [None]:
df = pd.concat([df_qa_eval, df_documents_agg], axis=0)

In [None]:
# create UMAP only using documents and apply it to the documents and questions
from umap import UMAP
import numpy as np

df_questions = df[~df["question"].isna()]
umap = UMAP(n_neighbors=20, min_dist=0.15, metric="cosine", random_state=42).fit(
    df_questions["embedding"].values.tolist()
)
umap_questions = umap.transform(df["embedding"].values.tolist())


df_without_questions = df[df["question"].isna()]
umap = UMAP(n_neighbors=20, min_dist=0.15, metric="cosine", random_state=42).fit(
    df_without_questions["embedding"].values.tolist()
)
umap_docs = umap.transform(df["embedding"].values.tolist())
df["umap_docs"] = umap_docs.tolist()

umap = UMAP(n_neighbors=20, min_dist=0.15, metric="cosine", random_state=42).fit(
    df["embedding"].values.tolist()
)
umap_all = umap.transform(df["embedding"].values.tolist())
df["umap"] = umap_all.tolist()


# find the nearet question (by embedding) for each document
question_embeddings = np.array(df[df["question"].notna()]["embedding"].tolist())

df["nearest_question_dist"] = [  # brute force, could be optimized using ChromaDB
    np.min([np.linalg.norm(np.array(doc_emb) - question_embeddings, axis=1)])
    for doc_emb in df["embedding"].values
]

# write the dataframe to parquet for later use
df.to_parquet(f'df_{index_name}.parquet')

### Visualize

Adapt the first cell to use the downloaded dataframes if you skipped the preparation steps.

In [None]:
# concat the df containing the questions and the df containing the documents
import pandas as pd

# df = pd.concat([df_qa_eval, df_documents_agg], axis=0)

# OR Load the data from downloaded file https://spotlightpublic.blob.core.windows.net/docs-data/rag_demo/df_f1_rag_docs_and_questions_umaps_v3.parquet
df = pd.read_parquet(f'df_{index_name}.parquet')

In [None]:
# show the dataframe with the question and answer in spotlight
from renumics import spotlight
from renumics.spotlight import Embedding
import pandas as pd

spotlight.show(
    df,
    layout="https://spotlightpublic.blob.core.windows.net/docs-data/rag_demo/layout_rag_3.json",
    dtype={x: Embedding for x in df.keys() if "umap" in x},
)

##  UMAP visualization froms cluster of the questions, workaround: UMAP only on documents