# Ragas evaluation
Test batch and ragas capability.

Uses this article as a model: https://towardsdatascience.com/visualize-your-rag-data-evaluate-your-retrieval-augmented-generation-system-with-ragas-fc2486308557

Ragas repository: https://github.com/explodinggradients/ragas/tree/main

In [41]:
import os, sys
import json
from pathlib import Path
import pickle

from ragas.testset import TestsetGenerator
from ragas import RunConfig
from dotenv import load_dotenv,find_dotenv
import chromadb
from chromadb import PersistentClient
from pinecone import Pinecone as pinecone_client, ServerlessSpec
from langchain_pinecone import PineconeVectorStore
from langchain_community.vectorstores import Chroma
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_voyageai import VoyageAIEmbeddings
from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings
from langchain_core.documents import Document
import pandas as pd
import random

from ragas import evaluate
from ragas.metrics import answer_correctness
from datasets import Dataset

from renumics import spotlight
from renumics.spotlight import Embedding
import pandas as pd

from umap import UMAP
import numpy as np

# Import local packages
sys.path.append('../src/aerospace_chatbot')
import queries
from data_processing import _stable_hash_meta, archive_db, get_docs_questions_df

# Set environment variables with .env
load_dotenv(find_dotenv(), override=True)

True

## Utility functions

In [42]:
def write_dict_to_file(data_dict, filename):
    """write a dictionary as a json line to a file - allowing for appending"""
    with open(filename, "a") as f:
        f.write(json.dumps(data_dict) + "\n")

def read_dicts_from_file(filename):
    """Read a json line file as a generator of dictionaries - allowing to load multiple dictionaries as list."""
    with open(filename, "r") as f:
        for line in f:
            yield json.loads(line)
            
def add_cached_column_from_file(df, file_name, merge_on, column):
    """Read a file with cached list of dicts data write it to a dataframe."""
    if Path(file_name).exists():
        cached_answer_correctness = (
            pd.DataFrame(list(read_dicts_from_file(file_name)))
            .drop_duplicates(
                subset=[merge_on],
            )[[column, merge_on]]
            .dropna()
            .reset_index(drop=True)
        )
        return df.merge(
            cached_answer_correctness,
            on=merge_on,
            how="left",
        ).reset_index(drop=True)
    else:
        # Create a copy of the DataFrame
        df_out = df.copy()

        # Add the new column with the name of the variable 'column'
        df_out[column] = None

        # Reorder the columns to place the new column at the end
        columns = list(df_out.columns)
        columns.remove(column)
        columns.append(column)
        df_out = df_out[columns]
        
        return df_out

In [59]:
def rag_responses(index_type, index_name, query_model, llm, QA_model_params, df_questions_answers_in, df_docs):
    df_questions_answers_out=df_questions_answers_in.copy()
    
    # Generate responses using RAG with input parameters
    for i, row in df_questions_answers_out.iterrows():
        if row['answer'] is None or pd.isnull(row['answer']) or row['answer']=='':
            print(f"Processing question {i+1}/{len(df_questions_answers_out)}")

            # Use the QA model to query the documents
            qa_obj=queries.QA_Model(index_type,
                            index_name,
                            query_model,
                            llm,
                            **QA_model_params)
            qa_obj.query_docs(row['question'])
            response=qa_obj.result

            df_questions_answers_out.loc[df_questions_answers_out.index[i], "answer"] = response['answer'].content

            ids=[_stable_hash_meta(source_document.metadata)
                for source_document in response['references']]
            df_questions_answers_out.loc[df_questions_answers_out.index[i], "source_documents"] = ', '.join(ids)

            # Save the response to cache file
            response_dict = {
                "question": row['question'],
                "answer": response['answer'].content,
                "source_documents": ids,
            }
            write_dict_to_file(response_dict, f'rag_response_cache_{index_name}.json')

    # Get the context documents content for each question
    source_documents_list = []
    for cell in df_questions_answers_out['source_documents']:
        cell_list = cell.strip('[]').split(', ')
        context=[]
        for cell in cell_list:
            context.append(df_docs[df_docs["id"] == cell]["document"].values[0])
        source_documents_list.append(context)
    df_questions_answers_out["contexts"]=source_documents_list

    # Addtionaly get embeddings for questions
    if not Path(f'question_embeddings_{index_name}.pickle').exists():
        question_embeddings = [
            query_model.embed_query(question)
            for question in df_questions_answers_out["question"]
        ]
        with open(f'question_embeddings_{index_name}.pickle', "wb") as f:
            pickle.dump(question_embeddings, f)

    question_embeddings = pickle.load(open(f'question_embeddings_{index_name}.pickle', "rb"))
    df_questions_answers_out["question_embedding"] = question_embeddings
    return df_questions_answers_out

In [60]:
def eval_rag(index_name, df_questions_answers_in):
    df_questions_answers_out=df_questions_answers_in.copy()

    # Add answer correctness column, fill in if it exists
    df_questions_answers_out = add_cached_column_from_file(
        df_questions_answers_out, f'ragas_result_cache_{index_name}.txt', "question", "answer_correctness"
    )

    # Unclear why but sometimes ground_truth does not provide a response. Just filter those out.
    df_questions_answers_out = df_questions_answers_out[df_questions_answers_out['ground_truth'].apply(lambda x: isinstance(x, str))]
    df_questions_answers_out

    # Prepare the dataframe for evaluation
    df_qa_eval = df_questions_answers_out.copy()

    # Evaluate the answer correctness if not already done
    fields = ["question", "answer", "contexts", "ground_truth"]
    for i, row in df_qa_eval.iterrows():
        print(i, row["question"])
        if row["answer_correctness"] is None or pd.isnull(row["answer_correctness"]):
            evaluation_result = evaluate(
                Dataset.from_pandas(df_qa_eval.iloc[i : i + 1][fields]),
                [answer_correctness],
            )
            df_qa_eval.loc[i, "answer_correctness"] = evaluation_result[
                "answer_correctness"
            ]

            # optionally save the response to cache
            response_dict = {
                "question": row["question"],
                "answer_correctness": evaluation_result["answer_correctness"],
            }
            write_dict_to_file(response_dict, f'ragas_result_cache_{index_name}.txt')

    # write the answer correctness to the original dataframe
    df_questions_answers_out["answer_correctness"] = df_qa_eval["answer_correctness"]

    return df_qa_eval, df_questions_answers_out

In [61]:
def data_viz_prep(index_name,df_qa_eval,df_docs):
    # This section adds a column to df_documents containing the ids of the questions that used the document as source. 
    # Add the infos about questions using each document to the documents dataframe.

    # Explode 'source_documents' so each document ID is in its own row alongside the question ID
    df_questions_exploded = df_qa_eval.explode("source_documents")

    # Group by exploded 'source_documents' (document IDs) and aggregate
    agg = (
        df_questions_exploded.groupby("source_documents")
        .agg(
            num_questions=("id", "count"),  # Count of questions referencing the document
            question_ids=(
                "id",
                lambda x: list(x),
            ),  # List of question IDs referencing the document
        )
        .reset_index()
        .rename(columns={"source_documents": "id"})
    )

    # Merge the aggregated information back into df_documents
    df_documents_agg = pd.merge(df_docs, agg, on="id", how="left")

    # Use apply to replace NaN values with empty lists for 'question_ids'
    df_documents_agg["question_ids"] = df_documents_agg["question_ids"].apply(
        lambda x: x if isinstance(x, list) else []
    )
    # Replace NaN values in 'num_questions' with 0
    df_documents_agg["num_questions"] = df_documents_agg["num_questions"].fillna(0)

    # Concatenate the two dataframes
    df_visualize = pd.concat([df_qa_eval, df_documents_agg], axis=0)

    # Create UMAP only using documents and apply it to the documents and questions
    df_questions = df_visualize[~df_visualize["question"].isna()]
    umap = UMAP(n_neighbors=20, min_dist=0.15, metric="cosine", random_state=42).fit(
        df_questions["question_embedding"].values.tolist()
    )

    df_without_questions = df_visualize[df_visualize["question"].isna()]
    umap = UMAP(n_neighbors=20, min_dist=0.15, metric="cosine", random_state=42).fit(
        df_without_questions["question_embedding"].values.tolist()
    )
    umap_docs = umap.transform(df_visualize["question_embedding"].values.tolist())
    df_visualize["umap_docs"] = umap_docs.tolist()

    umap = UMAP(n_neighbors=20, min_dist=0.15, metric="cosine", random_state=42).fit(
        df_visualize["question_embedding"].values.tolist()
    )
    umap_all = umap.transform(df_visualize["question_embedding"].values.tolist())
    df_visualize["umap"] = umap_all.tolist()


    # find the nearet question (by question_embedding) for each document
    question_embeddings = np.array(df_visualize[df_visualize["question"].notna()]["question_embedding"].tolist())

    df_visualize["nearest_question_dist"] = [  # brute force, could be optimized using ChromaDB
        np.min([np.linalg.norm(np.array(doc_emb) - question_embeddings, axis=1)])
        for doc_emb in df_visualize["question_embedding"].values
    ]

    # write the dataframe to parquet for later use
    df_visualize.to_parquet(f'df_{index_name}.parquet')

    return df_visualize

# ChromaDB

## Connect to database

In [3]:
persistent_client = chromadb.PersistentClient(path=os.path.join(os.getenv('LOCAL_DB_PATH'),'chromadb'))   
collections=persistent_client.list_collections()
collections

[Collection(name=text-embedding-3-large-2merge-0),
 Collection(name=text-embedding-3-large-0merge-400)]

In [4]:
dbs=[{'index_name':'text-embedding-3-large-2merge-0',
     'query_model':OpenAIEmbeddings(model='text-embedding-3-large',openai_api_key=os.getenv('OPENAI_API_KEY'))}]

In [5]:
# Inspect the first db
db=dbs[0]
docs_vectorstore = Chroma(client=persistent_client,
                        collection_name=db['index_name'],
                        embedding_function=db['query_model'])  
all_docs = docs_vectorstore.get(include=["metadatas", "documents", "embeddings"])
lcdocs_chroma = [Document(page_content=doc, metadata=metadata) 
          for doc, metadata in zip(all_docs['documents'], all_docs['metadatas'])]

print(len(lcdocs_chroma))

In [8]:
# Export all collections to pickles to store them
for db in dbs:
    df_temp_chroma=archive_db('ChromaDB',db['index_name'],db['query_model'],export_pickle=True)

In [9]:
df_temp_chroma.head(5)

Unnamed: 0,id,source,page,metadata,document,embedding
0,001889abd34c56d712ba79f905d7b90159f5c354,"['AMS_2001_reocr.pdf', 'AMS_2001_reocr.pdf']","[37, 38]","{'page': '[37, 38]', 'source': '['AMS_2001_reo...",Using a Ball Aerospace-developed lubricant f...,"[0.013638148084282875, -0.023859944194555283, ..."
1,001bc8af131361ed46a8999ade3e7068522572a1,"['AMS_2016_reocr.pdf', 'AMS_2016_reocr.pdf']","[172, 173]","{'page': '[172, 173]', 'source': '['AMS_2016_r...",158 Design Ideology and Testing \n\nEarly Des...,"[0.0009454930550418794, 0.018359793350100517, ..."
2,0030b47c6180d167c2d013c8e7a8b8bb0c16de44,"['AMS_2001_reocr.pdf', 'AMS_2001_reocr.pdf']","[43, 44]","{'page': '[43, 44]', 'source': '['AMS_2001_reo...",The Vertrel XF treated bearing had a slight am...,"[-0.00619348743930459, -0.028075991198420525, ..."
3,00356e19ac534a10e86a774fa22603978622d654,"['AMS_2018_reocr.pdf', 'AMS_2018_reocr.pdf']","[523, 524]","{'page': '[523, 524]', 'source': '['AMS_2018_r...",•Terminate the test at the first onset of unex...,"[-0.036841981112957, -0.022101974114775658, -0..."
4,003edd3f9efa244ccfd87445ebd5edff5c3155b3,"['AMS_2020_reocr.pdf', 'AMS_2020_reocr.pdf']","[197, 198]","{'page': '[197, 198]', 'source': '['AMS_2020_r...","7 oe ~ Of Dual ~187 a highly loaded planet, a ...","[-0.002627367153763771, -0.006074088159948587,..."


## Generate synthetic dataset

Good article on how models/embeddings are used in the `TestsetGenerator`: https://www.pondhouse-data.com/blog/evaluate-rag-performance-using-ragas

In [10]:
# Set generator inputs
generator_model="gpt-3.5-turbo-16k"
synthetic_generator_llm = ChatOpenAI(model=generator_model, tags=[generator_model])

critic_model='gpt-4o'
synthetic_critic_llm = ChatOpenAI(model=critic_model,tags=[critic_model])

embedding_model='text-embedding-3-large'
synthetic_embeddings = OpenAIEmbeddings(model=embedding_model,api_key=os.getenv('OPENAI_API_KEY'))

In [11]:
generator = TestsetGenerator.from_langchain(
    synthetic_generator_llm,
    synthetic_critic_llm,
    synthetic_embeddings
)

sample_size=50
percent_total=sample_size/len(lcdocs_chroma)
print(percent_total)

# Get a random sample of lcdocs
lcdocs_random = random.sample(lcdocs_chroma, sample_size)

0.022502250225022502


In [12]:
# Check if testset.csv exists, use, or generate the synthetic dataset.
if not os.path.exists('./testset.csv'):
    run_config=RunConfig(timeout=1000,
                    max_retries=50,
                    max_wait=1000,
                    max_workers=1)

    n_questions=5
    testset = generator.generate_with_langchain_docs(lcdocs_random, 
                                                    test_size=n_questions,
                                                    with_debugging_logs=True,
                                                    is_async=False,
                                                    run_config=run_config,
                                                    raise_exceptions=False)
    df_testset=testset.to_pandas()
    df_testset.to_csv(f"testset_{db['index_name']}.csv", index=False)
else:
    # Import testset.csv into a DataFrame
    df_testset = pd.read_csv(f"testset_{db['index_name']}.csv")

embedding nodes:   0%|          | 0/176 [00:00<?, ?it/s]

Filename and doc_id are the same for all nodes.


Generating:   0%|          | 0/5 [00:00<?, ?it/s]

[ragas.testset.filters.DEBUG] context scoring: {'clarity': 2, 'depth': 3, 'structure': 2, 'relevance': 3, 'score': 2.5}
[ragas.testset.evolutions.DEBUG] keyphrases in merged node: ['Microvibration', 'System-level', 'Transmission path', 'Performance requirements', 'Microvibration verification']
[ragas.testset.evolutions.INFO] seed question generated: How is the transmission path of microvibrations investigated at the system level?
[ragas.testset.filters.DEBUG] filtered question: {'feedback': 'The question asks about the investigation of the transmission path of microvibrations at the system level. It is clear in specifying the topic of interest (transmission path of microvibrations) and the scope (system level). However, it could benefit from more specificity regarding the type of system or the methods used for investigation. For example, specifying whether the system is mechanical, electronic, or another type, and what kind of investigative techniques are being referred to (e.g., exper

In [13]:
df_testset

Unnamed: 0,question,contexts,ground_truth,evolution_type,metadata,episode_done
0,How is the transmission path of microvibration...,[System-Level Aspects and Microvibration Requi...,System-level microvibration assessments invest...,simple,"[{'page': '[213, 214]', 'source': '['AMS_2018_...",True
1,What is the purpose of the recent testing camp...,[n ee a2 GF @ Life in vacuum (millions ofrevs....,The purpose of the recent testing campaigns at...,simple,"[{'page': '[170, 171]', 'source': '['AMS_2018_...",True
2,In what situations is test and analysis verifi...,[System-Level Aspects and Microvibration Requi...,Verification by test and analysis or a combina...,reasoning,"[{'page': '[213, 214]', 'source': '['AMS_2018_...",True
3,What challenges are there in verifying microvi...,[System-Level Aspects and Microvibration Requi...,The challenges in verifying microvibration per...,multi_context,"[{'page': '[213, 214]', 'source': '['AMS_2018_...",True
4,How is the total UF for preload in bolted join...,[Preloddyom (1 — %LOSSaverage ) Prelo adye ...,"A total UF, accounting for the scatter in prel...",reasoning,"[{'page': '[227, 228]', 'source': '['AMS_2018_...",True


## RAG questions/answers (batch mode)

### Format dataset and database for RAG

In [44]:
df_questions = df_testset[['question', 'ground_truth']].copy()
df_questions['id'] = 'Question ' + df_questions.index.astype(str)
df_questions['question_by'] = generator_model
df_questions = df_questions[['id', 'question', 'ground_truth', 'question_by']]

In [45]:
df_questions

Unnamed: 0,id,question,ground_truth,question_by
0,Question 0,How is the transmission path of microvibration...,System-level microvibration assessments invest...,gpt-3.5-turbo-16k
1,Question 1,What is the purpose of the recent testing camp...,The purpose of the recent testing campaigns at...,gpt-3.5-turbo-16k
2,Question 2,In what situations is test and analysis verifi...,Verification by test and analysis or a combina...,gpt-3.5-turbo-16k
3,Question 3,What challenges are there in verifying microvi...,The challenges in verifying microvibration per...,gpt-3.5-turbo-16k
4,Question 4,How is the total UF for preload in bolted join...,"A total UF, accounting for the scatter in prel...",gpt-3.5-turbo-16k


In [46]:
all_docs = docs_vectorstore.get(include=["metadatas", "documents", "embeddings"])
df_docs = pd.DataFrame(
    {
        "id": [_stable_hash_meta(metadata) for metadata in all_docs["metadatas"]],
        "source": [metadata.get("source") for metadata in all_docs["metadatas"]],
        "page": [metadata.get("page", -1) for metadata in all_docs["metadatas"]],
        "document": all_docs["documents"],
        "embedding": all_docs["embeddings"],
    }
)

In [47]:
# Load the cached RAG answers and source_documents ids from a file - or create an empty column
df_questions_answers = add_cached_column_from_file(
    df_questions, f"rag_response_cache_{db['index_name']}.txt", "question", "answer")

df_questions_answers = add_cached_column_from_file(
    df_questions_answers, f"rag_response_cache_{db['index_name']}.txt", "question", "source_documents")

In [48]:
df_questions_answers

Unnamed: 0,id,question,ground_truth,question_by,answer,source_documents
0,Question 0,How is the transmission path of microvibration...,System-level microvibration assessments invest...,gpt-3.5-turbo-16k,,
1,Question 1,What is the purpose of the recent testing camp...,The purpose of the recent testing campaigns at...,gpt-3.5-turbo-16k,,
2,Question 2,In what situations is test and analysis verifi...,Verification by test and analysis or a combina...,gpt-3.5-turbo-16k,,
3,Question 3,What challenges are there in verifying microvi...,The challenges in verifying microvibration per...,gpt-3.5-turbo-16k,,
4,Question 4,How is the total UF for preload in bolted join...,"A total UF, accounting for the scatter in prel...",gpt-3.5-turbo-16k,,


### Use RAG to generate responses

In [56]:
index_type='ChromaDB'
index_name=db['index_name']
query_model=synthetic_embeddings
llm=synthetic_generator_llm

QA_model_params={'rag_type':'Standard',
                 'k':4,
                 'search_type':'similarity',
                 'local_db_path':os.getenv('LOCAL_DB_PATH')}


In [57]:
df_questions_answers_rag=rag_responses(index_type, index_name, query_model, llm, QA_model_params, df_questions_answers, df_docs)
df_questions_answers_rag

## Ragas eval, visualize

In [None]:
# Evaluate
df_qa_eval, df_questions_answers_rag = eval_rag(index_name, df_questions_answers_rag):

In [None]:
# Link from documents to questions, that used the document as source. Add UMAP column for visualization purposes.
df_visualize=data_viz_prep(index_name,df_qa_eval,df_docs)

In [33]:
# concat the df containing the questions and the df containing the documents
df = pd.read_parquet(f'df_{index_name}.parquet')

# show the dataframe with the question and answer in spotlight
spotlight.show(
    df,
    layout="https://spotlightpublic.blob.core.windows.net/docs-data/rag_demo/layout_rag_3.json",
    dtype={x: Embedding for x in df.keys() if "umap" in x},
)

##  UMAP visualization froms cluster of the questions, workaround: UMAP only on documents

# Pinecone

## Connect to database

In [None]:
pinecone_client = pinecone_client(api_key=os.getenv('PINECONE_API_KEY'))
indexes=pinecone_client.list_indexes()
indexes

In [None]:
dbs=[{'index_name':'voyage-large-2-instruct-ams-2merge-none',
     'query_model': VoyageAIEmbeddings(model='voyage-large-2-instruct', 
                                       voyage_api_key=os.getenv('VOYAGE_API_KEY'), truncation=False)}]

In [None]:
# Selection to use
db=dbs[0]

In [None]:
index = pinecone_client.Index(db['index_name'])

In [None]:
ids=[]
for id in index.list():
    ids.extend(id)

In [None]:
docs=[]
chunk_size=200  # Tune to whatever doesn't error out
for i in range(0, len(ids), chunk_size):
    print(f"Fetching {i} to {i+chunk_size}")
    vector=index.fetch(ids[i:i+chunk_size])['vectors']
    vector_data = []
    for key, value in vector.items():
        vector_data.append(value)
    docs.extend(vector_data)

In [None]:
lcdocs_pinecone = []
for data in docs:
    data=data['metadata']
    lcdocs_pinecone.append(Document(page_content=data['page_content'],
                           metadata={'page':data['page'],'source':data['source']}))

In [None]:
len(lcdocs_pinecone)

In [None]:
lcdocs_pinecone[0]

In [None]:
# Export all collections to pickles to store them
for db in dbs:
    df_temp_pinecone=archive_db('Pinecone',db['index_name'],db['query_model'],export_pickle=True)