# Ragas evaluation
Test batch and ragas capability.

Uses this article as a model: https://towardsdatascience.com/visualize-your-rag-data-evaluate-your-retrieval-augmented-generation-system-with-ragas-fc2486308557

Ragas repository: https://github.com/explodinggradients/ragas/tree/main

In [8]:
import os, sys
import json
from pathlib import Path
import pickle

from ragas.testset import TestsetGenerator
from ragas import RunConfig
from dotenv import load_dotenv,find_dotenv
import chromadb
from chromadb import PersistentClient
from langchain_community.vectorstores import Chroma
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_core.documents import Document
import pandas as pd
import random

# Import local packages
sys.path.append('../src/aerospace_chatbot')
import queries
from data_processing import _stable_hash_meta

# Set environment variables with .env
load_dotenv(find_dotenv(), override=True)

True

## Nifty functions

In [9]:
def write_dict_to_file(data_dict, filename):
    """write a dictionary as a json line to a file - allowing for appending"""
    with open(filename, "a") as f:
        f.write(json.dumps(data_dict) + "\n")

def read_dicts_from_file(filename):
    """Read a json line file as a generator of dictionaries - allowing to load multiple dictionaries as list."""
    with open(filename, "r") as f:
        for line in f:
            yield json.loads(line)
            
def add_cached_column_from_file(df, file_name, merge_on, column):
    """Read a file with cached list of dicts data write it to a dataframe."""

    if Path(file_name).exists():

        cached_answer_correctness = (
            pd.DataFrame(list(read_dicts_from_file(file_name)))
            .drop_duplicates(
                subset=[merge_on],
            )[[column, merge_on]]
            .dropna()
            .reset_index(drop=True)
        )
        return df.merge(
            cached_answer_correctness,
            on=merge_on,
            how="left",
        ).reset_index(drop=True)
    else:
        # return df.insert(0, column, None)

        # Create a copy of the DataFrame
        df_out = df.copy()

        # Add the new column with the name of the variable 'column'
        df_out[column] = None

        # Reorder the columns to place the new column at the end
        columns = list(df_out.columns)
        columns.remove(column)
        columns.append(column)
        df_out = df_out[columns]
        
        # df_out = df.copy().assign(column="")
        # df_out = df_out.reindex(columns=( [col for col in df_out.columns if col not in [column]] + [column] ))
        return df_out

## Connect to database

In [10]:
persistent_client = chromadb.PersistentClient(path=os.path.join(os.getenv('LOCAL_DB_PATH'),'chromadb'))   
query_model=OpenAIEmbeddings(model='text-embedding-ada-002',openai_api_key=os.getenv('OPENAI_API_KEY'))

index_name='chromadb-openai-ams-400chunk-2000-2020'

# Connect to vectorstore where no chunking was done only full PDF pages
vectorstore = Chroma(client=persistent_client,
                        collection_name=index_name,
                        embedding_function=query_model)  

In [11]:
all_docs = vectorstore.get(include=["metadatas", "documents", "embeddings"])

In [12]:
lcdocs = [Document(page_content=doc, metadata=metadata) 
          for doc, metadata in zip(all_docs['documents'], all_docs['metadatas'])]

## Generate synthetic dataset

In [13]:
generator_model="gpt-3.5-turbo-16k"
synthetic_generator_llm = ChatOpenAI(model=generator_model)

critic_model="gpt-3.5-turbo-16k"
# critic_model="gpt-4"
synthetic_critic_llm = ChatOpenAI(model=critic_model)

synthetic_embeddings = OpenAIEmbeddings()

In [14]:
generator = TestsetGenerator.from_langchain(
    synthetic_generator_llm,
    synthetic_critic_llm,
    synthetic_embeddings
)
# Calculate the number of elements to sample. Use a random cross section.
# size=0.005r
# sample_size = int(len(lcdocs) * size)
sample_size=100

# Get a random sample of lcdocs
lcdocs_random = random.sample(lcdocs, sample_size)

In [15]:
# Check if testset.csv exists
if not os.path.exists('./testset.csv'):
    run_config=RunConfig(timeout=120,
                    max_retries=20,
                    max_wait=120,
                    max_workers=8)

    n_questions=5
    testset = generator.generate_with_langchain_docs(lcdocs_random, 
                                                    test_size=n_questions,
                                                    with_debugging_logs=True,
                                                    is_async=False,
                                                    run_config=run_config)
    df_testset=testset.to_pandas()
    df_testset.to_csv('testset.csv', index=False)
else:
    # Import testset.csv into a DataFrame
    df_testset = pd.read_csv('testset.csv')

embedding nodes:   0%|          | 0/200 [00:00<?, ?it/s]

Filename and doc_id are the same for all nodes.


Generating:   0%|          | 0/5 [00:00<?, ?it/s]

[ragas.testset.filters.DEBUG] node filter: {'score': 5.5}
[ragas.testset.evolutions.INFO] retrying evolution: 0 times
[ragas.testset.filters.DEBUG] node filter: {'score': 5.5}
[ragas.testset.evolutions.INFO] retrying evolution: 0 times
[ragas.testset.filters.DEBUG] node filter: {'score': 5.5}
[ragas.testset.evolutions.INFO] retrying evolution: 0 times
[ragas.testset.filters.DEBUG] node filter: {'score': 7.5}
[ragas.testset.evolutions.DEBUG] keyphrases in merged node: ['Refined gearbox design', 'Chariot Lunar Rover', 'Steve Bauman', 'David Lewicki']
[ragas.testset.filters.DEBUG] node filter: {'score': 5.5}
[ragas.testset.evolutions.INFO] retrying evolution: 0 times
[ragas.testset.filters.DEBUG] node filter: {'score': 4.5}
[ragas.testset.evolutions.INFO] retrying evolution: 0 times
[ragas.testset.filters.DEBUG] node filter: {'score': 5.5}
[ragas.testset.evolutions.INFO] retrying evolution: 0 times
[ragas.testset.filters.DEBUG] node filter: {'score': 4.0}
[ragas.testset.evolutions.INFO] r

In [16]:
df_testset

Unnamed: 0,question,contexts,ground_truth,evolution_type,metadata,episode_done
0,What is the significance of the single tooth b...,[singletoothinbending limitatyield.Atapproxima...,The significance of the single tooth bending l...,simple,"[{'page': 96, 'source': 'AMS_2000.pdf', 'start...",True
1,What is the purpose of incorporating leaf spri...,[heaters tosimilartypesofmaterial. Leafsprings...,The purpose of incorporating leaf springs into...,simple,"[{'page': 309, 'source': 'AMS_2001.pdf', 'star...",True
2,What determines the upper limit of the measure...,[Figure 1. Microvibrations measurement setup:...,The upper limit of the measurement frequency r...,reasoning,"[{'page': 252, 'source': 'AMS_2020.pdf', 'star...",True
3,How can the utilization of the nichrome burn w...,"[commercial suppliers such as McMaster-Carr, t...",The utilization of the nichrome burn wire rele...,multi_context,"[{'page': 494, 'source': 'AMS_2012.pdf', 'star...",True
4,What steps are taken after SLM process simulat...,[The manufacturability of the design should th...,"A post-processing sequence, including thermal ...",reasoning,"[{'page': 530, 'source': 'AMS_2020.pdf', 'star...",True


### Format dataset and database for RAG

In [10]:
df_questions = df_testset[['question', 'ground_truth']].copy()
df_questions['question_by'] = generator_model

# questions_all = [
#     {
#         "question": qa.question,
#         "ground_truth": qa.ground_truth,
#         "question_by": generator_model,
#     }
#     for qa in df_testset
# ]

In [11]:
df_questions

# df_questions = pd.DataFrame(
#     {
#         "id": [f"Question {i}" for i, _ in enumerate(questions_all)],
#         "question": [qa["question"] for qa in questions_all],
#         "ground_truth": [qa["ground_truth"] for qa in questions_all],
#         "question_by": [qa["question_by"] for qa in questions_all],
#     }
# )
# # keep only the first question if questions are duplicated
# df_questions = df_questions.drop_duplicates(subset=["question"])
# df_questions

Unnamed: 0,question,ground_truth,question_by
0,How does the clasp function in the deployment ...,The clasp functions in the deployment of the a...,gpt-3.5-turbo-16k
1,What is the purpose of keeping translational m...,To avoid control issues,gpt-3.5-turbo-16k
2,What was the condition of the silver coating a...,silver coating was found to be in good condition,gpt-3.5-turbo-16k
3,How is film durability measured in terms of pi...,The film durability is measured in terms of th...,gpt-3.5-turbo-16k
4,What are the behavior differences between AEGS...,Behavior differences between AEGSE and RMCA ex...,gpt-3.5-turbo-16k


In [12]:
all_docs = vectorstore.get(include=["metadatas", "documents", "embeddings"])
df_docs = pd.DataFrame(
    {
        "id": [_stable_hash_meta(metadata) for metadata in all_docs["metadatas"]],
        "source": [metadata.get("source") for metadata in all_docs["metadatas"]],
        "page": [metadata.get("page", -1) for metadata in all_docs["metadatas"]],
        "document": all_docs["documents"],
        "embedding": all_docs["embeddings"],
    }
)

In [13]:
df_docs.head(10)

Unnamed: 0,id,source,page,document,embedding
0,0001a4a31c369c6418707c7e2ce6e6ef75a68adc,AMS_2018.pdf,323,simply reinforce good system and test design ...,"[-0.008877139538526535, 0.012146159075200558, ..."
1,00053953623d054747d255c6cd28c089854ab4b5,AMS_2018.pdf,473,mechanisms test. Electrical pinout verificatio...,"[-0.043031420558691025, -0.00593230128288269, ..."
2,0009a16bdab4f2763980e652e22c9abaa96c1762,AMS_2018.pdf,235,design guidelines exist. This paper aims to q...,"[-0.0049348436295986176, 0.00696054520085454, ..."
3,000cad7233c3c63ac61a0f06474f18d0f50184e6,AMS_2020.pdf,153,"be used on the flight unit, to an axial load ...","[-0.012982546351850033, 0.00916499923914671, -..."
4,001186298ba7a8f8570a5febd7fc9f3d7b362616,AMS_2018.pdf,490,spacecraft docking harkens back to the days o...,"[0.012490472756326199, -0.02411811798810959, 0..."
5,0011e034a24f29562d42b9bbf26fe68bb114af28,AMS_2020.pdf,45,Figure 7. Exterior View of the C rack One pos...,"[0.000985087244771421, -0.009893814101815224, ..."
6,001839d3c8b21537be890d4cda7fca528fa48c8a,AMS_2018.pdf,405,"the issue, which it passed. The redesign was t...","[0.0039053959771990776, 0.018659112975001335, ..."
7,001bb8e0950fc993b461d544372d531962780fd2,AMS_2018.pdf,310,test the motors in the correct thermal and vac...,"[-0.02209189534187317, -0.00885580200701952, -..."
8,001ec0d154612eedbdcc7cca4dde94e01fe6b31f,AMS_2018.pdf,128,"affected the preload, starting torque, running...","[-0.03811594843864441, -0.005840575788170099, ..."
9,002c17eef38ec942554fd79e0935a507288458d8,AMS_2018.pdf,165,µavg = 0.2 measured in the 30°C experiments w...,"[0.00458584725856781, 0.005623244680464268, 0...."


In [30]:
# df_questions_answers = df_questions.copy().assign(answer="", source_documents="")
# df_questions_answers = df_questions_answers.reindex(columns=( [col for col in df_questions_answers.columns if col not in ['answer', 'source_documents']] 
#                          + ['answer', 'source_documents'] ))
# df_questions_answers

# load the cached RAG answers and source_documents ids from a file - or create an empty column
df_questions_answers = add_cached_column_from_file(
    df_questions, "rag_response_cache.txt", "question", "answer")

df_questions_answers = add_cached_column_from_file(
    df_questions_answers, "rag_response_cache.txt", "question", "source_documents")

In [31]:
df_questions_answers

Unnamed: 0,question,ground_truth,question_by,answer,source_documents
0,How does the clasp function in the deployment ...,The clasp functions in the deployment of the a...,gpt-3.5-turbo-16k,,
1,What is the purpose of keeping translational m...,To avoid control issues,gpt-3.5-turbo-16k,,
2,What was the condition of the silver coating a...,silver coating was found to be in good condition,gpt-3.5-turbo-16k,,
3,How is film durability measured in terms of pi...,The film durability is measured in terms of th...,gpt-3.5-turbo-16k,,
4,What are the behavior differences between AEGS...,Behavior differences between AEGSE and RMCA ex...,gpt-3.5-turbo-16k,,


# RAG questions/answers (batch mode)

The cell below demonstrates using aerospace_chatbot in batch mode. It requires some basic parameter setup and a QA_model.

In [32]:
# TODO turn this into a function/standalone notebook to show how to use the QA model in batch mode

index_type='ChromaDB'
index_name=index_name
query_model=synthetic_embeddings
llm=synthetic_generator_llm

QA_model_params={'rag_type':'Standard',
                 'k':4,
                 'search_type':'similarity',
                 'local_db_path':os.getenv('LOCAL_DB_PATH')}

for i, row in df_questions_answers.iterrows():
    if row['answer'] is None or pd.isnull(row['answer']) or row['answer']=='':
        print(f"Processing question {i+1}/{len(df_questions_answers)}")

        # Use the QA model to query the documents
        qa_obj=queries.QA_Model(index_type,
                        index_name,
                        query_model,
                        llm,
                        **QA_model_params)
        qa_obj.query_docs(row['question'])
        response=qa_obj.result

        df_questions_answers.loc[df_questions_answers.index[i], "answer"] = response['answer'].content
        # print(response['references'])

        ids=[_stable_hash_meta(source_document.metadata)
            for source_document in response['references']]
        df_questions_answers.loc[df_questions_answers.index[i], "source_documents"] = ', '.join(ids)

        # Save the response to cache file
        response_dict = {
            "question": row['question'],
            "answer": response['answer'].content,
            "source_documents": ids,
        }
        write_dict_to_file(response_dict, "rag_response_cache.json")

Processing question 1/5
Processing question 2/5
Processing question 3/5
Processing question 4/5
Processing question 5/5


In [39]:
# Get the context documents content for each question
source_documents_list = []
for cell in df_questions_answers['source_documents']:
    cell_list = cell.strip('[]').split(', ')
    context=[]
    for cell in cell_list:
        context.append(df_docs[df_docs["id"] == cell]["document"].values[0])
    source_documents_list.append(context)
df_questions_answers["contexts"]=source_documents_list

In [43]:
# Addtionaly get embeddings for questions

if not Path("question_embeddings.pickle").exists():
    question_embeddings = [
        synthetic_embeddings.embed_query(question)
        for question in df_questions_answers["question"]
    ]
    with open("question_embeddings.pickle", "wb") as f:
        pickle.dump(question_embeddings, f)

question_embeddings = pickle.load(open("question_embeddings.pickle", "rb"))
# answer_embeddings = pickle.load(open("answer_embeddings_2040214_1111.pickle", "rb"))
df_questions_answers["embedding"] = question_embeddings

In [44]:
df_questions_answers

Unnamed: 0,question,ground_truth,question_by,answer,source_documents,contexts,embedding
0,How does the clasp function in the deployment ...,The clasp functions in the deployment of the a...,gpt-3.5-turbo-16k,The specific type of clasp used in the deploym...,"d8f8942b0a2128f0a7e4e37ecd977e4b0eff3175, 10e3...","[When deployed, the clamshells hinge open alon...","[0.0017221946481310274, 0.015560210153071317, ..."
1,What is the purpose of keeping translational m...,To avoid control issues,gpt-3.5-turbo-16k,The excitation frequencies of cryocoolers are ...,"8339dcb0e5a72689d38520e706bd667ddca26299, af15...","[510 resonance. Thus, cryo- cooler s micro -vi...","[0.0014575446162674052, -0.015633314746379735,..."
2,What was the condition of the silver coating a...,silver coating was found to be in good condition,gpt-3.5-turbo-16k,Based on the information provided in the refer...,"7ba3d7ae456de6149e0cd77c2c6498a0d4ca58c2, e4cd...",[- no defect observed post thermal cycling (F...,"[0.01731043051087476, 0.012807833432222686, 0...."
3,How is film durability measured in terms of pi...,The film durability is measured in terms of th...,gpt-3.5-turbo-16k,Figure 4 illustrates the measurement of film d...,"c08e83ca7c5b4d715c55287e21489fe16fef2421, d534...",[thrust force. Film durability was determined...,"[0.013993376826443391, -0.011561762670698585, ..."
4,What are the behavior differences between AEGS...,Behavior differences between AEGSE and RMCA ex...,gpt-3.5-turbo-16k,"During ACA-level testing, there are behavior d...","f903161ffc1a3a9a1119e7681768c7db72408da1, b3b0...",[operated using Actuator Electrical Ground Su...,"[0.005842556044597406, 0.010622214075739735, 0..."
