In [1]:
%load_ext autoreload
%autoreload 2

# Ragas evaluation
Test batch and ragas capability.

Uses this article as a model: https://towardsdatascience.com/visualize-your-rag-data-evaluate-your-retrieval-augmented-generation-system-with-ragas-fc2486308557

Ragas repository: https://github.com/explodinggradients/ragas/tree/main

In [2]:
import os, sys
import json
from pathlib import Path
import pickle

from ragas.testset import TestsetGenerator
from ragas import RunConfig
from dotenv import load_dotenv,find_dotenv
import chromadb
from chromadb import PersistentClient
from pinecone import Pinecone as pinecone_client, ServerlessSpec
from ragatouille import RAGPretrainedModel

from langchain_pinecone import PineconeVectorStore
# from langchain_community.vectorstores import Chroma
from langchain_chroma import Chroma
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_voyageai import VoyageAIEmbeddings
from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings
from langchain_core.documents import Document
import pandas as pd

from ragas import evaluate
from ragas.metrics import answer_correctness, faithfulness, context_recall
from datasets import Dataset

from renumics import spotlight
from renumics.spotlight import Embedding
import pandas as pd

# from umap import UMAP
# import numpy as np

# Import local packages
sys.path.append('../src/aerospace_chatbot')
import eval
import admin
import data_processing
import queries

# Set environment variables with .env
load_dotenv(find_dotenv(), override=True)

[nltk_data] Downloading package punkt_tab to /Users/danmueller/Documen
[nltk_data]     ts/GitHub/aerospace_chatbot/.venv/lib/python3.11/site-
[nltk_data]     packages/llama_index/core/_static/nltk_cache...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [3]:
# Set secrets
secrets={}
sb={}

secrets['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY')
secrets['VOYAGE_API_KEY'] = os.getenv('VOYAGE_API_KEY')
secrets['PINECONE_API_KEY'] = os.getenv('PINECONE_API_KEY')
secrets['HUGGINGFACEHUB_API_TOKEN'] = os.getenv('HUGGINGFACEHUB_API_TOKEN')

# Create test (synthetic) dataset, generate docs

## Connect to database

### ChromaDB

Use chroma with standard RAG to generate synthetic dataset

In [4]:
persistent_client = chromadb.PersistentClient(path=os.path.join(os.getenv('LOCAL_DB_PATH'),'chromadb'))   
collections=persistent_client.list_collections()
collections

[Collection(name=text-embedding-3-small-2merge-0),
 Collection(name=text-embedding-3-large-2merge-0-parent-child-queries),
 Collection(name=text-embedding-3-large-2merge-0),
 Collection(name=text-embedding-3-large-0merge-400),
 Collection(name=text-embedding-3-small-2merge-0-queries),
 Collection(name=text-embedding-3-large-2merge-0-parent-child),
 Collection(name=text-embedding-3-large-2merge-0-queries)]

In [5]:
# Chroma _embedding_function isn't compatible like embedding objects. Index by embeddings used.
query_models=[OpenAIEmbeddings(model='text-embedding-3-small',openai_api_key=os.getenv('OPENAI_API_KEY')),
              OpenAIEmbeddings(model='text-embedding-3-large',openai_api_key=os.getenv('OPENAI_API_KEY')),
              OpenAIEmbeddings(model='text-embedding-3-large',openai_api_key=os.getenv('OPENAI_API_KEY'))]

### Pinecone



In [None]:
pinecone_client = pinecone_client(api_key=os.getenv('PINECONE_API_KEY'))
indexes=pinecone_client.list_indexes()
indexes

In [None]:
query_models=[VoyageAIEmbeddings(model='voyage-large-2-instruct', 
                                 voyage_api_key=os.getenv('VOYAGE_API_KEY'), truncation=False),
              VoyageAIEmbeddings(model='voyage-large-2-instruct', 
                                 voyage_api_key=os.getenv('VOYAGE_API_KEY'), truncation=False)]

### RAGatouille



In [None]:
# indexes=admin.show_ragatouille_indexes(format=False)
# indexes

In [None]:
# dbs=['colbert-ir/colbertv2.0']

In [None]:
# idx=0
# db=dbs[idx]

# # TODO get this as a langchain retriever, pull docs
# query_model = RAGPretrainedModel.from_pretrained(db,index_root=os.path.join(os.getenv('LOCAL_DB_PATH'),'.ragatouille'))
# docs_vectorstore=data_processing.initialize_database('RAGatouille',
#                                                      'colbert-ir-colbertv2.0-2merge-0',
#                                                      query_model,
#                                                      'Standard',
#                                                      os.getenv('LOCAL_DB_PATH'),
#                                                      init_ragatouille=False,
#                                                      clear=False)

In [None]:
# docs=docs_vectorstore.model.collection  # Document chunks (chunked smaller according to token size)
# print(docs[0])
# print(len(docs))

# metadata=docs_vectorstore.model.docid_metadata_map  # Document metadata for original documents
# print(len(metadata))

# map=docs_vectorstore.model.pid_docid_map    # Map of document chunks to original document
# print(len(map))

Can't quite figure out functionality to export the encodings from RAGatouille for each document. Won't pursue unless RAGatouille has exceptional performance.

### Generate Docs

In [7]:
index_type="ChromaDB"
# index_type="Pinecone"

# Select database for generating docs
if index_type=="ChromaDB":
    idx_chroma=2   # Most reasonable baseline (text-embedding-3-large-2merge-0), top of the line embeddings, 2 page size good to genreate questions from.
    docs_vectorstore=collections[idx_chroma]
    query_model=query_models[idx_chroma]  
elif index_type=="Pinecone":
    idx_pinecone=0
    docs_vectorstore=indexes[idx_pinecone]
    query_model=query_models[idx_pinecone]


In [9]:
df_docs, lcdocs = eval.lcdoc_export(index_type,docs_vectorstore,query_model,export_pickle=False)
print(len(lcdocs))

2222


## Generate dataset

Good article on how models/embeddings are used in the `TestsetGenerator`: https://www.pondhouse-data.com/blog/evaluate-rag-performance-using-ragas

Sometimes you'll get a tricky threading error. Fully close vs studio, open a new window, restart the kernel, and it'll clear. It also appears to be related to versions newer than 0.1.6 for ragas. I'll stick with that for now until I find ways to test an upgrade.

In [None]:
# Set generator inputs
generator_model="gpt-4o-mini"
synthetic_generator_llm = ChatOpenAI(model=generator_model, tags=[generator_model])

critic_model='gpt-4o'
synthetic_critic_llm = ChatOpenAI(model=critic_model,tags=[critic_model])

# embedding_model='text-embedding-3-large'
# synthetic_embeddings = OpenAIEmbeddings(model=embedding_model,api_key=os.getenv('OPENAI_API_KEY'))
synthetic_embeddings=query_model

# Run parameters for testset generation
run_config=RunConfig(timeout=1000,
                max_retries=50,
                max_wait=1000,
                max_workers=1)

# Create generator
generator = TestsetGenerator.from_langchain(
    synthetic_generator_llm,
    synthetic_critic_llm,
    synthetic_embeddings,
    run_config=run_config
)

In [None]:
# Input parameters
# eval_size=100    # Number of samples to evaluate at a time. Intended to circumvent OpenAI API rate limits.
eval_size=len(lcdocs)
n_questions=30   # Number of questions to generate for each evaluation sample.
fname=os.path.join('output',f"testset_{docs_vectorstore.name}_full.csv")

In [None]:
df_testset=eval.generat7e_testset(lcdocs,generator,eval_size,n_questions,fname,run_config)

# RAG Evaluation

This section will use the same base data as the synthetic test dataset but apply different RAG strategies:
* Different chunk sizes
* Embedding models
* LLMs
* Advanced RAG (parent-child, RAGatouille)

The database may not be the same as the synthetic test dataset but uses the same base data.

## Format dataset and database for RAG

In [8]:
# Read in test dataset. Skip this if you have generated it above.

# testset_name=docs_vectorstore.name    # Uncomment if you want to use the most recent testset
testset_name='text-embedding-3-small-2merge-0_full'
fname=os.path.join('output',f"testset_{testset_name}.csv")

import_csv=True
if import_csv:
    df_testset = pd.read_csv(fname)

# In case there was missing truth, drop the question
df_testset = df_testset.dropna(subset=['ground_truth'])

# temporarily reduce the quantity to evaluate the functionality
# df_testset=df_testset.head(2)

# Create template dataframe to iterate over later
df_qa_template = df_testset[['question', 'ground_truth']].copy()
df_qa_template['question_id'] = df_qa_template.index
df_qa_template = df_qa_template[['question_id', 'question', 'ground_truth']]
# for column in ["answer", "source_documents", "answer_by", "query_model"]:
#     df_qa_template[column] = None

From here, you have a blank dataframe to generate questions for an evaluate. For each model and database in setup_data below, this template dataframe will be what is evaluated with RAG responses/RAGAS criteria.

## Use RAG to generate responses, evaluate

In [9]:
# Read setup data, determining the evaluation models and databases
json_file_path = "eval_models.json"
with open(json_file_path, "r") as json_file:
    setup_data = json.load(json_file)

In [10]:
setup_data

{'eval_models': [{'index_type': 'ChromaDB',
   'index_name': 'text-embedding-3-large-2merge-0',
   'query_model': {'query_model': 'OpenAI',
    'embedding_name': 'text-embedding-3-large'},
   'llm': {'llm_source': 'OpenAI',
    'llm_model': 'gpt-4o',
    'model_options': {'temperature': 0.2, 'output_level': 1000}},
   'qa_model_params': {'rag_type': 'Standard',
    'k': 4,
    'search_type': 'similarity',
    'local_db_path': '/Users/danmueller/Documents/GitHub/aerospace_chatbot/db'}},
  {'index_type': 'ChromaDB',
   'index_name': 'text-embedding-3-small-2merge-0',
   'query_model': {'query_model': 'OpenAI',
    'embedding_name': 'text-embedding-3-small'},
   'llm': {'llm_source': 'OpenAI',
    'llm_model': 'gpt-4o',
    'model_options': {'temperature': 0.2, 'output_level': 1000}},
   'qa_model_params': {'rag_type': 'Standard',
    'k': 4,
    'search_type': 'similarity',
    'local_db_path': '/Users/danmueller/Documents/GitHub/aerospace_chatbot/db'}},
  {'index_type': 'ChromaDB',
   '

In [11]:
# Iterate through the evaluation models and databases, dump data as you go.
# This will cache data along the way into rag_responses, so you can pick up where you left off.
# A pickled dataframe is also exported at the end of each iteration, but not row-by-row.
df_qa = pd.DataFrame()
for model in setup_data['eval_models']:
    print(model)
    
    # Database
    index_type=model['index_type']
    sb['index_type']=index_type
    index_name=model['index_name']
    sb['index_name']=index_name
    # Query model and llm
    for key in model['query_model']:
        sb[key] = model['query_model'][key]
    query_model=admin.get_query_model(sb, secrets)
    for key in model['llm']:
        sb[key] = model['llm'][key]
    llm=admin.set_llm(sb, secrets)
    # QA model params
    qa_model_params=model['qa_model_params']
    
    df_qa_iter=eval.rag_responses(index_type, index_name, query_model, llm, qa_model_params, 
                                  df_qa_template, df_docs, testset_name)
    df_qa = pd.concat([df_qa,df_qa_iter],ignore_index=True)

    # After each iteration, export a pickle of the dataframe
    with open(os.path.join('output',f'df_qa_{testset_name}.pickle'), "wb") as f:
            pickle.dump(df_qa, f)

{'index_type': 'ChromaDB', 'index_name': 'text-embedding-3-large-2merge-0', 'query_model': {'query_model': 'OpenAI', 'embedding_name': 'text-embedding-3-large'}, 'llm': {'llm_source': 'OpenAI', 'llm_model': 'gpt-4o', 'model_options': {'temperature': 0.2, 'output_level': 1000}}, 'qa_model_params': {'rag_type': 'Standard', 'k': 4, 'search_type': 'similarity', 'local_db_path': '/Users/danmueller/Documents/GitHub/aerospace_chatbot/db'}}
{'index_type': 'ChromaDB', 'index_name': 'text-embedding-3-small-2merge-0', 'query_model': {'query_model': 'OpenAI', 'embedding_name': 'text-embedding-3-small'}, 'llm': {'llm_source': 'OpenAI', 'llm_model': 'gpt-4o', 'model_options': {'temperature': 0.2, 'output_level': 1000}}, 'qa_model_params': {'rag_type': 'Standard', 'k': 4, 'search_type': 'similarity', 'local_db_path': '/Users/danmueller/Documents/GitHub/aerospace_chatbot/db'}}
{'index_type': 'ChromaDB', 'index_name': 'text-embedding-3-large-2merge-0', 'query_model': {'query_model': 'OpenAI', 'embeddin



In [16]:
# Evaluate
eval_criterias=["answer_correctness", "faithfulness", "context_recall"]
df_qa = eval.eval_rag(df_qa,eval_criterias,testset_name)

0 How does the motion on the PoD tribometer differ from the motion on the SOT tribometer, and how does this difference impact the assessment of lubricant behavior in hybrid lubrication?
1 What testing methods were used to assess the performance of the spring strut hardware, and what were the results of these tests?
2 What challenges were faced with the surface of the L7 deployment floor and how did it impact the deployment process?
3 How was the position sensor selected for the Future Actuator design?
4 How does the presence of lubrication on threads affect the nut factor ranges in bolted joints?
5 What role do DC motors play in space applications and why is proper selection and specification essential for the success of future space missions?
6 How do modern Fourier Transform Spectrometers address the effects of velocity errors in sampling infrared interferograms?
7 What difficulties were encountered during the design of the Engage-Disengage Mechanism and how were they addressed?
8 Ho

In [17]:
write=True
if write:
    with open(os.path.join('output',f'df_qa_{testset_name}.pickle'), "wb") as f:
        pickle.dump(df_qa, f)
else:
    with open(os.path.join('output',f'df_qa_{testset_name}.pickle'), "rb") as f:
        df_qa = pickle.load(f)

## Ragas eval, visualize

In [None]:
# Link from documents to questions, that used the document as source. Add UMAP column for visualization purposes.
df_visualize=eval.data_viz_prep(index_name,df_qa,df_docs)

In [None]:
# concat the df containing the questions and the df containing the documents
df = pd.read_parquet(f'df_{index_name}.parquet')

# show the dataframe with the question and answer in spotlight
spotlight.show(
    df,
    layout="https://spotlightpublic.blob.core.windows.net/docs-data/rag_demo/layout_rag_3.json",
    dtype={x: Embedding for x in df.keys() if "umap" in x},
)

##  UMAP visualization froms cluster of the questions, workaround: UMAP only on documents