In [1]:
%load_ext autoreload
%autoreload 2

# Ragas evaluation
Test batch and ragas capability.

Uses this article as a model: https://towardsdatascience.com/visualize-your-rag-data-evaluate-your-retrieval-augmented-generation-system-with-ragas-fc2486308557

Ragas repository: https://github.com/explodinggradients/ragas/tree/main

In [2]:
import os, sys
import json
from pathlib import Path
import pickle

from ragas.testset import TestsetGenerator
from ragas import RunConfig
from dotenv import load_dotenv,find_dotenv
import chromadb
from chromadb import PersistentClient
from pinecone import Pinecone as pinecone_client, ServerlessSpec
from langchain_pinecone import PineconeVectorStore
from langchain_community.vectorstores import Chroma
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_voyageai import VoyageAIEmbeddings
from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings
from langchain_core.documents import Document
import pandas as pd

from ragas import evaluate
from ragas.metrics import answer_correctness
from datasets import Dataset

from renumics import spotlight
from renumics.spotlight import Embedding
import pandas as pd

# from umap import UMAP
# import numpy as np

# Import local packages
sys.path.append('../src/aerospace_chatbot')
import queries
import eval
import admin
from data_processing import _stable_hash_meta, archive_db

# Set environment variables with .env
load_dotenv(find_dotenv(), override=True)

True

In [3]:
# Set secrets
secrets={}
sb={}

secrets['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY')
secrets['VOYAGE_API_KEY'] = os.getenv('VOYAGE_API_KEY')
secrets['PINECONE_API_KEY'] = os.getenv('PINECONE_API_KEY')
secrets['HUGGINGFACEHUB_API_TOKEN'] = os.getenv('HUGGINGFACEHUB_API_TOKEN')

# Create test (synthetic) dataset, generate docs+

## Connect to database

### ChromaDB

Use chroma with standard RAG to generate synthetic dataset

In [69]:
persistent_client = chromadb.PersistentClient(path=os.path.join(os.getenv('LOCAL_DB_PATH'),'chromadb'))   
collections=persistent_client.list_collections()
collections

[Collection(name=text-embedding-3-large-2merge-0),
 Collection(name=text-embedding-3-large-0merge-400),
 Collection(name=text-embedding-3-large-0merge-400-parent-child),
 Collection(name=text-embedding-3-large-2merge-0-queries)]

In [70]:
# Chroma _embedding_function isn't compatible like embedding objects. Index by embeddings used.
query_models=[OpenAIEmbeddings(model='text-embedding-3-large',openai_api_key=os.getenv('OPENAI_API_KEY')),
              OpenAIEmbeddings(model='text-embedding-3-large',openai_api_key=os.getenv('OPENAI_API_KEY')),
              OpenAIEmbeddings(model='text-embedding-3-large',openai_api_key=os.getenv('OPENAI_API_KEY')),
              OpenAIEmbeddings(model='text-embedding-3-large',openai_api_key=os.getenv('OPENAI_API_KEY')),
              OpenAIEmbeddings(model='text-embedding-3-large',openai_api_key=os.getenv('OPENAI_API_KEY'))]

In [71]:
# Export all collections to pickles to store them. Uncomment if desired, takes a while.
export=False

if export:
    for collection in collections:
        df_temp_chroma=archive_db('ChromaDB',collection.name,collection._embedding_function,export_pickle=True)

    df_temp_chroma.head(5)

In [72]:
# Select database for determining synthetic dataset
idx=0   # Most reasonable baseline (text-embedding-3-large-2merge-0), top of the line embeddings, 2 page size good to genreate questions from.

docs_vectorstore=collections[idx]
query_model=query_models[idx]

In [73]:
# Inspect the first db, save for synthetic test dataset
all_docs = docs_vectorstore.get(include=["metadatas", "documents", "embeddings"])
lcdocs_chroma = [Document(page_content=doc, metadata=metadata) 
          for doc, metadata in zip(all_docs['documents'], all_docs['metadatas'])]

print(len(lcdocs_chroma))

2222


In [74]:
# Format docs into dataframe
all_docs = docs_vectorstore.get(include=["metadatas", "documents", "embeddings"])
df_docs = pd.DataFrame(
    {
        "id": [_stable_hash_meta(metadata) for metadata in all_docs["metadatas"]],
        "source": [metadata.get("source") for metadata in all_docs["metadatas"]],
        "page": [metadata.get("page", -1) for metadata in all_docs["metadatas"]],
        "document": all_docs["documents"],
        "embedding": all_docs["embeddings"],
    }
)

### Pinecone



In [56]:
pinecone_client = pinecone_client(api_key=os.getenv('PINECONE_API_KEY'))
indexes=pinecone_client.list_indexes()
indexes

{'indexes': [{'dimension': 1024,
              'host': 'voyage-large-2-instruct-2merge-0-dj30h8y.svc.aped-4627-b74a.pinecone.io',
              'metric': 'cosine',
              'name': 'voyage-large-2-instruct-2merge-0',
              'spec': {'serverless': {'cloud': 'aws', 'region': 'us-east-1'}},
              'status': {'ready': True, 'state': 'Ready'}},
             {'dimension': 1024,
              'host': 'voyage-large-2-instruct-0merge-400-dj30h8y.svc.aped-4627-b74a.pinecone.io',
              'metric': 'cosine',
              'name': 'voyage-large-2-instruct-0merge-400',
              'spec': {'serverless': {'cloud': 'aws', 'region': 'us-east-1'}},
              'status': {'ready': True, 'state': 'Ready'}}]}

In [57]:
dbs=[{'index_name':'voyage-large-2-instruct-2merge-0',
     'query_model': VoyageAIEmbeddings(model='voyage-large-2-instruct', 
                                       voyage_api_key=os.getenv('VOYAGE_API_KEY'), truncation=False)},
     {'index_name':'voyage-large-2-instruct-0merge-400',
     'query_model': VoyageAIEmbeddings(model='voyage-large-2-instruct', 
                                       voyage_api_key=os.getenv('VOYAGE_API_KEY'), truncation=False)}]

In [58]:
# Select database for determining synthetic dataset
idx=0
db=dbs[0]
index = pinecone_client.Index(db['index_name'])

In [86]:
ids=[]
for id in index.list():
    ids.extend(id)

docs=[]
df_docs = pd.DataFrame()
chunk_size=200  # Tune to whatever doesn't error out, 200 won't for serverless
for i in range(0, len(ids), chunk_size):
    print(f"Fetching {i} to {i+chunk_size}")
    vector=index.fetch(ids[i:i+chunk_size])['vectors']
    vector_data = []
    for key, value in vector.items():
        vector_data.append(value)
    docs.extend(vector_data)

    df_doc_temp = pd.DataFrame()
    df_doc_temp["id"]= [vector_elm["id"] for vector_elm in vector_data]
    df_doc_temp["source"]= [vector_elm["metadata"]["source"] for vector_elm in vector_data]
    df_doc_temp["page"]= [vector_elm["metadata"]["page"] for vector_elm in vector_data]
    df_doc_temp["document"]= [vector_elm["metadata"]["page_content"] for vector_elm in vector_data]
    df_doc_temp["embedding"]= [vector_elm["values"] for vector_elm in vector_data]
    df_docs = pd.concat([df_docs, df_doc_temp])

lcdocs_pinecone = []
for data in docs:
    data=data['metadata']
    lcdocs_pinecone.append(Document(page_content=data['page_content'],
                           metadata={'page':data['page'],'source':data['source']}))
    
print(len(lcdocs_pinecone))

Fetching 0 to 200
Fetching 200 to 400
Fetching 400 to 600
Fetching 600 to 800
Fetching 800 to 1000
Fetching 1000 to 1200
Fetching 1200 to 1400
Fetching 1400 to 1600
Fetching 1600 to 1800
Fetching 1800 to 2000
Fetching 2000 to 2200
Fetching 2200 to 2400
2222


In [None]:
# Export all collections to pickles to store them
# for db in dbs:
#     df_temp_pinecone=archive_db('Pinecone',db['index_name'],db['query_model'],export_pickle=True)

### RAGatouille



In [55]:
indexes=admin.show_ragatouille_indexes(format=False)
indexes

{'status': True, 'message': ['colbert-ir-colbertv2.0-2merge-0']}

## Generate dataset

Good article on how models/embeddings are used in the `TestsetGenerator`: https://www.pondhouse-data.com/blog/evaluate-rag-performance-using-ragas

Sometimes you'll get a tricky threading error. Fully close vs studio, open a new window, restart the kernel, and it'll clear. It also appears to be related to versions newer than 0.1.6 for ragas. I'll stick with that for now until I find ways to test an upgrade.

In [None]:
# Set generator inputs
generator_model="gpt-3.5-turbo-0125"
synthetic_generator_llm = ChatOpenAI(model=generator_model, tags=[generator_model])

critic_model='gpt-4o'
synthetic_critic_llm = ChatOpenAI(model=critic_model,tags=[critic_model])

# embedding_model='text-embedding-3-large'
# synthetic_embeddings = OpenAIEmbeddings(model=embedding_model,api_key=os.getenv('OPENAI_API_KEY'))
synthetic_embeddings=query_model

# Run parameters for testset generation
run_config=RunConfig(timeout=1000,
                max_retries=50,
                max_wait=1000,
                max_workers=1)

# Create generator
generator = TestsetGenerator.from_langchain(
    synthetic_generator_llm,
    synthetic_critic_llm,
    synthetic_embeddings,
    run_config=run_config
)

In [None]:
# Input parameters
eval_size=100    # Number of samples to evaluate at a time. Intended to circumvent OpenAI API rate limits.
n_questions=10   # Number of questions to generate for each evaluation sample.
fname=os.path.join('output',f"testset_{docs_vectorstore.name}.csv")
lcdocs=lcdocs_chroma

In [None]:
df_testset=eval.generate_testset(lcdocs,generator,eval_size,n_questions,fname,run_config)

# RAG Evaluation

This section will use the same base data as the synthetic test dataset but apply different RAG strategies:
* Different chunk sizes
* Embedding models
* LLMs
* Advanced RAG (parent-child, RAGatouille)

The database may not be the same as the synthetic test dataset but uses the same base data.

## Format dataset and database for RAG

In [10]:
# Read in test dataset. Skip this if you have generated it above.

# testset_name=docs_vectorstore.name    # Uncomment if you want to use the most recent testset
testset_name='text-embedding-3-large-2merge-0'
fname=os.path.join('output',f"testset_{testset_name}.csv")

import_csv=True
if import_csv:
    df_testset = pd.read_csv(fname)

# temporarily reduce the quantity to evaluate the functionality
df_testset=df_testset.head(4)

# Create template dataframe to iterate over later
df_qa_template = df_testset[['question', 'ground_truth']].copy()
df_qa_template['question_id'] = df_qa_template.index
df_qa_template = df_qa_template[['question_id', 'question', 'ground_truth']]
# for column in ["answer", "source_documents", "answer_by", "query_model"]:
#     df_qa_template[column] = None

From here, you have a blank dataframe to generate questions for an evaluate. For each model and database in setup_data below, this template dataframe will be what is evaluated with RAG responses/RAGAS criteria.

## Use RAG to generate responses, evaluate

In [33]:
# Read setup data, determining the evaluation models and databases
# TODO add parent-child
json_file_path = "eval_models.json"
with open(json_file_path, "r") as json_file:
    setup_data = json.load(json_file)

setup_data['eval_models'] = setup_data['eval_models'][-2:]

In [34]:
setup_data

{'eval_models': [{'index_type': 'Pinecone',
   'index_name': 'voyage-large-2-instruct-2merge-0',
   'query_model': {'query_model': 'Voyage',
    'embedding_name': 'voyage-large-2-instruct'},
   'llm': {'llm_source': 'OpenAI',
    'llm_model': 'gpt-4o',
    'model_options': {'temperature': 0.2, 'output_level': 1000}},
   'qa_model_params': {'rag_type': 'Standard',
    'k': 4,
    'search_type': 'similarity',
    'local_db_path': '/Users/danmueller/Documents/GitHub/aerospace_chatbot/db'}},
  {'index_type': 'RAGatouille',
   'index_name': 'colbert-ir-colbertv2.0-2merge-0',
   'query_model': {'embedding_name': 'colbert-ir/colbertv2.0'},
   'llm': {'llm_source': 'OpenAI',
    'llm_model': 'gpt-4o',
    'model_options': {'temperature': 0.2, 'output_level': 1000}},
   'qa_model_params': {'k': 4,
    'local_db_path': '/Users/danmueller/Documents/GitHub/aerospace_chatbot/db'}}]}

In [36]:
# Iterate through the evaluation models and databases, dump data as you go.
# This will cache data along the way into rag_responses, so you can pick up where you left off.
# A pickled dataframe is also exported at the end of each iteration, but not row-by-row.
df_qa = pd.DataFrame()
for model in setup_data['eval_models']:
    print(model)
    
    # Database
    index_type=model['index_type']
    sb['index_type']=index_type
    index_name=model['index_name']
    sb['index_name']=index_name
    # Query model and llm
    for key in model['query_model']:
        sb[key] = model['query_model'][key]
    query_model=admin.get_query_model(sb, secrets)
    for key in model['llm']:
        sb[key] = model['llm'][key]
    llm=admin.set_llm(sb, secrets)
    # QA model params
    qa_model_params=model['qa_model_params']
    
    df_qa_iter=eval.rag_responses(index_type, index_name, query_model, llm, qa_model_params, 
                                  df_qa_template, df_docs, testset_name)
    df_qa = pd.concat([df_qa,df_qa_iter],ignore_index=True)

    # After each iteration, export a pickle of the dataframe
    with open(os.path.join('output',f'df_qa_{index_name}.pickle'), "wb") as f:
            pickle.dump(df_qa, f)

{'index_type': 'Pinecone', 'index_name': 'voyage-large-2-instruct-2merge-0', 'query_model': {'query_model': 'Voyage', 'embedding_name': 'voyage-large-2-instruct'}, 'llm': {'llm_source': 'OpenAI', 'llm_model': 'gpt-4o', 'model_options': {'temperature': 0.2, 'output_level': 1000}}, 'qa_model_params': {'rag_type': 'Standard', 'k': 4, 'search_type': 'similarity', 'local_db_path': '/Users/danmueller/Documents/GitHub/aerospace_chatbot/db'}}
   question_id                                           question  \
0            0  How much power does the four-element piezo mot...   
1            1  How did statistical analysis determine the num...   
2            2  How is the stress field in the Si3N4 ball and ...   
3            3  How important is testing in ensuring the relia...   

                                              answer  \
0  The normal operating conditions on Earth for t...   
1  The statistical method used to determine the o...   
2  The presence of a ball scar within the conta



Empty DataFrame
Columns: [question_id, question, answer, source_documents, answer_by, query_model, qa_model_params, index_type, index_name]
Index: []
Processing question 1/4




Loading searcher for index colbert-ir-colbertv2.0-2merge-0 for the first time... This may take a few seconds
[Jul 14, 11:26:44] #> Loading codec...
[Jul 14, 11:26:44] #> Loading IVF...
[Jul 14, 11:26:44] #> Loading doclens...


100%|██████████| 1/1 [00:00<00:00, 547.63it/s]

[Jul 14, 11:26:44] #> Loading codes and residuals...



100%|██████████| 1/1 [00:00<00:00, 10.55it/s]


Searcher loaded!

#> QueryTokenizer.tensorize(batch_text[0], batch_background[0], bsize) ==
#> Input: . What are the normal operating conditions on Earth for the four-element piezo motor, and how much power does it consume under these conditions?, 		 True, 		 None
#> Output IDs: torch.Size([32]), tensor([  101,     1,  2054,  2024,  1996,  3671,  4082,  3785,  2006,  3011,
         2005,  1996,  2176,  1011,  5783, 11345,  6844,  5013,  1010,  1998,
         2129,  2172,  2373,  2515,  2009, 16678,  2104,  2122,  3785,  1029,
          102,   103])
#> Output Mask: torch.Size([32]), tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 0])

{'question_id': 0, 'question': 'How much power does the four-element piezo motor consume under normal operating conditions on Earth?', 'answer': 'The normal operating conditions on Earth for the four-element piezo motor involve it consuming approximately 3.25 watts of power. The drive electronics



Loading searcher for index colbert-ir-colbertv2.0-2merge-0 for the first time... This may take a few seconds
[Jul 14, 11:26:49] #> Loading codec...
[Jul 14, 11:26:49] #> Loading IVF...
[Jul 14, 11:26:49] #> Loading doclens...


100%|██████████| 1/1 [00:00<00:00, 885.62it/s]

[Jul 14, 11:26:49] #> Loading codes and residuals...



100%|██████████| 1/1 [00:00<00:00, 38.53it/s]


Searcher loaded!

#> QueryTokenizer.tensorize(batch_text[0], batch_background[0], bsize) ==
#> Input: . What specific statistical methods were used to determine the number of scars needed on each ball for the bearing fatigue life test?, 		 True, 		 None
#> Output IDs: torch.Size([32]), tensor([  101,     1,  2054,  3563,  7778,  4725,  2020,  2109,  2000,  5646,
         1996,  2193,  1997, 13521,  2734,  2006,  2169,  3608,  2005,  1996,
         7682, 16342,  2166,  3231,  1029,   102,   103,   103,   103,   103,
          103,   103])
#> Output Mask: torch.Size([32]), tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 0, 0, 0, 0, 0, 0])

{'question_id': 1, 'question': 'How did statistical analysis determine the number of scars needed on each ball for the bearing fatigue life test?', 'answer': 'The specific statistical method used to determine the number of scars needed on each ball for the bearing fatigue life test was based on ensuring a h



Loading searcher for index colbert-ir-colbertv2.0-2merge-0 for the first time... This may take a few seconds
[Jul 14, 11:26:57] #> Loading codec...
[Jul 14, 11:26:57] #> Loading IVF...
[Jul 14, 11:26:57] #> Loading doclens...


100%|██████████| 1/1 [00:00<00:00, 1349.95it/s]

[Jul 14, 11:26:57] #> Loading codes and residuals...



100%|██████████| 1/1 [00:00<00:00, 37.95it/s]


Searcher loaded!

#> QueryTokenizer.tensorize(batch_text[0], batch_background[0], bsize) ==
#> Input: . How does the presence of a ball scar within the contact patch influence the stress distribution in the Si3N4 ball and raceway?, 		 True, 		 None
#> Output IDs: torch.Size([32]), tensor([  101,     1,  2129,  2515,  1996,  3739,  1997,  1037,  3608, 11228,
         2306,  1996,  3967,  8983,  3747,  1996,  6911,  4353,  1999,  1996,
         9033,  2509,  2078,  2549,  3608,  1998, 23018,  1029,   102,   103,
          103,   103])
#> Output Mask: torch.Size([32]), tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 0, 0, 0])

{'question_id': 2, 'question': 'How is the stress field in the Si3N4 ball and raceway affected by the presence of a ball scar within the contact patch?', 'answer': 'The presence of a ball scar within the contact patch influences the stress distribution in the Si₃N₄ ball and raceway in the following ways:\n\n1. *



Loading searcher for index colbert-ir-colbertv2.0-2merge-0 for the first time... This may take a few seconds
[Jul 14, 11:27:05] #> Loading codec...
[Jul 14, 11:27:05] #> Loading IVF...
[Jul 14, 11:27:05] #> Loading doclens...


100%|██████████| 1/1 [00:00<00:00, 1968.23it/s]

[Jul 14, 11:27:05] #> Loading codes and residuals...



100%|██████████| 1/1 [00:00<00:00, 17.75it/s]


Searcher loaded!

#> QueryTokenizer.tensorize(batch_text[0], batch_background[0], bsize) ==
#> Input: . How does testing contribute to the reliability of aerospace mechanisms?, 		 True, 		 None
#> Output IDs: torch.Size([32]), tensor([  101,     1,  2129,  2515,  5604,  9002,  2000,  1996, 15258,  1997,
        13395, 10595,  1029,   102,   103,   103,   103,   103,   103,   103,
          103,   103,   103,   103,   103,   103,   103,   103,   103,   103,
          103,   103])
#> Output Mask: torch.Size([32]), tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0])

{'question_id': 3, 'question': 'How important is testing in ensuring the reliability of aerospace mechanisms?', 'answer': 'Testing plays a crucial role in ensuring the reliability of aerospace mechanisms. Here are several key points on how testing contributes to this reliability:\n\n1. **Early Detection of Failures**: Initial testing often reveals that many mechanis

AttributeError: 'RAGPretrainedModel' object has no attribute 'embed_query'

In [50]:
# TESTING
query_model.model.encode(["hello world2"])
print(query_model.model.in_memory_embed_docs)
query_model.model.clear_encoded_docs()

tensor([[[ 0.0315,  0.0850, -0.0128,  ...,  0.0547, -0.0918,  0.0366],
         [ 0.0012, -0.0008,  0.0097,  ..., -0.0881,  0.0215,  0.0804],
         [-0.0275, -0.0470,  0.0361,  ..., -0.0999,  0.0272,  0.1023],
         ...,
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]]])


In [None]:
# Evaluate
df_qa = eval.eval_rag(index_name, df_qa)

In [None]:
write=False
if write:
    with open(os.path.join('output',f'df_qa_{testset_name}.pickle'), "wb") as f:
        pickle.dump(df_qa, f)
else:
    with open(os.path.join('output',f'df_qa_{testset_name}.pickle'), "rb") as f:
        df_qa = pickle.load(f)

## Ragas eval, visualize

In [None]:
# Link from documents to questions, that used the document as source. Add UMAP column for visualization purposes.
df_visualize=eval.data_viz_prep(index_name,df_qa,df_docs)

In [None]:
# concat the df containing the questions and the df containing the documents
df = pd.read_parquet(f'df_{index_name}.parquet')

# show the dataframe with the question and answer in spotlight
spotlight.show(
    df,
    layout="https://spotlightpublic.blob.core.windows.net/docs-data/rag_demo/layout_rag_3.json",
    dtype={x: Embedding for x in df.keys() if "umap" in x},
)

##  UMAP visualization froms cluster of the questions, workaround: UMAP only on documents