In [2]:
%load_ext autoreload
%autoreload 2

# Ragas evaluation
Test batch and ragas capability.

Uses this article as a model: https://towardsdatascience.com/visualize-your-rag-data-evaluate-your-retrieval-augmented-generation-system-with-ragas-fc2486308557

Ragas repository: https://github.com/explodinggradients/ragas/tree/main

In [47]:
import os, sys
import json
from pathlib import Path
import pickle

from ragas.testset import TestsetGenerator
from ragas import RunConfig
from dotenv import load_dotenv,find_dotenv
import chromadb
from chromadb import PersistentClient
from pinecone import Pinecone as pinecone_client, ServerlessSpec
from ragatouille import RAGPretrainedModel

from langchain_pinecone import PineconeVectorStore
from langchain_community.vectorstores import Chroma
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_voyageai import VoyageAIEmbeddings
from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings
from langchain_core.documents import Document
import pandas as pd

from ragas import evaluate
from ragas.metrics import answer_correctness
from datasets import Dataset

from renumics import spotlight
from renumics.spotlight import Embedding
import pandas as pd

# from umap import UMAP
# import numpy as np

# Import local packages
sys.path.append('../src/aerospace_chatbot')
import eval
import admin
import data_processing
import queries

# Set environment variables with .env
load_dotenv(find_dotenv(), override=True)

True

In [31]:
# Set secrets
secrets={}
sb={}

secrets['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY')
secrets['VOYAGE_API_KEY'] = os.getenv('VOYAGE_API_KEY')
secrets['PINECONE_API_KEY'] = os.getenv('PINECONE_API_KEY')
secrets['HUGGINGFACEHUB_API_TOKEN'] = os.getenv('HUGGINGFACEHUB_API_TOKEN')

# Create test (synthetic) dataset, generate docs+

## Connect to database

### ChromaDB

Use chroma with standard RAG to generate synthetic dataset

In [32]:
persistent_client = chromadb.PersistentClient(path=os.path.join(os.getenv('LOCAL_DB_PATH'),'chromadb'))   
collections=persistent_client.list_collections()
collections

[Collection(name=text-embedding-3-large-2merge-0),
 Collection(name=text-embedding-3-large-0merge-400),
 Collection(name=text-embedding-3-large-0merge-400-parent-child),
 Collection(name=text-embedding-3-large-2merge-0-queries)]

In [33]:
# Chroma _embedding_function isn't compatible like embedding objects. Index by embeddings used.
query_models=[OpenAIEmbeddings(model='text-embedding-3-large',openai_api_key=os.getenv('OPENAI_API_KEY')),
              OpenAIEmbeddings(model='text-embedding-3-large',openai_api_key=os.getenv('OPENAI_API_KEY')),
              OpenAIEmbeddings(model='text-embedding-3-large',openai_api_key=os.getenv('OPENAI_API_KEY')),
              OpenAIEmbeddings(model='text-embedding-3-large',openai_api_key=os.getenv('OPENAI_API_KEY')),
              OpenAIEmbeddings(model='text-embedding-3-large',openai_api_key=os.getenv('OPENAI_API_KEY'))]

In [34]:
# Export all collections to pickles to store them. Uncomment if desired, takes a while.
export=False

if export:
    for collection in collections:
        df_temp_chroma=data_processing.archive_db('ChromaDB',collection.name,collection._embedding_function,export_pickle=True)

    df_temp_chroma.head(5)

In [35]:
# Select database for determining synthetic dataset
idx=0   # Most reasonable baseline (text-embedding-3-large-2merge-0), top of the line embeddings, 2 page size good to genreate questions from.

docs_vectorstore=collections[idx]
query_model=query_models[idx]

In [36]:
# Inspect the first db, save for synthetic test dataset
all_docs = docs_vectorstore.get(include=["metadatas", "documents", "embeddings"])
lcdocs_chroma = [Document(page_content=doc, metadata=metadata) 
          for doc, metadata in zip(all_docs['documents'], all_docs['metadatas'])]

print(len(lcdocs_chroma))

2222


In [37]:
# Format docs into dataframe
all_docs = docs_vectorstore.get(include=["metadatas", "documents", "embeddings"])
df_docs = pd.DataFrame(
    {
        "id": [data_processing._stable_hash_meta(metadata) for metadata in all_docs["metadatas"]],
        "source": [metadata.get("source") for metadata in all_docs["metadatas"]],
        "page": [metadata.get("page", -1) for metadata in all_docs["metadatas"]],
        "document": all_docs["documents"],
        "embedding": all_docs["embeddings"],
    }
)

### Pinecone



In [None]:
pinecone_client = pinecone_client(api_key=os.getenv('PINECONE_API_KEY'))
indexes=pinecone_client.list_indexes()
indexes

In [None]:
dbs=[{'index_name':'voyage-large-2-instruct-2merge-0',
     'query_model': VoyageAIEmbeddings(model='voyage-large-2-instruct', 
                                       voyage_api_key=os.getenv('VOYAGE_API_KEY'), truncation=False)},
     {'index_name':'voyage-large-2-instruct-0merge-400',
     'query_model': VoyageAIEmbeddings(model='voyage-large-2-instruct', 
                                       voyage_api_key=os.getenv('VOYAGE_API_KEY'), truncation=False)}]

In [None]:
# Select database for determining synthetic dataset
idx=0
db=dbs[0]
index = pinecone_client.Index(db['index_name'])

In [None]:
ids=[]
for id in index.list():
    ids.extend(id)

docs=[]
df_docs = pd.DataFrame()
chunk_size=200  # Tune to whatever doesn't error out, 200 won't for serverless
for i in range(0, len(ids), chunk_size):
    print(f"Fetching {i} to {i+chunk_size}")
    vector=index.fetch(ids[i:i+chunk_size])['vectors']
    vector_data = []
    for key, value in vector.items():
        vector_data.append(value)
    docs.extend(vector_data)

    df_doc_temp = pd.DataFrame()
    df_doc_temp["id"]= [vector_elm["id"] for vector_elm in vector_data]
    df_doc_temp["source"]= [vector_elm["metadata"]["source"] for vector_elm in vector_data]
    df_doc_temp["page"]= [vector_elm["metadata"]["page"] for vector_elm in vector_data]
    df_doc_temp["document"]= [vector_elm["metadata"]["page_content"] for vector_elm in vector_data]
    df_doc_temp["embedding"]= [vector_elm["values"] for vector_elm in vector_data]
    df_docs = pd.concat([df_docs, df_doc_temp])

lcdocs_pinecone = []
for data in docs:
    data=data['metadata']
    lcdocs_pinecone.append(Document(page_content=data['page_content'],
                           metadata={'page':data['page'],'source':data['source']}))
    
print(len(lcdocs_pinecone))

In [None]:
# Export all collections to pickles to store them
# for db in dbs:
#     df_temp_pinecone=archive_db('Pinecone',db['index_name'],db['query_model'],export_pickle=True)

### RAGatouille



In [12]:
indexes=admin.show_ragatouille_indexes(format=False)
indexes

{'status': True, 'message': ['colbert-ir-colbertv2.0-2merge-0']}

In [13]:
dbs=['colbert-ir/colbertv2.0']

In [14]:
idx=0
db=dbs[idx]

# TODO get this as a langchain retriever, pull docs
query_model = RAGPretrainedModel.from_pretrained(db,index_root=os.path.join(os.getenv('LOCAL_DB_PATH'),'.ragatouille'))
docs_vectorstore=data_processing.initialize_database('RAGatouille',
                                                     'colbert-ir-colbertv2.0-2merge-0',
                                                     query_model,
                                                     'Standard',
                                                     os.getenv('LOCAL_DB_PATH'),
                                                     init_ragatouille=False,
                                                     clear=False)



In [29]:
docs=docs_vectorstore.model.collection  # Document chunks (chunked smaller according to token size)
print(docs[0])
print(len(docs))

metadata=docs_vectorstore.model.docid_metadata_map  # Document metadata for original documents
print(len(metadata))

map=docs_vectorstore.model.pid_docid_map    # Map of document chunks to original document
print(len(map))

NASA/CP—2008–215252 39th Aerospace Mechanisms Symposium Compiled by  E.A. Boesiger Lockheed Martin Space Systems Company, Sunnyvale, California May 2008Proceedings of a Symposium held at the V on Braun Center, Huntsville, Alabama,  Hosted by Marshall Space Flight Center  and Lockheed Martin Space Systems Company, Organized by the Mechanisms Education Association May 7–9, 2008 The NASA STI Program…in Proﬁle  Since its founding, NASA has been dedicated   to the advancement of aeronautics and space  science. The NASA Scientiﬁc and Technical  Information (STI) Program Ofﬁce plays a key   part in helping NASA maintain this important role.  The NASA STI program operates under the  auspices of the Agency Chief Information Ofﬁcer.  It collects, organizes, provides for archiving, and  disseminates NASA’s STI.
15090
2222
15090


Can't quite figure out functionality to export the encodings from RAGatouille for each document. Won't pursue unless RAGatouille has exceptional performance.

## Generate dataset

Good article on how models/embeddings are used in the `TestsetGenerator`: https://www.pondhouse-data.com/blog/evaluate-rag-performance-using-ragas

Sometimes you'll get a tricky threading error. Fully close vs studio, open a new window, restart the kernel, and it'll clear. It also appears to be related to versions newer than 0.1.6 for ragas. I'll stick with that for now until I find ways to test an upgrade.

In [None]:
# Set generator inputs
generator_model="gpt-3.5-turbo-0125"
synthetic_generator_llm = ChatOpenAI(model=generator_model, tags=[generator_model])

critic_model='gpt-4o'
synthetic_critic_llm = ChatOpenAI(model=critic_model,tags=[critic_model])

# embedding_model='text-embedding-3-large'
# synthetic_embeddings = OpenAIEmbeddings(model=embedding_model,api_key=os.getenv('OPENAI_API_KEY'))
synthetic_embeddings=query_model

# Run parameters for testset generation
run_config=RunConfig(timeout=1000,
                max_retries=50,
                max_wait=1000,
                max_workers=1)

# Create generator
generator = TestsetGenerator.from_langchain(
    synthetic_generator_llm,
    synthetic_critic_llm,
    synthetic_embeddings,
    run_config=run_config
)

In [None]:
# Input parameters
eval_size=100    # Number of samples to evaluate at a time. Intended to circumvent OpenAI API rate limits.
n_questions=10   # Number of questions to generate for each evaluation sample.
fname=os.path.join('output',f"testset_{docs_vectorstore.name}.csv")
lcdocs=lcdocs_chroma

In [None]:
df_testset=eval.generate_testset(lcdocs,generator,eval_size,n_questions,fname,run_config)

# RAG Evaluation

This section will use the same base data as the synthetic test dataset but apply different RAG strategies:
* Different chunk sizes
* Embedding models
* LLMs
* Advanced RAG (parent-child, RAGatouille)

The database may not be the same as the synthetic test dataset but uses the same base data.

## Format dataset and database for RAG

In [39]:
# Read in test dataset. Skip this if you have generated it above.

# testset_name=docs_vectorstore.name    # Uncomment if you want to use the most recent testset
testset_name='text-embedding-3-large-2merge-0'
fname=os.path.join('output',f"testset_{testset_name}.csv")

import_csv=True
if import_csv:
    df_testset = pd.read_csv(fname)

# temporarily reduce the quantity to evaluate the functionality
df_testset=df_testset.head(4)

# Create template dataframe to iterate over later
df_qa_template = df_testset[['question', 'ground_truth']].copy()
df_qa_template['question_id'] = df_qa_template.index
df_qa_template = df_qa_template[['question_id', 'question', 'ground_truth']]
# for column in ["answer", "source_documents", "answer_by", "query_model"]:
#     df_qa_template[column] = None

From here, you have a blank dataframe to generate questions for an evaluate. For each model and database in setup_data below, this template dataframe will be what is evaluated with RAG responses/RAGAS criteria.

## Use RAG to generate responses, evaluate

In [40]:
# Read setup data, determining the evaluation models and databases
json_file_path = "eval_models.json"
with open(json_file_path, "r") as json_file:
    setup_data = json.load(json_file)

# setup_data['eval_models'] = setup_data['eval_models'][-2:]

In [41]:
setup_data

{'eval_models': [{'index_type': 'ChromaDB',
   'index_name': 'text-embedding-3-large-2merge-0',
   'query_model': {'query_model': 'OpenAI',
    'embedding_name': 'text-embedding-3-large'},
   'llm': {'llm_source': 'OpenAI',
    'llm_model': 'gpt-4o',
    'model_options': {'temperature': 0.2, 'output_level': 1000}},
   'qa_model_params': {'rag_type': 'Standard',
    'k': 4,
    'search_type': 'similarity',
    'local_db_path': '/Users/danmueller/Documents/GitHub/aerospace_chatbot/db'}},
  {'index_type': 'ChromaDB',
   'index_name': 'text-embedding-3-large-2merge-0',
   'query_model': {'query_model': 'OpenAI',
    'embedding_name': 'text-embedding-3-large'},
   'llm': {'llm_source': 'OpenAI',
    'llm_model': 'gpt-4',
    'model_options': {'temperature': 0.2, 'output_level': 1000}},
   'qa_model_params': {'rag_type': 'Standard',
    'k': 4,
    'search_type': 'similarity',
    'local_db_path': '/Users/danmueller/Documents/GitHub/aerospace_chatbot/db'}},
  {'index_type': 'ChromaDB',
   'i

In [48]:
# Parent-child not functioning properly. It is just returning the document chunks of the child, not the parent.

debug_model=setup_data['eval_models'][2]

index_type=debug_model['index_type']
sb['index_type']=index_type
index_name=debug_model['index_name']
sb['index_name']=index_name
# Query model and llm
for key in debug_model['query_model']:
    sb[key] = debug_model['query_model'][key]
query_model=admin.get_query_model(sb, secrets)
for key in debug_model['llm']:
    sb[key] = debug_model['llm'][key]
llm=admin.set_llm(sb, secrets)
# QA model params
qa_model_params=debug_model['qa_model_params']

 # Use the QA model to query the documents
qa_obj=queries.QA_Model(index_type,
                        index_name,
                        query_model,
                        llm,
                        **qa_model_params)
qa_obj.query_docs("Can you describe the failure modes of a latch?")
response=qa_obj.result

In [49]:
qa_obj

<queries.QA_Model at 0x36ad65550>

In [44]:
# Iterate through the evaluation models and databases, dump data as you go.
# This will cache data along the way into rag_responses, so you can pick up where you left off.
# A pickled dataframe is also exported at the end of each iteration, but not row-by-row.
df_qa = pd.DataFrame()
for model in setup_data['eval_models']:
    print(model)
    
    # Database
    index_type=model['index_type']
    sb['index_type']=index_type
    index_name=model['index_name']
    sb['index_name']=index_name
    # Query model and llm
    for key in model['query_model']:
        sb[key] = model['query_model'][key]
    query_model=admin.get_query_model(sb, secrets)
    for key in model['llm']:
        sb[key] = model['llm'][key]
    llm=admin.set_llm(sb, secrets)
    # QA model params
    qa_model_params=model['qa_model_params']
    
    df_qa_iter=eval.rag_responses(index_type, index_name, query_model, llm, qa_model_params, 
                                  df_qa_template, df_docs, testset_name)
    df_qa = pd.concat([df_qa,df_qa_iter],ignore_index=True)

    # After each iteration, export a pickle of the dataframe
    with open(os.path.join('output',f'df_qa_{index_name}.pickle'), "wb") as f:
            pickle.dump(df_qa, f)

{'index_type': 'ChromaDB', 'index_name': 'text-embedding-3-large-2merge-0', 'query_model': {'query_model': 'OpenAI', 'embedding_name': 'text-embedding-3-large'}, 'llm': {'llm_source': 'OpenAI', 'llm_model': 'gpt-4o', 'model_options': {'temperature': 0.2, 'output_level': 1000}}, 'qa_model_params': {'rag_type': 'Standard', 'k': 4, 'search_type': 'similarity', 'local_db_path': '/Users/danmueller/Documents/GitHub/aerospace_chatbot/db'}}
{'index_type': 'ChromaDB', 'index_name': 'text-embedding-3-large-2merge-0', 'query_model': {'query_model': 'OpenAI', 'embedding_name': 'text-embedding-3-large'}, 'llm': {'llm_source': 'OpenAI', 'llm_model': 'gpt-4', 'model_options': {'temperature': 0.2, 'output_level': 1000}}, 'qa_model_params': {'rag_type': 'Standard', 'k': 4, 'search_type': 'similarity', 'local_db_path': '/Users/danmueller/Documents/GitHub/aerospace_chatbot/db'}}
{'index_type': 'ChromaDB', 'index_name': 'text-embedding-3-large-0merge-400-parent-child', 'query_model': {'query_model': 'Open

IndexError: index 0 is out of bounds for axis 0 with size 0

In [None]:
# Evaluate
df_qa = eval.eval_rag(index_name, df_qa)

In [None]:
write=False
if write:
    with open(os.path.join('output',f'df_qa_{testset_name}.pickle'), "wb") as f:
        pickle.dump(df_qa, f)
else:
    with open(os.path.join('output',f'df_qa_{testset_name}.pickle'), "rb") as f:
        df_qa = pickle.load(f)

## Ragas eval, visualize

In [None]:
# Link from documents to questions, that used the document as source. Add UMAP column for visualization purposes.
df_visualize=eval.data_viz_prep(index_name,df_qa,df_docs)

In [None]:
# concat the df containing the questions and the df containing the documents
df = pd.read_parquet(f'df_{index_name}.parquet')

# show the dataframe with the question and answer in spotlight
spotlight.show(
    df,
    layout="https://spotlightpublic.blob.core.windows.net/docs-data/rag_demo/layout_rag_3.json",
    dtype={x: Embedding for x in df.keys() if "umap" in x},
)

##  UMAP visualization froms cluster of the questions, workaround: UMAP only on documents