In [1]:
%load_ext autoreload
%autoreload 2

# Ragas evaluation
Test batch and ragas capability.

Uses this article as a model: https://towardsdatascience.com/visualize-your-rag-data-evaluate-your-retrieval-augmented-generation-system-with-ragas-fc2486308557

Ragas repository: https://github.com/explodinggradients/ragas/tree/main

In [2]:
import os, sys
import json
from pathlib import Path
import pickle

from ragas.testset import TestsetGenerator
from ragas import RunConfig
from dotenv import load_dotenv,find_dotenv
import chromadb
from chromadb import PersistentClient
from pinecone import Pinecone as pinecone_client, ServerlessSpec
from langchain_pinecone import PineconeVectorStore
from langchain_community.vectorstores import Chroma
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_voyageai import VoyageAIEmbeddings
from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings
from langchain_core.documents import Document
import pandas as pd
import random

from ragas import evaluate
from ragas.metrics import answer_correctness
from datasets import Dataset

from renumics import spotlight
from renumics.spotlight import Embedding
import pandas as pd

# from umap import UMAP
# import numpy as np

# Import local packages
sys.path.append('../src/aerospace_chatbot')
import queries
import eval
from data_processing import _stable_hash_meta, archive_db

# Set environment variables with .env
load_dotenv(find_dotenv(), override=True)

True

# ChromaDB

## Connect to database

In [3]:
persistent_client = chromadb.PersistentClient(path=os.path.join(os.getenv('LOCAL_DB_PATH'),'chromadb'))   
collections=persistent_client.list_collections()
collections

[Collection(name=text-embedding-3-large-2merge-0),
 Collection(name=text-embedding-3-large-0merge-400),
 Collection(name=text-embedding-3-large-0merge-400-parent-child),
 Collection(name=text-embedding-3-large-2merge-0-queries),
 Collection(name=text-embedding-3-large-ams-none-400-queries)]

In [4]:
# Chroma _embedding_function isn't compatible like embedding objects. Index by embeddings used.
query_models=[OpenAIEmbeddings(model='text-embedding-3-large',openai_api_key=os.getenv('OPENAI_API_KEY')),
              OpenAIEmbeddings(model='text-embedding-3-large',openai_api_key=os.getenv('OPENAI_API_KEY')),
              OpenAIEmbeddings(model='text-embedding-3-large',openai_api_key=os.getenv('OPENAI_API_KEY')),
              OpenAIEmbeddings(model='text-embedding-3-large',openai_api_key=os.getenv('OPENAI_API_KEY')),
              OpenAIEmbeddings(model='text-embedding-3-large',openai_api_key=os.getenv('OPENAI_API_KEY'))]

## Export pickles

In [5]:
# Export all collections to pickles to store them. Uncomment if desired, takes a while.
export=False

if export:
    for collection in collections:
        df_temp_chroma=archive_db('ChromaDB',collection.name,collection._embedding_function,export_pickle=True)

    df_temp_chroma.head(5)

## Create data for synthetic dataset

In [6]:
# Select database for determining synthetic dataset
idx=0

docs_vectorstore=collections[idx]
query_model=query_models[idx]

In [7]:
# Inspect the first db, save for synthetic test dataset
all_docs = docs_vectorstore.get(include=["metadatas", "documents", "embeddings"])
lcdocs_chroma = [Document(page_content=doc, metadata=metadata) 
          for doc, metadata in zip(all_docs['documents'], all_docs['metadatas'])]

print(len(lcdocs_chroma))

2222


In [8]:
# Format docs into dataframe
all_docs = docs_vectorstore.get(include=["metadatas", "documents", "embeddings"])
df_docs = pd.DataFrame(
    {
        "id": [_stable_hash_meta(metadata) for metadata in all_docs["metadatas"]],
        "source": [metadata.get("source") for metadata in all_docs["metadatas"]],
        "page": [metadata.get("page", -1) for metadata in all_docs["metadatas"]],
        "document": all_docs["documents"],
        "embedding": all_docs["embeddings"],
    }
)

## Generate synthetic dataset

Good article on how models/embeddings are used in the `TestsetGenerator`: https://www.pondhouse-data.com/blog/evaluate-rag-performance-using-ragas

Sometimes you'll get a tricky threading error. Fully close vs studio, open a new window, restart the kernel, and it'll clear. It also appears to be related to versions newer than 0.1.6 for ragas. I'll stick with that for now until I find ways to test an upgrade.

In [9]:
# Set generator inputs
generator_model="gpt-3.5-turbo-0125"
synthetic_generator_llm = ChatOpenAI(model=generator_model, tags=[generator_model])


critic_model='gpt-4o'
synthetic_critic_llm = ChatOpenAI(model=critic_model,tags=[critic_model])

# embedding_model='text-embedding-3-large'
# synthetic_embeddings = OpenAIEmbeddings(model=embedding_model,api_key=os.getenv('OPENAI_API_KEY'))
synthetic_embeddings=query_model

# Run parameters for testset generation
run_config=RunConfig(timeout=1000,
                max_retries=50,
                max_wait=1000,
                max_workers=1)

# Create generator
generator = TestsetGenerator.from_langchain(
    synthetic_generator_llm,
    synthetic_critic_llm,
    synthetic_embeddings,
    run_config=run_config
)

In [10]:
# Input parameters
eval_size=100    # Number of samples to evaluate at a time. Intended to circumvent OpenAI API rate limits.
n_questions=10   # Number of questions to generate for each evaluation sample.
fname=os.path.join('output',f"testset_{docs_vectorstore.name}.csv")
lcdocs=lcdocs_chroma

In [None]:
df_testset=eval.generate_testset(lcdocs,generator,eval_size,n_questions,fname,run_config)

In [11]:
import_csv=True
if import_csv:
    df_testset = pd.read_csv(fname)

# temporarily reduce the quantity to evaluate the functionality
df_testset=df_testset.head(10)

## RAG questions/answers (batch mode)

### Format dataset and database for RAG

In [12]:
df_questions = df_testset[['question', 'ground_truth']].copy()
df_questions['id'] = 'Question ' + df_questions.index.astype(str)
df_questions['question_by'] = generator_model
df_questions = df_questions[['id', 'question', 'ground_truth', 'question_by']]

In [13]:
# Load the cached RAG answers and source_documents ids from a file - or create an empty column
df_questions_answers = eval.add_cached_column_from_file(
    df_questions, f"rag_response_cache_{docs_vectorstore.name}.txt", "question", "answer")

df_questions_answers = eval.add_cached_column_from_file(
    df_questions_answers, f"rag_response_cache_{docs_vectorstore.name}.txt", "question", "source_documents")

### Use RAG to generate responses

In [14]:
index_type='ChromaDB'
index_name=docs_vectorstore.name
query_model=synthetic_embeddings
llm=synthetic_critic_llm

QA_model_params={'rag_type':'Standard',
                 'k':4,
                 'search_type':'similarity',
                 'local_db_path':os.getenv('LOCAL_DB_PATH')}


In [15]:
# TODO add all of the parameters used to generate answers in the dataframe
df_questions_answers=eval.rag_responses(index_type, index_name, query_model, llm, QA_model_params, df_questions_answers, df_docs)

Processing question 1/10
Processing question 2/10
Processing question 3/10
Processing question 4/10
Processing question 5/10
Processing question 6/10
Processing question 7/10
Processing question 8/10
Processing question 9/10
Processing question 10/10


In [16]:
df_questions_answers

Unnamed: 0,id,question,ground_truth,question_by,answer,source_documents,contexts,embedding
0,Question 0,How much power does the four-element piezo mot...,The four-element piezo motor itself consumes a...,gpt-3.5-turbo-0125,The power consumption of the four-element piez...,"099ec9c21af691b1133d034105a024e7148c71dd, b19f...","[ITI1S/117 ITT111117°a flight mechanism, addit...","[-0.009815779160309207, 0.006825634497646335, ..."
1,Question 1,How did statistical analysis determine the num...,Statistical analysis determined that 2 scars o...,gpt-3.5-turbo-0125,The specific statistical method used to determ...,"5845a4c11b7f34dd976c86e7c374d86d0e31b34b, 05c3...",[A Study on the Effects of Ball Defects on the...,"[0.02447172302812749, -0.03881108837734852, -0..."
2,Question 2,How is the stress field in the Si3N4 ball and ...,The stress field in the Si3N4 ball and raceway...,gpt-3.5-turbo-0125,The presence of a ball scar within the contact...,"0a22815cea1effbf06b31313a9a816dc0c2bf2b7, 05c3...",[Figure 5. Micron element size FEM (left) and ...,"[0.03700122098490776, -0.012941841482224214, -..."
3,Question 3,How important is testing in ensuring the relia...,Testing is crucial in ensuring the reliability...,gpt-3.5-turbo-0125,### How Testing Contributes to the Reliability...,"08865a28dee2dd08c9bf3de207520646e0314af7, 8eee...","[component failure, requires the understandi...","[-0.0055112459916348755, 0.01983761914535086, ..."
4,Question 4,How does the wear rate vary when different coa...,The lowest wear rate was obtained with the CSE...,gpt-3.5-turbo-0125,The types of coatings found to significantly a...,"00b8df52420abd80582ac5508f51154b4faf12ff, 4a1d...",[Block on Ring Friction Coefficient The avera...,"[-0.003935487775594324, -0.03574427439627591, ..."
5,Question 5,How did the ball-on-ball apparatus help create...,The ball-on-ball apparatus helped create artif...,gpt-3.5-turbo-0125,The ball-on-ball apparatus was used to create ...,"05c38a1ac579c4b37cc07249f581c9ebd9d82dae, 5c2e...","[15kVU , S68 SBbm Br 3 Balltresponsible for it...","[0.01197364974896236, -0.015068469939456788, -..."
6,Question 6,What apparatus is used to create artificial sc...,The apparatus used to create artificial scars ...,gpt-3.5-turbo-0125,The apparatus designed to create artificial sc...,"05c38a1ac579c4b37cc07249f581c9ebd9d82dae, 5c2e...","[15kVU , S68 SBbm Br 3 Balltresponsible for it...","[-0.0004984276446224549, -0.02846046075657779,..."
7,Question 7,How does the rail system design affect the pow...,The rail system design does not directly affec...,gpt-3.5-turbo-0125,The rail system design has a significant impac...,"099ec9c21af691b1133d034105a024e7148c71dd, 2ef0...","[ITI1S/117 ITT111117°a flight mechanism, addit...","[-0.01693620432043042, 0.022915279992765804, -..."
8,Question 8,How were artificial scars induced on Si 3N4 ba...,Artificial scars were induced on Si 3N4 balls ...,gpt-3.5-turbo-0125,### Methods for Inducing Artificial Scars on S...,"0a22815cea1effbf06b31313a9a816dc0c2bf2b7, 05c3...",[Figure 5. Micron element size FEM (left) and ...,"[0.018049016853648507, -0.003485609827243578, ..."
9,Question 9,What design changes were made to the launch la...,Design changes were made to address material a...,gpt-3.5-turbo-0125,### Changes to the Launch Latch Mechanism\n\n#...,"08837cf1bdb7c984624981368696dcbcbaf38f02, 7f97...",[Development and Testing \n\nDevelopment test ...,"[0.0036130547195561877, 0.0001756986150636199,..."


## Ragas eval, visualize

In [None]:
# Evaluate
df_questions_answers = eval.eval_rag(index_name, df_questions_answers)

In [None]:
df_questions_answers

In [None]:
# Link from documents to questions, that used the document as source. Add UMAP column for visualization purposes.
df_visualize=eval.data_viz_prep(index_name,df_qa_eval,df_docs)

In [None]:
# concat the df containing the questions and the df containing the documents
df = pd.read_parquet(f'df_{index_name}.parquet')

# show the dataframe with the question and answer in spotlight
spotlight.show(
    df,
    layout="https://spotlightpublic.blob.core.windows.net/docs-data/rag_demo/layout_rag_3.json",
    dtype={x: Embedding for x in df.keys() if "umap" in x},
)

##  UMAP visualization froms cluster of the questions, workaround: UMAP only on documents

# Pinecone

## Connect to database

In [None]:
pinecone_client = pinecone_client(api_key=os.getenv('PINECONE_API_KEY'))
indexes=pinecone_client.list_indexes()
indexes

In [None]:
dbs=[{'index_name':'voyage-large-2-instruct-2merge-0',
     'query_model': VoyageAIEmbeddings(model='voyage-large-2-instruct', 
                                       voyage_api_key=os.getenv('VOYAGE_API_KEY'), truncation=False)},
     {'index_name':'voyage-large-2-instruct-0merge-400',
     'query_model': VoyageAIEmbeddings(model='voyage-large-2-instruct', 
                                       voyage_api_key=os.getenv('VOYAGE_API_KEY'), truncation=False)}]

In [None]:
# Inspect the first db, save for synthetic test dataset
db=dbs[0]
index = pinecone_client.Index(db['index_name'])
ids=[]
for id in index.list():
    ids.extend(id)

docs=[]
chunk_size=200  # Tune to whatever doesn't error out, 200 won't for serverless
for i in range(0, len(ids), chunk_size):
    print(f"Fetching {i} to {i+chunk_size}")
    vector=index.fetch(ids[i:i+chunk_size])['vectors']
    vector_data = []
    for key, value in vector.items():
        vector_data.append(value)
    docs.extend(vector_data)

lcdocs_pinecone = []
for data in docs:
    data=data['metadata']
    lcdocs_pinecone.append(Document(page_content=data['page_content'],
                           metadata={'page':data['page'],'source':data['source']}))
    
print(len(lcdocs_pinecone))

In [None]:
# Export all collections to pickles to store them
for db in dbs:
    df_temp_pinecone=archive_db('Pinecone',db['index_name'],db['query_model'],export_pickle=True)