In [37]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Ragas evaluation
Test batch and ragas capability.

Uses this article as a model: https://towardsdatascience.com/visualize-your-rag-data-evaluate-your-retrieval-augmented-generation-system-with-ragas-fc2486308557

Ragas repository: https://github.com/explodinggradients/ragas/tree/main

In [38]:
import os, sys
import json
from pathlib import Path
import pickle

from ragas.testset import TestsetGenerator
from ragas import RunConfig
from dotenv import load_dotenv,find_dotenv
import chromadb
from chromadb import PersistentClient
from pinecone import Pinecone as pinecone_client, ServerlessSpec
from langchain_pinecone import PineconeVectorStore
from langchain_community.vectorstores import Chroma
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_voyageai import VoyageAIEmbeddings
from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings
from langchain_core.documents import Document
import pandas as pd
import random

from ragas import evaluate
from ragas.metrics import answer_correctness
from datasets import Dataset

from renumics import spotlight
from renumics.spotlight import Embedding
import pandas as pd

# from umap import UMAP
# import numpy as np

# Import local packages
sys.path.append('../src/aerospace_chatbot')
import queries
import eval
from data_processing import _stable_hash_meta, archive_db

# Set environment variables with .env
load_dotenv(find_dotenv(), override=True)

True

# ChromaDB

## Connect to database

In [39]:
persistent_client = chromadb.PersistentClient(path=os.path.join(os.getenv('LOCAL_DB_PATH'),'chromadb'))   
collections=persistent_client.list_collections()
collections

[Collection(name=text-embedding-3-large-2merge-0),
 Collection(name=text-embedding-3-large-0merge-400),
 Collection(name=text-embedding-3-large-0merge-400-parent-child),
 Collection(name=text-embedding-3-large-2merge-0-queries),
 Collection(name=text-embedding-3-large-ams-none-400-queries)]

In [40]:
# Chroma _embedding_function isn't compatible like embedding objects. Index by embeddings used.
query_models=[OpenAIEmbeddings(model='text-embedding-3-large',openai_api_key=os.getenv('OPENAI_API_KEY')),
              OpenAIEmbeddings(model='text-embedding-3-large',openai_api_key=os.getenv('OPENAI_API_KEY')),
              OpenAIEmbeddings(model='text-embedding-3-large',openai_api_key=os.getenv('OPENAI_API_KEY')),
              OpenAIEmbeddings(model='text-embedding-3-large',openai_api_key=os.getenv('OPENAI_API_KEY')),
              OpenAIEmbeddings(model='text-embedding-3-large',openai_api_key=os.getenv('OPENAI_API_KEY'))]

## Export pickles

In [41]:
# Export all collections to pickles to store them. Uncomment if desired, takes a while.
export=False

if export:
    for collection in collections:
        df_temp_chroma=archive_db('ChromaDB',collection.name,collection._embedding_function,export_pickle=True)

    df_temp_chroma.head(5)

## Create data for synthetic dataset

In [42]:
# Select database for determining synthetic dataset
idx=0

docs_vectorstore=collections[idx]
query_model=query_models[idx]

In [43]:
# Inspect the first db, save for synthetic test dataset
all_docs = docs_vectorstore.get(include=["metadatas", "documents", "embeddings"])
lcdocs_chroma = [Document(page_content=doc, metadata=metadata) 
          for doc, metadata in zip(all_docs['documents'], all_docs['metadatas'])]

print(len(lcdocs_chroma))

2222


In [44]:
# Format docs into dataframe
all_docs = docs_vectorstore.get(include=["metadatas", "documents", "embeddings"])
df_docs = pd.DataFrame(
    {
        "id": [_stable_hash_meta(metadata) for metadata in all_docs["metadatas"]],
        "source": [metadata.get("source") for metadata in all_docs["metadatas"]],
        "page": [metadata.get("page", -1) for metadata in all_docs["metadatas"]],
        "document": all_docs["documents"],
        "embedding": all_docs["embeddings"],
    }
)

## Generate synthetic dataset

Good article on how models/embeddings are used in the `TestsetGenerator`: https://www.pondhouse-data.com/blog/evaluate-rag-performance-using-ragas

Sometimes you'll get a tricky threading error. Fully close vs studio, open a new window, restart the kernel, and it'll clear. It also appears to be related to versions newer than 0.1.6 for ragas. I'll stick with that for now until I find ways to test an upgrade.

In [45]:
# Set generator inputs
generator_model="gpt-3.5-turbo-0125"
synthetic_generator_llm = ChatOpenAI(model=generator_model, tags=[generator_model])
# synthetic_generator_llm = ChatOpenAI(base_url='https://api-inference.huggingface.co/v1',
#                                     model='meta-llama/Meta-Llama-3-8B-Instruct',
#                                     api_key=os.getenv('HUGGINGFACEHUB_API_TOKEN'),
#                                     temperature=0.1,
#                                     max_tokens=500)


critic_model='gpt-4o'
synthetic_critic_llm = ChatOpenAI(model=critic_model,tags=[critic_model])
# synthetic_critic_llm = ChatOpenAI(base_url='https://api-inference.huggingface.co/v1',
#                                     model='meta-llama/Meta-Llama-3-8B-Instruct',
#                                     api_key=os.getenv('HUGGINGFACEHUB_API_TOKEN'),
#                                     temperature=0.1,
#                                     max_tokens=500)

# embedding_model='text-embedding-3-large'
# synthetic_embeddings = OpenAIEmbeddings(model=embedding_model,api_key=os.getenv('OPENAI_API_KEY'))
# synthetic_embeddings=db['query_model']  # Set to be the same as the database
synthetic_embeddings=query_model

# Run parameters for testset generation
run_config=RunConfig(timeout=1000,
                max_retries=50,
                max_wait=1000,
                max_workers=1)

# Create generator
generator = TestsetGenerator.from_langchain(
    synthetic_generator_llm,
    synthetic_critic_llm,
    synthetic_embeddings,
    run_config=run_config
)

In [46]:
# Input parameters
eval_size=100    # Number of samples to evaluate at a time. Intended to circumvent OpenAI API rate limits.
n_questions=10   # Number of questions to generate for each evaluation sample.
fname=os.path.join('output',f"testset_{docs_vectorstore.name}.csv")
lcdocs=lcdocs_chroma

In [36]:
df_testset=eval.generate_testset(lcdocs,generator,eval_size,n_questions,fname,run_config)

Index loop: [0, 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1100, 1200, 1300, 1400, 1500, 1600, 1700, 1800, 1900, 2000, 2100, 2200]
Processing index 0 to 100...


embedding nodes:   0%|          | 0/352 [00:00<?, ?it/s]

Exception in thread Thread-70:
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/threading.py", line 1038, in _bootstrap_inner
    self.run()
  File "/Users/danmueller/Documents/GitHub/aerospace_chatbot/.venv/lib/python3.11/site-packages/ragas/executor.py", line 96, in run
    results = self.loop.run_until_complete(self._aresults())
              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/asyncio/base_events.py", line 653, in run_until_complete
    return future.result()
           ^^^^^^^^^^^^^^^
  File "/Users/danmueller/Documents/GitHub/aerospace_chatbot/.venv/lib/python3.11/site-packages/ragas/executor.py", line 84, in _aresults
    raise e
  File "/Users/danmueller/Documents/GitHub/aerospace_chatbot/.venv/lib/python3.11/site-packages/ragas/executor.py", line 79, in _aresults
    r = await future
        ^^^^^^^^^^^^
  File "/Library/Frameworks/Py

ExceptionInRunner: The runner thread which was running the jobs raised an exeception. Read the traceback above to debug it. You can also pass `raise_exceptions=False` incase you want to show only a warning message instead.

## RAG questions/answers (batch mode)

### Format dataset and database for RAG

In [None]:
df_questions = df_testset[['question', 'ground_truth']].copy()
df_questions['id'] = 'Question ' + df_questions.index.astype(str)
df_questions['question_by'] = generator_model
df_questions = df_questions[['id', 'question', 'ground_truth', 'question_by']]

In [None]:
# Load the cached RAG answers and source_documents ids from a file - or create an empty column
df_questions_answers = eval.add_cached_column_from_file(
    df_questions, f"rag_response_cache_{docs_vectorstore.name}.txt", "question", "answer")

df_questions_answers = eval.add_cached_column_from_file(
    df_questions_answers, f"rag_response_cache_{docs_vectorstore.name}.txt", "question", "source_documents")

### Use RAG to generate responses

In [None]:
index_type='ChromaDB'
index_name=docs_vectorstore.name
query_model=synthetic_embeddings
llm=synthetic_generator_llm

QA_model_params={'rag_type':'Standard',
                 'k':4,
                 'search_type':'similarity',
                 'local_db_path':os.getenv('LOCAL_DB_PATH')}


In [None]:
df_questions_answers_rag=eval.rag_responses(index_type, index_name, query_model, llm, QA_model_params, df_questions_answers, df_docs)

## Ragas eval, visualize

In [None]:
# Evaluate
df_qa_eval, df_questions_answers_rag = eval.eval_rag(index_name, df_questions_answers_rag)

In [None]:
df_questions_answers_rag

In [None]:
# Link from documents to questions, that used the document as source. Add UMAP column for visualization purposes.
df_visualize=eval.data_viz_prep(index_name,df_qa_eval,df_docs)

In [None]:
# concat the df containing the questions and the df containing the documents
df = pd.read_parquet(f'df_{index_name}.parquet')

# show the dataframe with the question and answer in spotlight
spotlight.show(
    df,
    layout="https://spotlightpublic.blob.core.windows.net/docs-data/rag_demo/layout_rag_3.json",
    dtype={x: Embedding for x in df.keys() if "umap" in x},
)

##  UMAP visualization froms cluster of the questions, workaround: UMAP only on documents

# Pinecone

## Connect to database

In [None]:
pinecone_client = pinecone_client(api_key=os.getenv('PINECONE_API_KEY'))
indexes=pinecone_client.list_indexes()
indexes

In [None]:
dbs=[{'index_name':'voyage-large-2-instruct-2merge-0',
     'query_model': VoyageAIEmbeddings(model='voyage-large-2-instruct', 
                                       voyage_api_key=os.getenv('VOYAGE_API_KEY'), truncation=False)},
     {'index_name':'voyage-large-2-instruct-0merge-400',
     'query_model': VoyageAIEmbeddings(model='voyage-large-2-instruct', 
                                       voyage_api_key=os.getenv('VOYAGE_API_KEY'), truncation=False)}]

In [None]:
# Inspect the first db, save for synthetic test dataset
db=dbs[0]
index = pinecone_client.Index(db['index_name'])
ids=[]
for id in index.list():
    ids.extend(id)

docs=[]
chunk_size=200  # Tune to whatever doesn't error out, 200 won't for serverless
for i in range(0, len(ids), chunk_size):
    print(f"Fetching {i} to {i+chunk_size}")
    vector=index.fetch(ids[i:i+chunk_size])['vectors']
    vector_data = []
    for key, value in vector.items():
        vector_data.append(value)
    docs.extend(vector_data)

lcdocs_pinecone = []
for data in docs:
    data=data['metadata']
    lcdocs_pinecone.append(Document(page_content=data['page_content'],
                           metadata={'page':data['page'],'source':data['source']}))
    
print(len(lcdocs_pinecone))

In [None]:
# Export all collections to pickles to store them
for db in dbs:
    df_temp_pinecone=archive_db('Pinecone',db['index_name'],db['query_model'],export_pickle=True)