# Ragas evaluation
Test batch and ragas capability.

Uses this article as a model: https://towardsdatascience.com/visualize-your-rag-data-evaluate-your-retrieval-augmented-generation-system-with-ragas-fc2486308557

Ragas repository: https://github.com/explodinggradients/ragas/tree/main

In [None]:
import os, sys
import json
from pathlib import Path
import pickle

from ragas.testset import TestsetGenerator
from ragas import RunConfig
from dotenv import load_dotenv,find_dotenv
import chromadb
from chromadb import PersistentClient
from langchain_community.vectorstores import Chroma
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_core.documents import Document
import pandas as pd
import random

# Import local packages
sys.path.append('../src/aerospace_chatbot')
import queries
from data_processing import _stable_hash_meta

# Set environment variables with .env
load_dotenv(find_dotenv(), override=True)

## Nifty functions

In [None]:
def write_dict_to_file(data_dict, filename):
    """write a dictionary as a json line to a file - allowing for appending"""
    with open(filename, "a") as f:
        f.write(json.dumps(data_dict) + "\n")

def read_dicts_from_file(filename):
    """Read a json line file as a generator of dictionaries - allowing to load multiple dictionaries as list."""
    with open(filename, "r") as f:
        for line in f:
            yield json.loads(line)
            
def add_cached_column_from_file(df, file_name, merge_on, column):
    """Read a file with cached list of dicts data write it to a dataframe."""

    if Path(file_name).exists():

        cached_answer_correctness = (
            pd.DataFrame(list(read_dicts_from_file(file_name)))
            .drop_duplicates(
                subset=[merge_on],
            )[[column, merge_on]]
            .dropna()
            .reset_index(drop=True)
        )
        return df.merge(
            cached_answer_correctness,
            on=merge_on,
            how="left",
        ).reset_index(drop=True)
    else:
        # return df.insert(0, column, None)

        # Create a copy of the DataFrame
        df_out = df.copy()

        # Add the new column with the name of the variable 'column'
        df_out[column] = None

        # Reorder the columns to place the new column at the end
        columns = list(df_out.columns)
        columns.remove(column)
        columns.append(column)
        df_out = df_out[columns]
        
        # df_out = df.copy().assign(column="")
        # df_out = df_out.reindex(columns=( [col for col in df_out.columns if col not in [column]] + [column] ))
        return df_out

## Connect to database

In [None]:
persistent_client = chromadb.PersistentClient(path=os.path.join(os.getenv('LOCAL_DB_PATH'),'chromadb'))   
query_model=OpenAIEmbeddings(model='text-embedding-ada-002',openai_api_key=os.getenv('OPENAI_API_KEY'))

# index_name='chromadb-openai-ams-400chunk-2000-2020'
index_name='chromadb-openai-ams-full-2000-2020'

# Connect to vectorstore where no chunking was done only full PDF pages
vectorstore = Chroma(client=persistent_client,
                        collection_name=index_name,
                        embedding_function=query_model)  

In [None]:
all_docs = vectorstore.get(include=["metadatas", "documents", "embeddings"])

In [None]:
lcdocs = [Document(page_content=doc, metadata=metadata) 
          for doc, metadata in zip(all_docs['documents'], all_docs['metadatas'])]

In [None]:
lcdocs[0]

## Generate synthetic dataset

In [None]:
generator_model="gpt-3.5-turbo-16k"
synthetic_generator_llm = ChatOpenAI(model=generator_model, tags=[generator_model])


# generator_model="google/gemma-7b-it"
# synthetic_generator_llm = ChatOpenAI(base_url='https://api-inference.huggingface.co/v1',
#                             model=generator_model,
#                             api_key=os.getenv('HUGGINGFACEHUB_API_TOKEN'),
#                             tags=generator_model)
# synthetic_generator_llm = ChatOpenAI(base_url="http://localhost:1234/v1", api_key="not-needed")

critic_model="gpt-3.5-turbo-0125"
# critic_model="gpt-4"
synthetic_critic_llm = ChatOpenAI(model=critic_model,tags=[critic_model])

synthetic_embeddings = OpenAIEmbeddings()

In [None]:
generator = TestsetGenerator.from_langchain(
    synthetic_generator_llm,
    synthetic_critic_llm,
    synthetic_embeddings
)

sample_size=500 # This is the max size before you'll hit rate limits
percent_total=sample_size/len(lcdocs)
print(percent_total)

# Get a random sample of lcdocs
lcdocs_random = random.sample(lcdocs, sample_size)

In [None]:
# TODO run this in batches according to the rate limits

# Check if testset.csv exists
if not os.path.exists('./testset.csv'):
    run_config=RunConfig(timeout=1000,
                    max_retries=50,
                    max_wait=1000,
                    max_workers=1)

    n_questions=5
    testset = generator.generate_with_langchain_docs(lcdocs_random, 
                                                    test_size=n_questions,
                                                    with_debugging_logs=True,
                                                    is_async=True,
                                                    run_config=run_config,
                                                    raise_exceptions=True)
    df_testset=testset.to_pandas()
    df_testset.to_csv('testset.csv', index=False)
else:
    # Import testset.csv into a DataFrame
    df_testset = pd.read_csv('testset.csv')

In [None]:
df_testset

### Format dataset and database for RAG

In [None]:
df_questions = df_testset[['question', 'ground_truth']].copy()
df_questions['question_by'] = generator_model

# questions_all = [
#     {
#         "question": qa.question,
#         "ground_truth": qa.ground_truth,
#         "question_by": generator_model,
#     }
#     for qa in df_testset
# ]

In [None]:
df_questions

# df_questions = pd.DataFrame(
#     {
#         "id": [f"Question {i}" for i, _ in enumerate(questions_all)],
#         "question": [qa["question"] for qa in questions_all],
#         "ground_truth": [qa["ground_truth"] for qa in questions_all],
#         "question_by": [qa["question_by"] for qa in questions_all],
#     }
# )
# # keep only the first question if questions are duplicated
# df_questions = df_questions.drop_duplicates(subset=["question"])
# df_questions

In [None]:
all_docs = vectorstore.get(include=["metadatas", "documents", "embeddings"])
df_docs = pd.DataFrame(
    {
        "id": [_stable_hash_meta(metadata) for metadata in all_docs["metadatas"]],
        "source": [metadata.get("source") for metadata in all_docs["metadatas"]],
        "page": [metadata.get("page", -1) for metadata in all_docs["metadatas"]],
        "document": all_docs["documents"],
        "embedding": all_docs["embeddings"],
    }
)

In [None]:
df_docs.head(10)

In [None]:
# df_questions_answers = df_questions.copy().assign(answer="", source_documents="")
# df_questions_answers = df_questions_answers.reindex(columns=( [col for col in df_questions_answers.columns if col not in ['answer', 'source_documents']] 
#                          + ['answer', 'source_documents'] ))
# df_questions_answers

# load the cached RAG answers and source_documents ids from a file - or create an empty column
df_questions_answers = add_cached_column_from_file(
    df_questions, "rag_response_cache.txt", "question", "answer")

df_questions_answers = add_cached_column_from_file(
    df_questions_answers, "rag_response_cache.txt", "question", "source_documents")

In [None]:
df_questions_answers

# RAG questions/answers (batch mode)

The cell below demonstrates using aerospace_chatbot in batch mode. It requires some basic parameter setup and a QA_model.

In [None]:
# TODO turn this into a function/standalone notebook to show how to use the QA model in batch mode

index_type='ChromaDB'
index_name=index_name
query_model=synthetic_embeddings
llm=synthetic_generator_llm

QA_model_params={'rag_type':'Standard',
                 'k':4,
                 'search_type':'similarity',
                 'local_db_path':os.getenv('LOCAL_DB_PATH')}

for i, row in df_questions_answers.iterrows():
    if row['answer'] is None or pd.isnull(row['answer']) or row['answer']=='':
        print(f"Processing question {i+1}/{len(df_questions_answers)}")

        # Use the QA model to query the documents
        qa_obj=queries.QA_Model(index_type,
                        index_name,
                        query_model,
                        llm,
                        **QA_model_params)
        qa_obj.query_docs(row['question'])
        response=qa_obj.result

        df_questions_answers.loc[df_questions_answers.index[i], "answer"] = response['answer'].content
        # print(response['references'])

        ids=[_stable_hash_meta(source_document.metadata)
            for source_document in response['references']]
        df_questions_answers.loc[df_questions_answers.index[i], "source_documents"] = ', '.join(ids)

        # Save the response to cache file
        response_dict = {
            "question": row['question'],
            "answer": response['answer'].content,
            "source_documents": ids,
        }
        write_dict_to_file(response_dict, "rag_response_cache.json")

In [None]:
# Get the context documents content for each question
source_documents_list = []
for cell in df_questions_answers['source_documents']:
    cell_list = cell.strip('[]').split(', ')
    context=[]
    for cell in cell_list:
        context.append(df_docs[df_docs["id"] == cell]["document"].values[0])
    source_documents_list.append(context)
df_questions_answers["contexts"]=source_documents_list

In [None]:
# Addtionaly get embeddings for questions

if not Path("question_embeddings.pickle").exists():
    question_embeddings = [
        synthetic_embeddings.embed_query(question)
        for question in df_questions_answers["question"]
    ]
    with open("question_embeddings.pickle", "wb") as f:
        pickle.dump(question_embeddings, f)

question_embeddings = pickle.load(open("question_embeddings.pickle", "rb"))
# answer_embeddings = pickle.load(open("answer_embeddings_2040214_1111.pickle", "rb"))
df_questions_answers["embedding"] = question_embeddings

In [None]:
df_questions_answers