In [1]:
# Import necessary libraries
from langchain_community.document_loaders import PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from ragas.testset.generator import TestsetGenerator
from ragas.testset.evolutions import simple, reasoning, multi_context
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from datasets import Dataset
from ragas import evaluate
from ragas.metrics import faithfulness, answer_relevancy, context_recall, context_precision, answer_correctness
import pandas as pd
from rag_pipeline import create_rag_pipeline, create_embeddings, create_vector_store

# Load your documents (use the same documents as the Chainlit app)
pdf_path_1 = "../docs/Blueprint-for-an-AI-Bill-of-Rights.pdf"
pdf_path_2 = "../docs/NIST_AI_600-1.pdf"
loader1 = PyMuPDFLoader(pdf_path_1)
loader2 = PyMuPDFLoader(pdf_path_2)
documents1 = loader1.load()
documents2 = loader2.load()
documents = documents1 + documents2

# Split the documents into chunks
text_splitter_eval = RecursiveCharacterTextSplitter(
    chunk_size=600,
    chunk_overlap=50
)
eval_documents = text_splitter_eval.split_documents(documents)

# Set up embeddings using ADA
EMBEDDING_MODEL = "text-embedding-ada-002"
embeddings = create_embeddings(model_name=EMBEDDING_MODEL)

# Set up the Qdrant vector store
vectorstore = create_vector_store(eval_documents, embeddings)

# Create the RAG pipeline using the current ADA model
retriever = vectorstore.as_retriever()
rag_pipeline = create_rag_pipeline(retriever)

# Define the LLMs for test set generation
generator_llm = ChatOpenAI(model="gpt-3.5-turbo")
critic_llm = ChatOpenAI(model="gpt-4o-mini")

# Initialize the TestsetGenerator
generator = TestsetGenerator.from_langchain(
    generator_llm,
    critic_llm,
    embeddings
)

# Define test set distribution (simple, multi_context, reasoning)
distributions = {
    simple: 0.5,
    multi_context: 0.4,
    reasoning: 0.1
}

# Generate synthetic test set (5 QA pairs for this example)
num_qa_pairs = 5
testset = generator.generate_with_langchain_docs(eval_documents, num_qa_pairs, distributions)

# Convert test set to pandas DataFrame for inspection
testset_df = testset.to_pandas()
testset_df.to_csv("testset.csv")

# Load the test set questions and ground truth answers
test_df = pd.read_csv("testset.csv")
test_questions = test_df["question"].values.tolist()
test_groundtruths = test_df["ground_truth"].values.tolist()

embedding nodes:   0%|          | 0/1522 [00:00<?, ?it/s]

Filename and doc_id are the same for all nodes.


Generating:   0%|          | 0/5 [00:00<?, ?it/s]

In [4]:
# Prepare the answers and context lists
answers = []
contexts = []

# Generate answers using the current RAG pipeline
for question in test_questions:
    # Invoke the RAG pipeline and get the response
    response = rag_pipeline.invoke({"query": question})
    
    # Append the generated answer (content) from the response
    answers.append(response.content)
    
    # Access the retrieved context separately from the response if available
    retrieved_context = response.additional_kwargs.get("context", [])
    contexts.append([context.page_content for context in retrieved_context])


In [5]:
# Create the HuggingFace Dataset for evaluation
response_dataset = Dataset.from_dict({
    "question": test_questions,
    "answer": answers,
    "contexts": contexts,
    "ground_truth": test_groundtruths
})

# Evaluate using RAGAS metrics
metrics = [
    faithfulness,
    answer_relevancy,
    context_recall,
    context_precision,
    answer_correctness
]
results = evaluate(response_dataset, metrics)

# Convert results to pandas DataFrame for analysis
results_df = results.to_pandas()
print(results_df)

# Save results to CSV
results_df.to_csv("ragas_evaluation_results.csv")


Evaluating:   0%|          | 0/25 [00:00<?, ?it/s]

                                            question contexts  \
0  What information should be documented regardin...       []   
1  What is the importance of implementing safety ...       []   
2  How is the AI model validated and documented f...       []   
3  How can diverse AI teams establish contextual ...       []   
4  How do privacy risks increase with GAI using p...       []   

                                              answer  \
0  The information that should be documented rega...   
1  The importance of implementing safety measures...   
2  The AI model is explained, validated, and docu...   
3  To establish contextual relevance with domain ...   
4  Privacy risks increase with GAI using personal...   

                                        ground_truth  faithfulness  \
0  Information about the AI system's knowledge li...           0.0   
1  Implementing safety measures, both prior to de...           0.0   
2  The AI model is validated, explained, and docu...          