In [1]:
# Import necessary libraries
from langchain_community.document_loaders import PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from ragas.testset.generator import TestsetGenerator
from ragas.testset.evolutions import simple, reasoning, multi_context
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from datasets import Dataset
from ragas import evaluate
from ragas.metrics import faithfulness, answer_relevancy, context_recall, context_precision, answer_correctness
import pandas as pd
from rag_pipeline import create_rag_pipeline, create_embeddings, create_vector_store

In [2]:
# Load your documents (use the same documents as the Chainlit app)
pdf_path_1 = "../docs/Blueprint-for-an-AI-Bill-of-Rights.pdf"
pdf_path_2 = "../docs/NIST_AI_600-1.pdf"
loader1 = PyMuPDFLoader(pdf_path_1)
loader2 = PyMuPDFLoader(pdf_path_2)
documents1 = loader1.load()
documents2 = loader2.load()
documents = documents1 + documents2

In [3]:
print(f"Number of documents loaded: {len(documents)}")

Number of documents loaded: 137


In [4]:
# Split the documents into chunks
text_splitter_eval = RecursiveCharacterTextSplitter(
    chunk_size=600,
    chunk_overlap=50
)
eval_documents = text_splitter_eval.split_documents(documents)

In [5]:
print(f"Number of document chunks created: {len(eval_documents)}")

Number of document chunks created: 761


In [6]:
# Set up embeddings using ADA
EMBEDDING_MODEL = "text-embedding-ada-002"
embeddings = create_embeddings(model_name=EMBEDDING_MODEL)

# Set up the Qdrant vector store
vectorstore = create_vector_store(eval_documents, embeddings)

# Create the RAG pipeline using the current ADA model
retriever = vectorstore.as_retriever()
rag_pipeline = create_rag_pipeline(retriever)

# Define the LLMs for test set generation
generator_llm = ChatOpenAI(model="gpt-3.5-turbo")
critic_llm = ChatOpenAI(model="gpt-4o-mini")

# Initialize the TestsetGenerator
generator = TestsetGenerator.from_langchain(
    generator_llm,
    critic_llm,
    embeddings
)

# Define test set distribution (simple, multi_context, reasoning)
distributions = {
    simple: 0.5,
    multi_context: 0.4,
    reasoning: 0.1
}

# Generate synthetic test set (5 QA pairs for this example)
num_qa_pairs = 5
testset = generator.generate_with_langchain_docs(eval_documents, num_qa_pairs, distributions)

embedding nodes:   0%|          | 0/1522 [00:00<?, ?it/s]

Filename and doc_id are the same for all nodes.


Generating:   0%|          | 0/5 [00:00<?, ?it/s]

In [7]:
print("Generated synthetic test set:")
print(testset.to_pandas().head())

Generated synthetic test set:
                                            question  \
0  What tasks are AI Actors responsible for in th...   
1  How does the lack of transparency in GAI syste...   
2  How can organizations document AI risk within ...   
3  How does generative AI impact data poisoning i...   
4  How can AI teams ensure diverse representation...   

                                            contexts  \
0  [Chain and Component Integration \nMP-5.2-002 ...   
1  [2.4. Data Privacy \nGAI systems raise several...   
2  [40 \nMANAGE 1.3: Responses to the AI risks de...   
3  [arXiv. https://arxiv.org/pdf/2310.07879 \nLen...   
4  [AI Actor Tasks: AI Deployment \n \nMAP 1.2: I...   

                                        ground_truth evolution_type  \
0  AI Actors responsible for the integration of c...         simple   
1  The lack of transparency in GAI system trainin...         simple   
2  Responses to the AI risks deemed high priority...  multi_context   
3  The answe

In [8]:
# Convert test set to pandas DataFrame for inspection
testset_df = testset.to_pandas()
testset_df.to_csv("testset.csv")

# Load the test set questions and ground truth answers
test_df = pd.read_csv("testset.csv")
test_questions = test_df["question"].values.tolist()
test_groundtruths = test_df["ground_truth"].values.tolist()

In [9]:
# Prepare the answers and context lists
answers = []
contexts = []

# Generate answers using the current RAG pipeline
for question in test_questions:
    # Invoke the RAG pipeline and get the response
    response = rag_pipeline.invoke({"query": question})
    
    # Append the generated answer (content) from the response
    answers.append(response.content)
    
    # Access the retrieved context separately from the response if available
    retrieved_context = response.additional_kwargs.get("context", [])
    contexts.append([context.page_content for context in retrieved_context])


In [10]:
print(f"Generated answer for question '{question}': {response.content}")
print(f"Retrieved context: {[context.page_content for context in retrieved_context]}")

Generated answer for question 'How can AI teams ensure diverse representation for interdisciplinary collaborations?': To ensure diverse representation for interdisciplinary collaborations, AI teams can establish and empower interdisciplinary teams that reflect a wide range of capabilities, competencies, demographic groups, domain expertise, and educational backgrounds. This approach helps prioritize opportunities for interdisciplinary collaboration and ensures a diverse mix of perspectives and skills within the team.
Retrieved context: []


In [11]:
# Create the HuggingFace Dataset for evaluation
response_dataset = Dataset.from_dict({
    "question": test_questions,
    "answer": answers,
    "contexts": contexts,
    "ground_truth": test_groundtruths
})

# Evaluate using RAGAS metrics
metrics = [
    faithfulness,
    answer_relevancy,
    context_recall,
    context_precision,
    answer_correctness
]
results = evaluate(response_dataset, metrics)

Evaluating:   0%|          | 0/25 [00:00<?, ?it/s]

In [13]:
# Convert results to pandas DataFrame for analysis
results_df = results.to_pandas()
print(results_df)

                                            question contexts  \
0  What tasks are AI Actors responsible for in th...       []   
1  How does the lack of transparency in GAI syste...       []   
2  How can organizations document AI risk within ...       []   
3  How does generative AI impact data poisoning i...       []   
4  How can AI teams ensure diverse representation...       []   

                                              answer  \
0  AI Actors are responsible for tasks such as AI...   
1  The lack of transparency in GAI system trainin...   
2  To document AI risk within their limits, organ...   
3                                      I don't know.   
4  To ensure diverse representation for interdisc...   

                                        ground_truth  faithfulness  \
0  AI Actors responsible for the integration of c...           0.0   
1  The lack of transparency in GAI system trainin...           0.0   
2  Responses to the AI risks deemed high priority...          

In [14]:
# Save results to CSV
results_df.to_csv("ragas_evaluation_results.csv")
