In [1]:
!pip install -q langchain==0.2.16 langchain-community==0.2.16 langchain-core==0.2.38 langchain-experimental==0.0.65 langchain-openai==0.1.23 langchain-qdrant==0.1.4 qdrant-client==1.11.3 ragas==0.1.20

In [3]:
!pip install pymupdf

Collecting pymupdf
  Downloading PyMuPDF-1.24.11-cp38-abi3-macosx_10_9_x86_64.whl.metadata (3.4 kB)
Downloading PyMuPDF-1.24.11-cp38-abi3-macosx_10_9_x86_64.whl (18.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.9/18.9 MB[0m [31m43.6 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
[?25hInstalling collected packages: pymupdf
Successfully installed pymupdf-1.24.11


In [4]:
# Import necessary libraries
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_experimental.text_splitter import SemanticChunker
from ragas.testset.generator import TestsetGenerator
from ragas.testset.evolutions import simple, reasoning, multi_context
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from datasets import Dataset
from ragas import evaluate
from ragas.metrics import faithfulness, answer_relevancy, context_recall, context_precision, answer_correctness
import pandas as pd
from rag_pipeline import create_rag_pipeline, create_embeddings, create_vector_store
from pydantic.v1 import BaseModel


# Load your documents (use the same documents as the Chainlit app)
pdf_path_1 = "../docs/Blueprint-for-an-AI-Bill-of-Rights.pdf"
pdf_path_2 = "../docs/NIST_AI_600-1.pdf"
loader1 = PyMuPDFLoader(pdf_path_1)
loader2 = PyMuPDFLoader(pdf_path_2)
documents1 = loader1.load()
documents2 = loader2.load()
documents = documents1 + documents2
print(f"Number of documents loaded: {len(documents)}")

# Use Semantic Chunking instead of RecursiveCharacterTextSplitter
embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")
semantic_chunker = SemanticChunker(embeddings)

# Split the documents into semantic chunks
semantic_documents = semantic_chunker.split_documents(documents)
print(f"Number of document chunks created: {len(semantic_documents)}")

# Set up embeddings using ADA
EMBEDDING_MODEL = "text-embedding-ada-002"
embeddings = create_embeddings(model_name=EMBEDDING_MODEL)

# Set up the Qdrant vector store
vectorstore = create_vector_store(semantic_documents, embeddings)

# Create the RAG pipeline using the current ADA model
retriever = vectorstore.as_retriever()
rag_pipeline = create_rag_pipeline(retriever)

# Define the LLMs for test set generation
generator_llm = ChatOpenAI(model="gpt-3.5-turbo")
critic_llm = ChatOpenAI(model="gpt-4o-mini")

# Initialize the TestsetGenerator
generator = TestsetGenerator.from_langchain(
    generator_llm,
    critic_llm,
    embeddings
)

# Define test set distribution (simple, multi_context, reasoning)
distributions = {
    simple: 0.5,
    multi_context: 0.4,
    reasoning: 0.1
}

# Generate synthetic test set (5 QA pairs for this example)
num_qa_pairs = 5
testset = generator.generate_with_langchain_docs(semantic_documents, num_qa_pairs, distributions)
print("Generated synthetic test set:")
print(testset.to_pandas().head())

# Convert test set to pandas DataFrame for inspection
testset_df = testset.to_pandas()
testset_df.to_csv("testset.csv")

# Load the test set questions and ground truth answers
test_df = pd.read_csv("testset.csv")
test_questions = test_df["question"].values.tolist()
test_groundtruths = test_df["ground_truth"].values.tolist()

# Prepare the answers and context lists
answers = []
contexts = []

# Generate answers using the current RAG pipeline
for question in test_questions:
    try:
        # Invoke the RAG pipeline and get the response
        response = rag_pipeline.invoke({"query": question})
        
        # Append the generated answer (content) from the response
        answers.append(response.content)
        
        # Access the retrieved context separately from the response if available
        retrieved_context = response.additional_kwargs.get("context", [])
        contexts.append([context.page_content for context in retrieved_context])

        print(f"Generated answer for question '{question}': {response.content}")
        print(f"Retrieved context: {[context.page_content for context in retrieved_context]}")
    except Exception as e:
        print(f"Error processing question '{question}': {e}")

# Create the HuggingFace Dataset for evaluation
response_dataset = Dataset.from_dict({
    "question": test_questions,
    "answer": answers,
    "contexts": contexts,
    "ground_truth": test_groundtruths
})

# Evaluate using RAGAS metrics
metrics = [
    faithfulness,
    answer_relevancy,
    context_recall,
    context_precision,
    answer_correctness
]

# Run RAGAS evaluation
results = evaluate(response_dataset, metrics)

# Convert results to pandas DataFrame for analysis
results_df = results.to_pandas()
print(results_df)

# Save results to CSV
results_df.to_csv("ragas_evaluation_results_semantic.csv")


Number of documents loaded: 137
Number of document chunks created: 322


Filename and doc_id are the same for all nodes.                   
Generating: 100%|██████████| 5/5 [00:43<00:00,  8.67s/it]


Generated synthetic test set:
                                            question  \
0  What are some examples of security measures th...   
1  How can harmful bias be addressed in the GAI s...   
2  How can diverse teams improve AI red-teaming i...   
3  How can GAI risks be reduced with adversarial ...   
4  What expectations should be met by automated s...   

                                            contexts  \
0  [ \n33 \nMEASURE 2.7: AI system security and r...   
1  [Confabulation \nMG-4.1-005 \nShare transparen...   
2  [Small user studies can provide feedback from ...   
3  [Action ID \nSuggested Action \nGAI Risks \nMS...   
4  [ \n \n \n \n \n \nDATA PRIVACY \nWHAT SHOULD ...   

                                        ground_truth evolution_type  \
0  Some examples of security measures that should...         simple   
1  Harmful bias in the GAI system can be addresse...         simple   
2  Diverse AI red teams can improve AI red-teamin...  multi_context   
3  Conductin

Evaluating: 100%|██████████| 25/25 [00:16<00:00,  1.54it/s]


                                            question contexts  \
0  What are some examples of security measures th...       []   
1  How can harmful bias be addressed in the GAI s...       []   
2  How can diverse teams improve AI red-teaming i...       []   
3  How can GAI risks be reduced with adversarial ...       []   
4  What expectations should be met by automated s...       []   

                                              answer  \
0  Some examples of security measures that should...   
1  To address harmful bias in the GAI system to e...   
2  Diverse teams can improve AI red-teaming in GA...   
3  To reduce GAI risks with adversarial testing a...   
4  The expectations that should be met by automat...   

                                        ground_truth  faithfulness  \
0  Some examples of security measures that should...           0.0   
1  Harmful bias in the GAI system can be addresse...           0.0   
2  Diverse AI red teams can improve AI red-teamin...          