In [3]:
# imports
import tiktoken
import os
from qdrant_client import QdrantClient
from langchain_qdrant import QdrantVectorStore
from qdrant_client.http.models import Distance, VectorParams
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_community.vectorstores import Qdrant
from langchain_openai.llms import OpenAI
from langchain_openai.chat_models import ChatOpenAI
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain.chains.summarize import load_summarize_chain
from langchain.chains.conversation.memory import ConversationSummaryBufferMemory
from langchain.chains.conversation.base import ConversationChain
from langchain_core.runnables import RunnableParallel, RunnablePassthrough
from langchain_core.messages import SystemMessage, AIMessage, HumanMessage
from langchain_core.prompts import (ChatMessagePromptTemplate, SystemMessagePromptTemplate, 
                                    AIMessagePromptTemplate, HumanMessagePromptTemplate)
from langchain_community.chat_message_histories import ChatMessageHistory
from langchain_core.chat_history import BaseChatMessageHistory
from langchain_core.runnables.history import RunnableWithMessageHistory
from langchain_core.prompts.chat import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain.output_parsers import PydanticOutputParser
from langchain.output_parsers import OutputFixingParser

# RAGAS imports
from ragas.testset.generator import TestsetGenerator
from ragas.testset.evolutions import simple, reasoning, multi_context

In [5]:
from dotenv import load_dotenv; _ = load_dotenv()

# Synthetic Test Data Generation

## Index relevant documents
We begin by first loading and indexing documents related to AI Ethics. These documents will be used to generate synthetic test data.

In [6]:
# Load datasets
pdf_loader = PyMuPDFLoader('Blueprint-for-an-AI-Bill-of-Rights.pdf')
docs_bill_of_rights = pdf_loader.load()

In [7]:
pdf_loader = PyMuPDFLoader('NIST.AI.600-1.pdf')
docs_nist = pdf_loader.load()

In [8]:
documents = docs_bill_of_rights + docs_nist

In [49]:
# Split documents in preparation for SDG
def tiktoken_len(text, model='text-embedding-3-small'):
    embedding = tiktoken.encoding_for_model(model)
    query = embedding.encode(text)
    return len(query)


text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=200,
    length_function=tiktoken_len
)

In [None]:
split_documents = text_splitter.split_documents(documents)

In [13]:
# Index documents
LOCATION = ":memory:"
COLLECTION_NAME = "ai-ethics-sdg"
VECTOR_SIZE = 1536

embedding = OpenAIEmbeddings(model='text-embedding-3-small')

qdrant_client = QdrantClient(LOCATION)

qdrant_client.create_collection(
    collection_name=COLLECTION_NAME,
    vectors_config=VectorParams(size=VECTOR_SIZE, distance=Distance.COSINE)
)

qdrant_vector_store = QdrantVectorStore(
    client=qdrant_client,
    collection_name=COLLECTION_NAME,
    embedding=embedding
)

qdrant_vector_store.add_documents(split_documents)

['2f5197eba494411283efc7efce5a8960',
 '2012da8723dd46808ac7f96e922aefb1',
 '9a68d8eb03bf4a27a6a4562aca5e791e',
 'ac265d97ebd44634b2f26a779a1bfb28',
 'c01bdb229a7041978cc124a5e1332ba7',
 '59d6abb72460429fbb198d274dc55c58',
 '3b09e720beb84d72b1e3c2a3cc728750',
 'e3cde20aa5764e7d91e5548101505bf7',
 'ea6c8e687ec448b394247e485df0cbf9',
 '1b9d9265d6f2433ab3ee4cfe1c2d5054',
 '3f82bb4754f440bfa83369a3d169aaed',
 '727f0defb1744bb6802db51177099bf5',
 '715ab65b957e4c3d9a3eeed4cac362fe',
 '77f86b63c75b4693bdebbeeb459bed66',
 'bb20387ec91e47e78610bcbaf08d1151',
 '8d486d77d52e4ed78f7fdf356f6a87f4',
 '523e5f60aeea408fb28e1501145f1067',
 '885f99f398da4007b348567d33b99fc9',
 '9e4d5a52c6f64356afea9e81cd1d31dd',
 '51183dc3ba374fb39eee25597ced6a91',
 '645c08de20ef40edbbf4c32a4b80a7b7',
 'd540a540aa3c4ba892b12047f4bef6a7',
 'f057a3b9021e403597f532472388bc0b',
 'edfd39342f554800a0dc7d67480c350d',
 '5f5cd2822dc0482ca773b7e31b46b253',
 '4f2577d89eb04189aef32861bbaadb74',
 'cb5efb3353464d91aa0dff3fad211a4f',
 

In [35]:
# Set up the RAG retrieval chain
from operator import itemgetter
prompt = """
Please answer the question below using the provided context. If the question cannnot be answered
using the context, politely state that you can't answer that question.

Question:
{question}

Context:
{context}
"""
prompt = ChatPromptTemplate.from_template(prompt)
embedding = OpenAIEmbeddings(model='text-embedding-3-small')
retriever = qdrant_vector_store.as_retriever()
llm = ChatOpenAI(model='gpt-4o', temperature=0)
rag_chain = (
    {"context": itemgetter("question") | retriever, "question": itemgetter("question")}
    | RunnablePassthrough.assign(context=itemgetter("context"))
    | {"response": prompt | llm, "context": itemgetter("context")}
)

## Generate Synthetic Data
With the retrieval QA chain ready to test, we generate synthetic data

In [None]:
generator_llm = ChatOpenAI(model="gpt-3.5-turbo")
critic_llm = ChatOpenAI(model="gpt-4o-mini")
embeddings = OpenAIEmbeddings()

generator = TestsetGenerator.from_langchain(
    generator_llm,
    critic_llm,
    embeddings
)

distributions = {
    simple: 0.75,
    multi_context: 0.20,
    reasoning: 0.05
}

num_qa_pairs = 200 # You can reduce the number of QA pairs to 5 if you're experiencing rate-limiting issues

testset = generator.generate_with_langchain_docs(split_documents, num_qa_pairs, distributions)

In [17]:
df = testset.to_pandas()
df.to_csv('golden_eval_set.csv')

In [18]:
df.head()

Unnamed: 0,question,contexts,ground_truth,evolution_type,metadata,episode_done
0,How can organizations enhance risk management ...,"[acquisition, human resources, legal, complian...",Organizations can enhance risk management rega...,simple,"[{'source': 'NIST.AI.600-1.pdf', 'file_path': ...",True
1,How is information integrity maintained in the...,[Information Integrity \nAI Actor Tasks: AI De...,Information integrity in the context of AI dev...,simple,"[{'source': 'NIST.AI.600-1.pdf', 'file_path': ...",True
2,How do conventional cybersecurity practices ne...,"[11 \nvalue chain (e.g., data inputs, processi...",Conventional cybersecurity practices may need ...,simple,"[{'source': 'NIST.AI.600-1.pdf', 'file_path': ...",True
3,How can the verification of fine-tuning ensure...,[MS-2.7-004 \nIdentify metrics that reﬂect the...,The verification of fine-tuning ensures that s...,simple,"[{'source': 'NIST.AI.600-1.pdf', 'file_path': ...",True
4,What are examples of sensitive information tha...,[entities. \n7 What is categorized as sensiti...,Examples of sensitive information that may be ...,simple,"[{'source': 'NIST.AI.600-1.pdf', 'file_path': ...",True


## Evaluating RAG for text-embedding-3-small Pipeline using RAGAS

In [19]:
import pandas as pd
df = pd.read_csv('golden_eval_set.csv')

In [41]:
# Get answers to questions using our rag_chain
from tqdm.auto import tqdm
test_questions = df["question"].values.tolist()
test_groundtruths = df["ground_truth"].values.tolist()
answers = []
contexts = []

for question in tqdm(test_questions):
  response = rag_chain.invoke({"question": question})
  answers.append(response['response'].content)
  contexts.append([context.page_content for context in response["context"]])

In [42]:
from datasets import Dataset
response_dataset = Dataset.from_dict({
    "question" : test_questions,
    "answer" : answers,
    "contexts" : contexts,
    "ground_truth" : test_groundtruths
})
response_dataset.save_to_disk('te3-responses')

Saving the dataset (0/1 shards):   0%|          | 0/193 [00:00<?, ? examples/s]

In [43]:
from ragas import evaluate
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    answer_correctness,
    context_recall,
    context_precision,
)

metrics = [
    faithfulness,
    answer_relevancy,
    context_recall,
    context_precision,
    answer_correctness,
]

In [44]:
results = evaluate(response_dataset, metrics)

Evaluating:   0%|          | 0/965 [00:00<?, ?it/s]

Failed to parse output. Returning None.
Failed to parse output. Returning None.


In [45]:
results

{'faithfulness': 0.9130, 'answer_relevancy': 0.8136, 'context_recall': 0.9033, 'context_precision': 0.8869, 'answer_correctness': 0.6251}

Looks like `text-embedding-3-small` model performs quite well out of the box. 

In [48]:
# Saving the results
import pickle
with open('te3-eval-results', 'wb') as f:
    pickle.dump(results, f)

## Evaluating RAG for text-embedding-3-large and SemanticChunker Pipeline using RAGAS

In [54]:
!pip install -qU langchain-experimental

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
langchain-huggingface 0.0.3 requires langchain-core<0.3,>=0.1.52, but you have langchain-core 0.3.5 which is incompatible.
langchain-openai 0.1.25 requires langchain-core<0.3.0,>=0.2.40, but you have langchain-core 0.3.5 which is incompatible.
ragas 0.1.20 requires langchain-core<0.3, but you have langchain-core 0.3.5 which is incompatible.[0m[31m
[0m

In [56]:
from functools import partial
from langchain_experimental.text_splitter import SemanticChunker

text_splitter = SemanticChunker(OpenAIEmbeddings(model='text-embedding-3-large'))
split_documents = text_splitter.split_documents(documents)

In [70]:
# Save the semantically chunked docs because they are hard to compute
with open('semantic_chunked_docs', 'wb') as f:
    pickle.dump(split_documents, f)

In [57]:
# Index Documents
# Index documents
LOCATION = ":memory:"
COLLECTION_NAME = "ai-ethics-sdg-te3-large-semantic"
VECTOR_SIZE = 3072

embedding = OpenAIEmbeddings(model='text-embedding-3-large')

qdrant_client = QdrantClient(LOCATION)

qdrant_client.create_collection(
    collection_name=COLLECTION_NAME,
    vectors_config=VectorParams(size=VECTOR_SIZE, distance=Distance.COSINE)
)

qdrant_vector_store = QdrantVectorStore(
    client=qdrant_client,
    collection_name=COLLECTION_NAME,
    embedding=embedding
)

_ = qdrant_vector_store.add_documents(split_documents)

In [58]:
# Set up the RAG retrieval chain
from operator import itemgetter
prompt = """
Please answer the question below using the provided context. If the question cannnot be answered
using the context, politely state that you can't answer that question.

Question:
{question}

Context:
{context}
"""
prompt = ChatPromptTemplate.from_template(prompt)
embedding = OpenAIEmbeddings(model='text-embedding-3-large')
retriever = qdrant_vector_store.as_retriever()
llm = ChatOpenAI(model='gpt-4o', temperature=0)
rag_chain = (
    {"context": itemgetter("question") | retriever, "question": itemgetter("question")}
    | RunnablePassthrough.assign(context=itemgetter("context"))
    | {"response": prompt | llm, "context": itemgetter("context")}
)

In [60]:
# Get answers to questions using our rag_chain
from tqdm.auto import tqdm
test_questions = df["question"].values.tolist()
test_groundtruths = df["ground_truth"].values.tolist()
answers = []
contexts = []

for question in tqdm(test_questions):
  response = rag_chain.invoke({"question": question})
  answers.append(response['response'].content)
  contexts.append([context.page_content for context in response["context"]])

  0%|          | 0/193 [00:00<?, ?it/s]

In [61]:
from datasets import Dataset
response_dataset = Dataset.from_dict({
    "question" : test_questions,
    "answer" : answers,
    "contexts" : contexts,
    "ground_truth" : test_groundtruths
})
response_dataset.save_to_disk('te3-large-responses-semantic')

Saving the dataset (0/1 shards):   0%|          | 0/193 [00:00<?, ? examples/s]

In [62]:
from ragas import evaluate
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    answer_correctness,
    context_recall,
    context_precision,
)

metrics = [
    faithfulness,
    answer_relevancy,
    context_recall,
    context_precision,
    answer_correctness,
]

results = evaluate(response_dataset, metrics)

Evaluating:   0%|          | 0/965 [00:00<?, ?it/s]

Failed to parse output. Returning None.


In [63]:
results

{'faithfulness': 0.9020, 'answer_relevancy': 0.8151, 'context_recall': 0.8581, 'context_precision': 0.9013, 'answer_correctness': 0.6180}

In [64]:
# Saving the results
import pickle
with open('te3-large-semantic-eval-results', 'wb') as f:
    pickle.dump(results, f)

In [66]:
with open('te3-eval-results', 'rb') as f:
    results_te3_small = pickle.load(f)

In [67]:
with open('te3-large-semantic-eval-results', 'rb') as f:
    results_te3_large_semantic = pickle.load(f)

# Comparing the two Chunking Strategies

In [68]:
df_te3_small = pd.DataFrame(list(results_te3_small.items()), columns=['Metric', 'TE3-Small'])
df_te3_large_semantic = pd.DataFrame(list(results_te3_large_semantic.items()), columns=['Metric', 'TE3-Large-Semantic'])
df_merged = pd.merge(df_te3_small, df_te3_large_semantic, on='Metric')
df_merged['TE3-Small -> TE3-Large-Semantic'] = df_merged['TE3-Large-Semantic'] - df_merged['TE3-Small']
df_merged

Unnamed: 0,Metric,TE3-Small,TE3-Large-Semantic,TE3-Small -> TE3-Large-Semantic
0,faithfulness,0.91295,0.902031,-0.01092
1,answer_relevancy,0.813614,0.815119,0.001505
2,context_recall,0.903331,0.858105,-0.045226
3,context_precision,0.886874,0.901267,0.014393
4,answer_correctness,0.625057,0.618008,-0.007049


The above chart shows that while there are minor changes in some of the metrics, **there is no significant difference in these two chunking strategies.**