In [1]:
!pip install -qU langsmith langchain-core langchain-community langchain-openai langchain-qdrant

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
langchain-experimental 0.0.64 requires langchain-community<0.3.0,>=0.2.10, but you have langchain-community 0.3.0 which is incompatible.
langchain-experimental 0.0.64 requires langchain-core<0.3.0,>=0.2.27, but you have langchain-core 0.3.2 which is incompatible.
langgraph 0.2.16 requires langchain-core<0.3,>=0.2.27, but you have langchain-core 0.3.2 which is incompatible.
langchain-huggingface 0.0.3 requires langchain-core<0.3,>=0.1.52, but you have langchain-core 0.3.2 which is incompatible.
ragas 0.1.20 requires langchain-core<0.3, but you have langchain-core 0.3.2 which is incompatible.
langgraph-checkpoint 1.0.6 requires langchain-core<0.3,>=0.2.22, but you have langchain-core 0.3.2 which is incompatible.[0m[31m
[0m

In [2]:
!pip install -qU pymupdf ragas

In [3]:
import os
import getpass

os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_API_KEY"] = getpass.getpass("LangChain API Key:")

In [4]:
from uuid import uuid4

os.environ["LANGCHAIN_PROJECT"] = f"AIM_Midterm - SDG - {uuid4().hex[0:8]}"

In [5]:
os.environ["OPENAI_API_KEY"] = getpass.getpass("OpenAI API Key:")

# RAG Chain

In [6]:
from aimakerspace.text_utils import CharacterTextSplitter, PDFFileLoader

pdf_loader_NIST = PDFFileLoader("data/NIST.AI.600-1.pdf")
pdf_loader_Blueprint = PDFFileLoader("data/Blueprint-for-an-AI-Bill-of-Rights.pdf")
documents_NIST = pdf_loader_NIST.load_documents()
documents_Blueprint = pdf_loader_Blueprint.load_documents()

text_splitter = CharacterTextSplitter()
split_documents_NIST = text_splitter.split_texts(documents_NIST)
split_documents_Blueprint = text_splitter.split_texts(documents_Blueprint)

In [7]:
from aimakerspace.openai_utils.prompts import (
    UserRolePrompt,
    SystemRolePrompt,
)
from aimakerspace.openai_utils.chatmodel import ChatOpenAI
from aimakerspace.vectordatabase import VectorDatabase


RAG_PROMPT_TEMPLATE = """ \
Use the provided context to answer the user's query.
You may not answer the user's query unless there is specific context in the following text.
If you do not know the answer, or cannot answer, please respond with "I don't know".
"""

rag_prompt = SystemRolePrompt(RAG_PROMPT_TEMPLATE)

USER_PROMPT_TEMPLATE = """ \
Context:
{context}
User Query:
{question}
"""

user_prompt = UserRolePrompt(USER_PROMPT_TEMPLATE)

class RetrievalAugmentedQAPipeline:
    def __init__(self, llm: ChatOpenAI(), vector_db_retriever: VectorDatabase) -> None:
        self.llm = llm
        self.vector_db_retriever = vector_db_retriever

    async def arun_pipeline(self, question: str):
        context_list = self.vector_db_retriever.search_by_text(question, k=4)

        context_prompt = ""
        for context in context_list:
            context_prompt += context[0] + "\n"

        formatted_system_prompt = rag_prompt.create_message()

        formatted_user_prompt = user_prompt.create_message(question=question, context=context_prompt)

        async def generate_response():
            async for chunk in self.llm.astream([formatted_system_prompt, formatted_user_prompt]):
                yield chunk

        return {"response": generate_response(), "context": context_list}


# Generate synthetic data

In [8]:
from langchain_community.document_loaders import PyMuPDFLoader
from ragas.testset.generator import TestsetGenerator
from ragas.testset.evolutions import simple, reasoning, multi_context
from langchain_openai import ChatOpenAI, OpenAIEmbeddings


filepath_NIST = "data/NIST.AI.600-1.pdf"
filepath_Blueprint = "data/Blueprint-for-an-AI-Bill-of-Rights.pdf"

documents_NIST = PyMuPDFLoader(filepath_NIST).load()
documents_Blueprint = PyMuPDFLoader(filepath_Blueprint).load()
documents = documents_NIST + documents_Blueprint

generator_llm = ChatOpenAI(model="gpt-3.5-turbo")
critic_llm = ChatOpenAI(model="gpt-4o-mini", tags=["base_llm"]) 
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

generator = TestsetGenerator.from_langchain(
    generator_llm,
    critic_llm,
    embeddings
)

distributions = {
    simple: 0.5,
    multi_context: 0.4,
    reasoning: 0.1
}

testset = generator.generate_with_langchain_docs(documents, 20, distributions, with_debugging_logs=True)
testset.to_pandas()

embedding nodes:   0%|          | 0/284 [00:00<?, ?it/s]

Filename and doc_id are the same for all nodes.


Generating:   0%|          | 0/20 [00:00<?, ?it/s]

[ragas.testset.filters.DEBUG] context scoring: {'clarity': 1, 'depth': 1, 'structure': 1, 'relevance': 2, 'score': 1.25}
[ragas.testset.evolutions.INFO] retrying evolution: 0 times
[ragas.testset.filters.DEBUG] context scoring: {'clarity': 2, 'depth': 3, 'structure': 2, 'relevance': 3, 'score': 2.5}
[ragas.testset.evolutions.DEBUG] keyphrases in merged node: ['Listening to the American People', 'Algorithmic and data-driven harms', 'Panel discussions', 'Consumer rights and protections', 'Automated society']
[ragas.testset.filters.DEBUG] context scoring: {'clarity': 1, 'depth': 2, 'structure': 1, 'relevance': 2, 'score': 1.5}
[ragas.testset.evolutions.DEBUG] keyphrases in merged node: ['AI Risk Management Framework', 'Trustworthy AI', 'Bias in Artificial Intelligence', 'Language models', 'AI deception']
[ragas.testset.filters.DEBUG] context scoring: {'clarity': 1, 'depth': 2, 'structure': 1, 'relevance': 2, 'score': 1.5}
[ragas.testset.evolutions.DEBUG] keyphrases in merged node: ['Deepf

Unnamed: 0,question,contexts,ground_truth,evolution_type,metadata,episode_done
0,Why is providing notice and explanation import...,[ \n \n \n \n \nNOTICE & \nEXPLANATION \nWHY ...,Providing notice and explanations in the conte...,simple,[{'source': 'data/Blueprint-for-an-AI-Bill-of-...,True
1,"How can designers, developers, and deployers o...",[ ­­­­­­­\nALGORITHMIC DISCRIMINATION Protecti...,"Designers, developers, and deployers of automa...",simple,[{'source': 'data/Blueprint-for-an-AI-Bill-of-...,True
2,What information should be included in reporti...,"[ \n \n \n \n \n \n \nHUMAN ALTERNATIVES, \nCO...",Reporting for automated systems used in sensit...,simple,[{'source': 'data/Blueprint-for-an-AI-Bill-of-...,True
3,What mechanisms are in place for inventorying ...,[ \n16 \nGOVERN 1.5: Ongoing monitoring and pe...,Mechanisms for inventorying AI systems include...,simple,"[{'source': 'data/NIST.AI.600-1.pdf', 'file_pa...",True
4,What resources are provided in the National In...,[ \n57 \nNational Institute of Standards and T...,The National Institute of Standards and Techno...,simple,"[{'source': 'data/NIST.AI.600-1.pdf', 'file_pa...",True
5,Why is it important for both police and the pu...,[ \n \n \n \n \nNOTICE & \nEXPLANATION \nWHY ...,Both police and the public deserve to understa...,simple,[{'source': 'data/Blueprint-for-an-AI-Bill-of-...,True
6,How can continuous monitoring be implemented t...,[ \n29 \nMS-1.1-006 \nImplement continuous mon...,Continuous monitoring of GAI system impacts ca...,simple,"[{'source': 'data/NIST.AI.600-1.pdf', 'file_pa...",True
7,What sparked the state-wide biometrics morator...,[ \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n...,A state-wide biometrics moratorium was sparked...,simple,[{'source': 'data/Blueprint-for-an-AI-Bill-of-...,True
8,How can bias in artificial intelligence be ide...,[ \n57 \nNational Institute of Standards and T...,Towards a Standard for Identifying and Managin...,simple,"[{'source': 'data/NIST.AI.600-1.pdf', 'file_pa...",True
9,How do advertisement delivery systems reinforc...,[ \n \n \nWHY THIS PRINCIPLE IS IMPORTANT\nTh...,Advertisement delivery systems reinforce racia...,simple,[{'source': 'data/Blueprint-for-an-AI-Bill-of-...,True


# Dataset creation

In [10]:

from langsmith import Client
from datasets import Dataset


client = Client()

dataset_name = "Implications of AI"

dataset = client.create_dataset(
    dataset_name=dataset_name,
    description="Questions about the implications of AI"
)

In [11]:
for test in testset.to_pandas().iterrows():
  client.create_example(
      inputs={
          "question": test[1]["question"]
      },
      outputs={
          "answer": test[1]["ground_truth"]
      },
      metadata={
          "context": test[0]
      },
      dataset_id=dataset.id
  )



In [12]:
test_questions = testset.to_pandas()["question"].values.tolist()
test_groundtruths = testset.to_pandas()["ground_truth"].values.tolist()

print(test_questions[0])
print(test_groundtruths[0])

Why is providing notice and explanation important in the context of automated decision-making processes?
Providing notice and explanations in the context of automated decision-making processes is important because it allows individuals to understand how automated systems are impacting their lives. Without clear explanations, people may not know why certain decisions are made, leading to a lack of transparency and accountability. Notice and explanations also help experts verify the reasonableness of recommendations before they are enacted, ensuring safety and efficacy. In order to guard against potential harms, it is crucial for the public to know if automated systems are being used and how they are making decisions that affect rights, opportunities, and access. Clear and valid explanations should be recognized as a baseline requirement to build trust and confidence in the use of automated systems.


In [91]:
from langchain_openai import ChatOpenAI
from operator import itemgetter
from langchain_core.runnables import RunnablePassthrough
from langchain.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser

from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Qdrant

embeddings = OpenAIEmbeddings()
primary_qa_llm = ChatOpenAI(model_name="gpt-4o-mini")


vectorstore = Qdrant.from_documents(
    documents=documents,
    embedding=embeddings,
    location=":memory:",
    collection_name="Implications of AI"
)

retriever = vectorstore.as_retriever()

# Create a vector store
vector_db = VectorDatabase()
vector_db = await vector_db.abuild_from_list(split_documents_NIST)
vector_db = await vector_db.abuild_from_list(split_documents_Blueprint)

RAG_PROMPT_TEMPLATE = """ \
Use the provided context to answer the user's query.
You may not answer the user's query unless there is specific context in the following text.
If you do not know the answer, or cannot answer, please respond with "I don't know".
Context:
{context}
User Query:
{question}
"""

prompt = ChatPromptTemplate.from_template(RAG_PROMPT_TEMPLATE)

retrieval_augmented_qa_chain = (
    {"context": itemgetter("question") | retriever, "question": itemgetter("question")}
    | RunnablePassthrough.assign(context=itemgetter("context"))
    | {"response": prompt | primary_qa_llm, "context": itemgetter("context")}
)

result = retrieval_augmented_qa_chain.invoke({"question" : "Is AI a threat to humanity?"})
print(result)

{'response': AIMessage(content="I don't know.", additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 4, 'prompt_tokens': 1515, 'total_tokens': 1519, 'completion_tokens_details': {'reasoning_tokens': 0}}, 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_1bb46167f9', 'finish_reason': 'stop', 'logprobs': None}, id='run-f51c923b-1e1d-4a11-a971-2b6d7fbb9aee-0', usage_metadata={'input_tokens': 1515, 'output_tokens': 4, 'total_tokens': 1519}), 'context': [Document(metadata={'source': 'data/Blueprint-for-an-AI-Bill-of-Rights.pdf', 'file_path': 'data/Blueprint-for-an-AI-Bill-of-Rights.pdf', 'page': 3, 'total_pages': 73, 'format': 'PDF 1.6', 'title': 'Blueprint for an AI Bill of Rights', 'author': '', 'subject': '', 'keywords': '', 'creator': 'Adobe Illustrator 26.3 (Macintosh)', 'producer': 'iLovePDF', 'creationDate': "D:20220920133035-04'00'", 'modDate': "D:20221003104118-04'00'", 'trapped': '', 'id': '8b974c51-84d5-4da7-a434-077c375bad2

In [71]:
result = retrieval_augmented_qa_chain.invoke({"question" : "What is NIST?"})
print(result)

{'response': 'NIST stands for the National Institute of Standards and Technology. It develops measurements, technology, tools, and standards to advance reliable, safe, transparent, explainable, privacy-enhanced, and fair artificial intelligence (AI). NIST has been conducting work on AI for more than a decade and is involved in efforts to fulfill the 2023 Executive Order on Safe, Secure, and Trustworthy AI.', 'context': [Document(metadata={'source': 'data/NIST.AI.600-1.pdf', 'file_path': 'data/NIST.AI.600-1.pdf', 'page': 2, 'total_pages': 64, 'format': 'PDF 1.6', 'title': 'Artificial Intelligence Risk Management Framework: Generative Artificial Intelligence Profile', 'author': 'National Institute of Standards and Technology', 'subject': '', 'keywords': '', 'creator': 'Acrobat PDFMaker 24 for Word', 'producer': 'Adobe PDF Library 24.2.159', 'creationDate': "D:20240805141702-04'00'", 'modDate': "D:20240805143048-04'00'", 'trapped': '', 'id': '516fbe2f-9208-4540-a9ee-6d03200f3341', '_id': 

In [72]:
result = retrieval_augmented_qa_chain.invoke({"question" : "What is a concern of AI?"})
print(result)

{'response': 'A concern of AI is that it can produce outputs that may be erroneous, leading to ill-founded decision-making or amplifying harmful biases. Additionally, AI incidents can cause harm to health, disrupt critical infrastructure, violate human rights, or lead to misinformation.', 'context': [Document(metadata={'source': 'data/NIST.AI.600-1.pdf', 'file_path': 'data/NIST.AI.600-1.pdf', 'page': 7, 'total_pages': 64, 'format': 'PDF 1.6', 'title': 'Artificial Intelligence Risk Management Framework: Generative Artificial Intelligence Profile', 'author': 'National Institute of Standards and Technology', 'subject': '', 'keywords': '', 'creator': 'Acrobat PDFMaker 24 for Word', 'producer': 'Adobe PDF Library 24.2.159', 'creationDate': "D:20240805141702-04'00'", 'modDate': "D:20240805143048-04'00'", 'trapped': '', 'id': '0e3c2613-ec8e-40c1-96b5-3453c2d22503', '_id': '04147c3070094fc3affb7f1e7b26043c', '_collection_name': 'Implications of AI'}, page_content='outputs, which may be erroneo

In [75]:
answers = []
contexts = []

for question in test_questions:
  response = retrieval_augmented_qa_chain.invoke({"question" : question})
  answers.append(response["response"].content)
  contexts.append([context.page_content for context in response["context"]])

In [28]:
from datasets import Dataset

response_dataset = Dataset.from_dict({
    "question" : test_questions,
    "answer" : answers,
    "contexts" : contexts,
    "ground_truth" : test_groundtruths
})
response_dataset[0]

{'question': 'Why is providing notice and explanation important in the context of automated decision-making processes?',
 'answer': 'Providing notice and explanation is important in the context of automated decision-making processes because it helps individuals understand if and how automated systems are being used to influence significant outcomes in their lives, such as employment, credit, and legal decisions. Clear and accessible notice allows the public to be informed about the use of these systems, which is essential for ensuring accountability and trust. \n\nMoreover, explanations of how and why decisions are made by automated systems are crucial for validating the fairness and accuracy of those decisions. This transparency enables individuals to contest or challenge decisions that may adversely affect them, as it provides the necessary information to understand the reasoning behind the outcomes. Without such notice and explanations, people may feel powerless and unable to addres

In [29]:
from ragas import evaluate
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    answer_correctness,
    context_recall,
    context_precision,
)

metrics = [
    faithfulness,
    answer_relevancy,
    context_recall,
    context_precision,
    answer_correctness,
]

In [30]:
results = evaluate(response_dataset, metrics)

Evaluating:   0%|          | 0/100 [00:00<?, ?it/s]

In [31]:
results_df = results.to_pandas()
results_df

Unnamed: 0,question,contexts,answer,ground_truth,faithfulness,answer_relevancy,context_recall,context_precision,answer_correctness
0,Why is providing notice and explanation import...,[ \n \n \n \n \nNOTICE & \nEXPLANATION \nWHY ...,Providing notice and explanation is important ...,Providing notice and explanations in the conte...,1.0,0.996779,1.0,1.0,0.747138
1,"How can designers, developers, and deployers o...",[ ­­­­­­­\nALGORITHMIC DISCRIMINATION Protecti...,"Designers, developers, and deployers of automa...","Designers, developers, and deployers of automa...",1.0,0.976883,1.0,1.0,0.653547
2,What information should be included in reporti...,[ \n \n \n \n \n \n \nDATA PRIVACY \nWHAT SHOU...,Reporting for automated systems used in sensit...,Reporting for automated systems used in sensit...,1.0,1.0,1.0,1.0,0.266613
3,What mechanisms are in place for inventorying ...,[ \n16 \nGOVERN 1.5: Ongoing monitoring and pe...,Mechanisms for inventorying AI systems include...,Mechanisms for inventorying AI systems include...,1.0,1.0,1.0,1.0,0.996998
4,What resources are provided in the National In...,[ \nNIST Trustworthy and Responsible AI \nNIS...,I don't know.,The National Institute of Standards and Techno...,0.0,0.0,0.0,0.416667,0.179853
5,Why is it important for both police and the pu...,[ \n \n \n \n \nNOTICE & \nEXPLANATION \nWHY ...,It is important for both police and the public...,Both police and the public deserve to understa...,1.0,0.995294,0.5,1.0,0.78481
6,How can continuous monitoring be implemented t...,[ \n36 \nMEASURE 2.11: Fairness and bias – as ...,Continuous monitoring of GAI system impacts ca...,Continuous monitoring of GAI system impacts ca...,1.0,0.89833,1.0,1.0,0.76127
7,What sparked the state-wide biometrics morator...,[ \n \nAPPENDIX\nPanel 3: Equal Opportunitie...,The state-wide biometrics moratorium was spark...,A state-wide biometrics moratorium was sparked...,1.0,0.928142,1.0,1.0,0.529175
8,How can bias in artificial intelligence be ide...,[ \n \n \nAbout AI at NIST: The National Insti...,According to the National Institute of Standar...,Towards a Standard for Identifying and Managin...,0.769231,0.951132,1.0,0.5,0.227159
9,How do advertisement delivery systems reinforc...,[ \n \n \nWHY THIS PRINCIPLE IS IMPORTANT\nTh...,Advertisement delivery systems reinforce racia...,Advertisement delivery systems reinforce racia...,1.0,1.0,1.0,0.916667,0.997382


# Fine Tuning

In [35]:
import nest_asyncio
nest_asyncio.apply()
!pip install -qU langchain_openai langchain_huggingface langchain_core langchain langchain_community langchain-text-splitters
!pip install -qU faiss-cpu unstructured==0.15.7 python-pptx==1.0.2 nltk==3.9.1

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
langchain-experimental 0.0.64 requires langchain-community<0.3.0,>=0.2.10, but you have langchain-community 0.3.0 which is incompatible.
langchain-experimental 0.0.64 requires langchain-core<0.3.0,>=0.2.27, but you have langchain-core 0.3.2 which is incompatible.
langgraph 0.2.16 requires langchain-core<0.3,>=0.2.27, but you have langchain-core 0.3.2 which is incompatible.
ragas 0.1.20 requires langchain-core<0.3, but you have langchain-core 0.3.2 which is incompatible.
langgraph-checkpoint 1.0.6 requires langchain-core<0.3,>=0.2.22, but you have langchain-core 0.3.2 which is incompatible.[0m[31m
[0m

In [43]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 750,
    chunk_overlap  = 20,
    length_function = len
)

training_documents = text_splitter.split_documents(documents)

from langchain_openai import ChatOpenAI

llm = ChatOpenAI(model="gpt-4o-mini")

In [44]:
import uuid

id_set = set()

for document in training_documents:
  id = str(uuid.uuid4())
  while id in id_set:
    id = uuid.uuid4()
  id_set.add(id)
  document.metadata["id"] = id

In [45]:
training_split_documents = training_documents[:300]
val_split_documents = training_documents[300:350]
test_split_documents = training_documents[350:400]

In [40]:
from langchain_openai import ChatOpenAI

qa_chat_model = ChatOpenAI(
    model="gpt-4o-mini",
    temperature=0
)
from langchain_core.prompts import ChatPromptTemplate

qa_prompt = """\
Given the following context, you must generate questions based on only the provided context.

You are to generate {n_questions} questions which should be provided in the following format:

1. QUESTION #1
2. QUESTION #2
...

Context:
{context}
"""

qa_prompt_template = ChatPromptTemplate.from_template(qa_prompt)

question_generation_chain = qa_prompt_template | qa_chat_model

In [46]:
import tqdm

def create_questions(documents, n_questions):

  questions = {}

  relevant_docs = {}

  for document in tqdm.tqdm(documents):

    document_content = {"context" : document.page_content, "questions" : []}

    questions_generated = question_generation_chain.invoke({"context": document.page_content, "n_questions": n_questions})

    for question in questions_generated.content.split("\n"):

      question_id = str(uuid.uuid4())

      questions[question_id] = "".join(question.split(".")[1:]).strip()

      relevant_docs[question_id] = [document.metadata["id"]]

  return questions, relevant_docs

In [42]:
training_questions, training_relevant_contexts = create_questions(training_split_documents, 2)
val_questions, val_relevant_contexts = create_questions(val_split_documents, 2)
test_questions, test_relevant_contexts = create_questions(test_split_documents, 2)

100%|██████████| 300/300 [06:01<00:00,  1.20s/it]
100%|██████████| 50/50 [00:59<00:00,  1.18s/it]
100%|██████████| 50/50 [00:56<00:00,  1.14s/it]


In [47]:
import json

training_corpus = {train_item.metadata["id"] : train_item.page_content for train_item in training_split_documents}

train_dataset = {
    "questions" : training_questions,
    "relevant_contexts" : training_relevant_contexts,
    "corpus" : training_corpus
}

with open("training_dataset.jsonl", "w") as f:
  json.dump(train_dataset, f)

In [48]:
val_corpus = {val_item.metadata["id"] : val_item.page_content for val_item in val_split_documents}

val_dataset = {
    "questions" : val_questions,
    "relevant_contexts" : val_relevant_contexts,
    "corpus" : val_corpus
}

with open("val_dataset.jsonl", "w") as f:
  json.dump(val_dataset, f)

In [49]:
import json
train_corpus = {test_item.metadata["id"] : test_item.page_content for test_item in test_split_documents}

test_dataset = {
    "questions" : test_questions,
    "relevant_contexts" : test_relevant_contexts,
    "corpus" : train_corpus
}

with open("test_dataset.jsonl", "w") as f:
  json.dump(test_dataset, f)

In [64]:
from operator import itemgetter
from langchain_core.runnables import RunnablePassthrough

trained_docs_vectorstore = Qdrant.from_documents(
    documents=training_documents, # RecursiveCharacterTextSplitter
    embedding=embeddings,
    location=":memory:",
    collection_name="Fine Tuned Implications of AI- RecursiveCharacterTextSplitter"
)

finetune_embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

finetune_vectorstore = Qdrant.from_documents(
    documents=documents,
    embedding=finetune_embeddings,
    location=":memory:",
    collection_name="Fine Tuned Implications of AI- TE3"
)

finetune_retriever = vectorstore.as_retriever()
finetune_retriever = finetune_vectorstore.as_retriever(search_kwargs={"k": 6})

In [65]:
finetune_rag_chain = (
    {"context": itemgetter("question") | finetune_retriever, "question": itemgetter("question")}
    | RunnablePassthrough.assign(context=itemgetter("context"))
    | {"response": prompt | primary_qa_llm, "context": itemgetter("context")}
)

fine_tuned_result = finetune_rag_chain.invoke({"question" : "What is a concern of AI?"})
print(fine_tuned_result)


{'response': 'A concern of AI is the potential for outputs to be erroneous, which may lead to ill-founded decision-making or amplify harmful biases. Additionally, there are risks associated with human-AI configuration, such as inappropriate anthropomorphizing of AI systems, algorithmic aversion, automation bias, over-reliance, or emotional entanglement with AI systems. Furthermore, there is a lowered barrier to entry for generating and supporting the exchange of content that may not distinguish fact from opinion or fiction, which could be leveraged for large-scale disinformation campaigns.', 'context': [Document(metadata={'source': 'data/NIST.AI.600-1.pdf', 'file_path': 'data/NIST.AI.600-1.pdf', 'page': 7, 'total_pages': 64, 'format': 'PDF 1.6', 'title': 'Artificial Intelligence Risk Management Framework: Generative Artificial Intelligence Profile', 'author': 'National Institute of Standards and Technology', 'subject': '', 'keywords': '', 'creator': 'Acrobat PDFMaker 24 for Word', 'pro

In [55]:
!pip install -qU ragas

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
langchain-huggingface 0.1.0 requires langchain-core<0.4,>=0.3.0, but you have langchain-core 0.2.41 which is incompatible.[0m[31m
[0m

In [56]:
from ragas.testset.generator import TestsetGenerator
from ragas.testset.evolutions import simple, reasoning, multi_context
from langchain_openai import OpenAIEmbeddings

generator_llm = ChatOpenAI(model="gpt-3.5-turbo")
critic_llm = ChatOpenAI(model="gpt-4o-mini")
embeddings = OpenAIEmbeddings()

In [57]:
generator = TestsetGenerator.from_langchain(
    generator_llm,
    critic_llm,
    embeddings
)

In [58]:
testset = generator.generate_with_langchain_docs(test_split_documents, test_size=20, distributions={simple: 0.5, reasoning: 0.25, multi_context: 0.25})

embedding nodes:   0%|          | 0/100 [00:00<?, ?it/s]

[ragas.testset.extractor.DEBUG] topics: {'keyphrases': ['Risk assessment', 'Public inspection', 'Trade secrets', 'Discovery in criminal matter']}
[ragas.testset.extractor.DEBUG] topics: {'keyphrases': ['Principles into practice', 'Real-life examples', 'Combat discrimination', 'Mortgage lending', 'Nationwide initiative']}
[ragas.testset.extractor.DEBUG] topics: {'keyphrases': ['Americans with Disabilities Act', 'Title I', 'Disparity assessments', 'Healthcare algorithm', "Black patients' healthcare access"]}
[ragas.testset.extractor.DEBUG] topics: {'keyphrases': ['Automated systems', 'Technical standards', 'Algorithmic discrimination', 'Anti-discrimination law', 'Proactive technical steps']}
[ragas.testset.extractor.DEBUG] topics: {'keyphrases': ['Protection Bureau', 'Prudential regulators', 'Automated Valuation Models', 'Equal Employment Opportunity Commission', 'Department of Justice']}
[ragas.testset.extractor.DEBUG] topics: {'keyphrases': ['Sociodemographic variables', "Algorithm's o

Generating:   0%|          | 0/20 [00:00<?, ?it/s]

[ragas.testset.filters.DEBUG] context scoring: {'clarity': 2, 'depth': 2, 'structure': 2, 'relevance': 3, 'score': 2.25}
[ragas.testset.evolutions.DEBUG] keyphrases in merged node: ['Target measure', 'Unobservable targets', 'Inappropriate use of proxies', 'Algorithmic discrimination', 'Equity goals']
[ragas.testset.filters.DEBUG] context scoring: {'clarity': 2, 'depth': 2, 'structure': 2, 'relevance': 3, 'score': 2.25}
[ragas.testset.evolutions.DEBUG] keyphrases in merged node: ['Americans with Disabilities Act', 'Title I', 'Disparity assessments', 'Healthcare algorithm', "Black patients' healthcare access"]
[ragas.testset.filters.DEBUG] context scoring: {'clarity': 2, 'depth': 2, 'structure': 2, 'relevance': 3, 'score': 2.25}
[ragas.testset.evolutions.DEBUG] keyphrases in merged node: ['Automated systems', 'Technical standards', 'Algorithmic discrimination', 'Independent evaluation', 'Public sector uses']
[ragas.testset.filters.DEBUG] context scoring: {'clarity': 2, 'depth': 2, 'struc

In [59]:
testset.to_pandas().head()

Unnamed: 0,question,contexts,ground_truth,evolution_type,metadata,episode_done
0,"How are standards, guidance, audits, and impac...","[launched, preventing harm to the public. Fede...",Standards and guidance for the use of automate...,simple,[{'source': 'data/Blueprint-for-an-AI-Bill-of-...,True
1,How did the healthcare algorithm discriminate ...,[under Title I of the Americans with Disabilit...,The healthcare algorithm discriminated against...,simple,[{'source': 'data/Blueprint-for-an-AI-Bill-of-...,True
2,What proactive steps can be taken to ensure au...,[WHAT SHOULD BE EXPECTED OF AUTOMATED SYSTEMS\...,The proactive steps that can be taken to ensur...,simple,[{'source': 'data/Blueprint-for-an-AI-Bill-of-...,True
3,What steps are involved in the design stage eq...,"[consultation, design stage equity assessments...",The steps involved in the design stage equity ...,simple,[{'source': 'data/Blueprint-for-an-AI-Bill-of-...,True
4,How has the Department of Justice addressed th...,[Protection Bureau and prudential regulators. ...,The Equal Employment Opportunity Commission an...,simple,[{'source': 'data/Blueprint-for-an-AI-Bill-of-...,True


In [77]:
from datasets import Dataset

def generate_answers(chain, testset):
  answers = []
  contexts = []
  questions = testset.to_pandas()["question"].values.tolist()
  ground_truths = testset.to_pandas()["ground_truth"].values.tolist()

  for question in tqdm.tqdm(questions):
    # answer = chain.invoke({"question" : question})
    # answers.append(answer["response"])
    # contexts.append([context.page_content for context in answer["context"]])
    answer = chain.invoke({"question": question})
    # Extract the content from AIMessage if necessary
    response = answer["response"].content if hasattr(answer["response"], 'content') else answer["response"]
    answers.append(response)
    contexts.append([context.page_content for context in answer["context"]])

  return Dataset.from_dict({
      "question" : questions,
      "answer" : answers,
      "contexts" : contexts,
      "ground_truth" : ground_truths
  })

In [78]:
base_dataset = generate_answers(retrieval_augmented_qa_chain, testset)


  0%|          | 0/20 [00:00<?, ?it/s]

100%|██████████| 20/20 [00:53<00:00,  2.69s/it]


In [79]:
finetune_dataset = generate_answers(finetune_rag_chain, testset)

100%|██████████| 20/20 [00:53<00:00,  2.67s/it]


In [80]:
from ragas.metrics import (
    context_recall,
    context_precision,
)

In [81]:
from ragas import evaluate

result = evaluate(
    base_dataset,
    metrics=[
        context_precision,
        context_recall,
    ],
)

Evaluating:   0%|          | 0/40 [00:00<?, ?it/s]

In [82]:
result

{'context_precision': 0.9417, 'context_recall': 0.9000}

In [83]:
result.to_pandas().head()

Unnamed: 0,question,contexts,answer,ground_truth,context_precision,context_recall
0,"How are standards, guidance, audits, and impac...","[launched, preventing harm to the public. Fede...",Federal government agencies have been developi...,Standards and guidance for the use of automate...,1.0,1.0
1,How did the healthcare algorithm discriminate ...,[under Title I of the Americans with Disabilit...,The healthcare algorithm discriminated against...,The healthcare algorithm discriminated against...,1.0,1.0
2,What proactive steps can be taken to ensure au...,[and deployers of automated systems should tak...,Proactive steps that can be taken to ensure au...,The proactive steps that can be taken to ensur...,1.0,0.666667
3,What steps are involved in the design stage eq...,"[consultation, design stage equity assessments...",The steps involved in the design stage equity ...,The steps involved in the design stage equity ...,0.833333,1.0
4,How has the Department of Justice addressed th...,[the severity of certain diseases in Black Ame...,The Department of Justice has clearly laid out...,The Equal Employment Opportunity Commission an...,1.0,1.0


In [85]:
fine_tuned_result = evaluate(
    finetune_dataset,
    metrics=[
        context_precision,
        context_recall,
    ],
)


Evaluating:   0%|          | 0/40 [00:00<?, ?it/s]

In [86]:
fine_tuned_result


{'context_precision': 0.9347, 'context_recall': 0.9500}

In [87]:
fine_tuned_result.to_pandas().head()

Unnamed: 0,question,contexts,answer,ground_truth,context_precision,context_recall
0,"How are standards, guidance, audits, and impac...","[launched, preventing harm to the public. Fede...",Federal government agencies have been developi...,Standards and guidance for the use of automate...,1.0,1.0
1,How did the healthcare algorithm discriminate ...,[under Title I of the Americans with Disabilit...,The healthcare algorithm discriminated against...,The healthcare algorithm discriminated against...,1.0,1.0
2,What proactive steps can be taken to ensure au...,[and deployers of automated systems should tak...,Proactive steps that can be taken to ensure au...,The proactive steps that can be taken to ensur...,1.0,0.666667
3,What steps are involved in the design stage eq...,"[consultation, design stage equity assessments...",The design stage equity assessments for access...,The steps involved in the design stage equity ...,0.926667,1.0
4,How has the Department of Justice addressed th...,[the severity of certain diseases in Black Ame...,The Department of Justice has clearly laid out...,The Equal Employment Opportunity Commission an...,1.0,1.0
