In [1]:
!pip install -qU langsmith langchain-core langchain-community langchain-openai langchain-qdrant

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
langchain-experimental 0.0.64 requires langchain-community<0.3.0,>=0.2.10, but you have langchain-community 0.3.0 which is incompatible.
langchain-experimental 0.0.64 requires langchain-core<0.3.0,>=0.2.27, but you have langchain-core 0.3.1 which is incompatible.
langgraph 0.2.16 requires langchain-core<0.3,>=0.2.27, but you have langchain-core 0.3.1 which is incompatible.
langchain-huggingface 0.0.3 requires langchain-core<0.3,>=0.1.52, but you have langchain-core 0.3.1 which is incompatible.
langgraph-checkpoint 1.0.6 requires langchain-core<0.3,>=0.2.22, but you have langchain-core 0.3.1 which is incompatible.[0m[31m
[0m

In [20]:
!pip install -qU pymupdf ragas

In [3]:
import os
import getpass

os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_API_KEY"] = getpass.getpass("LangChain API Key:")

In [4]:
from uuid import uuid4

os.environ["LANGCHAIN_PROJECT"] = f"AIM_Midterm - SDG - {uuid4().hex[0:8]}"

In [5]:
os.environ["OPENAI_API_KEY"] = getpass.getpass("OpenAI API Key:")

# RAG Chain

In [24]:
from langchain_community.document_loaders import PyMuPDFLoader

filepath_NIST = "data/NIST.AI.600-1.pdf"
filepath_Blueprint = "data/Blueprint-for-an-AI-Bill-of-Rights.pdf"

documents_NIST = PyMuPDFLoader(filepath_NIST).load()
documents_Blueprint = PyMuPDFLoader(filepath_Blueprint).load()


In [26]:
documents = documents_NIST + documents_Blueprint
# rag_documents = PyMuPDFLoader(documents).load()

In [27]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 500,
    chunk_overlap = 50
)

rag_documents = text_splitter.split_documents(documents)

In [28]:
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Qdrant
from langchain_qdrant import QdrantVectorStore
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams

embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

vectorstore = Qdrant.from_documents(
    documents=rag_documents,
    embedding=embeddings,
    location=":memory:",
    collection_name="Implications of AI"
)

retriever = vectorstore.as_retriever()

In [29]:
from langchain.prompts import ChatPromptTemplate

RAG_PROMPT = """\
Given a provided context and question, you must answer the question based only on context.

If you cannot answer the question based on the context - you must say "I don't know".

Context: {context}
Question: {question}
"""

rag_prompt = ChatPromptTemplate.from_template(RAG_PROMPT)

# Generate synthetic data

In [6]:
from langchain_community.document_loaders import PyMuPDFLoader
from ragas.testset.generator import TestsetGenerator
from ragas.testset.evolutions import simple, reasoning, multi_context
from langchain_openai import ChatOpenAI, OpenAIEmbeddings


filepath_NIST = "data/NIST.AI.600-1.pdf"
filepath_Blueprint = "data/Blueprint-for-an-AI-Bill-of-Rights.pdf"

documents_NIST = PyMuPDFLoader(filepath_NIST).load()
documents_Blueprint = PyMuPDFLoader(filepath_Blueprint).load()
documents = documents_NIST + documents_Blueprint

generator_llm = ChatOpenAI(model="gpt-3.5-turbo")
critic_llm = ChatOpenAI(model="gpt-4o-mini", tags=["base_llm"]) 
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

generator = TestsetGenerator.from_langchain(
    generator_llm,
    critic_llm,
    embeddings
)

distributions = {
    simple: 0.5,
    multi_context: 0.4,
    reasoning: 0.1
}

testset = generator.generate_with_langchain_docs(documents, 20, distributions, with_debugging_logs=True)
testset.to_pandas()

embedding nodes:   0%|          | 0/284 [00:00<?, ?it/s]

Filename and doc_id are the same for all nodes.


Generating:   0%|          | 0/20 [00:00<?, ?it/s]

[ragas.testset.filters.DEBUG] context scoring: {'clarity': 1, 'depth': 2, 'structure': 2, 'relevance': 3, 'score': 2.0}
[ragas.testset.evolutions.DEBUG] keyphrases in merged node: ['Unacceptable use', 'Harmful bias and homogenization', 'GAI risks', 'Information integrity', 'Transparent policies']
[ragas.testset.filters.DEBUG] context scoring: {'clarity': 1, 'depth': 2, 'structure': 1, 'relevance': 2, 'score': 1.5}
[ragas.testset.evolutions.DEBUG] keyphrases in merged node: ['AI Risk Management Framework', 'Bias in Artificial Intelligence', 'Trustworthy AI', 'Language models', 'Synthetic media transparency']
[ragas.testset.filters.DEBUG] context scoring: {'clarity': 2, 'depth': 3, 'structure': 2, 'relevance': 3, 'score': 2.5}
[ragas.testset.evolutions.DEBUG] keyphrases in merged node: ['Automated sentiment analyzer', 'Search engine results', 'Advertisement delivery systems', 'Body scanners at airport checkpoints', 'Algorithmic discrimination protections']
[ragas.testset.filters.DEBUG] c

Unnamed: 0,question,contexts,ground_truth,evolution_type,metadata,episode_done
0,How do language models contribute to the reduc...,[ \n57 \nNational Institute of Standards and T...,The answer to given question is not present in...,simple,"[{'source': 'data/NIST.AI.600-1.pdf', 'file_pa...",True
1,What should be provided in terms of notice and...,"[ \n \n \n \n \n \nHUMAN ALTERNATIVES, \nCONSI...",Those impacted by an automated system should b...,simple,[{'source': 'data/Blueprint-for-an-AI-Bill-of-...,True
2,"How can designers, developers, and deployers o...",[ ­­­­­­­\nALGORITHMIC DISCRIMINATION Protecti...,"Designers, developers, and deployers of automa...",simple,[{'source': 'data/Blueprint-for-an-AI-Bill-of-...,True
3,What benefits have been publicly described by ...,[ \nENDNOTES\n12. Expectations about reporting...,The benefits of 'traffic calming' measures hav...,simple,[{'source': 'data/Blueprint-for-an-AI-Bill-of-...,True
4,What is the purpose of AI Red-teaming in testi...,[ \n49 \nearly lifecycle TEVV approaches are d...,AI Red-teaming is a structured testing exercis...,simple,"[{'source': 'data/NIST.AI.600-1.pdf', 'file_pa...",True
5,What is the importance of training and assessm...,"[ \n \n \n \n \n \n \nHUMAN ALTERNATIVES, \nCO...",Training and assessment are crucial in ensurin...,simple,[{'source': 'data/Blueprint-for-an-AI-Bill-of-...,True
6,How do advertisement delivery systems reinforc...,[ \n \n \nWHY THIS PRINCIPLE IS IMPORTANT\nTh...,Advertisement delivery systems reinforce racia...,simple,[{'source': 'data/Blueprint-for-an-AI-Bill-of-...,True
7,What is the purpose of the Blueprint for an AI...,[ \n \n \n \n \n \n \n \n \n \n \n \n \n \nAbo...,The Blueprint for an AI Bill of Rights is inte...,simple,[{'source': 'data/Blueprint-for-an-AI-Bill-of-...,True
8,What are the key privacy protections provided ...,[ \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n...,The Privacy Act of 1974 provides privacy prote...,simple,[{'source': 'data/Blueprint-for-an-AI-Bill-of-...,True
9,How does the Fair Credit Reporting Act ensure ...,[ \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n...,The Fair Credit Reporting Act ensures that con...,simple,[{'source': 'data/Blueprint-for-an-AI-Bill-of-...,True


# Dataset creation

In [7]:

from langsmith import Client
from datasets import Dataset


client = Client()

dataset_name = "Implications of AI"

dataset = client.create_dataset(
    dataset_name=dataset_name,
    description="Questions about the implications of AI"
)

In [11]:
for test in testset.to_pandas().iterrows():
  client.create_example(
      inputs={
          "question": test[1]["question"]
      },
      outputs={
          "answer": test[1]["ground_truth"]
      },
      metadata={
          "context": test[0]
      },
      dataset_id=dataset.id
  )



In [18]:
test_questions = testset.to_pandas()["question"].values.tolist()
test_groundtruths = testset.to_pandas()["ground_truth"].values.tolist()

print(test_questions[0])
print(test_groundtruths[0])

How do language models contribute to the reduction of content diversity in writing?
The answer to given question is not present in context


In [39]:
from langchain_openai import ChatOpenAI
from operator import itemgetter
from langchain_core.runnables import RunnablePassthrough, RunnableParallel
from langchain.schema import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

llm = ChatOpenAI(model="gpt-4o-mini", tags=["base_llm"]) 

rag_chain = (
    {"context": itemgetter("question") | retriever, "question": itemgetter("question")}
    | rag_prompt | llm | StrOutputParser()
)

rag_qa_chain = (
    {"context": itemgetter("question") | retriever, "question": itemgetter("question")}
    | RunnablePassthrough.assign(context=itemgetter("context"))
    | {"response": rag_prompt | llm, "context": itemgetter("context")}
)

result = rag_qa_chain.invoke({"question" : "Is AI a threat to humanity?"})
print(result)

{'response': AIMessage(content="I don't know.", additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 4, 'prompt_tokens': 1238, 'total_tokens': 1242, 'completion_tokens_details': {'reasoning_tokens': 0}}, 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_2d87079ca9', 'finish_reason': 'stop', 'logprobs': None}, id='run-6db82f54-ddff-4079-b8a4-dd0dbe43a358-0', usage_metadata={'input_tokens': 1238, 'output_tokens': 4, 'total_tokens': 1242}), 'context': [Document(metadata={'source': 'data/NIST.AI.600-1.pdf', 'file_path': 'data/NIST.AI.600-1.pdf', 'page': 6, 'total_pages': 64, 'format': 'PDF 1.6', 'title': 'Artificial Intelligence Risk Management Framework: Generative Artificial Intelligence Profile', 'author': 'National Institute of Standards and Technology', 'subject': '', 'keywords': '', 'creator': 'Acrobat PDFMaker 24 for Word', 'producer': 'Adobe PDF Library 24.2.159', 'creationDate': "D:20240805141702-04'00'", 'modDate': "D:202408

In [43]:
answers = []
contexts = []

for question in test_questions:
  response = rag_qa_chain.invoke({"question" : question})
  answers.append(response["response"].content)
  contexts.append([context.page_content for context in response["context"]])

In [44]:
from datasets import Dataset

response_dataset = Dataset.from_dict({
    "question" : test_questions,
    "answer" : answers,
    "contexts" : contexts,
    "ground_truth" : test_groundtruths
})
response_dataset[0]

{'question': 'How do language models contribute to the reduction of content diversity in writing?',
 'answer': 'Language models can contribute to the reduction of content diversity in writing by producing overly homogenized outputs, which can be incorrect or lead to unreliable decision-making and amplify harmful biases. This phenomenon can flow from foundation models to downstream models and systems, with the foundation models acting as “bottlenecks” or single points of failure. Overly homogenized content can also contribute to what is referred to as “model collapse.”',
 'contexts': ['https://doi.org/10.1787/2448f04b-en \nOECD (2024) "Deﬁning AI incidents and related terms" OECD Artiﬁcial Intelligence Papers, No. 16, OECD \nPublishing, Paris. https://doi.org/10.1787/d1a8d965-en \nOpenAI (2023) GPT-4 System Card. https://cdn.openai.com/papers/gpt-4-system-card.pdf \nOpenAI (2024) GPT-4 Technical Report. https://arxiv.org/pdf/2303.08774 \nPadmakumar, V. et al. (2024) Does writing with la

In [None]:
# from langchain.text_splitter import RecursiveCharacterTextSplitter

# text_splitter = RecursiveCharacterTextSplitter(
#     chunk_size = 500,
#     chunk_overlap = 50
# )

# rag_documents = text_splitter.split_documents(rag_documents)

In [45]:
from ragas import evaluate
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    answer_correctness,
    context_recall,
    context_precision,
)

metrics = [
    faithfulness,
    answer_relevancy,
    context_recall,
    context_precision,
    answer_correctness,
]

In [46]:
results = evaluate(response_dataset, metrics)

Evaluating:   0%|          | 0/100 [00:00<?, ?it/s]

In [47]:
results_df = results.to_pandas()
results_df

Unnamed: 0,question,contexts,answer,ground_truth,faithfulness,answer_relevancy,context_recall,context_precision,answer_correctness
0,How do language models contribute to the reduc...,[https://doi.org/10.1787/2448f04b-en \nOECD (2...,Language models can contribute to the reductio...,The answer to given question is not present in...,1.0,0.967219,1.0,0.0,0.178897
1,What should be provided in terms of notice and...,"[alternative, where appropriate \nBrief, clear...","There should be a brief, clear notice that ind...",Those impacted by an automated system should b...,1.0,0.958077,1.0,1.0,0.952916
2,"How can designers, developers, and deployers o...",[systems have the capacity to drive extraordin...,"Designers, developers, and deployers of automa...","Designers, developers, and deployers of automa...",1.0,0.945499,1.0,1.0,0.770302
3,What benefits have been publicly described by ...,"[15. See, e.g., Charles Pruitt. People Doing W...",I don't know.,The benefits of 'traffic calming' measures hav...,0.0,0.0,1.0,1.0,0.181544
4,What is the purpose of AI Red-teaming in testi...,"[sense of AI-generated information, and subseq...",The purpose of AI Red-teaming in testing AI sy...,AI Red-teaming is a structured testing exercis...,1.0,1.0,1.0,0.916667,0.817249
5,What is the importance of training and assessm...,[Training and assessment. Anyone administering...,Training and assessment are important in ensur...,Training and assessment are crucial in ensurin...,0.75,0.980111,0.75,1.0,0.862464
6,How do advertisement delivery systems reinforc...,[ering ads in ways that reinforce racial and g...,Advertisement delivery systems reinforce racia...,Advertisement delivery systems reinforce racia...,1.0,1.0,1.0,1.0,0.846395
7,What is the purpose of the Blueprint for an AI...,[Examples of automated systems for which the B...,The purpose of the Blueprint for an AI Bill of...,The Blueprint for an AI Bill of Rights is inte...,1.0,0.974553,1.0,1.0,0.993355
8,What are the key privacy protections provided ...,"[records systems, including limits on data ret...",The key privacy protections provided by the Pr...,The Privacy Act of 1974 provides privacy prote...,1.0,1.0,1.0,1.0,0.845967
9,How does the Fair Credit Reporting Act ensure ...,[beyond simple notice to include reporting ele...,The Fair Credit Reporting Act ensures that con...,The Fair Credit Reporting Act ensures that con...,1.0,0.915813,1.0,1.0,0.620742


In [None]:
# eval_llm = ChatOpenAI(model="gpt-4o-mini", tags=["base_llm"]) 