# Task 0:  Setup 

In [1]:
# Uninstall incompatible version of langchain-core as a caution
%pip uninstall langchain-core -y

# Installs
%pip install langchain-community langchain-core==0.2.40 langchain-huggingface==0.0.3 langchain-qdrant 
%pip install openai ragas==0.1.20 tqdm

# Check for any remaining package conflicts
%pip check

Found existing installation: langchain-core 0.2.40
Uninstalling langchain-core-0.2.40:
  Successfully uninstalled langchain-core-0.2.40
Note: you may need to restart the kernel to use updated packages.
Collecting langchain-core==0.2.40
  Using cached langchain_core-0.2.40-py3-none-any.whl.metadata (6.2 kB)
Collecting protobuf<6.0dev,>=5.26.1 (from grpcio-tools>=1.41.0->qdrant-client<2.0.0,>=1.10.1->langchain-qdrant)
  Downloading protobuf-5.28.2-cp38-abi3-macosx_10_9_universal2.whl.metadata (592 bytes)
Using cached langchain_core-0.2.40-py3-none-any.whl (396 kB)
Downloading protobuf-5.28.2-cp38-abi3-macosx_10_9_universal2.whl (414 kB)
Installing collected packages: protobuf, langchain-core
  Attempting uninstall: protobuf
    Found existing installation: protobuf 4.25.5
    Uninstalling protobuf-4.25.5:
      Successfully uninstalled protobuf-4.25.5
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the sour

In [2]:
import os
import openai
from getpass import getpass

openai.api_key = getpass("Please provide your OpenAI Key: ")
os.environ["OPENAI_API_KEY"] = openai.api_key

In [3]:
from langchain_community.document_loaders import PyMuPDFLoader

def load_pdfs(paths: list) -> list:

    # List of file paths for the PDFs you want to load
    paths = paths

    # Create a list to store loaded documents
    documents = []

    # Loop through each PDF and load it
    for path in paths:
        loader = PyMuPDFLoader(path)
        documents.extend(loader.load())  # Add the documents to the list

    return documents 

#####

from langchain.text_splitter import RecursiveCharacterTextSplitter

def chunk_docs_recursive(documents: list, chunk_size: int, chunk_overlap: int) -> list:

    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
    )

    chunks = text_splitter.split_documents(documents)

    return chunks

#####

from langchain.text_splitter import NLTKTextSplitter

def chunk_docs_nltk(documents: list, chunk_size: int, chunk_overlap: int) -> list:

    text_splitter = NLTKTextSplitter(
        chunk_size=chunk_size, 
        chunk_overlap=chunk_overlap)

    chunks = text_splitter.split_documents(documents)

    return chunks

#####

# from langchain_openai import OpenAIEmbeddings

# def create_embeddings_openai(model: str) -> OpenAIEmbeddings:

#     # Initialize the OpenAIEmbeddings class
#     embeddings = OpenAIEmbeddings(model=model)

#     return embeddings

#####

from langchain_huggingface import HuggingFaceEmbeddings

def create_embeddings_opensource(model: str) -> HuggingFaceEmbeddings:

    # Initialize the OpenAIEmbeddings class
    embeddings = HuggingFaceEmbeddings(model_name=model)

    return embeddings

#####

from langchain_qdrant import QdrantVectorStore
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams

def create_vector_store(location: str, collection_name: str, vector_size: int, embeddings, documents: list) -> QdrantVectorStore:

    # Initialize the Qdrant client
    qdrant_client = QdrantClient(
        location=location
        )

    # Create a collection in Qdrant
    qdrant_client.create_collection(
        collection_name=collection_name,
        vectors_config=VectorParams(
            size=vector_size, 
            distance=Distance.COSINE
            )
        )

        # Initialize QdrantVectorStore with the Qdrant client
    qdrant_vector_store = QdrantVectorStore(
            client=qdrant_client,
            collection_name=collection_name,
            embedding=embeddings,
        )
    
    qdrant_vector_store.add_documents(documents)
    
    return qdrant_vector_store

#####

def create_retriever_from_qdrant(vector_store: QdrantVectorStore):
  retriever = vector_store.as_retriever()

  return retriever

#####

from langchain.prompts import ChatPromptTemplate

def create_chat_prompt_template() -> ChatPromptTemplate:
    template = """
    Only answer the question using the context below.  If the answer can't be found in the context, respond "I don't know". 

    Question:
    {question}

    Context:
    {context}
    """
    prompt = ChatPromptTemplate.from_template(template)

    return prompt

#####

from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import ChatOpenAI
from operator import itemgetter

def create_chain_openai(model: str, prompt: ChatPromptTemplate, retriever):

    llm = ChatOpenAI(
        model_name="gpt-4o-mini", 
        temperature=0
        )

    chain = (
        {"context": itemgetter("question") | retriever, "question": itemgetter("question")} 
        | RunnablePassthrough.assign(context=itemgetter("context")) 
        | {"response": prompt | llm, "context": itemgetter("context")}
        )

    return chain

#####

# Task 1:  Dealing with the Data

In [7]:
documents = load_pdfs(
    ["https://www.whitehouse.gov/wp-content/uploads/2022/10/Blueprint-for-an-AI-Bill-of-Rights.pdf", 
     "https://nvlpubs.nist.gov/nistpubs/ai/NIST.AI.600-1.pdf"]
     )

print(len(documents))

print(documents[0])

137
page_content=' 
 
 
 
 
 
 
 
 
 
BLUEPRINT FOR AN 
AI BILL OF 
RIGHTS 
MAKING AUTOMATED 
SYSTEMS WORK FOR 
THE AMERICAN PEOPLE 
OCTOBER 2022 
' metadata={'source': 'https://www.whitehouse.gov/wp-content/uploads/2022/10/Blueprint-for-an-AI-Bill-of-Rights.pdf', 'file_path': 'https://www.whitehouse.gov/wp-content/uploads/2022/10/Blueprint-for-an-AI-Bill-of-Rights.pdf', 'page': 0, 'total_pages': 73, 'format': 'PDF 1.6', 'title': 'Blueprint for an AI Bill of Rights', 'author': '', 'subject': '', 'keywords': '', 'creator': 'Adobe Illustrator 26.3 (Macintosh)', 'producer': 'iLovePDF', 'creationDate': "D:20220920133035-04'00'", 'modDate': "D:20221003104118-04'00'", 'trapped': ''}


In [8]:
chunks = chunk_docs_recursive(documents, 500, 50)

print(len(chunks))
print(chunks[0])

910
page_content='BLUEPRINT FOR AN 
AI BILL OF 
RIGHTS 
MAKING AUTOMATED 
SYSTEMS WORK FOR 
THE AMERICAN PEOPLE 
OCTOBER 2022' metadata={'source': 'https://www.whitehouse.gov/wp-content/uploads/2022/10/Blueprint-for-an-AI-Bill-of-Rights.pdf', 'file_path': 'https://www.whitehouse.gov/wp-content/uploads/2022/10/Blueprint-for-an-AI-Bill-of-Rights.pdf', 'page': 0, 'total_pages': 73, 'format': 'PDF 1.6', 'title': 'Blueprint for an AI Bill of Rights', 'author': '', 'subject': '', 'keywords': '', 'creator': 'Adobe Illustrator 26.3 (Macintosh)', 'producer': 'iLovePDF', 'creationDate': "D:20220920133035-04'00'", 'modDate': "D:20221003104118-04'00'", 'trapped': ''}


In [9]:
# embeddings = create_embeddings_openai("text-embedding-ada-002")

# query = "This is an example sentence for generating embeddings."
# embedding_vector = embeddings.embed_query(query)
# print(embedding_vector)


NameError: name 'create_embeddings_openai' is not defined

In [10]:
embeddings = create_embeddings_opensource("sentence-transformers/all-MiniLM-L6-v2")

query = "This is an example sentence for generating embeddings."
embedding_vector = embeddings.embed_query(query)
print(embedding_vector)

  from tqdm.autonotebook import tqdm, trange


[-0.00017142681463155895, -0.017052756622433662, 0.0466596782207489, 0.07441818714141846, 0.03471486642956734, 0.05450190603733063, 0.01252576895058155, -0.049386944621801376, 0.029524460434913635, -0.04392328858375549, 0.06013912707567215, -0.05279029905796051, 0.0978655070066452, -0.009004428051412106, -0.014963291585445404, 0.0728435218334198, 0.052706003189086914, 0.006455709226429462, -0.061940401792526245, -0.01770511269569397, 0.022993385791778564, 0.04449153319001198, 0.07146460562944412, -0.04201526194810867, 0.012547018937766552, -0.05232679471373558, -0.03799056634306908, 0.06573976576328278, 0.15933726727962494, 0.00011792029545176774, 0.05847509205341339, -0.007675816770642996, -0.04133718088269234, 0.043803952634334564, 0.026761561632156372, 0.11080886423587799, -0.01858428306877613, 0.07720502465963364, -0.02903367020189762, -0.001317247748374939, 0.04944780841469765, 0.028358951210975647, 0.0126786008477211, 0.0531744509935379, 0.019705265760421753, -0.09619426727294922

In [11]:
qdrant_vector_store = create_vector_store(":memory:", "Midterm", 384, embeddings, documents)

In [12]:
retriever = create_retriever_from_qdrant(qdrant_vector_store)

In [13]:
# retrieved_documents = retriever.invoke("What are underserved communities?")
# retrieved_documents = retriever.invoke("What should be expected of automated systems?")
retrieved_documents = retriever.invoke("What is action ID GV-1.3-001?")

for doc in retrieved_documents:
  print(doc)

page_content=' 
19 
GV-4.1-003 
Establish policies, procedures, and processes for oversight functions (e.g., senior 
leadership, legal, compliance, including internal evaluation) across the GAI 
lifecycle, from problem formulation and supply chains to system decommission. 
Value Chain and Component 
Integration 
AI Actor Tasks: AI Deployment, AI Design, AI Development, Operation and Monitoring 
 
GOVERN 4.2: Organizational teams document the risks and potential impacts of the AI technology they design, develop, deploy, 
evaluate, and use, and they communicate about the impacts more broadly. 
Action ID 
Suggested Action 
GAI Risks 
GV-4.2-001 
Establish terms of use and terms of service for GAI systems. 
Intellectual Property; Dangerous, 
Violent, or Hateful Content; 
Obscene, Degrading, and/or 
Abusive Content 
GV-4.2-002 
Include relevant AI Actors in the GAI system risk identiﬁcation process. 
Human-AI Conﬁguration 
GV-4.2-003 
Verify that downstream GAI system impacts (such as the u

# Task 2:  Building a Quick End-to-End Prototype

In [14]:
prompt = create_chat_prompt_template()

In [16]:
chain_1 = create_chain_openai("gpt-4o-mini", prompt, retriever)

In [17]:
question = "What is confabulation?"
#question = "What is action ID GV-1.3-002?"

result = chain_1.invoke({"question" : question})

print(result["response"].content)

Confabulation refers to a phenomenon in which GAI systems generate and confidently present erroneous or false content in response to prompts. It includes generated outputs that diverge from the prompts or contradict previously generated statements in the same context. These are also colloquially referred to as "hallucinations" or "fabrications."


# Task 3: Creating a Golden Test Data Set

In [18]:
text_splitter_eval = RecursiveCharacterTextSplitter(
    chunk_size = 600,
    chunk_overlap = 50
)

eval_documents = text_splitter_eval.split_documents(documents)

In [19]:
from ragas.testset.generator import TestsetGenerator
from ragas.testset.evolutions import simple, reasoning, multi_context
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

generator_llm = ChatOpenAI(model="gpt-3.5-turbo")
critic_llm = ChatOpenAI(model="gpt-4o-mini")
embeddings = OpenAIEmbeddings()

generator = TestsetGenerator.from_langchain(
    generator_llm,
    critic_llm,
    embeddings
)

distributions = {
    simple: 0.5,
    multi_context: 0.4,
    reasoning: 0.1
}

num_qa_pairs = 20 # You can reduce the number of QA pairs to 5 if you're experiencing rate-limiting issues

testset = generator.generate_with_langchain_docs(eval_documents, num_qa_pairs, distributions)
testset.to_pandas()

embedding nodes:   0%|          | 0/1522 [00:00<?, ?it/s]

Filename and doc_id are the same for all nodes.


Generating:   0%|          | 0/20 [00:00<?, ?it/s]

Unnamed: 0,question,contexts,ground_truth,evolution_type,metadata,episode_done
0,What is the role of NCII in addressing risks r...,"[disinformation, deepfakes, including NCII, or...",The answer to given question is not present in...,simple,[{'source': 'https://nvlpubs.nist.gov/nistpubs...,True
1,What initiatives has OSTP undertaken to gather...,[APPENDIX\nSummaries of Additional Engagements...,OSTP created an email address (ai-equity@ostp....,simple,[{'source': 'https://www.whitehouse.gov/wp-con...,True
2,How can interdisciplinary AI teams establish c...,[AI Actor Tasks: AI Deployment \n \nMAP 1.2: I...,Establishing context reflecting demographic di...,simple,[{'source': 'https://nvlpubs.nist.gov/nistpubs...,True
3,How do pretrial risk assessments play a role i...,"[policing, pretrial risk assessments, automate...",Pretrial risk assessments play a role in ensur...,simple,[{'source': 'https://www.whitehouse.gov/wp-con...,True
4,What is the importance of implementing safety ...,"[safety measures, both prior to deployment and...",Implementing safety measures both prior to dep...,simple,[{'source': 'https://nvlpubs.nist.gov/nistpubs...,True
5,What role do technical standards play in the d...,[SAFE AND EFFECTIVE \nSYSTEMS \nWHAT SHOULD BE...,Technical standards play a crucial role in the...,simple,[{'source': 'https://www.whitehouse.gov/wp-con...,True
6,What factors contribute to the quality of AI r...,[environment and in collaboration with AI deve...,The quality of AI red-teaming outputs in pre-d...,simple,[{'source': 'https://nvlpubs.nist.gov/nistpubs...,True
7,What contingency processes are in place to han...,[GOVERN 6.2: Contingency processes are in plac...,Contingency processes are in place to handle f...,simple,[{'source': 'https://nvlpubs.nist.gov/nistpubs...,True
8,How does the documentation of AI systems' know...,[decision-making criteria. \nIntellectual Prop...,The answer to given question is not present in...,simple,[{'source': 'https://nvlpubs.nist.gov/nistpubs...,True
9,How should derived data from high-risk inputs ...,"[tracked, e.g., via a specialized type in a da...",Derived data from high-risk inputs should be c...,simple,[{'source': 'https://www.whitehouse.gov/wp-con...,True


In [20]:
testset.test_data[0]
testset_df = testset.to_pandas()
testset_df.to_csv("testset.csv")

In [21]:
import pandas as pd

test_df = pd.read_csv("testset.csv")

In [22]:
test_df

Unnamed: 0.1,Unnamed: 0,question,contexts,ground_truth,evolution_type,metadata,episode_done
0,0,What is the role of NCII in addressing risks r...,"['disinformation, deepfakes, including NCII, o...",The answer to given question is not present in...,simple,[{'source': 'https://nvlpubs.nist.gov/nistpubs...,True
1,1,What initiatives has OSTP undertaken to gather...,['APPENDIX\nSummaries of Additional Engagement...,OSTP created an email address (ai-equity@ostp....,simple,[{'source': 'https://www.whitehouse.gov/wp-con...,True
2,2,How can interdisciplinary AI teams establish c...,['AI Actor Tasks: AI Deployment \n \nMAP 1.2: ...,Establishing context reflecting demographic di...,simple,[{'source': 'https://nvlpubs.nist.gov/nistpubs...,True
3,3,How do pretrial risk assessments play a role i...,"['policing, pretrial risk assessments, automat...",Pretrial risk assessments play a role in ensur...,simple,[{'source': 'https://www.whitehouse.gov/wp-con...,True
4,4,What is the importance of implementing safety ...,"['safety measures, both prior to deployment an...",Implementing safety measures both prior to dep...,simple,[{'source': 'https://nvlpubs.nist.gov/nistpubs...,True
5,5,What role do technical standards play in the d...,['SAFE AND EFFECTIVE \nSYSTEMS \nWHAT SHOULD B...,Technical standards play a crucial role in the...,simple,[{'source': 'https://www.whitehouse.gov/wp-con...,True
6,6,What factors contribute to the quality of AI r...,['environment and in collaboration with AI dev...,The quality of AI red-teaming outputs in pre-d...,simple,[{'source': 'https://nvlpubs.nist.gov/nistpubs...,True
7,7,What contingency processes are in place to han...,['GOVERN 6.2: Contingency processes are in pla...,Contingency processes are in place to handle f...,simple,[{'source': 'https://nvlpubs.nist.gov/nistpubs...,True
8,8,How does the documentation of AI systems' know...,['decision-making criteria. \nIntellectual Pro...,The answer to given question is not present in...,simple,[{'source': 'https://nvlpubs.nist.gov/nistpubs...,True
9,9,How should derived data from high-risk inputs ...,"['tracked, e.g., via a specialized type in a d...",Derived data from high-risk inputs should be c...,simple,[{'source': 'https://www.whitehouse.gov/wp-con...,True


In [23]:
test_questions = test_df["question"].values.tolist()
test_groundtruths = test_df["ground_truth"].values.tolist()

In [24]:
answers = []
contexts = []

for question in test_questions:
  response = chain_1.invoke({"question" : question})
  answers.append(response["response"].content)
  contexts.append([context.page_content for context in response["context"]])

In [25]:
from datasets import Dataset

response_dataset = Dataset.from_dict({
    "question" : test_questions,
    "answer" : answers,
    "contexts" : contexts,
    "ground_truth" : test_groundtruths
})

In [26]:
response_dataset[0]

{'question': 'What is the role of NCII in addressing risks related to disinformation, deepfakes, and tampered content?',
 'answer': 'The role of NCII in addressing risks related to disinformation, deepfakes, and tampered content includes identifying potential content provenance harms, such as misinformation or disinformation, deepfakes, including NCII, or tampered content. It involves enumerating and ranking risks based on their likelihood and potential impact, and determining how well provenance solutions address specific risks and/or harms. Additionally, it emphasizes the need for policies and mechanisms to prevent GAI systems from generating NCII or content that violates the law.',
 'contexts': [' \n5 \noperations, or other cyberattacks; increased attack surface for targeted cyberattacks, which may \ncompromise a system’s availability or the conﬁdentiality or integrity of training data, code, or \nmodel weights.  \n10. Intellectual Property: Eased production or replication of allege

In [27]:
from ragas import evaluate
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    answer_correctness,
    context_recall,
    context_precision,
)

metrics = [
    faithfulness,
    answer_relevancy,
    context_recall,
    context_precision,
    answer_correctness,
]

In [28]:
results = evaluate(response_dataset, metrics)

Evaluating:   0%|          | 0/100 [00:00<?, ?it/s]

In [29]:
results

{'faithfulness': 0.6992, 'answer_relevancy': 0.7329, 'context_recall': 0.7333, 'context_precision': 0.6736, 'answer_correctness': 0.4846}

In [30]:
results_df = results.to_pandas()
results_df

Unnamed: 0,question,contexts,answer,ground_truth,faithfulness,answer_relevancy,context_recall,context_precision,answer_correctness
0,What is the role of NCII in addressing risks r...,"[ \n5 \noperations, or other cyberattacks; inc...",The role of NCII in addressing risks related t...,The answer to given question is not present in...,0.666667,0.989388,1.0,0.0,0.17573
1,What initiatives has OSTP undertaken to gather...,[APPENDIX\nSummaries of Additional Engagements...,OSTP has undertaken several initiatives to gat...,OSTP created an email address (ai-equity@ostp....,1.0,1.0,1.0,1.0,0.992843
2,How can interdisciplinary AI teams establish c...,[ \n \n \n \n \nAPPENDIX\nPanel 4: Artificial ...,Interdisciplinary AI teams can establish conte...,Establishing context reflecting demographic di...,1.0,0.975474,0.0,1.0,0.373225
3,How do pretrial risk assessments play a role i...,[ \n \n \n \n \nNOTICE & \nEXPLANATION \nWHY ...,I don't know.,Pretrial risk assessments play a role in ensur...,0.0,0.0,0.0,0.333333,0.175914
4,What is the importance of implementing safety ...,[ \n \n \nSAFE AND EFFECTIVE SYSTEMS \nYou sho...,The importance of implementing safety measures...,Implementing safety measures both prior to dep...,1.0,0.944502,1.0,1.0,0.665653
5,What role do technical standards play in the d...,[ \n \n \n \n \n \n \nSAFE AND EFFECTIVE \nSYS...,Technical standards play a role in the develop...,Technical standards play a crucial role in the...,1.0,1.0,1.0,1.0,0.823056
6,What factors contribute to the quality of AI r...,[ \n29 \nMS-1.1-006 \nImplement continuous mon...,Factors that contribute to the quality of AI r...,The quality of AI red-teaming outputs in pre-d...,1.0,1.0,1.0,1.0,0.35503
7,What contingency processes are in place to han...,"[ \n3 \nthe abuse, misuse, and unsafe repurpos...",I don't know.,Contingency processes are in place to handle f...,0.0,0.0,1.0,1.0,0.177631
8,How does the documentation of AI systems' know...,[TABLE OF CONTENTS\nFROM PRINCIPLES TO PRACTIC...,The documentation of AI systems' knowledge lim...,The answer to given question is not present in...,0.833333,0.963713,1.0,0.0,0.928695
9,How should derived data from high-risk inputs ...,[ \n \n \n \n \n \n \n \n \n \nSAFE AND EFFECT...,Derived data from high-risk inputs should be c...,Derived data from high-risk inputs should be c...,1.0,0.934824,1.0,1.0,0.781843


# Task 4: Fine-Tuning Open Source Embeddings

In [24]:
import uuid

id_set = set()

# Assign each chunk a unique identifier
for chunk in chunks:
  id = str(uuid.uuid4())

  # Check for uniqueness
  while id in id_set:
    id = uuid.uuid4()

  # Add the id to the set
  id_set.add(id)

  # Add the id to the document's metadata
  chunk.metadata["id"] = id

In [None]:
len(chunks)

In [26]:
training_chunks = chunks[:300]
validation_chunks = chunks[300:350]
test_chunks = chunks[350:400]

In [27]:
from langchain_openai import ChatOpenAI

llm_2 = ChatOpenAI(
    model="gpt-4o-mini",
    temperature=0
)

In [28]:
from langchain_core.prompts import ChatPromptTemplate

prompt_2 = """\
Given the following context, you must generate questions based on only the provided context.

You are to generate {n_questions} questions which should be provided in the following format:

1. QUESTION #1
2. QUESTION #2
...

Context:
{context}
"""

prompt_template_2 = ChatPromptTemplate.from_template(prompt_2)

In [29]:
chain_2 = prompt_template_2 | llm_2

In [30]:
import tqdm

def create_questions(documents, n_questions):
  questions = {}
  relevant_docs = {}

  for document in tqdm.tqdm(documents):
    # Get the context and a empty list object for the generated questions
    document_content = {"context" : document.page_content, "questions" : []}

    # Generate the questions the chain - note we pass in as parameters the context and the number of questions we want generated
    questions_generated = chain_2.invoke({"context": document.page_content, "n_questions": n_questions})

    for question in questions_generated.content.split("\n"):
      # Generate a unique id
      question_id = str(uuid.uuid4())
      
      # Extract question text
      questions[question_id] = "".join(question.split(".")[1:]).strip()

      # Associate the related chunk
      relevant_docs[question_id] = [document.metadata["id"]]

  return questions, relevant_docs

In [None]:
training_questions, training_relevant_contexts = create_questions(training_chunks, 2)
validation_questions, val_relevant_contexts = create_questions(validation_chunks, 2)
test_questions, test_relevant_contexts = create_questions(test_chunks, 2)

In [32]:
import json

training_corpus = {train_item.metadata["id"] : train_item.page_content for train_item in training_chunks}

train_dataset = {
    "questions" : training_questions,
    "relevant_contexts" : training_relevant_contexts,
    "corpus" : training_corpus
}

with open("training_dataset.jsonl", "w") as f:
  json.dump(train_dataset, f)

In [33]:
import json

val_corpus = {val_item.metadata["id"] : val_item.page_content for val_item in validation_chunks}

val_dataset = {
    "questions" : validation_questions,
    "relevant_contexts" : val_relevant_contexts,
    "corpus" : val_corpus
}

with open("val_dataset.jsonl", "w") as f:
  json.dump(val_dataset, f)

In [34]:
import json

train_corpus = {test_item.metadata["id"] : test_item.page_content for test_item in test_chunks}

test_dataset = {
    "questions" : test_questions,
    "relevant_contexts" : test_relevant_contexts,
    "corpus" : train_corpus
}

with open("test_dataset.jsonl", "w") as f:
  json.dump(test_dataset, f)

In [None]:
%pip install -qU sentence_transformers datasets pyarrow

from sentence_transformers import InputExample, SentenceTransformer
from sentence_transformers.evaluation import InformationRetrievalEvaluator
from sentence_transformers.losses import MatryoshkaLoss, MultipleNegativesRankingLoss

from torch.utils.data import DataLoader, Dataset

model_id = "Snowflake/snowflake-arctic-embed-m"
model = SentenceTransformer(model_id)

corpus = train_dataset['corpus']
queries = train_dataset['questions']
relevant_docs = train_dataset['relevant_contexts']

examples = []
for query_id, query in queries.items():
    doc_id = relevant_docs[query_id][0]
    text = corpus[doc_id]
    example = InputExample(texts=[query, text])
    examples.append(example)

BATCH_SIZE = 20
loader = DataLoader(
    examples, batch_size=BATCH_SIZE
)

corpus = val_dataset['corpus']
queries = val_dataset['questions']
relevant_docs = val_dataset['relevant_contexts']

evaluator = InformationRetrievalEvaluator(queries, corpus, relevant_docs)

matryoshka_dimensions = [768, 512, 256, 128, 64]
inner_train_loss = MultipleNegativesRankingLoss(model)
train_loss = MatryoshkaLoss(
    model, inner_train_loss, matryoshka_dims=matryoshka_dimensions
)

In [None]:
EPOCHS = 5

warmup_steps = int(len(loader) * EPOCHS * 0.1)

model.fit(
    train_objectives=[(loader, train_loss)],
    epochs=EPOCHS,
    warmup_steps=warmup_steps,
    output_path='finetuned_arctic',
    show_progress_bar=True,
    evaluator=evaluator,
    evaluation_steps=50,
)

In [None]:
from sentence_transformers import SentenceTransformer
from huggingface_hub import notebook_login

# Login to Hugging Face
notebook_login()  # or use huggingface-cli login

# After training the model, push the fine-tuned model to Hugging Face
model.push_to_hub("dstampfli/finetuned-snowflake-arctic-embed-m")

# Task 5: Assessing Performance

In [4]:
from datasets import Dataset
import pandas as pd
from ragas import evaluate
from ragas.metrics import (faithfulness, answer_relevancy,answer_correctness, context_recall, context_precision,)

def conduct_ragas_evaluation(filename: str, chain):

    test_df = pd.read_csv("testset.csv")
    test_questions = test_df["question"].values.tolist()
    test_groundtruths = test_df["ground_truth"].values.tolist()

    answers = []
    contexts = []

    for question in test_questions:
        response = chain.invoke({"question" : question})
        answers.append(response["response"].content)
        contexts.append([context.page_content for context in response["context"]])

    response_dataset = Dataset.from_dict({
        "question" : test_questions,
        "answer" : answers,
        "contexts" : contexts,
        "ground_truth" : test_groundtruths
    })

    metrics = [
        faithfulness,
        answer_relevancy,
        context_recall,
        context_precision,
        answer_correctness,
    ]

    results = evaluate(response_dataset, metrics)
    
    results_df = results.to_pandas()
    
    return results, results_df

In [None]:
documents = load_pdfs(
    ["https://www.whitehouse.gov/wp-content/uploads/2022/10/Blueprint-for-an-AI-Bill-of-Rights.pdf", 
     "https://nvlpubs.nist.gov/nistpubs/ai/NIST.AI.600-1.pdf"]
     )

chunks = chunk_docs_recursive(documents, 500, 50)
embeddings = create_embeddings_opensource("sentence-transformers/all-MiniLM-L6-v2")
qdrant_vector_store = create_vector_store(":memory:", "Midterm", 384, embeddings, chunks)
retriever = create_retriever_from_qdrant(qdrant_vector_store)
prompt = create_chat_prompt_template()
chain = create_chain_openai("gpt-4o-mini", prompt, retriever)

results, results_df = conduct_ragas_evaluation("testset.csv", chain)

results
results_df

In [None]:
documents = load_pdfs(
    ["https://www.whitehouse.gov/wp-content/uploads/2022/10/Blueprint-for-an-AI-Bill-of-Rights.pdf", 
     "https://nvlpubs.nist.gov/nistpubs/ai/NIST.AI.600-1.pdf"]
     )

chunks = chunk_docs_recursive(documents, 500, 50)
embeddings = create_embeddings_opensource("dstampfli/finetuned-snowflake-arctic-embed-m")
qdrant_vector_store = create_vector_store(":memory:", "Midterm", 768, embeddings, chunks)
retriever = create_retriever_from_qdrant(qdrant_vector_store)
prompt = create_chat_prompt_template()
chain = create_chain_openai("gpt-4o-mini", prompt, retriever)

results, results_df = conduct_ragas_evaluation("testset.csv", chain)

results
results_df

In [None]:
documents = load_pdfs(
    ["https://www.whitehouse.gov/wp-content/uploads/2022/10/Blueprint-for-an-AI-Bill-of-Rights.pdf", 
     "https://nvlpubs.nist.gov/nistpubs/ai/NIST.AI.600-1.pdf"]
     )

chunks = chunk_docs_nltk(documents, 500, 50)
embeddings = create_embeddings_opensource("dstampfli/finetuned-snowflake-arctic-embed-m")
qdrant_vector_store = create_vector_store(":memory:", "Midterm", 768, embeddings, chunks)
retriever = create_retriever_from_qdrant(qdrant_vector_store)
prompt = create_chat_prompt_template()
chain = create_chain_openai("gpt-4o-mini", prompt, retriever)

results, results_df = conduct_ragas_evaluation("testset.csv", chain)

results
results_df