In [4]:
# Install necessary libraries
!pip install langchain sentence-transformers pandas scikit-learn faiss-cpu

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting faiss-cpu
  Using cached faiss_cpu-1.8.0.post1-cp311-cp311-macosx_10_14_x86_64.whl.metadata (3.7 kB)
Using cached faiss_cpu-1.8.0.post1-cp311-cp311-macosx_10_14_x86_64.whl (7.3 MB)
Installing collected packages: faiss-cpu
Successfully installed faiss-cpu-1.8.0.post1


In [5]:
# Import necessary libraries
from langchain.document_loaders import PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from sentence_transformers import SentenceTransformer
import pandas as pd
import numpy as np

# Load the two PDFs
pdf_path_1 = "../docs/Blueprint-for-an-AI-Bill-of-Rights.pdf"
pdf_path_2 = "../docs/NIST_AI_600-1.pdf"

# Load the documents using PyMuPDFLoader
loader1 = PyMuPDFLoader(pdf_path_1)
loader2 = PyMuPDFLoader(pdf_path_2)

documents1 = loader1.load()
documents2 = loader2.load()

# Combine documents
documents = documents1 + documents2

# Split documents using RecursiveCharacterTextSplitter for easier processing
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
split_documents = text_splitter.split_documents(documents)

# Initialize the embedding model
embedding_model_name = "all-MiniLM-L6-v2"
embedding_model = HuggingFaceEmbeddings(model_name=embedding_model_name)

# Create a vector store (FAISS) for efficient similarity search
vectorstore = FAISS.from_documents(split_documents, embedding_model)

# Generate sample questions that stakeholders might ask about AI regulations
sample_questions = [
    "What are the key principles of AI transparency in the AI Bill of Rights?",
    "How does the AI Bill of Rights protect individual privacy?",
    "What are the implications of AI in decision-making processes?",
    "How should companies implement AI systems that comply with regulatory guidelines?",
    "What role does NIST play in establishing AI standards?",
    "What should business leaders understand about AI risk management?",
    "How does the AI Bill of Rights address bias in AI systems?",
    "What steps should organizations take to ensure AI accountability?",
    "How are AI systems validated for safety and fairness according to NIST standards?",
    "What kind of data protection measures are required under current AI regulations?"
]

# Prepare the dataset
qa_pairs = []

# For each question, retrieve the top N most relevant document chunks
top_n = 5  # Number of top relevant chunks to retrieve per question

for question in sample_questions:
    # Embed the question
    question_embedding = embedding_model.embed_query(question)
    
    # Retrieve the most relevant document chunks
    docs_and_scores = vectorstore.similarity_search_with_score_by_vector(question_embedding, k=top_n)
    
    for doc, score in docs_and_scores:
        # Since FAISS returns distances, lower means more similar
        # Adjust the score to be in [0, 1], where higher means more similar
        similarity_score = 1 / (1 + score)
        qa_pairs.append({
            "question": question,
            "content": doc.page_content,
            "score": similarity_score
        })

# Create a Pandas DataFrame for storing the question-content-score pairs
df = pd.DataFrame(qa_pairs)

# Display a few examples
print(df.head())

# Save the dataset for fine-tuning
df.to_csv("ai_regulation_finetuning_dataset.csv", index=False)

print("Dataset saved as ai_regulation_finetuning_dataset.csv")




                                            question  \
0  What are the key principles of AI transparency...   
1  What are the key principles of AI transparency...   
2  What are the key principles of AI transparency...   
3  What are the key principles of AI transparency...   
4  What are the key principles of AI transparency...   

                                             content     score  
0  the principles described in the Blueprint for ...  0.694441  
1  civil rights, civil liberties, and privacy. Th...  0.673516  
2  played a central role in shaping the Blueprint...  0.673443  
3  provide a concrete vision for actualizing the ...  0.669545  
4  govern automated systems and AI, such as the D...  0.667604  
Dataset saved as ai_regulation_finetuning_dataset.csv
