In [None]:
pip install langchain openai faiss-cpu pymupdf langchain-community tiktoken


Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.8 kB)
Collecting pymupdf
  Downloading pymupdf-1.25.5-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Collecting langchain-community
  Downloading langchain_community-0.3.23-py3-none-any.whl.metadata (2.5 kB)
Collecting tiktoken
  Downloading tiktoken-0.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.9.1-py3-none-any.whl.metadata (3.8 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading marshmallow-3.26.1-

In [None]:
from pathlib import Path
from langchain.document_loaders import PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.embeddings import OpenAIEmbeddings
import os


In [None]:
# Setup environment and API key
import os
from google.colab import userdata

os.environ['OPENAI_API_KEY'] = userdata.get('OPENAI_API_KEY')

In [None]:
# Load PDF
pdf_path = "bop_guidelines_draft2.pdf"  # adjust path if needed
loader = PyMuPDFLoader(pdf_path)
documents = loader.load()

#  Split into retrievable chunks
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
chunks = splitter.split_documents(documents)

# Tag each chunk with its source
for chunk in chunks:
    chunk.metadata["source"] = "bop_guidelines_draft2.pdf"

# Create embeddings using OpenAI
embeddings = OpenAIEmbeddings()

# Create and save FAISS vector store
vectorstore = FAISS.from_documents(chunks, embeddings)
vectorstore.save_local("bop_vectorstore")

print(f"Vector store created with {len(chunks)} chunks.")

ValueError: File path bop_guidelines_draft2.pdf is not a valid file or url

In [None]:
from langchain.vectorstores import FAISS
from langchain.embeddings import OpenAIEmbeddings
from langchain.chains.question_answering import load_qa_chain
from langchain.chat_models import ChatOpenAI

# Load vectorstore
db = FAISS.load_local("bop_vectorstore", OpenAIEmbeddings(), allow_dangerous_deserialization=True)
retriever = db.as_retriever(search_kwargs={"k": 4})

# Example: retrieve guidelines for coffee shops
query = "What are the underwriting guidelines for coffee shops?"
docs = retriever.get_relevant_documents(query)

# Pretty print retrieved chunks
for i, doc in enumerate(docs):
    print(f"--- Document {i+1} ---")
    print(doc.page_content[:500])


  docs = retriever.get_relevant_documents(query)


--- Document 1 ---
3.40
Coffee Shops & Tea Houses
Underwriting Appetite
• Preferred:
– Area 7,500 sq ft, seating 150
– No alcohol service or live entertainment
– Operates year-round
– Strong fire safety and loss prevention
• Acceptable:
– Incidental music
– Limited alcohol sales (<25%)
– Catering up to 15% of total sales
• Not Acceptable:
– Alcohol-focused operations (bars)
– Live entertainment or seasonal closures > 30 days
– Large venues or poor fire controls
Key Risk Considerations
• Fire from coffee equipment 
--- Document 2 ---
Key Risk Considerations
• Misrepresentation, errors in transactions, tenant disputes
Industry-Specific Restrictions
• Property management excluded
• Coverage based on BPP and location
Preferred Optional Coverages
• Cyber, EPLI, D&O Liability
• Commercial Auto and Business Interruption
3.99
Restaurants (Full Service, Limited Service, Fast Food)
Underwriting Appetite
• Preferred:
– <7,500 sq ft, 150 seats
– Controlled alcohol and catering percentages
– NFPA-c

In [None]:
# Step 1: Load the CSV file
import pandas as pd

df = pd.read_csv("BOP Policy Submission Details - BOP Submission Elements.csv")

# Step 2: Format the required fields into a description list
field_descriptions = "\n".join(
    f"- {row['Element Name']}: {row['Description']}"
    for _, row in df.iterrows()
)

In [None]:
from langchain.vectorstores import FAISS
from langchain.embeddings import OpenAIEmbeddings
from langchain.chat_models import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain

# Load Vector Store
vectorstore = FAISS.load_local("bop_vectorstore", OpenAIEmbeddings(), allow_dangerous_deserialization=True)
retriever = vectorstore.as_retriever(search_kwargs={"k": 4})

# LLM Setup
llm = ChatOpenAI(temperature=0.7, model_name="gpt-4")

# Step 1: Retrieve Guidelines
def get_guidelines(business_type: str) -> str:
    query = f"Underwriting guidelines for {business_type}"
    docs = retriever.get_relevant_documents(query)
    return "\n\n".join(doc.page_content for doc in docs)

# Step 2: Generate Questions
question_prompt = PromptTemplate(
    input_variables=["business_type", "guidelines"],
    template="""
You are a commercial insurance underwriter.

Given the business type "{business_type}" and the following underwriting guidelines:

{guidelines}

Generate a list of 5 to 10 key underwriting questions that would help assess eligibility or risk for this business. Use a mix of yes/no and short-answer formats. Each question should reflect a specific requirement or risk consideration.
"""
)
question_chain = LLMChain(llm=llm, prompt=question_prompt)

# Step 3: Generate Answers
answer_prompt = PromptTemplate(
    input_variables=["business_type", "questions"],
    template="""
You are simulating answers to an insurance application for a "{business_type}".

Here are the underwriting questions:
{questions}

Generate realistic, internally consistent answers for each question. Use natural business behavior and practical values. Provide short but detailed answers.
"""
)
answer_chain = LLMChain(llm=llm, prompt=answer_prompt)

# Step 4: Define an application generation prompt
from langchain.prompts import PromptTemplate

application_prompt = PromptTemplate(
    input_variables=["business_type", "qa_pairs", "fields_to_fill"],
    template="""
You are filling out a Business Owner Policy (BOP) insurance application for a "{business_type}".

Based on the following answered underwriting questions, generate a complete and realistic application record. Even if a field is not mentioned in the questions, infer it using common industry knowledge and consistency. Do not leave any values blank.

You must fill in the following fields:
{fields_to_fill}

Answered Questions:
{qa_pairs}

Return the application as a well-structured JSON object.
"""
)

from langchain.chat_models import ChatOpenAI
from langchain.chains import LLMChain

llm = ChatOpenAI(temperature=0.7, model_name="gpt-4")
application_chain = LLMChain(llm=llm, prompt=application_prompt)

# Step 5: run function
def generate_bop_application(business_type: str, qa_pairs: str) -> dict:
    response = application_chain.invoke({
        "business_type": business_type,
        "qa_pairs": qa_pairs,
        "fields_to_fill": field_descriptions
    })
    return response["text"]  # or return json.loads(response["text"]) if it's valid JSON

# Run the pipeline
the_biz = "Manufacturing technologies"
guidelines = get_guidelines(the_biz)

questions = question_chain.invoke({
    "business_type": the_biz,
    "guidelines": guidelines
})["text"]

answers = answer_chain.invoke({
    "business_type": the_biz,
    "questions": questions
})["text"]

qa_pairs = "\n".join(
    f"{q.strip()} — {a.strip()}"
    for q, a in zip(questions.strip().split("\n"), answers.strip().split("\n"))
    if q.strip() and a.strip()
)

application_json = generate_bop_application(the_biz, qa_pairs)

# === Print the result ===
import json
print("Underwriting Questions:\n", questions, "\n")
print("Simulated Answers:\n", answers, "\n")
print("Final Application JSON:\n", json.dumps(json.loads(application_json), indent=2))


Underwriting Questions:
 1. How many manufacturing locations does your business operate from?
2. Does your building have any outdated core systems or roofing over the specified age limits?
3. Does your business involve emergency response, LPG work, or any other manufacturing activities that could trigger higher ratings or exclusions?
4. Do you require any of the preferred optional coverages such as equipment breakdown insurance, business interruption insurance, professional liability insurance, commercial auto insurance, or cyber liability insurance? If yes, please specify.
5. Does your inventory value exceed $1M or do your online sales account for more than 50% of your total revenue?
6. Has your business had more than two claims in the past three years?
7. Does more than 25% of your revenue come from off-premises services?
8. If your business operates in a disaster-prone area, do you carry flood/wind/earthquake coverage?
9. Does your business derive over 75% of its revenue from online