In [1]:
pip install langchain openai faiss-cpu pymupdf langchain-community tiktoken


Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.8 kB)
Collecting pymupdf
  Downloading pymupdf-1.26.0-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Collecting langchain-community
  Downloading langchain_community-0.3.24-py3-none-any.whl.metadata (2.5 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.9.1-py3-none-any.whl.metadata (3.8 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading marshmallow-3.26.1-py3-none-any.whl.metadata (7.3 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community

In [2]:
from pathlib import Path
from langchain.document_loaders import PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.embeddings import OpenAIEmbeddings
import os


In [3]:
# Setup environment and API key
import os
from google.colab import userdata

os.environ['OPENAI_API_KEY'] = userdata.get('OPENAI_API_KEY')

In [5]:
# Load PDF
pdf_path = "bop_policyguide_draft2.pdf"  # adjust path if needed
loader = PyMuPDFLoader(pdf_path)
documents = loader.load()

#  Split into retrievable chunks
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
chunks = splitter.split_documents(documents)

# Tag each chunk with its source
for chunk in chunks:
    chunk.metadata["source"] = "bop_guidelines_draft2.pdf"

# Create embeddings using OpenAI
embeddings = OpenAIEmbeddings()

# Create and save FAISS vector store
vectorstore = FAISS.from_documents(chunks, embeddings)
vectorstore.save_local("bop_vectorstore")

print(f"Vector store created with {len(chunks)} chunks.")

  embeddings = OpenAIEmbeddings()


Vector store created with 168 chunks.


In [6]:
from langchain.vectorstores import FAISS
from langchain.embeddings import OpenAIEmbeddings
from langchain.chains.question_answering import load_qa_chain
from langchain.chat_models import ChatOpenAI

# Load vectorstore
db = FAISS.load_local("bop_vectorstore", OpenAIEmbeddings(), allow_dangerous_deserialization=True)
retriever = db.as_retriever(search_kwargs={"k": 4})

# Example: retrieve guidelines for coffee shops
query = "What are the underwriting guidelines for coffee shops?"
docs = retriever.get_relevant_documents(query)

# Pretty print retrieved chunks
for i, doc in enumerate(docs):
    print(f"--- Document {i+1} ---")
    print(doc.page_content[:500])


  docs = retriever.get_relevant_documents(query)


--- Document 1 ---
3.40
Coffee Shops & Tea Houses
Underwriting Appetite
• Preferred:
– Area ≤7,500 sq ft, seating ≤150
– No alcohol service or live entertainment
– Operates year-round
– Strong fire safety and loss prevention
• Acceptable:
– Incidental music
– Limited alcohol sales (<25%)
– Catering up to 15% of total sales
• Not Acceptable:
– Alcohol-focused operations (bars)
– Live entertainment or seasonal closures > 30 days
– Large venues or poor fire controls
Key Risk Considerations
• Fire from coffee equipmen
--- Document 2 ---
Key Risk Considerations
• Misrepresentation, errors in transactions, tenant disputes
Industry-Specific Restrictions
• Property management excluded
• Coverage based on BPP and location
Preferred Optional Coverages
• Cyber, EPLI, D&O Liability
• Commercial Auto and Business Interruption
3.99
Restaurants (Full Service, Limited Service, Fast Food)
Underwriting Appetite
• Preferred:
– <7,500 sq ft, ≤150 seats
– Controlled alcohol and catering percentages
– NFPA-

In [7]:
# Step 1: Load the CSV file
import pandas as pd

df = pd.read_csv("BOP Policy Submission Details - BOP Submission Elements.csv")

# Step 2: Format the required fields into a description list
field_descriptions = "\n".join(
    f"- {row['Element Name']}: {row['Description']}"
    for _, row in df.iterrows()
)

In [None]:
from langchain.vectorstores import FAISS
from langchain.embeddings import OpenAIEmbeddings
from langchain.chat_models import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain

# Load Vector Store
vectorstore = FAISS.load_local("bop_vectorstore", OpenAIEmbeddings(), allow_dangerous_deserialization=True)
retriever = vectorstore.as_retriever(search_kwargs={"k": 4})

# LLM Setup
llm = ChatOpenAI(temperature=0.7, model_name="gpt-4")

# Step 1: Retrieve Guidelines
def get_guidelines(business_type: str) -> str:
    query = f"Underwriting guidelines for {business_type}"
    docs = retriever.get_relevant_documents(query)
    return "\n\n".join(doc.page_content for doc in docs)

# Step 2: Generate Questions
question_prompt = PromptTemplate(
    input_variables=["business_type", "guidelines"],
    template="""
You are a commercial insurance underwriter.

Given the business type "{business_type}" and the following underwriting guidelines:

{guidelines}

Generate a list of 5 to 10 key underwriting questions that would help assess eligibility or risk for this business. Use a mix of yes/no and short-answer formats. Each question should reflect a specific requirement or risk consideration.
"""
)
question_chain = LLMChain(llm=llm, prompt=question_prompt)

# Step 3: Generate Answers
answer_prompt = PromptTemplate(
    input_variables=["business_type", "questions"],
    template="""
You are simulating answers to an insurance application for a "{business_type}".

Here are the underwriting questions:
{questions}

Generate realistic, internally consistent answers for each question. Use natural business behavior and practical values. Provide short but detailed answers.
"""
)
answer_chain = LLMChain(llm=llm, prompt=answer_prompt)

# Step 4: Define an application generation prompt
from langchain.prompts import PromptTemplate

application_prompt = PromptTemplate(
    input_variables=["business_type", "qa_pairs", "fields_to_fill"],
    template="""
You are filling out a Business Owner Policy (BOP) insurance application for a "{business_type}".

Based on the following answered underwriting questions, generate a complete and realistic application record. Even if a field is not mentioned in the questions, infer it using common industry knowledge and consistency. Do not leave any values blank.

You must fill in the following fields:
{fields_to_fill}

Answered Questions:
{qa_pairs}

Return the application as a well-structured JSON object.
"""
)

from langchain.chat_models import ChatOpenAI
from langchain.chains import LLMChain

llm = ChatOpenAI(temperature=0.7, model_name="gpt-4")
application_chain = LLMChain(llm=llm, prompt=application_prompt)

# Step 5: run function
def generate_bop_application(business_type: str, qa_pairs: str) -> dict:
    response = application_chain.invoke({
        "business_type": business_type,
        "qa_pairs": qa_pairs,
        "fields_to_fill": field_descriptions
    })
    return response["text"]  # or return json.loads(response["text"]) if it's valid JSON

# Run the pipeline
the_biz = "Manufacturing technologies"
guidelines = get_guidelines(the_biz)

questions = question_chain.invoke({
    "business_type": the_biz,
    "guidelines": guidelines
})["text"]

answers = answer_chain.invoke({
    "business_type": the_biz,
    "questions": questions
})["text"]

qa_pairs = "\n".join(
    f"{q.strip()} — {a.strip()}"
    for q, a in zip(questions.strip().split("\n"), answers.strip().split("\n"))
    if q.strip() and a.strip()
)

application_json = generate_bop_application(the_biz, qa_pairs)

# === Print the result ===
import json
print("Underwriting Questions:\n", questions, "\n")
print("Simulated Answers:\n", answers, "\n")
print("Final Application JSON:\n", json.dumps(json.loads(application_json), indent=2))


  llm = ChatOpenAI(temperature=0.7, model_name="gpt-4")
  question_chain = LLMChain(llm=llm, prompt=question_prompt)


Underwriting Questions:
 1. How many locations does your manufacturing technologies business operate from? 
2. Has the building's core systems or roofing been updated in the recent past? If yes, when was it last updated?
3. Does the business involve any emergency response, LPG work, or manufacturing activities? If yes, please elaborate.
4. Does the business maintain a sensitive customer database? If yes, is there an existing cyber liability insurance coverage?
5. What is the estimated value of your inventory? 
6. Does the business have any online sales? If yes, what percentage of your revenue is derived from online sales?
7. How many claims has the business filed in the past three years?
8. Does your business operate in a disaster-prone area? If yes, do you have flood/wind/earthquake coverage?
9. Do you have a safety program in place for your employees? If yes, please provide brief details about it.
10. Does your business involve any high-pressure boiler work or hazardous site exposure

In [None]:
import pandas as pd
import json
import re
from langchain.vectorstores import FAISS
from langchain.embeddings import OpenAIEmbeddings
from langchain.chat_models import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain.callbacks import get_openai_callback

# === Load Data ===
coverages_df = pd.read_csv("BOP Coverages - coverages.csv")
sic_codes_df = pd.read_csv("sic-codes.csv")
fields_df = pd.read_csv("BOP Policy Submission Details - BOP Submission Elements.csv")

# === Prepare Submission Fields ===
all_submission_fields = fields_df["Element Name"].drop_duplicates().tolist()
fields_to_fill = "\n".join(all_submission_fields)

# === LLM and Vector Store Setup ===
llm = ChatOpenAI(temperature=0.7, model_name="gpt-4")
vectorstore = FAISS.load_local("bop_vectorstore", OpenAIEmbeddings(), allow_dangerous_deserialization=True)
retriever = vectorstore.as_retriever(search_kwargs={"k": 4})

# === Prompt Templates ===
question_prompt = PromptTemplate(
    input_variables=["business_type", "guidelines"],
    template="""
You are a commercial insurance underwriter.

Given the business type "{business_type}" and the following underwriting guidelines:

{guidelines}

Generate 5 to 10 underwriting questions that would help assess eligibility or risk. Use yes/no and short-answer formats.
"""
)

answer_prompt = PromptTemplate(
    input_variables=["business_type", "questions"],
    template="""
You are simulating answers to an insurance application for a "{business_type}".

Here are the underwriting questions:
{questions}

Provide realistic, internally consistent answers. Use natural behavior and practical values.
"""
)

owner_prompt = PromptTemplate(
    input_variables=["business_type", "questions"],
    template="""
You are a business owner who owns a "{business_type}" type of business.

Here are underwriting questions:
{questions}

Now answer:
- What does your business do?
- Who are your customers?
- How many employees do you have (full/part-time)?
- What is your annual payroll?
- Where do you operate (city/neighborhood)?
- Do you have a physical location (size, features)?
- Anything unique or special?
- How do you make money?
- What is your annual gross sales?

Be natural, realistic, and detailed.
"""
)

application_prompt = PromptTemplate(
    input_variables=["business_type", "qa_pairs", "fields_to_fill", "business_owner_description"],
    template="""
You are completing a Business Owner Policy (BOP) insurance application for a "{business_type}".

Use these answered questions and business owner description to complete it:

Answered Questions:
{qa_pairs}

Business Description:
{business_owner_description}

Do not leave any values blank.

You must fill in the following fields:
{fields_to_fill}

Return the application as a well-structured JSON object.
"""
)

# === Chains ===
question_chain = LLMChain(llm=llm, prompt=question_prompt)
answer_chain = LLMChain(llm=llm, prompt=answer_prompt)
owner_chain = LLMChain(llm=llm, prompt=owner_prompt)
application_chain = LLMChain(llm=llm, prompt=application_prompt)

# === Utility Functions ===
def get_guidelines(business_type: str) -> str:
    docs = retriever.get_relevant_documents(f"Underwriting guidelines for {business_type}")
    return "\n\n".join(doc.page_content for doc in docs[:3])

# === Main Function ===
def generate_bop_application(business_type: str) -> dict:
    with get_openai_callback() as cb:
        guidelines = get_guidelines(business_type)

        questions = question_chain.invoke({
            "business_type": business_type,
            "guidelines": guidelines
        })["text"]

        answers = answer_chain.invoke({
            "business_type": business_type,
            "questions": questions
        })["text"]

        qa_pairs = "\n".join(
            f"{q.strip()} — {a.strip()}"
            for q, a in zip(questions.strip().split("\n"), answers.strip().split("\n"))
            if q.strip() and a.strip()
        )

        owner_description = owner_chain.invoke({
            "business_type": business_type,
            "questions": questions
        })["text"]

        final_app = application_chain.invoke({
            "business_type": business_type,
            "qa_pairs": qa_pairs,
            "fields_to_fill": fields_to_fill,
            "business_owner_description": owner_description
        })["text"]

        print(f"Total tokens used: {cb.total_tokens}")
        return json.loads(final_app)

# === Example Run ===
if __name__ == "__main__":
    biz_type = "Manufacturing Technologies"
    app = generate_bop_application(biz_type)
    print(json.dumps(app, indent=2))


Total tokens used: 4082
{
  "name": "Manufacturing Technologies",
  "fein_or_soc_sec": "123-45-6789",
  "business_type": "Industrial equipment and machinery manufacturing",
  "mailing_address": "123 Industrial Park Drive, Lexington, KY 40511",
  "contact_for_inspection": "John Doe, CEO",
  "gl_code": "61224",
  "sic": "3569",
  "nature_of_business": "Manufacturing",
  "description_of_operations": "Design, development, and manufacturing of specialized machinery and equipment used in various industrial sectors.",
  "date_business_started": "01/01/2000",
  "effective_date": "01/01/2022",
  "expiration_date": "01/01/2023",
  "new_renewal": "Renewal",
  "payment_plan": "Annual",
  "total_premium": "$10,000",
  "policy_number": "XYZ789101",
  "general_info_questions": {
    "more_than_four_locations": "No",
    "outdated_systems_or_roofing": "No",
    "high_risk_manufacturing": "No",
    "current_insurance_coverage": "General liability insurance, property insurance, workers' compensation ins

In [None]:
import pandas as pd
import json
import re
import random
import uuid
from datetime import datetime
from langchain.vectorstores import FAISS
from langchain.embeddings import OpenAIEmbeddings
from langchain.chat_models import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain.callbacks import get_openai_callback

# === Load Data ===
coverages_df = pd.read_csv("BOP Coverages - coverages.csv")
sic_codes_df = pd.read_csv("sic-codes.csv")
fields_df = pd.read_csv("BOP Policy Submission Details - BOP Submission Elements.csv")

# === Prepare Submission Fields ===
all_submission_fields = fields_df["Element Name"].drop_duplicates().tolist()
fields_to_fill = "\n".join(all_submission_fields)

# === Business Categories ===
with open("bop_categories.txt") as f:
    bop_categories = [line.strip() for line in f if line.strip()]

bop_categories = bop_categories[:10]

# === LLM and Vector Store Setup ===
llm = ChatOpenAI(temperature=0.7, model_name="gpt-4")
vectorstore = FAISS.load_local("bop_vectorstore", OpenAIEmbeddings(), allow_dangerous_deserialization=True)
retriever = vectorstore.as_retriever(search_kwargs={"k": 4})

# === Prompt Templates ===
question_prompt = PromptTemplate(
    input_variables=["business_type", "guidelines"],
    template="""
You are a commercial insurance underwriter.

Given the business type "{business_type}" and the following underwriting guidelines:

{guidelines}

Generate 5 to 10 underwriting questions that would help assess eligibility or risk. Use yes/no and short-answer formats.
"""
)

answer_prompt = PromptTemplate(
    input_variables=["business_type", "questions"],
    template="""
You are simulating answers to an insurance application for a "{business_type}".

Here are the underwriting questions:
{questions}

Provide realistic, internally consistent answers. Use natural behavior and practical values.
"""
)

owner_prompt = PromptTemplate(
    input_variables=["business_type", "questions"],
    template="""
You are a business owner who owns a "{business_type}" type of business.

Here are underwriting questions:
{questions}

Now answer:
- What does your business do?
- Who are your customers?
- How many employees do you have (full/part-time)?
- What is your annual payroll?
- Where do you operate (city/neighborhood)?
- Do you have a physical location (size, features)?
- Anything unique or special?
- How do you make money?
- What is your annual gross sales?

Be natural, realistic, and detailed.
"""
)

application_prompt = PromptTemplate(
    input_variables=["business_type", "qa_pairs", "fields_to_fill", "business_owner_description"],
    template="""
You are completing a Business Owner Policy (BOP) insurance application for a "{business_type}".

Use these answered questions and business owner description to complete it:

Answered Questions:
{qa_pairs}

Business Description:
{business_owner_description}

Do not leave any values blank.

You must fill in the following fields:
{fields_to_fill}

Return the application as a well-structured JSON object.
"""
)

# === Chains ===
question_chain = LLMChain(llm=llm, prompt=question_prompt)
answer_chain = LLMChain(llm=llm, prompt=answer_prompt)
owner_chain = LLMChain(llm=llm, prompt=owner_prompt)
application_chain = LLMChain(llm=llm, prompt=application_prompt)

# === Utility Functions ===
def get_guidelines(business_type: str) -> str:
    docs = retriever.get_relevant_documents(f"Underwriting guidelines for {business_type}")
    return "\n\n".join(doc.page_content for doc in docs[:3])

def simulate_credit_score() -> dict:
    r = random.random()
    if r < 0.2:
        return {"score": random.randint(450, 599), "rating": "Low"}
    elif r < 0.6:
        return {"score": random.randint(600, 699), "rating": "Average"}
    else:
        return {"score": random.randint(700, 850), "rating": "Good"}

def simulate_google_reviews(business_name: str) -> dict:
    review_count = random.randint(5, 250)
    rating = round(random.uniform(1.0, 5.0), 1)
    all_reviews = [
        "Great customer service and reliable work!",
        "Mediocre experience, not bad but could be better.",
        "I had to wait too long, not coming back.",
        "Wonderful staff and professional results.",
        "Overpriced and disappointing experience."
    ]
    if random.random() < 0.1:
        samples = random.choices(all_reviews[-2:], k=5)
    else:
        samples = random.choices(all_reviews, k=5)
    return {
        "review_count": review_count,
        "average_rating": rating,
        "reviews": samples
    }

def simulate_claims_history(credit_score: int, avg_rating: float) -> dict:
    if credit_score < 600 or avg_rating < 2.5:
        num_claims = random.randint(2, 5)
    elif credit_score < 700 or avg_rating < 3.5:
        num_claims = random.randint(1, 3)
    else:
        num_claims = random.choice([0, 1])
    total_losses = num_claims * random.randint(1000, 10000)
    return {
        "num_claims": num_claims,
        "total_losses": total_losses
    }

# === Batch Simulation ===
def batch_generate_applications(n_per_category=1):
    applications = []
    external_data = []
    for category in bop_categories:
        for i in range(n_per_category):
            try:
                business_type = f"{category} - {uuid.uuid4().hex[:6]}"
                app = generate_bop_application(business_type)
                credit = simulate_credit_score()
                reviews = simulate_google_reviews(app.get("NAME", business_type))
                claims = simulate_claims_history(credit["score"], reviews["average_rating"])

                applications.append(app)
                external_data.append({
                    "Business Name": app.get("NAME", business_type),
                    "Credit Score": credit["score"],
                    "Credit Rating": credit["rating"],
                    "Google Review Count": reviews["review_count"],
                    "Average Review Rating": reviews["average_rating"],
                    "Sample Reviews": reviews["reviews"],
                    "Number of Claims": claims["num_claims"],
                    "Total Losses": claims["total_losses"]
                })
            except Exception as e:
                print(f"Error generating for {category}: {e}")

    pd.DataFrame(applications).to_json("/mnt/data/generated_bop_applications.json", orient="records", indent=2)
    pd.DataFrame(external_data).to_json("/mnt/data/generated_bop_third_party_data.json", orient="records", indent=2)
    print("Batch generation completed.")

# === Main Function ===
def generate_bop_application(business_type: str) -> dict:
    with get_openai_callback() as cb:
        guidelines = get_guidelines(business_type)

        questions = question_chain.invoke({
            "business_type": business_type,
            "guidelines": guidelines
        })["text"]

        answers = answer_chain.invoke({
            "business_type": business_type,
            "questions": questions
        })["text"]

        qa_pairs = "\n".join(
            f"{q.strip()} — {a.strip()}"
            for q, a in zip(questions.strip().split("\n"), answers.strip().split("\n"))
            if q.strip() and a.strip()
        )

        owner_description = owner_chain.invoke({
            "business_type": business_type,
            "questions": questions
        })["text"]

        final_app = application_chain.invoke({
            "business_type": business_type,
            "qa_pairs": qa_pairs,
            "fields_to_fill": fields_to_fill,
            "business_owner_description": owner_description
        })["text"]

        print(f"Total tokens used: {cb.total_tokens}")
        return json.loads(final_app)

# === Example Run ===
if __name__ == "__main__":
    batch_generate_applications(n_per_category=1)


  llm = ChatOpenAI(temperature=0.7, model_name="gpt-4")
  question_chain = LLMChain(llm=llm, prompt=question_prompt)


Total tokens used: 4340
Total tokens used: 3965
Total tokens used: 4180
Total tokens used: 4534
Total tokens used: 3845


KeyboardInterrupt: 

In [None]:
import pandas as pd
import json
import re
import random
import uuid
from datetime import datetime
from langchain.vectorstores import FAISS
from langchain.embeddings import OpenAIEmbeddings
from langchain.chat_models import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain.callbacks import get_openai_callback

# === Load Data ===
coverages_df = pd.read_csv("BOP Coverages - coverages.csv")
sic_codes_df = pd.read_csv("sic-codes.csv")
fields_df = pd.read_csv("BOP Policy Submission Details - BOP Submission Elements.csv")

# === Prepare Submission Fields ===
all_submission_fields = fields_df["Element Name"].drop_duplicates().tolist()
fields_to_fill = "\n".join(all_submission_fields)

# === Business Categories ===
with open("bop_categories.txt") as f:
    bop_categories = [line.strip() for line in f if line.strip()]


bop_categories = bop_categories[:4]
# === LLM and Vector Store Setup ===
llm = ChatOpenAI(temperature=0.7, model_name="gpt-4")
vectorstore = FAISS.load_local("bop_vectorstore", OpenAIEmbeddings(), allow_dangerous_deserialization=True)
retriever = vectorstore.as_retriever(search_kwargs={"k": 4})
llm2 = ChatOpenAI(temperature=0.9, model_name="gpt-4")

# === Prompt Templates ===
question_prompt = PromptTemplate(
    input_variables=["business_type", "guidelines"],
    template="""
You are a commercial insurance underwriter.

Given the business type "{business_type}" and the following underwriting guidelines:

{guidelines}

Generate 5 to 10 underwriting questions that would help assess eligibility or risk. Use yes/no and short-answer formats.
"""
)

answer_prompt = PromptTemplate(
    input_variables=["business_type", "questions"],
    template="""
You are simulating answers to an insurance application for a "{business_type}".

Here are the underwriting questions:
{questions}

Provide realistic, internally consistent answers. Use natural behavior and practical values.
"""
)

owner_prompt = PromptTemplate(
    input_variables=["business_type", "questions"],
    template="""
You are a business owner who owns a "{business_type}" type of business.

Here are underwriting questions:
{questions}

Now answer:
- What does your business do?
- Who are your customers?
- How many employees do you have (full/part-time)?
- What is your annual payroll?
- Where do you operate (city/neighborhood)?
- Do you have a physical location (size, features)?
- Anything unique or special?
- How do you make money?
- What is your annual gross sales?

Be natural, realistic, and detailed.
"""
)

review_prompt = PromptTemplate(
    input_variables=["business_type", "tone"],
    template="""
You are writing a short customer review for a business of type "{business_type}".
Tone: {tone}
Write a realistic one-sentence review specific to this type of business.
"""
)

application_prompt = PromptTemplate(
    input_variables=["business_type", "qa_pairs", "fields_to_fill", "business_owner_description"],
    template="""
You are filling out a Business Owner Policy (BOP) insurance application for a \"{business_type}\".

Based on the following answered underwriting questions and business owner description, generate a complete and realistic application record.
Even if a field is not mentioned in the input, infer it using common industry knowledge and logical consistency.
Do not leave any values blank. If a value is unknown, infer something plausible.

You must fill in the following fields:
{fields_to_fill}

Answered Questions:
{qa_pairs}

Business Description:
{business_owner_description}

Return the application as a well-structured JSON object.
"""
)

# === Chains ===
question_chain = LLMChain(llm=llm, prompt=question_prompt)
answer_chain = LLMChain(llm=llm, prompt=answer_prompt)
owner_chain = LLMChain(llm=llm, prompt=owner_prompt)
application_chain = LLMChain(llm=llm, prompt=application_prompt)
review_chain = LLMChain(llm=llm2, prompt=review_prompt)

# === Utility Functions ===
def get_guidelines(business_type: str) -> str:
    docs = retriever.get_relevant_documents(f"Underwriting guidelines for {business_type}")
    return "\n\n".join(doc.page_content for doc in docs[:3])

def simulate_credit_score() -> dict:
    r = random.random()
    if r < 0.2:
        return {"score": random.randint(450, 599), "rating": "Low"}
    elif r < 0.6:
        return {"score": random.randint(600, 699), "rating": "Average"}
    else:
        return {"score": random.randint(700, 850), "rating": "Good"}

def simulate_google_reviews(business_name: str, business_type: str) -> dict:
    review_count = random.randint(5, 250)
    rating = round(random.uniform(1.0, 5.0), 1)
    tone = "negative" if random.random() < 0.1 else random.choices(["positive", "neutral", "negative"], weights=[0.5, 0.3, 0.2], k=5)
    if isinstance(tone, list):
        samples = [review_chain.invoke({"business_type": business_type, "tone": t})["text"].strip() for t in tone]
    else:
        samples = [review_chain.invoke({"business_type": business_type, "tone": "negative"})["text"].strip() for _ in range(5)]
    return {
        "review_count": review_count,
        "average_rating": rating,
        "reviews": samples
    }

def simulate_claims_history(credit_score: int, avg_rating: float) -> dict:
    if credit_score < 600 or avg_rating < 2.5:
        num_claims = random.randint(2, 5)
    elif credit_score < 700 or avg_rating < 3.5:
        num_claims = random.randint(1, 3)
    else:
        num_claims = random.choice([0, 1])
    total_losses = num_claims * random.randint(1000, 10000)
    return {
        "num_claims": num_claims,
        "total_losses": total_losses
    }

# === Batch Simulation ===
def batch_generate_applications(n_per_category=1):
    applications = []
    external_data = []
    for category in bop_categories:
        for i in range(n_per_category):
            try:
                business_type = f"{category} - {uuid.uuid4().hex[:6]}"
                app = generate_bop_application(business_type)
                credit = simulate_credit_score()
                reviews = simulate_google_reviews(app.get("NAME", business_type), business_type)
                claims = simulate_claims_history(credit["score"], reviews["average_rating"])

                applications.append(app)
                external_data.append({
                    "Business Name": app.get("NAME", business_type),
                    "Credit Score": credit["score"],
                    "Credit Rating": credit["rating"],
                    "Google Review Count": reviews["review_count"],
                    "Average Review Rating": reviews["average_rating"],
                    "Sample Reviews": reviews["reviews"],
                    "Number of Claims": claims["num_claims"],
                    "Total Losses": claims["total_losses"]
                })
            except Exception as e:
                print(f"Error generating for {category}: {e}")

    pd.DataFrame(applications).to_json("generated_bop_applications.json", orient="records", indent=2)
    pd.DataFrame(external_data).to_json("generated_bop_third_party_data.json", orient="records", indent=2)
    print("Batch generation completed.")

# === Main Function ===
def generate_bop_application(business_type: str) -> dict:
    with get_openai_callback() as cb:
        guidelines = get_guidelines(business_type)

        questions = question_chain.invoke({
            "business_type": business_type,
            "guidelines": guidelines
        })["text"]

        answers = answer_chain.invoke({
            "business_type": business_type,
            "questions": questions
        })["text"]

        qa_pairs = "\n".join(
            f"{q.strip()} — {a.strip()}"
            for q, a in zip(questions.strip().split("\n"), answers.strip().split("\n"))
            if q.strip() and a.strip()
        )

        owner_description = owner_chain.invoke({
            "business_type": business_type,
            "questions": questions
        })["text"]

        final_app = application_chain.invoke({
            "business_type": business_type,
            "qa_pairs": qa_pairs,
            "fields_to_fill": fields_to_fill,
            "business_owner_description": owner_description
        })["text"]

        print(f"Total tokens used: {cb.total_tokens}")
        return json.loads(final_app)

# === Example Run ===
if __name__ == "__main__":
    batch_generate_applications(n_per_category=1)


Total tokens used: 4318
Total tokens used: 4204
Total tokens used: 4448
Total tokens used: 4087
Batch generation completed.


In [13]:
import pandas as pd
import json
import re
import random
import uuid
from datetime import datetime
from langchain.vectorstores import FAISS
from langchain.embeddings import OpenAIEmbeddings
from langchain.chat_models import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain.callbacks import get_openai_callback

# === Load Data ===
coverages_df = pd.read_csv("BOP Coverages - coverages.csv")
sic_codes_df = pd.read_csv("sic-codes.csv")
fields_df = pd.read_csv("BOP Policy Submission Details - BOP Submission Elements.csv")

# === Prepare Submission Fields ===
all_submission_fields = fields_df["Element Name"].drop_duplicates().tolist()
fields_to_fill = "\n".join(all_submission_fields)

# === Business Categories ===
with open("bop_categories.txt") as f:
    bop_categories = [line.strip() for line in f if line.strip()]

bop_categories = bop_categories[:4]

# === LLM Setup ===
llm = ChatOpenAI(temperature=0.7, model_name="gpt-4")
llm2 = ChatOpenAI(temperature=0.9, model_name="gpt-4")
vectorstore = FAISS.load_local("bop_vectorstore", OpenAIEmbeddings(), allow_dangerous_deserialization=True)
retriever = vectorstore.as_retriever(search_kwargs={"k": 4})

# === Prompt Templates ===
question_prompt = PromptTemplate(
    input_variables=["business_type", "guidelines"],
    template="""
You are a commercial insurance underwriter.

Given the business type "{business_type}" and the following underwriting guidelines:

{guidelines}

Generate 5 to 10 underwriting questions that would help assess eligibility or risk. Use yes/no and short-answer formats.
"""
)

answer_prompt = PromptTemplate(
    input_variables=["business_type", "questions"],
    template="""
You are simulating answers to an insurance application for a "{business_type}".

Here are the underwriting questions:
{questions}

Provide realistic, internally consistent answers. Use natural behavior and practical values.
"""
)

owner_prompt = PromptTemplate(
    input_variables=["business_type", "questions"],
    template="""
You are a business owner who owns a "{business_type}" type of business.

Here are underwriting questions:
{questions}

Now answer:
- What does your business do?
- Who are your customers?
- How many employees do you have (full/part-time)?
- What is your annual payroll?
- Where do you operate (city/neighborhood)?
- Do you have a physical location (size, features)?
- Anything unique or special?
- How do you make money?
- What is your annual gross sales?

Be natural, realistic, and detailed.
"""
)

review_prompt = PromptTemplate(
    input_variables=["business_type", "tone"],
    template="""
You are writing a short customer review for a business of type "{business_type}".
Tone: {tone}
Write a realistic one-sentence review specific to this type of business.
"""
)

application_prompt = PromptTemplate(
    input_variables=["business_type", "qa_pairs", "fields_to_fill", "business_owner_description"],
    template="""
You are filling out a Business Owner Policy (BOP) insurance application for a "{business_type}".

Based on the following answered underwriting questions and business owner description, generate a complete and realistic application record.
Even if a field is not mentioned in the input, infer it using common industry knowledge and logical consistency.
Do not leave any values blank. If a value is unknown, infer something plausible.

You must fill in the following fields:
{fields_to_fill}

Answered Questions:
{qa_pairs}

Business Description:
{business_owner_description}

Return the application as a well-structured JSON object.
"""
)

# === Chains ===
question_chain = LLMChain(llm=llm, prompt=question_prompt)
answer_chain = LLMChain(llm=llm, prompt=answer_prompt)
owner_chain = LLMChain(llm=llm, prompt=owner_prompt)
application_chain = LLMChain(llm=llm, prompt=application_prompt)
review_chain = LLMChain(llm=llm2, prompt=review_prompt)

# === Utility Functions ===
def create_empty_application_template(path="empty_bop_template.json"):
    template = {field: None for field in all_submission_fields}
    with open(path, "w") as f:
        json.dump(template, f, indent=2)
    print(f"Empty application template saved to {path}")

def get_guidelines(business_type: str) -> str:
    docs = retriever.get_relevant_documents(f"Underwriting guidelines for {business_type}")
    return "\n\n".join(doc.page_content for doc in docs[:3])

def simulate_credit_score() -> dict:
    r = random.random()
    if r < 0.2:
        return {"score": random.randint(450, 599), "rating": "Low"}
    elif r < 0.6:
        return {"score": random.randint(600, 699), "rating": "Average"}
    else:
        return {"score": random.randint(700, 850), "rating": "Good"}

def simulate_google_reviews(business_name: str, business_type: str) -> dict:
    review_count = random.randint(5, 250)
    rating = round(random.uniform(1.0, 5.0), 1)
    tone = "negative" if random.random() < 0.1 else random.choices(["positive", "neutral", "negative"], weights=[0.5, 0.3, 0.2], k=5)
    if isinstance(tone, list):
        samples = [review_chain.invoke({"business_type": business_type, "tone": t})["text"].strip() for t in tone]
    else:
        samples = [review_chain.invoke({"business_type": business_type, "tone": "negative"})["text"].strip() for _ in range(5)]
    return {
        "review_count": review_count,
        "average_rating": rating,
        "reviews": samples
    }

def simulate_claims_history(credit_score: int, avg_rating: float) -> dict:
    if credit_score < 600 or avg_rating < 2.5:
        num_claims = random.randint(2, 5)
    elif credit_score < 700 or avg_rating < 3.5:
        num_claims = random.randint(1, 3)
    else:
        num_claims = random.choice([0, 1])
    total_losses = num_claims * random.randint(1000, 10000)
    return {
        "num_claims": num_claims,
        "total_losses": total_losses
    }

def validate_application(app: dict) -> dict:
    required_keys = set(all_submission_fields)
    flat_app = app if "Application" not in app else app["Application"]

    # Normalize field names for consistency
    normalized_app = {k.strip().upper(): v for k, v in flat_app.items()}

    # Basic field fixing
    try:
        premium_str = normalized_app.get("TOTAL PREMIUM", "$0").replace("$", "").replace(",", "")
        premium_val = float(premium_str)
        if premium_val > 500000:
            normalized_app["TOTAL PREMIUM"] = "$250,000"
    except:
        normalized_app["TOTAL PREMIUM"] = "$250,000"

    # Identify missing or implausible fields
    missing_fields = [key for key in all_submission_fields if normalized_app.get(key) in [None, "", "N/A", "Not Applicable"]]
    if missing_fields:
        repair_prompt = f"""
You are an expert in insurance underwriting. Here is a partially completed BOP application with missing or inconsistent fields:

{json.dumps(normalized_app, indent=2)}

Please provide plausible and realistic values to fill in the following fields:
{missing_fields}

Return only the corrected fields in JSON format.
"""
        try:
            fixed_fields = llm2.invoke(repair_prompt).content.strip()
            fixed_dict = json.loads(fixed_fields)
            normalized_app.update({k.strip().upper(): v for k, v in fixed_dict.items()})
        except Exception as e:
            print(f"Validation fix failed: {e}")

    # Apply updated structure back using a blank dictionary structure
    canonical_app = {field: None for field in all_submission_fields}
    for field in all_submission_fields:
        matches = [k for k in normalized_app if k.replace("_", " ").upper().strip() == field.upper().strip()]
        if matches:
            canonical_app[field] = normalized_app[matches[0]]

        # Normalize keys in GENERAL INFO QUESTIONS
    if "GENERAL INFO QUESTIONS" in canonical_app and isinstance(canonical_app["GENERAL INFO QUESTIONS"], dict):
        normalized_questions = {}
        for i, (k, v) in enumerate(canonical_app["GENERAL INFO QUESTIONS"].items(), start=1):
            normalized_questions[f"Question {i}"] = v
        canonical_app["GENERAL INFO QUESTIONS"] = normalized_questions

    return canonical_app
    prompt = f"""
You are an insurance compliance analyst. You have received the following business owner policy (BOP) insurance application data:

{json.dumps(flat_app, indent=2)}

Evaluate this application for realism and internal consistency. Return a single word answer: VALID if the data looks realistic and consistent, or INVALID if it seems incomplete, inconsistent, or implausible.
"""
    try:
        judgment = llm2.invoke(prompt).strip().upper()
        return judgment.startswith("VALID")
    except:
        return False

# === Batch Simulation ===
def batch_generate_applications(n_per_category=1):
    try:
        with open("empty_bop_template.json") as f:
            template_fields = json.load(f)
    except FileNotFoundError:
        create_empty_application_template()
        with open("empty_bop_template.json") as f:
            template_fields = json.load(f)
    applications = []
    external_data = []
    for category in bop_categories:
        for i in range(n_per_category):
            try:
                business_type = f"{category} - {uuid.uuid4().hex[:6]}"
                app = generate_application_incrementally(business_type)
                app = validate_application(app)
                credit = simulate_credit_score()
                reviews = simulate_google_reviews(app.get("NAME", business_type), business_type)
                claims = simulate_claims_history(credit["score"], reviews["average_rating"])

                applications.append(app)
                external_data.append({
                    "Business Name": app.get("NAME", business_type),
                    "Credit Score": credit["score"],
                    "Credit Rating": credit["rating"],
                    "Google Review Count": reviews["review_count"],
                    "Average Review Rating": reviews["average_rating"],
                    "Sample Reviews": reviews["reviews"],
                    "Number of Claims": claims["num_claims"],
                    "Total Losses": claims["total_losses"]
                })
            except Exception as e:
                print(f"Error generating for {category}: {e}")

    pd.DataFrame(applications).to_json("generated_bop_applications.json", orient="records", indent=2)
    pd.DataFrame(external_data).to_json("generated_bop_third_party_data.json", orient="records", indent=2)
    print("Batch generation completed.")

# === Incremental Application Generation ===
def generate_application_incrementally(business_type: str) -> dict:
    with get_openai_callback() as cb:
        guidelines = get_guidelines(business_type)
        questions = question_chain.invoke({"business_type": business_type, "guidelines": guidelines})["text"]
        answers = answer_chain.invoke({"business_type": business_type, "questions": questions})["text"]
        qa_pairs = "\n".join(
            f"{q.strip()} — {a.strip()}"
            for q, a in zip(questions.strip().split("\n"), answers.strip().split("\n"))
            if q.strip() and a.strip()
        )

        owner_description = owner_chain.invoke({"business_type": business_type, "questions": questions})["text"]

        application = {field: None for field in all_submission_fields}
        context_base = f"""
Business Type: {business_type}

Owner Description:
{owner_description}

Answered Questions:
{qa_pairs}
"""

        for field in all_submission_fields:
            context = json.dumps({k: v for k, v in application.items() if v is not None}, indent=2)
            prompt = f"""
You are completing a Business Owner Policy (BOP) insurance application.

Given the business details and previously answered fields below, generate the value for the field: "{field}".
If you cannot confidently infer a value, provide a realistic placeholder.

{context_base}

Known Fields:
{context}

Return a single realistic value for "{field}" only.
"""
            try:
                result = llm2.invoke(prompt).content.strip()
                if result.lower().startswith(field.lower()):
                    result = result.split(":", 1)[-1].strip()
                application[field] = result
            except Exception as e:
                print(f"Error generating field {field}: {e}")

        print(f"Total tokens used: {cb.total_tokens}")
        return application


# === Main Function ===
def generate_bop_application(business_type: str) -> dict:
    with get_openai_callback() as cb:
        guidelines = get_guidelines(business_type)

        questions = question_chain.invoke({
            "business_type": business_type,
            "guidelines": guidelines
        })["text"]

        answers = answer_chain.invoke({
            "business_type": business_type,
            "questions": questions
        })["text"]

        qa_pairs = "\n".join(
            f"{q.strip()} — {a.strip()}"
            for q, a in zip(questions.strip().split("\n"), answers.strip().split("\n"))
            if q.strip() and a.strip()
        )

        owner_description = owner_chain.invoke({
            "business_type": business_type,
            "questions": questions
        })["text"]

        final_app = application_chain.invoke({
            "business_type": business_type,
            "qa_pairs": qa_pairs,
            "fields_to_fill": fields_to_fill,
            "business_owner_description": owner_description
        })["text"]

        print(f"Total tokens used: {cb.total_tokens}")
        return json.loads(final_app)

    create_empty_application_template()

# === Example Run ===
if __name__ == "__main__":
    batch_generate_applications(n_per_category=1)


  llm = ChatOpenAI(temperature=0.7, model_name="gpt-4")
  question_chain = LLMChain(llm=llm, prompt=question_prompt)


Empty application template saved to empty_bop_template.json
Total tokens used: 97385
Total tokens used: 98899
Total tokens used: 105540
Total tokens used: 97743
Batch generation completed.
