# Data

In [2]:
!pip install faker

Collecting faker
  Downloading faker-37.4.2-py3-none-any.whl.metadata (15 kB)
Downloading faker-37.4.2-py3-none-any.whl (1.9 MB)
   ---------------------------------------- 0.0/1.9 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.9 MB ? eta -:--:--
    --------------------------------------- 0.0/1.9 MB 660.6 kB/s eta 0:00:03
    --------------------------------------- 0.0/1.9 MB 660.6 kB/s eta 0:00:03
    --------------------------------------- 0.0/1.9 MB 330.3 kB/s eta 0:00:06
   - -------------------------------------- 0.1/1.9 MB 359.3 kB/s eta 0:00:06
   - -------------------------------------- 0.1/1.9 MB 375.8 kB/s eta 0:00:05
   -- ------------------------------------- 0.1/1.9 MB 347.8 kB/s eta 0:00:06
   -- ------------------------------------- 0.1/1.9 MB 380.5 kB/s eta 0:00:05
   -- ------------------------------------- 0.1/1.9 MB 358.2 kB/s eta 0:00:06
   --- ------------------------------------ 0.2/1.9 MB 367.6 kB/s eta 0:00:05
   --- ------------------------

## Insurance Risk & Claim & Fraudulent Dataset

In [1]:
import pandas as pd
import numpy as np
from faker import Faker
import random

# Initialize faker
fake = Faker()
np.random.seed(42)

# Parameters
num_records = 10000

# Helper functions
def generate_policy_id(i):
    return f"POL{100000 + i}"

def generate_customer_id(i):
    return f"CUST{100000 + i}"

def generate_gender():
    return random.choices(['Male', 'Female', 'Other'], weights=[0.48, 0.48, 0.04])[0]

def generate_policy_type():
    return random.choice(['Health', 'Auto', 'Life', 'Property'])

def add_noise(val, scale=0.05):
    noise = val * np.random.normal(0, scale)
    return max(val + noise, 0)  # Ensure value is non-negative, = 0 is accepted
 
# Generate data
data = []

for i in range(num_records):
    customer_id = generate_customer_id(i)
    policy_id = generate_policy_id(i)
    age = int(np.clip(np.random.normal(40, 12), 18, 80))
    gender = generate_gender()
    policy_type = generate_policy_type()
    income = max(round(np.random.normal(60000, 15000), 2), 5000)
    asset_age = int(np.clip(np.random.normal(5, 2), 0, 20))  # Vehicle or Property age
    claim_history = np.random.poisson(1.2)
    fraudulent_claim = np.random.choice([0, 1], p=[0.8, 0.2])  # Imbalance
    premium_amount = round(add_noise(1000 + claim_history * 100 + (age/2)), 2)
    claim_amount = round(add_noise(premium_amount * np.random.uniform(0.5, 3.0)), 2)
    location = random.choice(['New York', 'Chicago', 'Los Angeles', 'Houston', 'Dallas'])
    policy_upgrade = np.random.choice([0, 1, 2], p=[0.5, 0.4, 0.1])
    
    #if fraudulent_claim == 1 or claim_history > 2 or claim_amount > 2500:
     #   risk_score = np.random.choice(['Medium', 'High'], p=[0.3, 0.7])
    #else:
     #   risk_score = np.random.choice(['Low', 'Medium'], p=[0.6, 0.4])
    
    risk_index = (
    2 * fraudulent_claim + 
    1.5 * (claim_history > 2) + 
    1.5 * (claim_amount > 2500) +
    1 * (income < 30000) + 
    1 * (policy_type in ['Auto', 'Property']))

    if risk_index >= 4:
        risk_score = 'High'
    elif risk_index >= 2:
        risk_score = 'Medium'
    else:
        risk_score = 'Low'

    
    row = [
        customer_id,
        policy_id,
        age,
        gender,
        policy_type,
        income,
        asset_age,
        claim_history,
        fraudulent_claim,
        premium_amount,
        claim_amount,
        risk_score,
        location,
        policy_upgrade
    ]
    data.append(row)

# Create DataFrame
columns = [
    'Customer_ID',
    'Policy_ID',  
    'Customer_Age',
    'Gender',
    'Policy_Type',
    'Monthly_Income',
    'Vehicle_or_Property_Age',
    'Claim_History',
    'Fraudulent_Claim',
    'Premium_Amount',
    'Claim_Amount',
    'Risk_Score',
    'Location',
    'Policy_Upgrade'
]

df = pd.DataFrame(data, columns=columns)

# Save to CSV
df.to_csv("insurance_class_reg_data.csv", index=False)
print("✅ Synthetic dataset generated and saved as 'insurance_class_reg_data.csv'")

✅ Synthetic dataset generated and saved as 'insurance_class_reg_data.csv'


## Fraudulent Claim Dataset 

In [1]:
from datetime import date
import pandas as pd
import numpy as np
from faker import Faker
import random

# Initialize Faker
fake = Faker()
random.seed(42)
np.random.seed(42)

# Number of claims
num_claims = 6000

# Generate unique Claim IDs
claim_ids = [f"CLM{100000 + i}" for i in range(num_claims)]

# Generate random Policyholder IDs from CUST100000 to CUST109999
policyholder_ids = [f"CUST{random.randint(100000, 109999)}" for _ in range(num_claims)]

# Convert date strings to datetime.date objects
start_date = date(2023, 1, 1)
end_date = date(2024, 12, 31)

# Generate realistic Claim Dates (random between Jan 1, 2023 and Dec 31 2024)
claim_dates = [fake.date_between(start_date=start_date, end_date=end_date) for _ in range(num_claims)]

# Build DataFrame
df_claims = pd.DataFrame({
    'Claim_ID': claim_ids,
    'Policyholder_ID': policyholder_ids,
    'Claim_Date': claim_dates
})

# Save to CSV (optional)
df_claims.to_csv("claim_base_data.csv", index=False)
print("✅ Generated 6000 claim records using Faker.")

✅ Generated 6000 claim records using Faker.


## Customer Feedback & Sentiment Dataset

In [1]:
import pandas as pd
import random
import uuid

# Define possible values for categorical columns
sentiments = ['Positive', 'Negative', 'Neutral']
ratings_map = {
    'Positive': [4, 5],
    'Negative': [1, 2],
    'Neutral': [3]
}
service_types = ['Claim', 'Policy Purchase', 'Customer Support']

# Some sample review texts to simulate real data
sample_reviews = {
    'Positive': [
        "The claim process was quick and smooth.",
        "Great customer support. Very satisfied.",
        "Easy to buy a policy online. Loved it!",
        "Fast and helpful service!",
        "Everything went better than expected."
    ],
    'Negative': [
        "My claim was denied unfairly. Very disappointed.",
        "Customer service was rude and unhelpful.",
        "The policy terms were not clearly explained.",
        "Had to wait weeks for a response.",
        "Very bad experience with the support team."
    ],
    'Neutral': [
        "Bought the policy without any issues.",
        "Service was okay, nothing special.",
        "Average experience, neither good nor bad.",
        "It was a typical insurance interaction.",
        "Support was fine, not exceptional."
    ]
}

# Generate synthetic data
data = []
for i in range(2000):
    review_id = f"R{str(i+1).zfill(5)}"
    customer_id = f"CUST{str(random.randint(100000, 109999))}"
    sentiment = random.choice(sentiments)
    review_text = random.choice(sample_reviews[sentiment])
    rating = random.choice(ratings_map[sentiment])
    service_type = random.choice(service_types)

    data.append([review_id, customer_id, review_text, sentiment, rating, service_type])

# Create DataFrame
df = pd.DataFrame(data, columns=[
    'Review_ID', 'Customer_ID', 'Review_Text', 'Sentiment_Label', 'Rating', 'Service_Type'
])

# Save to CSV
df.to_csv('insurance_reviews.csv', index=False)

print("✅ Synthetic dataset with 5000 rows created and saved as 'insurance_reviews.csv'")


✅ Synthetic dataset with 5000 rows created and saved as 'insurance_reviews.csv'


## Insurance Multilingual Policy Document Dataset

In [1]:
import pandas as pd
import numpy as np
import random
from pathlib import Path

# reproducibility
RND_SEED = 42
random.seed(RND_SEED)
np.random.seed(RND_SEED)

# ---------- CONFIG ----------
# Path to your dataset
INPUT_CSV = "C://Users//Admin//Documents//Guvi//MDTM38//project//insurance//data//insurance_af_cus_seg.csv"  # <- replace with your file path
OUTPUT_CSV = "policy_texts_en.csv"
# ----------------------------

def safe_num(x, fallback=5000):
    try:
        if pd.isna(x):
            return fallback
        return float(x)
    except:
        return fallback

def norm_policy_type(s):
    if pd.isna(s): 
        return "General"
    s = str(s).strip().lower()
    if "auto" in s or "vehicle" in s or "car" in s:
        return "Auto"
    if "health" in s or "medical" in s:
        return "Health"
    if "life" in s:
        return "Life"
    if "property" in s or "home" in s or "house" in s:
        return "Property"
    return s.title()

def generate_policy_text(row, verbosity=3):
    pid = row.get("policy_id", "UNKNOWN")
    ptype = norm_policy_type(row.get("policy_type", "General"))
    premium = safe_num(row.get("premium_amount", np.nan), fallback=5000)
    monthly_income = safe_num(row.get("monthly_income", np.nan), fallback=4000)
    vehicle_age = row.get("vehicle_or_property_age", None)
    claim_hist = int(row.get("claim_history", 0) if not pd.isna(row.get("claim_history", 0)) else 0)
    ref = premium if premium > 0 else max(1, monthly_income * 12.0)
    lead_phrases = [
        "This policy provides comprehensive coverage for",
        "This insurance covers",
        "Designed to protect against"
    ]
    lead = random.choice(lead_phrases)
    if ptype == "Auto":
        cov_limit = max(50000, int(ref * random.uniform(10, 40)))
        theft_limit = int(cov_limit * random.uniform(0.25, 0.6))
        personal_acc_limit = int(cov_limit * random.uniform(0.05, 0.2))
        deductible = int(max(250, ref * random.uniform(0.01, 0.08)))
        parts = [
            f"{lead} vehicle damage due to accidents, collision, theft, fire, and natural hazards.",
            f"Coverage includes comprehensive repair costs and third-party liability up to ${cov_limit:,}.",
            f"Theft & vandalism cover limit: ${theft_limit:,}.",
            f"Personal accident cover for driver and passengers up to ${personal_acc_limit:,}.",
            f"Standard deductible: ${deductible:,} per claim. Optional add-ons: roadside assistance, zero-depreciation, and engine protection.",
            "Exclusions: intentional damage, driving under influence, and racing-related incidents.",
        ]
    elif ptype == "Health":
        cov_limit = max(100000, int(ref * random.uniform(8, 80)))
        in_patient_limit = int(cov_limit * random.uniform(0.6, 0.95))
        daycare_limit = int(cov_limit * random.uniform(0.05, 0.2))
        deductible = int(max(0, ref * random.uniform(0.01, 0.03)))
        parts = [
            f"{lead} medical expenses including hospitalization, surgery, and day-care procedures.",
            f"Sum insured: up to ${cov_limit:,}; in-patient treatment covered up to ${in_patient_limit:,}.",
            f"Day-care & outpatient limits: up to ${daycare_limit:,} depending on treatment type.",
            "Optional riders: maternity cover, critical illness rider, and pre/post-hospitalization benefits.",
        ]
    elif ptype == "Life":
        death_benefit = max(100000, int(ref * random.uniform(20, 200)))
        critical_illness = int(death_benefit * random.uniform(0.2, 0.6))
        parts = [
            f"{lead} the insured's family against loss of life, providing a death benefit of up to ${death_benefit:,}.",
            f"Optional critical illness benefit: lump-sum up to ${critical_illness:,} for specified conditions.",
        ]
    elif ptype == "Property":
        cov_limit = max(100000, int(ref * random.uniform(10, 120)))
        contents_limit = int(cov_limit * random.uniform(0.2, 0.6))
        structural_limit = int(cov_limit * random.uniform(0.6, 0.95))
        deductible = int(max(500, ref * random.uniform(0.01, 0.05)))
        parts = [
            f"{lead} loss or damage to property from fire, lightning, explosion, burglary, and certain natural disasters.",
            f"Structural damage cover: up to ${structural_limit:,}; contents & valuables covered up to ${contents_limit:,}.",
        ]
    else:
        cov_limit = int(ref * random.uniform(5, 40))
        parts = [
            f"{lead} risks relevant to {ptype} policies, with cover limits tailored to the insured's profile.",
            f"Indicative cover limit: ${cov_limit:,}.",
        ]
    if verbosity <= 1:
        text = " ".join(parts[:2])
    else:
        text = " ".join(parts)
    return text.strip()

if not Path(INPUT_CSV).exists():
    print(f"Input file {INPUT_CSV} not found.")
else:
    df = pd.read_csv(INPUT_CSV)
    print("Loaded dataset with shape:", df.shape)
    want_cols = ["policy_id", "policy_type", "premium_amount", "monthly_income", "vehicle_or_property_age", "claim_history", "num_active_policies"]
    present = [c for c in want_cols if c in df.columns]
    policies = df.drop_duplicates("policy_id")[present].reset_index(drop=True)
    policies["Policy_Text_EN"] = policies.apply(lambda r: generate_policy_text(r, verbosity=3), axis=1)
    policies.to_csv(OUTPUT_CSV, index=False)
    print("Saved generated policy texts to:", OUTPUT_CSV, policies.shape)

Loaded dataset with shape: (17482, 18)
Saved generated policy texts to: policy_texts_en.csv (15998, 8)


## Customer Segmentation Dataset (Unsupervised Learning - Clustering)

In [1]:
import pandas as pd
import numpy as np
import random

# Set random seed for reproducibility
np.random.seed(42)
random.seed(42)

# Step 1: Generate 4000 unique customer_ids
customer_ids = [f"CUST{random.randint(100000, 109999)}" for _ in range(4000)]

# Step 2: Generate extra policies for each (1 or 2 per customer)
extra_data = []
policy_counter = 200000  # Start from a high policy_id to avoid clashes

for cust_id in customer_ids:
    num_policies = random.choice([1, 2])
    
    for _ in range(num_policies):
        policy_id = f"POL{policy_counter}"
        policy_counter += 1
        
        policy_type = random.choice(['Health', 'Auto', 'Life', 'Property'])
        asset_age = int(np.clip(np.random.normal(5, 2), 0, 20))
        claim_history = np.random.poisson(1.2)
        fraudulent_claim = np.random.choice([0, 1], p=[0.8, 0.2])
        premium_amount = round(1000 + claim_history * 100 + (random.randint(25, 60) / 2), 2)

        extra_data.append({
            'customer_id': cust_id,
            'policy_id': policy_id,
            'policy_type': policy_type,
            'vehicle_or_property_age': asset_age,
            'claim_history': claim_history,
            'fraudulent_claim': fraudulent_claim,
            'premium_amount': premium_amount
        })

# Step 3: Create a new DataFrame
df_extra_policies = pd.DataFrame(extra_data)

# Preview
print("✅ Extra policy records generated:", df_extra_policies.shape[0])

# Optional: Save to CSV
df_extra_policies.to_csv("extra_policies.csv", index=False)
print("✅ Generated 4000 Customer -  Extra policy records using Faker.")

✅ Extra policy records generated: 5998
✅ Generated 4000 Customer -  Extra policy records using Faker.
