In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report, accuracy_score
import numpy as np

In [6]:
df = pd.read_csv('insurance.csv')

In [8]:
df.sample(5)

Unnamed: 0,age,weight,height,income_lpa,smoker,city,occupation,insurance_premium_category
18,52,80.9,1.8,38.14,True,Kota,business_owner,High
69,46,67.2,1.75,6.034487,False,Lucknow,government_job,Low
86,35,66.0,1.89,37.38,False,Hyderabad,freelancer,Low
94,50,105.4,1.78,10.542289,False,Bangalore,government_job,Low
21,69,92.7,1.84,2.91,False,Jalandhar,retired,High


In [9]:
df_feat = df.copy()

In [10]:
#feature 1 : BMI
df_feat['bmi'] = df_feat['weight'] / (df_feat["height"]/100) ** 2

In [12]:
#feature 2: Age Groups
def age_group(age):
    if age < 25:
        return 'Young'
    elif age <45:
        return 'Adult'
    elif age< 60:
        return 'Middle_aged'
    else:
        return 'Senior'

In [13]:
df_feat['age_group'] = df_feat['age'].apply(age_group)

In [14]:
#feature 3. Lifestyle risk
def lifestyle_risk(row):
    if row['smoker'] and row['bmi'] >30:
        return 'High'
    elif row['smoker'] or row['bmi']>27:
        return 'Medium'
    else:
        return 'Low'

In [15]:
df_feat['lifestyle_risk'] = df_feat.apply(lifestyle_risk, axis=1)

In [16]:
tier_1_cities = ["Mumbai", "Delhi", "Bangalore", "Chennai", "Kolkata", "Hyderabad", "Pune"]
tier_2_cities = [
    "Jaipur", "Chandigarh", "Indore", "Lucknow", "Patna", "Ranchi", "Visakhapatnam", "Coimbatore",
    "Bhopal", "Nagpur", "Vadodara", "Surat", "Rajkot", "Jodhpur", "Raipur", "Amritsar", "Varanasi",
    "Agra", "Dehradun", "Mysore", "Jabalpur", "Guwahati", "Thiruvananthapuram", "Ludhiana", "Nashik",
    "Allahabad", "Udaipur", "Aurangabad", "Hubli", "Belgaum", "Salem", "Vijayawada", "Tiruchirappalli",
    "Bhavnagar", "Gwalior", "Dhanbad", "Bareilly", "Aligarh", "Gaya", "Kozhikode", "Warangal",
    "Kolhapur", "Bilaspur", "Jalandhar", "Noida", "Guntur", "Asansol", "Siliguri"
]

In [17]:
#feature 4: city tier
def city_tier(city):
    if city in tier_1_cities:
        return 1
    elif city in tier_2_cities:
        return 2
    else:
        return 3

In [18]:
df_feat["city_tier"] = df_feat['city'].apply(city_tier)

In [19]:
df_feat.drop(columns=['age', 'weight', 'height', 'smoker', 'city'])[['income_lpa', 'occupation', 'bmi', 'age_group', 'lifestyle_risk', 'city_tier', 'insurance_premium_category']].sample(5)


Unnamed: 0,income_lpa,occupation,bmi,age_group,lifestyle_risk,city_tier,insurance_premium_category
82,12.96,unemployed,178748.124741,Adult,Medium,1,Low
54,3.32,retired,210254.234019,Senior,High,2,High
12,17.58,freelancer,300467.112738,Adult,High,2,High
67,2.76,student,170269.291148,Young,Medium,2,Low
64,1.02,retired,371796.490342,Senior,Medium,2,High


In [21]:
#select features and target
X = df_feat[['income_lpa', 'occupation', 'bmi', 'age_group', 'lifestyle_risk', 'city_tier']]
y = df_feat['insurance_premium_category']

In [22]:
X

Unnamed: 0,income_lpa,occupation,bmi,age_group,lifestyle_risk,city_tier
0,2.92000,retired,492274.819198,Senior,Medium,2
1,34.28000,freelancer,301890.172893,Adult,Medium,1
2,36.64000,freelancer,211183.819155,Adult,Medium,2
3,3.34000,student,455359.001041,Young,High,1
4,3.94000,retired,242968.750000,Senior,High,2
...,...,...,...,...,...,...
95,19.64000,business_owner,214207.472920,Adult,Medium,2
96,34.01000,private_job,479844.830494,Adult,Medium,1
97,44.86000,freelancer,187654.320988,Middle_aged,Medium,1
98,28.30000,business_owner,305216.761261,Adult,Medium,1


In [23]:
y

0       High
1        Low
2        Low
3     Medium
4       High
       ...  
95       Low
96       Low
97       Low
98       Low
99       Low
Name: insurance_premium_category, Length: 100, dtype: object

In [24]:
#Define categorical and numerical features
categorical_features = ['occupation', 'age_group', 'lifestyle_risk', 'city_tier']
numerical_features = ['income_lpa', 'bmi']

In [None]:
# Encode categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(), categorical_features),
        ("num", "passthrough", numerical_features)
    ]
)