In [23]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report, accuracy_score
import numpy as np

In [42]:
df = pd.read_csv('insurance.csv')
df['occupation'].unique()

array(['retired', 'freelancer', 'student', 'government_job',
       'business_owner', 'unemployed', 'private_job'], dtype=object)

In [25]:
df_feature = df.copy()

In [26]:
df_feature['bmi'] = df_feature['weight']/(df_feature['height']**2)
df_feature.sample(5)

Unnamed: 0,age,weight,height,income_lpa,smoker,city,occupation,insurance_premium_category,bmi
87,30,82.0,1.6,25.59837,False,Hyderabad,government_job,Low,32.03125
44,59,77.0,1.6,50.0,True,Lucknow,private_job,Medium,30.078125
49,23,106.6,1.58,2.29,False,Kota,student,Medium,42.70149
67,22,56.4,1.82,2.76,False,Jaipur,student,Low,17.026929
27,58,111.4,1.78,34.33,False,Lucknow,private_job,Medium,35.159702


In [27]:
def age_group(age):
    if age < 25:
        return 'young'
    if age < 45:
        return 'adult'
    if age < 60:
        return 'middle_aged'
    return 'senior'

df_feature['age_group'] = df_feature['age'].apply(age_group)
df_feature.sample(5)


Unnamed: 0,age,weight,height,income_lpa,smoker,city,occupation,insurance_premium_category,bmi,age_group
80,56,95.8,1.67,50.0,False,Jalandhar,unemployed,High,34.350461,middle_aged
2,39,56.8,1.64,36.64,False,Indore,freelancer,Low,21.118382,adult
44,59,77.0,1.6,50.0,True,Lucknow,private_job,Medium,30.078125,middle_aged
23,35,70.3,1.78,23.71,False,Mysore,unemployed,Medium,22.187855,adult
94,50,105.4,1.78,10.542289,False,Bangalore,government_job,Low,33.266002,middle_aged


In [28]:
def lifestyle_risk(row):
    if row['smoker'] and row['bmi'] > 30:
        return 'high'
    if row['smoker'] or row['bmi'] > 27:
        return 'medium'
    return 'low'

df_feature['lifestyle_risk'] = df_feature.apply(lifestyle_risk, axis=1)
    
    

In [29]:
tier_1_cities = [
    "Mumbai", "Delhi", "Bangalore", "Chennai", 
    "Kolkata", "Hyderabad", "Pune"
]

tier_2_cities = [
    "Jaipur", "Chandigarh", "Indore", "Lucknow", "Patna", "Ranchi", "Visakhapatnam", "Coimbatore",
    "Bhopal", "Nagpur", "Vadodara", "Surat", "Rajkot", "Jodhpur", "Raipur", "Amritsar", "Varanasi",
    "Agra", "Dehradun", "Mysore", "Jabalpur", "Guwahati", "Thiruvananthapuram", "Ludhiana", "Nashik",
    "Allahabad", "Udaipur", "Aurangabad", "Hubli", "Belgaum", "Salem", "Vijayawada", "Tiruchirappalli",
    "Bhavnagar", "Gwalior", "Dhanbad", "Bareilly", "Aligarh", "Gaya", "Kozhikode", "Warangal",
    "Kolhapur", "Bilaspur", "Jalandhar", "Noida", "Guntur", "Asansol", "Siliguri"
]

In [30]:
def city_tier(city):
    if city in tier_1_cities:
        return 1
    if city in tier_2_cities:
        return 2
    return 3

df_feature['city_tier'] = df_feature['city'].apply(city_tier)

In [31]:
df_feature.drop(columns=['age','height','smoker','city'])[['income_lpa','occupation','bmi','age_group','lifestyle_risk','city_tier','insurance_premium_category']]

Unnamed: 0,income_lpa,occupation,bmi,age_group,lifestyle_risk,city_tier,insurance_premium_category
0,2.92000,retired,49.227482,senior,medium,2,High
1,34.28000,freelancer,30.189017,adult,medium,1,Low
2,36.64000,freelancer,21.118382,adult,low,2,Low
3,3.34000,student,45.535900,young,high,1,Medium
4,3.94000,retired,24.296875,senior,medium,2,High
...,...,...,...,...,...,...,...
95,19.64000,business_owner,21.420747,adult,low,2,Low
96,34.01000,private_job,47.984483,adult,medium,1,Low
97,44.86000,freelancer,18.765432,middle_aged,low,1,Low
98,28.30000,business_owner,30.521676,adult,medium,1,Low


In [32]:
X = df_feature[['bmi','age_group','lifestyle_risk','city_tier','income_lpa','occupation']]
y = df_feature['insurance_premium_category']

In [33]:
X

Unnamed: 0,bmi,age_group,lifestyle_risk,city_tier,income_lpa,occupation
0,49.227482,senior,medium,2,2.92000,retired
1,30.189017,adult,medium,1,34.28000,freelancer
2,21.118382,adult,low,2,36.64000,freelancer
3,45.535900,young,high,1,3.34000,student
4,24.296875,senior,medium,2,3.94000,retired
...,...,...,...,...,...,...
95,21.420747,adult,low,2,19.64000,business_owner
96,47.984483,adult,medium,1,34.01000,private_job
97,18.765432,middle_aged,low,1,44.86000,freelancer
98,30.521676,adult,medium,1,28.30000,business_owner


In [34]:
categorical_features = ['age_group','lifestyle_risk','city_tier','occupation']
numeric_features = ['bmi', 'income_lpa']

In [36]:
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), categorical_features),
        ('num', 'passthrough', numeric_features)
    ]
)

In [37]:
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))   
])

In [38]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=1)
pipeline.fit(X_train, y_train)

In [39]:
y_pred = pipeline.predict(X_test)
accuracy_score(y_test, y_pred)

0.8

In [40]:
X_test.sample(5)

Unnamed: 0,bmi,age_group,lifestyle_risk,city_tier,income_lpa,occupation
44,30.078125,middle_aged,high,2,50.0,private_job
65,37.662982,middle_aged,high,2,38.07,unemployed
56,42.414152,young,high,1,2.86,student
92,18.319942,adult,medium,2,30.0,government_job
33,21.791064,senior,low,1,1.46,retired


In [41]:
import pickle

with open('ml_model.pkl', 'wb') as file:
    pickle.dump(pipeline, file)