In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

: 

In [2]:
df = pd.read_csv('insurance.csv')

In [3]:
df.head()

Unnamed: 0,age,weight,height,income_lpa,smoker,city,occupation,insurance_premium_category
0,67,119.8,1.56,2.92,False,Jaipur,retired,High
1,36,101.1,1.83,34.28,False,Chennai,freelancer,Low
2,39,56.8,1.64,36.64,False,Indore,freelancer,Low
3,22,109.4,1.55,3.34,True,Mumbai,student,Medium
4,69,62.2,1.6,3.94,True,Indore,retired,High


In [5]:
df.shape

(100, 8)

In [6]:
df_feat = df.copy()

In [12]:
# feature 1: BMI
df_feat['bmi'] = df_feat['weight'] / (df_feat['height']**2)

In [14]:
# Feature-2  age group
def age_group(age):
    if age < 25:
        return 'young'
    elif age <45:
      return 'adult'
    return 'Senior'
df_feat['age_group'] = df_feat['age'].apply(age_group)

In [23]:
# feature -3 Lifestyle Risk
def lifestyle(row):
  if row['smoker'] and row['bmi'] > 30:
    return 'High'
  elif row['smoker'] and row['bmi'] > 27:
    return 'Medium'
  else:
    return 'low'
df_feat['lifestyle_risk'] = df_feat.apply(lifestyle, axis=1)


In [24]:
tier_1_cities = ['Mumbai','Delhi','Bangalore','Chennai','Kolkata','Hydrabad','Pune']
tier_2_cities = [
    'Jaipur', 'Chandigarh', 'Indore', 'Lucknow', 'Patna', 'Nashik', 'Visakhapatnam', 'Coimbatore',
    'Bhopal', 'Nagpur', 'Vadodara', 'Surat', 'Rajkot', 'Jodhpur', 'Raipur', 'Amritsar', 'Varanasi',
    'Udaipur', 'Aurangabad', 'Hubli', 'Belgaum', 'Gaya', 'Salem', 'Vijayawada', 'Tiruchirappalli',
    'Bhavnagar', 'Gwalior', 'Noida', 'Guntur', 'Siliguri', 'Warangal', 'Kolhapur', 'Jalandhar',
    'Dhanbad', 'Bilaspur', 'Mysore', 'Thrissur', 'Meerut', 'Jhansi', 'Bokaro', 'Aligarh', 'Kanpur',
    'Cuttack', 'Jammu', 'Shimla', 'Dehradun', 'Ranchi', 'Allahabad', 'Howrah', 'Pondicherry',
    'Jamshedpur', 'Durgapur', 'Kochi', 'Madurai', 'Guwahati', 'Faridabad', 'Gandhinagar', 'Secunderabad',
    'Moradabad', 'Karnal', 'Ajmer', 'Rohtak', 'Tirupati', 'Kottayam', 'Thrissur', 'Saharanpur',
    'Muzaffarpur', 'Rourkela', 'Anantapur', 'Tumkur', 'Thoothukudi', 'Nellore', 'Vellore', 'Agra'
]


In [25]:
# Feature 4 : City Tier
def city_tier(city):
  if city in tier_1_cities:
    return 1
  elif city in tier_2_cities:
    return 2
  else:
    return 3
df_feat['city_tier'] = df_feat['city'].apply(city_tier)


In [26]:
df_feat.drop(columns=['age','weight','height','smoker','city'])[['income_lpa','occupation','bmi','age_group','lifestyle_risk','city_tier','insurance_premium_category']]

Unnamed: 0,income_lpa,occupation,bmi,age_group,lifestyle_risk,city_tier,insurance_premium_category
0,2.92000,retired,49.227482,Senior,low,2,High
1,34.28000,freelancer,30.189017,adult,low,1,Low
2,36.64000,freelancer,21.118382,adult,low,2,Low
3,3.34000,student,45.535900,young,High,1,Medium
4,3.94000,retired,24.296875,Senior,low,2,High
...,...,...,...,...,...,...,...
95,19.64000,business_owner,21.420747,adult,low,2,Low
96,34.01000,private_job,47.984483,adult,low,1,Low
97,44.86000,freelancer,18.765432,Senior,low,3,Low
98,28.30000,business_owner,30.521676,adult,low,1,Low


In [27]:
## select features and target
x = df_feat[['bmi','age_group','lifestyle_risk','city_tier','income_lpa','occupation']]
y = df_feat['insurance_premium_category']

In [30]:
# define categorical and numeric feature
categorical_features = ['age_group','lifestyle_risk','occupation','city_tier']
numeric_features = ['bmi','income_lpa']

In [36]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report


In [37]:
preprocessor = ColumnTransformer(
    [
        ('OneHotEncoder',OneHotEncoder(),categorical_features),
        ('StandardScaler',StandardScaler(),numeric_features)
    ]
)

In [39]:
# Create a pipline with preprocessing and random forest classifier
pipeline = Pipeline(steps=[
    ('preprocessor',preprocessor),
    ('classifier',RandomForestClassifier())
])

In [40]:
# split data and train model
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42)
pipeline.fit(x_train,y_train)

In [42]:
# predict and evaluate
y_pred = pipeline.predict(x_test)
accuracy = accuracy_score(y_test,y_pred)
print(f'Accuracy: {accuracy}')

Accuracy: 0.5


In [43]:
x_test.sample(5)

Unnamed: 0,bmi,age_group,lifestyle_risk,city_tier,income_lpa,occupation
10,22.949982,adult,low,1,32.78,business_owner
73,32.121628,Senior,High,1,2.22,retired
31,15.258742,adult,low,2,11.77,private_job
33,21.791064,Senior,low,1,1.46,retired
70,36.694215,Senior,low,2,0.57,retired


In [44]:
import pickle
with open('insurance_premium_prediction.pkl','wb') as file:
  pickle.dump(pipeline,file)