In [32]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [4]:
df = pd.read_csv('12-insurance_data.csv')
df.sample(5)

Unnamed: 0,age,weight,height,income_lpa,smoker,city,occupation,insurance_premium_category
30,35,89.6,1.73,32.97,False,Delhi,business_owner,Low
22,57,106.4,1.83,30.0,False,Chandigarh,government_job,Low
47,55,116.4,1.87,8.34,False,Chandigarh,private_job,Medium
58,72,95.9,1.79,3.31,True,Indore,retired,High
41,64,59.8,1.63,3.87,False,Mumbai,retired,Medium


Now we will do feature engineering and extract new features from the existing columns like bmi, age group, lifestyle risk

In [5]:
df_feat = df.copy()

In [6]:
df_feat['bmi'] = df_feat['weight']/(df_feat['height']**2)

In [7]:
def age_group(age):
    if age < 25:
        return 'young'
    elif age < 45:
        return 'adult'
    elif age < 60:
        return 'middle_aged'
    return 'senior'

In [8]:
df_feat['age_group'] = df_feat['age'].apply(age_group)

In [9]:
def lifestyle_risk(row):
    if row['smoker'] and row['bmi']>30:
        return 'high'
    elif row['smoker'] and row['bmi']>27:
        return 'medium'
    else:
        return 'low'

In [10]:
df_feat['lifestyle_risk'] = df_feat.apply(lifestyle_risk,axis=1)

In [11]:
tier_1_cities = ["Mumbai", "Delhi", "Bangalore", "Chennai", "Kolkata", "Hyderabad", "Pune"]
tier_2_cities = [
    "Jaipur", "Chandigarh", "Indore", "Lucknow", "Patna", "Ranchi", "Visakhapatnam", "Coimbatore",
    "Bhopal", "Nagpur", "Vadodara", "Surat", "Rajkot", "Jodhpur", "Raipur", "Amritsar", "Varanasi",
    "Agra", "Dehradun", "Mysore", "Jabalpur", "Guwahati", "Thiruvananthapuram", "Ludhiana", "Nashik",
    "Allahabad", "Udaipur", "Aurangabad", "Hubli", "Belgaum", "Salem", "Vijayawada", "Tiruchirappalli",
    "Bhavnagar", "Gwalior", "Dhanbad", "Bareilly", "Aligarh", "Gaya", "Kozhikode", "Warangal",
    "Kolhapur", "Bilaspur", "Jalandhar", "Noida", "Guntur", "Asansol", "Siliguri"
]

In [12]:
def city_tier(city):
    if city in tier_1_cities:
        return 1
    if city in tier_2_cities:
        return 2
    else:
        return 3

In [15]:
df_feat['city_tier'] = df_feat['city'].apply(city_tier)

In [16]:
X = df_feat[["bmi", "age_group", "lifestyle_risk", "city_tier", "income_lpa", "occupation"]]
Y = df_feat["insurance_premium_category"]

In [17]:
X

Unnamed: 0,bmi,age_group,lifestyle_risk,city_tier,income_lpa,occupation
0,49.227482,senior,low,2,2.92000,retired
1,30.189017,adult,low,1,34.28000,freelancer
2,21.118382,adult,low,2,36.64000,freelancer
3,45.535900,young,high,1,3.34000,student
4,24.296875,senior,low,2,3.94000,retired
...,...,...,...,...,...,...
95,21.420747,adult,low,2,19.64000,business_owner
96,47.984483,adult,low,1,34.01000,private_job
97,18.765432,middle_aged,low,1,44.86000,freelancer
98,30.521676,adult,low,1,28.30000,business_owner


In [18]:
categorical_features = ["age_group", "lifestyle_risk", "occupation", "city_tier"]
numeric_features = ["bmi", "income_lpa"]

In [23]:
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(), categorical_features),
        ("num", "passthrough", numeric_features)
    ]
)

In [30]:
pipeline = Pipeline(steps=[
    ('preprocessor',preprocessor),
    ('classifier',RandomForestClassifier(random_state=42))
])

In [31]:
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.2,random_state=42)
pipeline.fit(X_train,Y_train)

In [33]:
y_pred = pipeline.predict(X_test)
accuracy_score(Y_test,y_pred)

0.55

In [35]:
import pickle

pickle_model_path = '12-model.pkl'
with open(pickle_model_path,'wb') as f:
    pickle.dump(pipeline,f)

NOW WE COMPLETED THE MODEL BUILDING

NOW WE HAVE TO BUILD THE API(send all the details to the model using POST then find all the features that we have engineered and then find the output)