In [35]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import *
import xgboost as xgb

from sklearn.preprocessing import PowerTransformer


import warnings
warnings.filterwarnings("ignore")
pd.set_option("display.max_columns", None)

In [36]:
df = pd.read_csv("cleaned_df.csv")

df["Policy Start Date"] = pd.to_datetime(df["Policy Start Date"])

In [37]:
df.shape

(2000000, 31)

In [38]:
df.head()

Unnamed: 0,Age,Gender,Annual Income,Marital Status,Number of Dependents,Education Level,Occupation,Health Score,Location,Policy Type,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Policy Start Date,Customer Feedback,Smoking Status,Exercise Frequency,Property Type,Premium Amount,IsNull_Age,IsNull_Annual Income,IsNull_Marital Status,IsNull_Number of Dependents,IsNull_Occupation,IsNull_Health Score,IsNull_Previous Claims,IsNull_Vehicle Age,IsNull_Credit Score,IsNull_Insurance Duration,IsNull_Customer Feedback
0,19.0,Female,10049.0,Married,1.0,Bachelor's,Self-Employed,22.598761,Urban,Premium,2.0,17.0,372.0,5.0,2023-12-23 15:21:39.134960,Poor,No,Weekly,House,2869.0,0,0,0,0,0,0,0,0,0,0,0
1,39.0,Female,31678.0,Divorced,3.0,Master's,Unemployed,15.569731,Rural,Comprehensive,1.0,12.0,694.0,2.0,2023-06-12 15:21:39.111551,Average,Yes,Monthly,House,1483.0,0,0,0,0,1,0,0,0,0,0,0
2,23.0,Male,25602.0,Divorced,3.0,High School,Self-Employed,47.177549,Suburban,Premium,1.0,14.0,632.0,3.0,2023-09-30 15:21:39.221386,Good,Yes,Weekly,House,567.0,0,0,0,0,0,0,0,0,1,0,0
3,21.0,Male,141855.0,Married,2.0,Bachelor's,Self-Employed,10.938144,Rural,Basic,1.0,0.0,367.0,1.0,2024-06-12 15:21:39.226954,Poor,Yes,Daily,Apartment,765.0,0,0,0,0,1,0,0,0,0,0,0
4,21.0,Male,39651.0,Single,1.0,Bachelor's,Self-Employed,20.376094,Rural,Premium,0.0,8.0,598.0,4.0,2021-12-01 15:21:39.252145,Poor,Yes,Weekly,House,2022.0,0,0,0,0,0,0,0,0,0,0,0


In [39]:
nulls = []
nuniques = []
uniques = []
types = []

for i in df.columns:
    nulls.append(df[i].isnull().sum())
    nuniques.append(df[i].nunique())
    uniques.append(df[i].unique())
    types.append(df[i].dtype)


pd.DataFrame(
    {
        "Column" : df.columns,
        "Data Type" : types,
        "Nulls" : nulls,
        "No. of Uniques" : nuniques,
        "Uniques" : uniques
    }
).sort_values(by="Nulls", ascending=False)

Unnamed: 0,Column,Data Type,Nulls,No. of Uniques,Uniques
19,Premium Amount,float64,800000,4794,"[2869.0, 1483.0, 567.0, 765.0, 2022.0, 3202.0,..."
1,Gender,object,0,2,"[Female, Male]"
0,Age,float64,0,47,"[19.0, 39.0, 23.0, 21.0, 29.0, 41.0, 48.0, 44...."
3,Marital Status,object,0,3,"[Married, Divorced, Single]"
4,Number of Dependents,float64,0,5,"[1.0, 3.0, 2.0, 0.0, 4.0]"
5,Education Level,object,0,4,"[Bachelor's, Master's, High School, PhD]"
2,Annual Income,float64,0,97970,"[10049.0, 31678.0, 25602.0, 141855.0, 39651.0,..."
7,Health Score,float64,0,933976,"[22.59876067181393, 15.569730989408043, 47.177..."
8,Location,object,0,3,"[Urban, Rural, Suburban]"
9,Policy Type,object,0,3,"[Premium, Comprehensive, Basic]"


---
#
# **Feature Engineering**
---

# Health Conscious Level

**Exercise Frequency** --> `['Weekly', 'Monthly', 'Daily', 'Rarely']`

**Smoking Status** --> `['No', 'Yes']`

**Health Score** 
- Poor --> (< 16.285503904803008)
- Average --> (>= 16.285503904803008 & <= 33.959695457149195)
- Good --> (> 33.959695457149195)

**Age** 
- Poor --> (< 30)
- Average --> (>= 30 & <= 53)
- Good --> (> 53.0)

In [40]:
health_min_df = pd.DataFrame()

In [41]:
health_min_df["smoke"] = df["Smoking Status"].replace({"Yes" : 0, "No" : 1})

In [42]:
health_min_df["ex"] = df["Exercise Frequency"].replace({"Rarely" : 0, "Monthly" : 1, "Weekly" : 2, "Daily" : 3})

In [43]:
bins = [0, 30, 53, float('inf')]
labels = [0, 1, 2]

health_min_df["age"] = pd.cut(df['Age'], bins=bins, labels=labels, right=False)

In [44]:
bins = [0, 16.285503904803008, 33.959695457149195, float('inf')]
labels = [0, 1, 2]

health_min_df["health"] = pd.cut(df['Health Score'], bins=bins, labels=labels, right=False)

In [45]:
df["Health Conscious Level"] = health_min_df.sum(axis=1)

In [46]:
df["Health Conscious Level1"] = df["Smoking Status"].replace({"Yes" : 2, "No" : 4}) * df["Exercise Frequency"].replace({"Rarely" : 2, "Monthly" : 4, "Weekly" : 8, "Daily" : 16}) * df['Age'] * df['Health Score']

In [47]:
df.head(3)

Unnamed: 0,Age,Gender,Annual Income,Marital Status,Number of Dependents,Education Level,Occupation,Health Score,Location,Policy Type,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Policy Start Date,Customer Feedback,Smoking Status,Exercise Frequency,Property Type,Premium Amount,IsNull_Age,IsNull_Annual Income,IsNull_Marital Status,IsNull_Number of Dependents,IsNull_Occupation,IsNull_Health Score,IsNull_Previous Claims,IsNull_Vehicle Age,IsNull_Credit Score,IsNull_Insurance Duration,IsNull_Customer Feedback,Health Conscious Level,Health Conscious Level1
0,19.0,Female,10049.0,Married,1.0,Bachelor's,Self-Employed,22.598761,Urban,Premium,2.0,17.0,372.0,5.0,2023-12-23 15:21:39.134960,Poor,No,Weekly,House,2869.0,0,0,0,0,0,0,0,0,0,0,0,4,13740.046488
1,39.0,Female,31678.0,Divorced,3.0,Master's,Unemployed,15.569731,Rural,Comprehensive,1.0,12.0,694.0,2.0,2023-06-12 15:21:39.111551,Average,Yes,Monthly,House,1483.0,0,0,0,0,1,0,0,0,0,0,0,2,4857.756069
2,23.0,Male,25602.0,Divorced,3.0,High School,Self-Employed,47.177549,Suburban,Premium,1.0,14.0,632.0,3.0,2023-09-30 15:21:39.221386,Good,Yes,Weekly,House,567.0,0,0,0,0,0,0,0,0,1,0,0,4,17361.338138


#
---
#

# Money Per Head

In [48]:
df["Money Per Head"] = df["Annual Income"] / df["Number of Dependents"].where(df["Number of Dependents"] != 0, 1)

In [49]:
df.head(3) 

Unnamed: 0,Age,Gender,Annual Income,Marital Status,Number of Dependents,Education Level,Occupation,Health Score,Location,Policy Type,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Policy Start Date,Customer Feedback,Smoking Status,Exercise Frequency,Property Type,Premium Amount,IsNull_Age,IsNull_Annual Income,IsNull_Marital Status,IsNull_Number of Dependents,IsNull_Occupation,IsNull_Health Score,IsNull_Previous Claims,IsNull_Vehicle Age,IsNull_Credit Score,IsNull_Insurance Duration,IsNull_Customer Feedback,Health Conscious Level,Health Conscious Level1,Money Per Head
0,19.0,Female,10049.0,Married,1.0,Bachelor's,Self-Employed,22.598761,Urban,Premium,2.0,17.0,372.0,5.0,2023-12-23 15:21:39.134960,Poor,No,Weekly,House,2869.0,0,0,0,0,0,0,0,0,0,0,0,4,13740.046488,10049.0
1,39.0,Female,31678.0,Divorced,3.0,Master's,Unemployed,15.569731,Rural,Comprehensive,1.0,12.0,694.0,2.0,2023-06-12 15:21:39.111551,Average,Yes,Monthly,House,1483.0,0,0,0,0,1,0,0,0,0,0,0,2,4857.756069,10559.333333
2,23.0,Male,25602.0,Divorced,3.0,High School,Self-Employed,47.177549,Suburban,Premium,1.0,14.0,632.0,3.0,2023-09-30 15:21:39.221386,Good,Yes,Weekly,House,567.0,0,0,0,0,0,0,0,0,1,0,0,4,17361.338138,8534.0


#
---
#

# Money Handling Level

In [50]:
df["Money Handling Level"] = df["Annual Income"] * df["Credit Score"]

In [51]:
df["Money Handling Level1"] = df["Annual Income"] / df["Credit Score"]

In [52]:
df.head(3)

Unnamed: 0,Age,Gender,Annual Income,Marital Status,Number of Dependents,Education Level,Occupation,Health Score,Location,Policy Type,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Policy Start Date,Customer Feedback,Smoking Status,Exercise Frequency,Property Type,Premium Amount,IsNull_Age,IsNull_Annual Income,IsNull_Marital Status,IsNull_Number of Dependents,IsNull_Occupation,IsNull_Health Score,IsNull_Previous Claims,IsNull_Vehicle Age,IsNull_Credit Score,IsNull_Insurance Duration,IsNull_Customer Feedback,Health Conscious Level,Health Conscious Level1,Money Per Head,Money Handling Level,Money Handling Level1
0,19.0,Female,10049.0,Married,1.0,Bachelor's,Self-Employed,22.598761,Urban,Premium,2.0,17.0,372.0,5.0,2023-12-23 15:21:39.134960,Poor,No,Weekly,House,2869.0,0,0,0,0,0,0,0,0,0,0,0,4,13740.046488,10049.0,3738228.0,27.013441
1,39.0,Female,31678.0,Divorced,3.0,Master's,Unemployed,15.569731,Rural,Comprehensive,1.0,12.0,694.0,2.0,2023-06-12 15:21:39.111551,Average,Yes,Monthly,House,1483.0,0,0,0,0,1,0,0,0,0,0,0,2,4857.756069,10559.333333,21984532.0,45.645533
2,23.0,Male,25602.0,Divorced,3.0,High School,Self-Employed,47.177549,Suburban,Premium,1.0,14.0,632.0,3.0,2023-09-30 15:21:39.221386,Good,Yes,Weekly,House,567.0,0,0,0,0,0,0,0,0,1,0,0,4,17361.338138,8534.0,16180464.0,40.509494


#
---
#

# Growth

In [53]:
df["Growth"] = df["Education Level"].replace({"High School" : 1, "Bachelor's" : 2, "Master's" : 3, "PhD" : 4}) * df["Annual Income"]

In [54]:
df["Growth1"] = df["Annual Income"] / df["Education Level"].replace({"High School" : 1, "Bachelor's" : 2, "Master's" : 3, "PhD" : 4})

In [55]:
df.head(3)

Unnamed: 0,Age,Gender,Annual Income,Marital Status,Number of Dependents,Education Level,Occupation,Health Score,Location,Policy Type,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Policy Start Date,Customer Feedback,Smoking Status,Exercise Frequency,Property Type,Premium Amount,IsNull_Age,IsNull_Annual Income,IsNull_Marital Status,IsNull_Number of Dependents,IsNull_Occupation,IsNull_Health Score,IsNull_Previous Claims,IsNull_Vehicle Age,IsNull_Credit Score,IsNull_Insurance Duration,IsNull_Customer Feedback,Health Conscious Level,Health Conscious Level1,Money Per Head,Money Handling Level,Money Handling Level1,Growth,Growth1
0,19.0,Female,10049.0,Married,1.0,Bachelor's,Self-Employed,22.598761,Urban,Premium,2.0,17.0,372.0,5.0,2023-12-23 15:21:39.134960,Poor,No,Weekly,House,2869.0,0,0,0,0,0,0,0,0,0,0,0,4,13740.046488,10049.0,3738228.0,27.013441,20098.0,5024.5
1,39.0,Female,31678.0,Divorced,3.0,Master's,Unemployed,15.569731,Rural,Comprehensive,1.0,12.0,694.0,2.0,2023-06-12 15:21:39.111551,Average,Yes,Monthly,House,1483.0,0,0,0,0,1,0,0,0,0,0,0,2,4857.756069,10559.333333,21984532.0,45.645533,95034.0,10559.333333
2,23.0,Male,25602.0,Divorced,3.0,High School,Self-Employed,47.177549,Suburban,Premium,1.0,14.0,632.0,3.0,2023-09-30 15:21:39.221386,Good,Yes,Weekly,House,567.0,0,0,0,0,0,0,0,0,1,0,0,4,17361.338138,8534.0,16180464.0,40.509494,25602.0,25602.0


#
---
#

# Determinstic

In [56]:
df["Determinstic"] = df["Annual Income"] * (1 / df["Age"])

In [57]:
df.head(3)

Unnamed: 0,Age,Gender,Annual Income,Marital Status,Number of Dependents,Education Level,Occupation,Health Score,Location,Policy Type,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Policy Start Date,Customer Feedback,Smoking Status,Exercise Frequency,Property Type,Premium Amount,IsNull_Age,IsNull_Annual Income,IsNull_Marital Status,IsNull_Number of Dependents,IsNull_Occupation,IsNull_Health Score,IsNull_Previous Claims,IsNull_Vehicle Age,IsNull_Credit Score,IsNull_Insurance Duration,IsNull_Customer Feedback,Health Conscious Level,Health Conscious Level1,Money Per Head,Money Handling Level,Money Handling Level1,Growth,Growth1,Determinstic
0,19.0,Female,10049.0,Married,1.0,Bachelor's,Self-Employed,22.598761,Urban,Premium,2.0,17.0,372.0,5.0,2023-12-23 15:21:39.134960,Poor,No,Weekly,House,2869.0,0,0,0,0,0,0,0,0,0,0,0,4,13740.046488,10049.0,3738228.0,27.013441,20098.0,5024.5,528.894737
1,39.0,Female,31678.0,Divorced,3.0,Master's,Unemployed,15.569731,Rural,Comprehensive,1.0,12.0,694.0,2.0,2023-06-12 15:21:39.111551,Average,Yes,Monthly,House,1483.0,0,0,0,0,1,0,0,0,0,0,0,2,4857.756069,10559.333333,21984532.0,45.645533,95034.0,10559.333333,812.25641
2,23.0,Male,25602.0,Divorced,3.0,High School,Self-Employed,47.177549,Suburban,Premium,1.0,14.0,632.0,3.0,2023-09-30 15:21:39.221386,Good,Yes,Weekly,House,567.0,0,0,0,0,0,0,0,0,1,0,0,4,17361.338138,8534.0,16180464.0,40.509494,25602.0,25602.0,1113.130435


#
---
#

# Some Dummy Features

In [58]:
df["Day_Name"] = df["Policy Start Date"].dt.day_name()

In [59]:
df["Credit by Score"] = df["Credit Score"]/df["Previous Claims"].where(df["Previous Claims"] != 0, 1)

In [60]:
df['CreditInsurance'] = df['Credit Score'] * df['Insurance Duration']

In [61]:
df['Health_Risk_Score'] = df['Smoking Status'].apply(lambda x: 1 if x == 'Smoker' else 0) + df['Exercise Frequency'].apply(lambda x: 1 if x == 'Low' else (0.5 if x == 'Medium' else 0)) + (100 - df['Health Score']) / 20

In [62]:
df['Credit_Health_Score'] = df['Credit Score'] * df['Health Score']
df['Health_Age_Interaction'] = df['Health Score'] * df['Age']

#
---
#

# Customer Feedback

In [63]:
df["Feedback1"] = df["Annual Income"] * df["Customer Feedback"].replace({"Poor" : 2, "Average" : 4, "Good" : 8})

In [64]:
df["Feedback2"] = df["Credit Score"] * df["Customer Feedback"].replace({"Poor" : 2, "Average" : 4, "Good" : 8})

In [65]:
df["Feedback3"] = df["Previous Claims"] * df["Customer Feedback"].replace({"Poor" : 2, "Average" : 4, "Good" : 8})

In [66]:
df["Feedback4"] = df["Health Score"] * df["Customer Feedback"].replace({"Poor" : 2, "Average" : 4, "Good" : 8})

#
---
# Downloading the Df

In [67]:
df.to_csv("EDAed_df.csv", index=False)