In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import *
import xgboost as xgb

from sklearn.preprocessing import PowerTransformer


import warnings
warnings.filterwarnings("ignore")
pd.set_option("display.max_columns", None)

In [2]:
train = pd.read_csv("train.csv")
train.head()

Unnamed: 0,id,Age,Gender,Annual Income,Marital Status,Number of Dependents,Education Level,Occupation,Health Score,Location,Policy Type,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Policy Start Date,Customer Feedback,Smoking Status,Exercise Frequency,Property Type,Premium Amount
0,0,19.0,Female,10049.0,Married,1.0,Bachelor's,Self-Employed,22.598761,Urban,Premium,2.0,17.0,372.0,5.0,2023-12-23 15:21:39.134960,Poor,No,Weekly,House,2869.0
1,1,39.0,Female,31678.0,Divorced,3.0,Master's,,15.569731,Rural,Comprehensive,1.0,12.0,694.0,2.0,2023-06-12 15:21:39.111551,Average,Yes,Monthly,House,1483.0
2,2,23.0,Male,25602.0,Divorced,3.0,High School,Self-Employed,47.177549,Suburban,Premium,1.0,14.0,,3.0,2023-09-30 15:21:39.221386,Good,Yes,Weekly,House,567.0
3,3,21.0,Male,141855.0,Married,2.0,Bachelor's,,10.938144,Rural,Basic,1.0,0.0,367.0,1.0,2024-06-12 15:21:39.226954,Poor,Yes,Daily,Apartment,765.0
4,4,21.0,Male,39651.0,Single,1.0,Bachelor's,Self-Employed,20.376094,Rural,Premium,0.0,8.0,598.0,4.0,2021-12-01 15:21:39.252145,Poor,Yes,Weekly,House,2022.0


In [3]:
test = pd.read_csv("test.csv")
test.head()

Unnamed: 0,id,Age,Gender,Annual Income,Marital Status,Number of Dependents,Education Level,Occupation,Health Score,Location,Policy Type,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Policy Start Date,Customer Feedback,Smoking Status,Exercise Frequency,Property Type
0,1200000,28.0,Female,2310.0,,4.0,Bachelor's,Self-Employed,7.657981,Rural,Basic,,19.0,,1.0,2023-06-04 15:21:39.245086,Poor,Yes,Weekly,House
1,1200001,31.0,Female,126031.0,Married,2.0,Master's,Self-Employed,13.381379,Suburban,Premium,,14.0,372.0,8.0,2024-04-22 15:21:39.224915,Good,Yes,Rarely,Apartment
2,1200002,47.0,Female,17092.0,Divorced,0.0,PhD,Unemployed,24.354527,Urban,Comprehensive,,16.0,819.0,9.0,2023-04-05 15:21:39.134960,Average,Yes,Monthly,Condo
3,1200003,28.0,Female,30424.0,Divorced,3.0,PhD,Self-Employed,5.136225,Suburban,Comprehensive,1.0,3.0,770.0,5.0,2023-10-25 15:21:39.134960,Poor,Yes,Daily,House
4,1200004,24.0,Male,10863.0,Divorced,2.0,High School,Unemployed,11.844155,Suburban,Premium,,14.0,755.0,7.0,2021-11-26 15:21:39.259788,Average,No,Weekly,House


In [4]:
df = pd.concat([train, test])
df.shape

(2000000, 21)

# Setting all columns to appropriate datatypes

In [5]:
df[["Previous Claims", "Number of Dependents", "Vehicle Age", "Age", "Insurance Duration"]] = df[["Previous Claims", "Number of Dependents", "Vehicle Age", "Age", "Insurance Duration"]].astype("object")

In [6]:
df["Policy Start Date"] = pd.to_datetime(df["Policy Start Date"])

In [7]:
nulls = []
nuniques = []
uniques = []
types = []

for i in df.columns:
    nulls.append(df[i].isnull().sum())
    nuniques.append(df[i].nunique())
    uniques.append(df[i].unique())
    types.append(df[i].dtype)

In [8]:
pd.DataFrame(
    {
        "Column" : df.columns,
        "Data Type" : types,
        "Nulls" : nulls,
        "No. of Uniques" : nuniques,
        "Uniques" : uniques
    }
).sort_values(by="Nulls", ascending=False)

Unnamed: 0,Column,Data Type,Nulls,No. of Uniques,Uniques
20,Premium Amount,float64,800000,4794,"[2869.0, 1483.0, 567.0, 765.0, 2022.0, 3202.0,..."
11,Previous Claims,object,606831,10,"[2.0, 1.0, 0.0, nan, 3.0, 4.0, 5.0, 6.0, 7.0, ..."
7,Occupation,object,597200,3,"[Self-Employed, nan, Employed, Unemployed]"
13,Credit Score,float64,229333,550,"[372.0, 694.0, nan, 367.0, 598.0, 614.0, 807.0..."
5,Number of Dependents,object,182802,5,"[1.0, 3.0, 2.0, 0.0, 4.0, nan]"
16,Customer Feedback,object,130100,3,"[Poor, Average, Good, nan]"
8,Health Score,float64,123525,811360,"[22.59876067181393, 15.569730989408043, 47.177..."
3,Annual Income,float64,74809,97540,"[10049.0, 31678.0, 25602.0, 141855.0, 39651.0,..."
1,Age,object,31194,47,"[19.0, 39.0, 23.0, 21.0, 29.0, 41.0, 48.0, 44...."
4,Marital Status,object,30865,3,"[Married, Divorced, Single, nan]"


# EDA - Filling Nulls

In [9]:
new_train = df.iloc[:1200000, :]

In [10]:
df.drop(columns=["id"], inplace=True)
new_train.drop(columns=["id"], inplace=True)

In [11]:
new_train.head(3)

Unnamed: 0,Age,Gender,Annual Income,Marital Status,Number of Dependents,Education Level,Occupation,Health Score,Location,Policy Type,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Policy Start Date,Customer Feedback,Smoking Status,Exercise Frequency,Property Type,Premium Amount
0,19.0,Female,10049.0,Married,1.0,Bachelor's,Self-Employed,22.598761,Urban,Premium,2.0,17.0,372.0,5.0,2023-12-23 15:21:39.134960,Poor,No,Weekly,House,2869.0
1,39.0,Female,31678.0,Divorced,3.0,Master's,,15.569731,Rural,Comprehensive,1.0,12.0,694.0,2.0,2023-06-12 15:21:39.111551,Average,Yes,Monthly,House,1483.0
2,23.0,Male,25602.0,Divorced,3.0,High School,Self-Employed,47.177549,Suburban,Premium,1.0,14.0,,3.0,2023-09-30 15:21:39.221386,Good,Yes,Weekly,House,567.0


### 

# No Nulls DF

In [12]:
no_nulls_df = df.dropna()

In [13]:
no_nulls_df.shape

(384004, 20)

#
# Filling creating IsNulls for null columns

In [14]:
for i in df.isnull().sum()[(df.isnull().sum() > 0) & (df.isnull().sum().index != "Premium Amount")].index:
    print(i)
    df[f"IsNull_{i}"] = df[i].isnull().astype(int)

Age
Annual Income
Marital Status
Number of Dependents
Occupation
Health Score
Previous Claims
Vehicle Age
Credit Score
Insurance Duration
Customer Feedback


In [15]:
df

Unnamed: 0,Age,Gender,Annual Income,Marital Status,Number of Dependents,Education Level,Occupation,Health Score,Location,Policy Type,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Policy Start Date,Customer Feedback,Smoking Status,Exercise Frequency,Property Type,Premium Amount,IsNull_Age,IsNull_Annual Income,IsNull_Marital Status,IsNull_Number of Dependents,IsNull_Occupation,IsNull_Health Score,IsNull_Previous Claims,IsNull_Vehicle Age,IsNull_Credit Score,IsNull_Insurance Duration,IsNull_Customer Feedback
0,19.0,Female,10049.0,Married,1.0,Bachelor's,Self-Employed,22.598761,Urban,Premium,2.0,17.0,372.0,5.0,2023-12-23 15:21:39.134960,Poor,No,Weekly,House,2869.0,0,0,0,0,0,0,0,0,0,0,0
1,39.0,Female,31678.0,Divorced,3.0,Master's,,15.569731,Rural,Comprehensive,1.0,12.0,694.0,2.0,2023-06-12 15:21:39.111551,Average,Yes,Monthly,House,1483.0,0,0,0,0,1,0,0,0,0,0,0
2,23.0,Male,25602.0,Divorced,3.0,High School,Self-Employed,47.177549,Suburban,Premium,1.0,14.0,,3.0,2023-09-30 15:21:39.221386,Good,Yes,Weekly,House,567.0,0,0,0,0,0,0,0,0,1,0,0
3,21.0,Male,141855.0,Married,2.0,Bachelor's,,10.938144,Rural,Basic,1.0,0.0,367.0,1.0,2024-06-12 15:21:39.226954,Poor,Yes,Daily,Apartment,765.0,0,0,0,0,1,0,0,0,0,0,0
4,21.0,Male,39651.0,Single,1.0,Bachelor's,Self-Employed,20.376094,Rural,Premium,0.0,8.0,598.0,4.0,2021-12-01 15:21:39.252145,Poor,Yes,Weekly,House,2022.0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
799995,50.0,Female,38782.0,Married,1.0,Bachelor's,,14.498639,Rural,Premium,,8.0,309.0,2.0,2021-07-09 15:21:39.184157,Average,Yes,Daily,Condo,,0,0,0,0,1,0,1,0,0,0,0
799996,,Female,73462.0,Single,0.0,Master's,,8.145748,Rural,Basic,2.0,0.0,,2.0,2023-03-28 15:21:39.250151,Good,No,Daily,Apartment,,1,0,0,0,1,0,0,0,1,0,0
799997,26.0,Female,35178.0,Single,0.0,Master's,Employed,6.636583,Urban,Comprehensive,,10.0,,6.0,2019-09-30 15:21:39.132191,Poor,No,Monthly,Apartment,,0,0,0,0,0,0,1,0,1,0,0
799998,34.0,Female,45661.0,Single,3.0,Master's,,15.937248,Urban,Premium,2.0,17.0,467.0,7.0,2022-05-09 15:21:39.253660,Average,No,Weekly,Condo,,0,0,0,0,1,0,0,0,0,0,0


#
---
#

# Encoding the Categoricals

In [25]:
nulls = []
nuniques = []
uniques = []
types = []

for i in df.columns:
    nulls.append(df[i].isnull().sum())
    nuniques.append(df[i].nunique())
    uniques.append(df[i].unique())
    types.append(df[i].dtype)

d = pd.DataFrame(
    {
        "Column" : df.columns,
        "Data Type" : types,
        "Nulls" : nulls,
        "No. of Uniques" : nuniques,
        "Uniques" : uniques
    }
).sort_values(by="Nulls", ascending=False)

In [21]:
df[['Age', 'Vehicle Age', 'Insurance Duration', 'Previous Claims', 'Number of Dependents']] = df[['Age', 'Vehicle Age', 'Insurance Duration',  'Previous Claims', 'Number of Dependents']].astype(float)

In [27]:
d[d['Data Type'] == 'object']

Unnamed: 0,Column,Data Type,Nulls,No. of Uniques,Uniques
6,Occupation,object,597200,3,"[Self-Employed, nan, Employed, Unemployed]"
15,Customer Feedback,object,130100,3,"[Poor, Average, Good, nan]"
3,Marital Status,object,30865,3,"[Married, Divorced, Single, nan]"
8,Location,object,0,3,"[Urban, Rural, Suburban]"
5,Education Level,object,0,4,"[Bachelor's, Master's, High School, PhD]"
1,Gender,object,0,2,"[Female, Male]"
9,Policy Type,object,0,3,"[Premium, Comprehensive, Basic]"
17,Exercise Frequency,object,0,4,"[Weekly, Monthly, Daily, Rarely]"
16,Smoking Status,object,0,2,"[No, Yes]"
18,Property Type,object,0,3,"[House, Apartment, Condo]"


#
---
#

In [29]:
df['Occupation'].replace({'Self-Employed' : 0, 'Employed' : 1, 'Unemployed' : 2}, inplace=True)

In [31]:
df['Customer Feedback'].replace({'Poor' : 0, 'Average' : 1, 'Good' : 2}, inplace=True)

In [33]:
df['Marital Status'].replace({'Single' : 0, 'Married' : 1, 'Divorced' : 2}, inplace=True)

In [34]:
df['Location'].replace({'Rural' : 0, 'Suburban' : 1, 'Urban' : 2}, inplace=True)

In [35]:
df['Education Level'].replace({'High School' : 0, "Bachelor's" : 1, "Master's" : 2, 'PhD' : 3}, inplace=True)

In [36]:
df['Gender'].replace({'Male' : 0, "Female" : 1}, inplace=True)

In [37]:
df['Policy Type'].replace({'Basic' : 0, 'Comprehensive' : 1, 'Premium' : 2}, inplace=True)

In [38]:
df['Exercise Frequency'].replace({'Rarely' : 0, "Monthly" : 1, "Weekly" : 2, 'Daily' : 3}, inplace=True)

In [39]:
df['Smoking Status'].replace({'No' : 0, "Yes" : 1}, inplace=True)

In [40]:
df['Property Type'].replace({'Apartment' : 0, 'House' : 1, 'Condo' : 2}, inplace=True)

In [41]:
nulls = []
nuniques = []
uniques = []
types = []

for i in df.columns:
    nulls.append(df[i].isnull().sum())
    nuniques.append(df[i].nunique())
    uniques.append(df[i].unique())
    types.append(df[i].dtype)

pd.DataFrame(
    {
        "Column" : df.columns,
        "Data Type" : types,
        "Nulls" : nulls,
        "No. of Uniques" : nuniques,
        "Uniques" : uniques
    }
).sort_values(by="Nulls", ascending=False)

Unnamed: 0,Column,Data Type,Nulls,No. of Uniques,Uniques
19,Premium Amount,float64,800000,4794,"[2869.0, 1483.0, 567.0, 765.0, 2022.0, 3202.0,..."
10,Previous Claims,float64,606831,10,"[2.0, 1.0, 0.0, nan, 3.0, 4.0, 5.0, 6.0, 7.0, ..."
6,Occupation,float64,597200,3,"[0.0, nan, 1.0, 2.0]"
12,Credit Score,float64,229333,550,"[372.0, 694.0, nan, 367.0, 598.0, 614.0, 807.0..."
4,Number of Dependents,float64,182802,5,"[1.0, 3.0, 2.0, 0.0, 4.0, nan]"
15,Customer Feedback,float64,130100,3,"[0.0, 1.0, 2.0, nan]"
7,Health Score,float64,123525,811360,"[22.59876067181393, 15.569730989408043, 47.177..."
2,Annual Income,float64,74809,97540,"[10049.0, 31678.0, 25602.0, 141855.0, 39651.0,..."
0,Age,float64,31194,47,"[19.0, 39.0, 23.0, 21.0, 29.0, 41.0, 48.0, 44...."
3,Marital Status,float64,30865,3,"[1.0, 2.0, 0.0, nan]"


#
---
#

# **Multiple Imputation by Chained Equations (MICE)**

In [53]:
X = df.drop(columns=["Premium Amount", "Policy Start Date"])

In [54]:
X

Unnamed: 0,Age,Gender,Annual Income,Marital Status,Number of Dependents,Education Level,Occupation,Health Score,Location,Policy Type,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Customer Feedback,Smoking Status,Exercise Frequency,Property Type,IsNull_Age,IsNull_Annual Income,IsNull_Marital Status,IsNull_Number of Dependents,IsNull_Occupation,IsNull_Health Score,IsNull_Previous Claims,IsNull_Vehicle Age,IsNull_Credit Score,IsNull_Insurance Duration,IsNull_Customer Feedback
0,19.0,1,10049.0,1.0,1.0,1,0.0,22.598761,2,2,2.0,17.0,372.0,5.0,0.0,0,2,1,0,0,0,0,0,0,0,0,0,0,0
1,39.0,1,31678.0,2.0,3.0,2,,15.569731,0,1,1.0,12.0,694.0,2.0,1.0,1,1,1,0,0,0,0,1,0,0,0,0,0,0
2,23.0,0,25602.0,2.0,3.0,0,0.0,47.177549,1,2,1.0,14.0,,3.0,2.0,1,2,1,0,0,0,0,0,0,0,0,1,0,0
3,21.0,0,141855.0,1.0,2.0,1,,10.938144,0,0,1.0,0.0,367.0,1.0,0.0,1,3,0,0,0,0,0,1,0,0,0,0,0,0
4,21.0,0,39651.0,0.0,1.0,1,0.0,20.376094,0,2,0.0,8.0,598.0,4.0,0.0,1,2,1,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
799995,50.0,1,38782.0,1.0,1.0,1,,14.498639,0,2,,8.0,309.0,2.0,1.0,1,3,2,0,0,0,0,1,0,1,0,0,0,0
799996,,1,73462.0,0.0,0.0,2,,8.145748,0,0,2.0,0.0,,2.0,2.0,0,3,0,1,0,0,0,1,0,0,0,1,0,0
799997,26.0,1,35178.0,0.0,0.0,2,1.0,6.636583,2,1,,10.0,,6.0,0.0,0,1,0,0,0,0,0,0,0,1,0,1,0,0
799998,34.0,1,45661.0,0.0,3.0,2,,15.937248,2,2,2.0,17.0,467.0,7.0,1.0,0,2,2,0,0,0,0,1,0,0,0,0,0,0


In [55]:
from miceforest import ImputationKernel

In [56]:
X.reset_index(inplace=True)

In [57]:
imp = ImputationKernel(data=X)

In [59]:
imp.mice(20)

In [61]:
new_df = imp.complete_data()

In [63]:
new_df.isnull().sum()

index                          0
Age                            0
Gender                         0
Annual Income                  0
Marital Status                 0
Number of Dependents           0
Education Level                0
Occupation                     0
Health Score                   0
Location                       0
Policy Type                    0
Previous Claims                0
Vehicle Age                    0
Credit Score                   0
Insurance Duration             0
Customer Feedback              0
Smoking Status                 0
Exercise Frequency             0
Property Type                  0
IsNull_Age                     0
IsNull_Annual Income           0
IsNull_Marital Status          0
IsNull_Number of Dependents    0
IsNull_Occupation              0
IsNull_Health Score            0
IsNull_Previous Claims         0
IsNull_Vehicle Age             0
IsNull_Credit Score            0
IsNull_Insurance Duration      0
IsNull_Customer Feedback       0
dtype: int

In [76]:
new_df["Premium Amount"] = df["Premium Amount"].values
new_df["Policy Start Date"] = df["Policy Start Date"].values

In [78]:
new_df

Unnamed: 0,index,Age,Gender,Annual Income,Marital Status,Number of Dependents,Education Level,Occupation,Health Score,Location,Policy Type,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Customer Feedback,Smoking Status,Exercise Frequency,Property Type,IsNull_Age,IsNull_Annual Income,IsNull_Marital Status,IsNull_Number of Dependents,IsNull_Occupation,IsNull_Health Score,IsNull_Previous Claims,IsNull_Vehicle Age,IsNull_Credit Score,IsNull_Insurance Duration,IsNull_Customer Feedback,Premium Amount,Policy Start Date
0,0,19.0,1,10049.0,1.0,1.0,1,0.0,22.598761,2,2,2.0,17.0,372.0,5.0,0.0,0,2,1,0,0,0,0,0,0,0,0,0,0,0,2869.0,2023-12-23 15:21:39.134960
1,1,39.0,1,31678.0,2.0,3.0,2,1.0,15.569731,0,1,1.0,12.0,694.0,2.0,1.0,1,1,1,0,0,0,0,1,0,0,0,0,0,0,1483.0,2023-06-12 15:21:39.111551
2,2,23.0,0,25602.0,2.0,3.0,0,0.0,47.177549,1,2,1.0,14.0,787.0,3.0,2.0,1,2,1,0,0,0,0,0,0,0,0,1,0,0,567.0,2023-09-30 15:21:39.221386
3,3,21.0,0,141855.0,1.0,2.0,1,0.0,10.938144,0,0,1.0,0.0,367.0,1.0,0.0,1,3,0,0,0,0,0,1,0,0,0,0,0,0,765.0,2024-06-12 15:21:39.226954
4,4,21.0,0,39651.0,0.0,1.0,1,0.0,20.376094,0,2,0.0,8.0,598.0,4.0,0.0,1,2,1,0,0,0,0,0,0,0,0,0,0,0,2022.0,2021-12-01 15:21:39.252145
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1999995,799995,50.0,1,38782.0,1.0,1.0,1,0.0,14.498639,0,2,1.0,8.0,309.0,2.0,1.0,1,3,2,0,0,0,0,1,0,1,0,0,0,0,,2021-07-09 15:21:39.184157
1999996,799996,48.0,1,73462.0,0.0,0.0,2,2.0,8.145748,0,0,2.0,0.0,726.0,2.0,2.0,0,3,0,1,0,0,0,1,0,0,0,1,0,0,,2023-03-28 15:21:39.250151
1999997,799997,26.0,1,35178.0,0.0,0.0,2,1.0,6.636583,2,1,1.0,10.0,779.0,6.0,0.0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,,2019-09-30 15:21:39.132191
1999998,799998,34.0,1,45661.0,0.0,3.0,2,0.0,15.937248,2,2,2.0,17.0,467.0,7.0,1.0,0,2,2,0,0,0,0,1,0,0,0,0,0,0,,2022-05-09 15:21:39.253660


In [79]:
df = new_df.copy()

In [83]:
df['Occupation'].replace({0: 'Self-Employed', 1: 'Employed', 2: 'Unemployed'}, inplace=True)

In [85]:
df['Customer Feedback'].replace({0: 'Poor', 1 : 'Average', 2 : 'Good'}, inplace=True)

In [86]:
df['Marital Status'].replace({0 : 'Single', 1 : 'Married', 2 : 'Divorced'}, inplace=True)

In [87]:
df['Location'].replace({0 : 'Rural', 1 : 'Suburban', 2 : 'Urban'}, inplace=True)

In [88]:
df['Education Level'].replace({0 : 'High School', 1 : "Bachelor's", 2 : "Master's", 3 : 'PhD'}, inplace=True)

In [89]:
df['Gender'].replace({0 : 'Male', 1 : "Female"}, inplace=True)

In [90]:
df['Policy Type'].replace({0 : 'Basic', 1 : 'Comprehensive', 2 : 'Premium'}, inplace=True)

In [91]:
df['Exercise Frequency'].replace({0 : 'Rarely', 1 : "Monthly", 2 : "Weekly", 3 : 'Daily'}, inplace=True)

In [92]:
df['Smoking Status'].replace({0 : 'No', 1 : "Yes"}, inplace=True)

In [93]:
df['Property Type'].replace({0 : 'Apartment', 1 : 'House', 2 : 'Condo'}, inplace=True)

In [94]:
df.head()

Unnamed: 0,index,Age,Gender,Annual Income,Marital Status,Number of Dependents,Education Level,Occupation,Health Score,Location,Policy Type,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Customer Feedback,Smoking Status,Exercise Frequency,Property Type,IsNull_Age,IsNull_Annual Income,IsNull_Marital Status,IsNull_Number of Dependents,IsNull_Occupation,IsNull_Health Score,IsNull_Previous Claims,IsNull_Vehicle Age,IsNull_Credit Score,IsNull_Insurance Duration,IsNull_Customer Feedback,Premium Amount,Policy Start Date
0,0,19.0,Female,10049.0,Married,1.0,Bachelor's,Self-Employed,22.598761,Urban,Premium,2.0,17.0,372.0,5.0,Poor,No,Weekly,House,0,0,0,0,0,0,0,0,0,0,0,2869.0,2023-12-23 15:21:39.134960
1,1,39.0,Female,31678.0,Divorced,3.0,Master's,Employed,15.569731,Rural,Comprehensive,1.0,12.0,694.0,2.0,Average,Yes,Monthly,House,0,0,0,0,1,0,0,0,0,0,0,1483.0,2023-06-12 15:21:39.111551
2,2,23.0,Male,25602.0,Divorced,3.0,High School,Self-Employed,47.177549,Suburban,Premium,1.0,14.0,787.0,3.0,Good,Yes,Weekly,House,0,0,0,0,0,0,0,0,1,0,0,567.0,2023-09-30 15:21:39.221386
3,3,21.0,Male,141855.0,Married,2.0,Bachelor's,Self-Employed,10.938144,Rural,Basic,1.0,0.0,367.0,1.0,Poor,Yes,Daily,Apartment,0,0,0,0,1,0,0,0,0,0,0,765.0,2024-06-12 15:21:39.226954
4,4,21.0,Male,39651.0,Single,1.0,Bachelor's,Self-Employed,20.376094,Rural,Premium,0.0,8.0,598.0,4.0,Poor,Yes,Weekly,House,0,0,0,0,0,0,0,0,0,0,0,2022.0,2021-12-01 15:21:39.252145


#
---
#

# After Treating Nulls with most related columns using KNN-Imputer

In [95]:
nulls = []
nuniques = []
uniques = []
types = []

for i in df.columns:
    nulls.append(df[i].isnull().sum())
    nuniques.append(df[i].nunique())
    uniques.append(df[i].unique())
    types.append(df[i].dtype)

In [96]:
pd.DataFrame(
    {
        "Column" : df.columns,
        "Data Type" : types,
        "Nulls" : nulls,
        "No. of Uniques" : nuniques,
        "Uniques" : uniques
    }
).sort_values(by="Nulls", ascending=False)

Unnamed: 0,Column,Data Type,Nulls,No. of Uniques,Uniques
30,Premium Amount,float64,800000,4794,"[2869.0, 1483.0, 567.0, 765.0, 2022.0, 3202.0,..."
0,index,int64,0,1200000,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,..."
2,Gender,object,0,2,"[Female, Male]"
1,Age,float64,0,47,"[19.0, 39.0, 23.0, 21.0, 29.0, 41.0, 48.0, 44...."
4,Marital Status,object,0,3,"[Married, Divorced, Single]"
5,Number of Dependents,float64,0,5,"[1.0, 3.0, 2.0, 0.0, 4.0]"
6,Education Level,object,0,4,"[Bachelor's, Master's, High School, PhD]"
3,Annual Income,float64,0,97540,"[10049.0, 31678.0, 25602.0, 141855.0, 39651.0,..."
8,Health Score,float64,0,811360,"[22.59876067181393, 15.569730989408043, 47.177..."
9,Location,object,0,3,"[Urban, Rural, Suburban]"


In [97]:
df.to_csv("cleaned_df_using_mice.csv", index=False)