In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import *
import xgboost as xgb

from sklearn.preprocessing import PowerTransformer


import warnings
warnings.filterwarnings("ignore")
pd.set_option("display.max_columns", None)

In [2]:
train = pd.read_csv("train.csv")
train.head()

Unnamed: 0,id,Age,Gender,Annual Income,Marital Status,Number of Dependents,Education Level,Occupation,Health Score,Location,Policy Type,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Policy Start Date,Customer Feedback,Smoking Status,Exercise Frequency,Property Type,Premium Amount
0,0,19.0,Female,10049.0,Married,1.0,Bachelor's,Self-Employed,22.598761,Urban,Premium,2.0,17.0,372.0,5.0,2023-12-23 15:21:39.134960,Poor,No,Weekly,House,2869.0
1,1,39.0,Female,31678.0,Divorced,3.0,Master's,,15.569731,Rural,Comprehensive,1.0,12.0,694.0,2.0,2023-06-12 15:21:39.111551,Average,Yes,Monthly,House,1483.0
2,2,23.0,Male,25602.0,Divorced,3.0,High School,Self-Employed,47.177549,Suburban,Premium,1.0,14.0,,3.0,2023-09-30 15:21:39.221386,Good,Yes,Weekly,House,567.0
3,3,21.0,Male,141855.0,Married,2.0,Bachelor's,,10.938144,Rural,Basic,1.0,0.0,367.0,1.0,2024-06-12 15:21:39.226954,Poor,Yes,Daily,Apartment,765.0
4,4,21.0,Male,39651.0,Single,1.0,Bachelor's,Self-Employed,20.376094,Rural,Premium,0.0,8.0,598.0,4.0,2021-12-01 15:21:39.252145,Poor,Yes,Weekly,House,2022.0


In [3]:
test = pd.read_csv("test.csv")
test.head()

Unnamed: 0,id,Age,Gender,Annual Income,Marital Status,Number of Dependents,Education Level,Occupation,Health Score,Location,Policy Type,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Policy Start Date,Customer Feedback,Smoking Status,Exercise Frequency,Property Type
0,1200000,28.0,Female,2310.0,,4.0,Bachelor's,Self-Employed,7.657981,Rural,Basic,,19.0,,1.0,2023-06-04 15:21:39.245086,Poor,Yes,Weekly,House
1,1200001,31.0,Female,126031.0,Married,2.0,Master's,Self-Employed,13.381379,Suburban,Premium,,14.0,372.0,8.0,2024-04-22 15:21:39.224915,Good,Yes,Rarely,Apartment
2,1200002,47.0,Female,17092.0,Divorced,0.0,PhD,Unemployed,24.354527,Urban,Comprehensive,,16.0,819.0,9.0,2023-04-05 15:21:39.134960,Average,Yes,Monthly,Condo
3,1200003,28.0,Female,30424.0,Divorced,3.0,PhD,Self-Employed,5.136225,Suburban,Comprehensive,1.0,3.0,770.0,5.0,2023-10-25 15:21:39.134960,Poor,Yes,Daily,House
4,1200004,24.0,Male,10863.0,Divorced,2.0,High School,Unemployed,11.844155,Suburban,Premium,,14.0,755.0,7.0,2021-11-26 15:21:39.259788,Average,No,Weekly,House


In [4]:
df = pd.concat([train, test])
df.shape

(2000000, 21)

# Setting all columns to appropriate datatypes

In [5]:
df[["Previous Claims", "Number of Dependents", "Vehicle Age", "Age", "Insurance Duration"]] = df[["Previous Claims", "Number of Dependents", "Vehicle Age", "Age", "Insurance Duration"]].astype("object")

In [6]:
df["Policy Start Date"] = pd.to_datetime(df["Policy Start Date"])

In [7]:
nulls = []
nuniques = []
uniques = []
types = []

for i in df.columns:
    nulls.append(df[i].isnull().sum())
    nuniques.append(df[i].nunique())
    uniques.append(df[i].unique())
    types.append(df[i].dtype)

In [8]:
pd.DataFrame(
    {
        "Column" : df.columns,
        "Data Type" : types,
        "Nulls" : nulls,
        "No. of Uniques" : nuniques,
        "Uniques" : uniques
    }
).sort_values(by="Nulls", ascending=False)

Unnamed: 0,Column,Data Type,Nulls,No. of Uniques,Uniques
20,Premium Amount,float64,800000,4794,"[2869.0, 1483.0, 567.0, 765.0, 2022.0, 3202.0,..."
11,Previous Claims,object,606831,10,"[2.0, 1.0, 0.0, nan, 3.0, 4.0, 5.0, 6.0, 7.0, ..."
7,Occupation,object,597200,3,"[Self-Employed, nan, Employed, Unemployed]"
13,Credit Score,float64,229333,550,"[372.0, 694.0, nan, 367.0, 598.0, 614.0, 807.0..."
5,Number of Dependents,object,182802,5,"[1.0, 3.0, 2.0, 0.0, 4.0, nan]"
16,Customer Feedback,object,130100,3,"[Poor, Average, Good, nan]"
8,Health Score,float64,123525,811360,"[22.59876067181393, 15.569730989408043, 47.177..."
3,Annual Income,float64,74809,97540,"[10049.0, 31678.0, 25602.0, 141855.0, 39651.0,..."
1,Age,object,31194,47,"[19.0, 39.0, 23.0, 21.0, 29.0, 41.0, 48.0, 44...."
4,Marital Status,object,30865,3,"[Married, Divorced, Single, nan]"


# EDA - Filling Nulls

In [9]:
new_train = df.iloc[:1200000, :]

In [10]:
df.drop(columns=["id"], inplace=True)
new_train.drop(columns=["id"], inplace=True)

In [11]:
new_train.head(3)

Unnamed: 0,Age,Gender,Annual Income,Marital Status,Number of Dependents,Education Level,Occupation,Health Score,Location,Policy Type,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Policy Start Date,Customer Feedback,Smoking Status,Exercise Frequency,Property Type,Premium Amount
0,19.0,Female,10049.0,Married,1.0,Bachelor's,Self-Employed,22.598761,Urban,Premium,2.0,17.0,372.0,5.0,2023-12-23 15:21:39.134960,Poor,No,Weekly,House,2869.0
1,39.0,Female,31678.0,Divorced,3.0,Master's,,15.569731,Rural,Comprehensive,1.0,12.0,694.0,2.0,2023-06-12 15:21:39.111551,Average,Yes,Monthly,House,1483.0
2,23.0,Male,25602.0,Divorced,3.0,High School,Self-Employed,47.177549,Suburban,Premium,1.0,14.0,,3.0,2023-09-30 15:21:39.221386,Good,Yes,Weekly,House,567.0


### 

# No Nulls DF

In [12]:
no_nulls_df = df.dropna()

In [13]:
no_nulls_df.shape

(384004, 20)

# Filling Nulls of - Insurace duration

In [14]:
df[df["Insurance Duration"].isnull()]

Unnamed: 0,Age,Gender,Annual Income,Marital Status,Number of Dependents,Education Level,Occupation,Health Score,Location,Policy Type,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Policy Start Date,Customer Feedback,Smoking Status,Exercise Frequency,Property Type,Premium Amount
711358,64.0,Male,30206.0,Married,3.0,Master's,Employed,49.551038,Suburban,Basic,0.0,18.0,581.0,,2022-04-06 15:21:39.203442,Poor,Yes,Rarely,Apartment,1044.0
627445,53.0,Male,5933.0,Married,3.0,Master's,,25.162516,Rural,Basic,,0.0,673.0,,2023-02-10 15:21:39.199447,Poor,No,Weekly,House,
706315,23.0,Male,35357.0,Single,1.0,Master's,,19.908571,Urban,Basic,2.0,14.0,481.0,,2020-01-25 15:21:39.223390,Poor,No,Monthly,Apartment,


In [15]:
from sklearn.impute import KNNImputer

filler = KNNImputer(weights="distance")

imputing_df = no_nulls_df.sample(n=200)
imputing_df = imputing_df[["Health Score", "Annual Income", "Vehicle Age", "Insurance Duration"]]

filler.fit(imputing_df)

In [16]:
fill_nas_df = df[df["Insurance Duration"].isnull()]
null_predictions = filler.transform(fill_nas_df[["Health Score", "Annual Income", "Vehicle Age", "Insurance Duration"]])[:, -1]

In [17]:
null_predictions.round().astype(int)

df.loc[df["Insurance Duration"].isnull(), "Insurance Duration"] = null_predictions.round().astype(int)

df[df["Insurance Duration"].isnull()]

Unnamed: 0,Age,Gender,Annual Income,Marital Status,Number of Dependents,Education Level,Occupation,Health Score,Location,Policy Type,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Policy Start Date,Customer Feedback,Smoking Status,Exercise Frequency,Property Type,Premium Amount


# Filling Nulls of - Vehicle Age

In [18]:
df[df["Vehicle Age"].isnull()]

Unnamed: 0,Age,Gender,Annual Income,Marital Status,Number of Dependents,Education Level,Occupation,Health Score,Location,Policy Type,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Policy Start Date,Customer Feedback,Smoking Status,Exercise Frequency,Property Type,Premium Amount
15629,25.0,Female,638.0,Divorced,3.0,PhD,Employed,13.494674,Suburban,Comprehensive,2.0,,467.0,6.0,2022-06-20 15:21:39.288099,Poor,Yes,Daily,Condo,909.0
53843,36.0,Male,7735.0,Married,3.0,PhD,Employed,10.592719,Suburban,Basic,0.0,,534.0,2.0,2024-08-08 15:21:39.226954,Average,No,Rarely,Condo,461.0
134847,52.0,Male,20287.0,Divorced,4.0,High School,Unemployed,4.788881,Suburban,Premium,,,406.0,6.0,2023-10-16 15:21:39.226954,Good,No,Daily,House,2719.0
412847,32.0,Male,1035.0,Single,1.0,PhD,,17.491565,Urban,Comprehensive,2.0,,733.0,3.0,2020-04-24 15:21:39.264504,Average,Yes,Rarely,House,508.0
595207,58.0,Female,,Married,2.0,Bachelor's,Self-Employed,34.596578,Urban,Comprehensive,3.0,,311.0,5.0,2022-03-21 15:21:39.271453,Poor,Yes,Monthly,Condo,1628.0
1068825,61.0,Male,45279.0,Single,1.0,PhD,Employed,15.26862,Suburban,Basic,1.0,,622.0,1.0,2020-03-13 15:21:39.274993,Poor,Yes,Rarely,Condo,962.0
317121,45.0,Male,37156.0,Single,0.0,Bachelor's,,33.430437,Suburban,Premium,0.0,,482.0,2.0,2022-08-23 15:21:39.155231,Good,No,Weekly,Apartment,
664839,50.0,Female,24619.0,Married,,Master's,Unemployed,49.647543,Urban,Comprehensive,0.0,,534.0,2.0,2021-07-08 15:21:39.186212,Good,No,Daily,House,
672636,49.0,Male,93714.0,Divorced,,High School,Self-Employed,43.410646,Suburban,Premium,,,427.0,4.0,2023-09-11 15:21:39.271453,Average,Yes,Daily,Apartment,


In [19]:
from sklearn.impute import KNNImputer

filler = KNNImputer(weights="distance")

imputing_df = no_nulls_df.sample(n=300)
imputing_df = imputing_df[["Health Score", "Insurance Duration", "Vehicle Age"]]

filler.fit(imputing_df)

In [20]:
fill_nas_df = df[df["Vehicle Age"].isnull()]
null_predictions = filler.transform(fill_nas_df[["Health Score", "Insurance Duration", "Vehicle Age"]])[:, -1]

In [21]:
print(null_predictions.round().astype(int))

df.loc[df["Vehicle Age"].isnull(), "Vehicle Age"] = null_predictions.round().astype(int)

df[df["Vehicle Age"].isnull()]

[ 8 10 12  5  3  7  7  8  5]


Unnamed: 0,Age,Gender,Annual Income,Marital Status,Number of Dependents,Education Level,Occupation,Health Score,Location,Policy Type,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Policy Start Date,Customer Feedback,Smoking Status,Exercise Frequency,Property Type,Premium Amount


# Filling Nulls of - Marital Status	

In [22]:
df[df["Marital Status"].isnull()].isnull().sum()

Age                       514
Gender                      0
Annual Income            1600
Marital Status          30865
Number of Dependents     3124
Education Level             0
Occupation               9185
Health Score             1531
Location                    0
Policy Type                 0
Previous Claims          8897
Vehicle Age                 0
Credit Score             3739
Insurance Duration          0
Policy Start Date           0
Customer Feedback        2443
Smoking Status              0
Exercise Frequency          0
Property Type               0
Premium Amount          12336
dtype: int64

In [23]:
from sklearn.impute import KNNImputer

filler = KNNImputer(weights="distance")

imputing_df = no_nulls_df.sample(n=3000)
imputing_df = imputing_df[["Health Score", "Annual Income", "Marital Status"]]

encoder = {imputing_df["Marital Status"].unique()[i] : i for i in range(len(imputing_df["Marital Status"].unique()))}

imputing_df["Marital Status"] = imputing_df["Marital Status"].replace(encoder)

filler.fit(imputing_df)

In [24]:
fill_nas_df = df[df["Marital Status"].isnull()]
null_predictions = filler.transform(fill_nas_df[["Health Score", "Annual Income", "Marital Status"]])[:, -1]

In [25]:
null_predictions = null_predictions.round().astype(int)

def find_key(x, encoder_list):
  for key, value in encoder_list:
    if value == x:
      return key
  return None

result = pd.Series(null_predictions).apply(lambda x: find_key(x, list(encoder.items())))

In [26]:
result

0          Single
1        Divorced
2          Single
3         Married
4        Divorced
           ...   
30860      Single
30861     Married
30862      Single
30863      Single
30864     Married
Length: 30865, dtype: object

In [27]:
df.loc[df["Marital Status"].isnull(), "Marital Status"] = result.to_numpy()

In [28]:
df[df["Marital Status"].isnull()]

Unnamed: 0,Age,Gender,Annual Income,Marital Status,Number of Dependents,Education Level,Occupation,Health Score,Location,Policy Type,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Policy Start Date,Customer Feedback,Smoking Status,Exercise Frequency,Property Type,Premium Amount


# Filling Nulls of - Age

In [29]:
df[df["Age"].isnull()].isnull().sum()

Age                     31194
Gender                      0
Annual Income            1221
Marital Status              0
Number of Dependents     2871
Education Level             0
Occupation               9538
Health Score             1796
Location                    0
Policy Type                 0
Previous Claims          9726
Vehicle Age                 0
Credit Score             3630
Insurance Duration          0
Policy Start Date           0
Customer Feedback        2176
Smoking Status              0
Exercise Frequency          0
Property Type               0
Premium Amount          12489
dtype: int64

In [30]:
from sklearn.impute import KNNImputer

filler = KNNImputer(weights="distance")

imputing_df = no_nulls_df.sample(n=300)
imputing_df = imputing_df[["Health Score", "Annual Income", "Age"]]

filler.fit(imputing_df)

In [31]:
fill_nas_df = df[df["Age"].isnull()]
null_predictions = filler.transform(fill_nas_df[["Health Score", "Annual Income", "Age"]])[:, -1]

In [32]:
print(null_predictions.round().astype(int))

df.loc[df["Age"].isnull(), "Age"] = null_predictions.round().astype(int)

df[df["Age"].isnull()]

[31 45 45 ... 43 41 56]


Unnamed: 0,Age,Gender,Annual Income,Marital Status,Number of Dependents,Education Level,Occupation,Health Score,Location,Policy Type,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Policy Start Date,Customer Feedback,Smoking Status,Exercise Frequency,Property Type,Premium Amount


# Filling Nulls of - Annual Income

In [33]:
df[df["Annual Income"].isnull()].isnull().sum()

Age                         0
Gender                      0
Annual Income           74809
Marital Status              0
Number of Dependents     6679
Education Level             0
Occupation              22095
Health Score             4032
Location                    0
Policy Type                 0
Previous Claims         21545
Vehicle Age                 0
Credit Score             5547
Insurance Duration          0
Policy Start Date           0
Customer Feedback        6512
Smoking Status              0
Exercise Frequency          0
Property Type               0
Premium Amount          29860
dtype: int64

In [34]:
from sklearn.impute import KNNImputer

filler = KNNImputer(weights="distance")

imputing_df = no_nulls_df.sample(n=1000)
imputing_df = imputing_df[["Credit Score", "Annual Income", "Previous Claims", "Annual Income"]]

filler.fit(imputing_df)

In [35]:
fill_nas_df = df[df["Annual Income"].isnull()]
null_predictions = filler.transform(fill_nas_df[["Credit Score", "Annual Income", "Previous Claims", "Annual Income"]])[:, -1]

In [36]:
print(null_predictions.round().astype(int))

df.loc[df["Annual Income"].isnull(), "Annual Income"] = null_predictions.round().astype(int)

df[df["Annual Income"].isnull()]

[ 19880  28338  53337 ...  46658 135599  36805]


Unnamed: 0,Age,Gender,Annual Income,Marital Status,Number of Dependents,Education Level,Occupation,Health Score,Location,Policy Type,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Policy Start Date,Customer Feedback,Smoking Status,Exercise Frequency,Property Type,Premium Amount


# Filling Nulls of - Health Score	

In [37]:
df[df["Health Score"].isnull()].isnull().sum()

Age                          0
Gender                       0
Annual Income                0
Marital Status               0
Number of Dependents     10401
Education Level              0
Occupation               35923
Health Score            123525
Location                     0
Policy Type                  0
Previous Claims          38281
Vehicle Age                  0
Credit Score             15584
Insurance Duration           0
Policy Start Date            0
Customer Feedback         8184
Smoking Status               0
Exercise Frequency           0
Property Type                0
Premium Amount           49449
dtype: int64

In [38]:
from sklearn.impute import KNNImputer

filler = KNNImputer(weights="distance")

imputing_df = no_nulls_df.sample(n=1000)
imputing_df = imputing_df[["Age", "Annual Income", "Credit Score", "Health Score"]]

filler.fit(imputing_df)

In [39]:
fill_nas_df = df[df["Health Score"].isnull()]
null_predictions = filler.transform(fill_nas_df[["Age", "Annual Income", "Credit Score", "Health Score"]])[:, -1]

In [40]:
print(null_predictions)

df.loc[df["Health Score"].isnull(), "Health Score"] = null_predictions

df[df["Health Score"].isnull()]

[22.08165228 23.88282447 33.06941967 ... 39.41867678 21.61650901
 16.53846934]


Unnamed: 0,Age,Gender,Annual Income,Marital Status,Number of Dependents,Education Level,Occupation,Health Score,Location,Policy Type,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Policy Start Date,Customer Feedback,Smoking Status,Exercise Frequency,Property Type,Premium Amount


# Filling Nulls of - Customer Feedback

In [41]:
df[df["Customer Feedback"].isnull()].isnull().sum()

Age                          0
Gender                       0
Annual Income                0
Marital Status               0
Number of Dependents     12597
Education Level              0
Occupation               38887
Health Score                 0
Location                     0
Policy Type                  0
Previous Claims          38058
Vehicle Age                  0
Credit Score             15810
Insurance Duration           0
Policy Start Date            0
Customer Feedback       130100
Smoking Status               0
Exercise Frequency           0
Property Type                0
Premium Amount           52276
dtype: int64

In [42]:
from sklearn.impute import KNNImputer

filler = KNNImputer(weights="distance")

imputing_df = no_nulls_df.sample(n=3000)
imputing_df = imputing_df[["Health Score", "Annual Income", "Credit Score", "Customer Feedback"]]

encoder = {imputing_df["Customer Feedback"].unique()[i] : i for i in range(len(imputing_df["Customer Feedback"].unique()))}

imputing_df["Customer Feedback"] = imputing_df["Customer Feedback"].replace(encoder)

filler.fit(imputing_df)

In [43]:
fill_nas_df = df[df["Customer Feedback"].isnull()]
null_predictions = filler.transform(fill_nas_df[["Health Score", "Annual Income", "Credit Score", "Customer Feedback"]])[:, -1]

In [44]:
null_predictions = null_predictions.round().astype(int)

def find_key(x, encoder_list):
    for key, value in encoder_list:
        if value == x:
            return key
    return None

result = pd.Series(null_predictions).apply(lambda x: find_key(x, list(encoder.items())))

In [45]:
result

0         Average
1            Poor
2         Average
3         Average
4         Average
           ...   
130095    Average
130096    Average
130097    Average
130098       Poor
130099    Average
Length: 130100, dtype: object

In [46]:
df.loc[df["Customer Feedback"].isnull(), "Customer Feedback"] = result.to_numpy()

In [47]:
df[df["Customer Feedback"].isnull()]

Unnamed: 0,Age,Gender,Annual Income,Marital Status,Number of Dependents,Education Level,Occupation,Health Score,Location,Policy Type,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Policy Start Date,Customer Feedback,Smoking Status,Exercise Frequency,Property Type,Premium Amount


# Filling Nulls of - Number of Dependents

In [48]:
df[df["Number of Dependents"].isnull()].isnull().sum()

Age                          0
Gender                       0
Annual Income                0
Marital Status               0
Number of Dependents    182802
Education Level              0
Occupation               54256
Health Score                 0
Location                     0
Policy Type                  0
Previous Claims          54141
Vehicle Age                  0
Credit Score             20947
Insurance Duration           0
Policy Start Date            0
Customer Feedback            0
Smoking Status               0
Exercise Frequency           0
Property Type                0
Premium Amount           73130
dtype: int64

In [49]:
from sklearn.impute import KNNImputer

filler = KNNImputer(weights="distance")

imputing_df = no_nulls_df.sample(n=2000)
imputing_df = imputing_df[["Health Score", "Annual Income", "Number of Dependents"]]

filler.fit(imputing_df)

In [50]:
fill_nas_df = df[df["Number of Dependents"].isnull()]
null_predictions = filler.transform(fill_nas_df[["Health Score", "Annual Income", "Number of Dependents"]])[:, -1]

In [51]:
null_predictions.round().astype(int)

df.loc[df["Number of Dependents"].isnull(), "Number of Dependents"] = null_predictions.round().astype(int)

df[df["Number of Dependents"].isnull()]

Unnamed: 0,Age,Gender,Annual Income,Marital Status,Number of Dependents,Education Level,Occupation,Health Score,Location,Policy Type,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Policy Start Date,Customer Feedback,Smoking Status,Exercise Frequency,Property Type,Premium Amount


# Filling Nulls of - Credit Score

In [52]:
df[df["Credit Score"].isnull()].isnull().sum()

Age                          0
Gender                       0
Annual Income                0
Marital Status               0
Number of Dependents         0
Education Level              0
Occupation               68117
Health Score                 0
Location                     0
Policy Type                  0
Previous Claims          68210
Vehicle Age                  0
Credit Score            229333
Insurance Duration           0
Policy Start Date            0
Customer Feedback            0
Smoking Status               0
Exercise Frequency           0
Property Type                0
Premium Amount           91451
dtype: int64

In [53]:
from sklearn.impute import KNNImputer

filler = KNNImputer(weights="distance")

imputing_df = no_nulls_df.sample(n=5000)
imputing_df = imputing_df[["Health Score", "Annual Income", "Previous Claims", "Credit Score"]]

filler.fit(imputing_df)

In [54]:
fill_nas_df = df[df["Credit Score"].isnull()]
null_predictions = filler.transform(fill_nas_df[["Health Score", "Annual Income", "Previous Claims", "Credit Score"]])[:, -1]

In [55]:
null_predictions.round().astype(int)

df.loc[df["Credit Score"].isnull(), "Credit Score"] = null_predictions.round().astype(int)

df[df["Credit Score"].isnull()]

Unnamed: 0,Age,Gender,Annual Income,Marital Status,Number of Dependents,Education Level,Occupation,Health Score,Location,Policy Type,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Policy Start Date,Customer Feedback,Smoking Status,Exercise Frequency,Property Type,Premium Amount


In [56]:
df

Unnamed: 0,Age,Gender,Annual Income,Marital Status,Number of Dependents,Education Level,Occupation,Health Score,Location,Policy Type,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Policy Start Date,Customer Feedback,Smoking Status,Exercise Frequency,Property Type,Premium Amount
0,19.0,Female,10049.0,Married,1.0,Bachelor's,Self-Employed,22.598761,Urban,Premium,2.0,17.0,372.0,5.0,2023-12-23 15:21:39.134960,Poor,No,Weekly,House,2869.0
1,39.0,Female,31678.0,Divorced,3.0,Master's,,15.569731,Rural,Comprehensive,1.0,12.0,694.0,2.0,2023-06-12 15:21:39.111551,Average,Yes,Monthly,House,1483.0
2,23.0,Male,25602.0,Divorced,3.0,High School,Self-Employed,47.177549,Suburban,Premium,1.0,14.0,555.0,3.0,2023-09-30 15:21:39.221386,Good,Yes,Weekly,House,567.0
3,21.0,Male,141855.0,Married,2.0,Bachelor's,,10.938144,Rural,Basic,1.0,0.0,367.0,1.0,2024-06-12 15:21:39.226954,Poor,Yes,Daily,Apartment,765.0
4,21.0,Male,39651.0,Single,1.0,Bachelor's,Self-Employed,20.376094,Rural,Premium,0.0,8.0,598.0,4.0,2021-12-01 15:21:39.252145,Poor,Yes,Weekly,House,2022.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
799995,50.0,Female,38782.0,Married,1.0,Bachelor's,,14.498639,Rural,Premium,,8.0,309.0,2.0,2021-07-09 15:21:39.184157,Average,Yes,Daily,Condo,
799996,56,Female,73462.0,Single,0.0,Master's,,8.145748,Rural,Basic,2.0,0.0,452.0,2.0,2023-03-28 15:21:39.250151,Good,No,Daily,Apartment,
799997,26.0,Female,35178.0,Single,0.0,Master's,Employed,6.636583,Urban,Comprehensive,,10.0,764.0,6.0,2019-09-30 15:21:39.132191,Poor,No,Monthly,Apartment,
799998,34.0,Female,45661.0,Single,3.0,Master's,,15.937248,Urban,Premium,2.0,17.0,467.0,7.0,2022-05-09 15:21:39.253660,Average,No,Weekly,Condo,


# Filling Nulls of - Occupation

In [57]:
df[df["Occupation"].isnull()].isnull().sum()

Age                          0
Gender                       0
Annual Income                0
Marital Status               0
Number of Dependents         0
Education Level              0
Occupation              597200
Health Score                 0
Location                     0
Policy Type                  0
Previous Claims         181354
Vehicle Age                  0
Credit Score                 0
Insurance Duration           0
Policy Start Date            0
Customer Feedback            0
Smoking Status               0
Exercise Frequency           0
Property Type                0
Premium Amount          239125
dtype: int64

In [58]:
from sklearn.impute import KNNImputer

filler = KNNImputer(weights="distance")

imputing_df = no_nulls_df.sample(n=5000)
imputing_df = imputing_df[["Health Score", "Annual Income", "Occupation"]]

encoder = {imputing_df["Occupation"].unique()[i] : i for i in range(len(imputing_df["Occupation"].unique()))}

imputing_df["Occupation"] = imputing_df["Occupation"].replace(encoder)

filler.fit(imputing_df)

In [59]:
fill_nas_df = df[df["Occupation"].isnull()]
null_predictions = filler.transform(fill_nas_df[["Health Score", "Annual Income", "Occupation"]])[:, -1]

In [60]:
null_predictions = null_predictions.round().astype(int)

def find_key(x, encoder_list):
    for key, value in encoder_list:
        if value == x:
            return key
    return None

result = pd.Series(null_predictions).apply(lambda x: find_key(x, list(encoder.items())))

In [61]:
result

0         Self-Employed
1         Self-Employed
2              Employed
3         Self-Employed
4            Unemployed
              ...      
597195    Self-Employed
597196         Employed
597197    Self-Employed
597198    Self-Employed
597199         Employed
Length: 597200, dtype: object

In [62]:
df.loc[df["Occupation"].isnull(), "Occupation"] = result.to_numpy()

In [63]:
df[df["Occupation"].isnull()]

Unnamed: 0,Age,Gender,Annual Income,Marital Status,Number of Dependents,Education Level,Occupation,Health Score,Location,Policy Type,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Policy Start Date,Customer Feedback,Smoking Status,Exercise Frequency,Property Type,Premium Amount


# Filling Nulls of - Previous Claims

In [64]:
df[df["Previous Claims"].isnull()].isnull().sum()

Age                          0
Gender                       0
Annual Income                0
Marital Status               0
Number of Dependents         0
Education Level              0
Occupation                   0
Health Score                 0
Location                     0
Policy Type                  0
Previous Claims         606831
Vehicle Age                  0
Credit Score                 0
Insurance Duration           0
Policy Start Date            0
Customer Feedback            0
Smoking Status               0
Exercise Frequency           0
Property Type                0
Premium Amount          242802
dtype: int64

In [65]:
from sklearn.impute import KNNImputer

filler = KNNImputer(weights="distance")

imputing_df = no_nulls_df.sample(n=2000)
imputing_df = imputing_df[["Health Score", "Annual Income", "Credit Score", "Previous Claims"]]

filler.fit(imputing_df)

In [66]:
fill_nas_df = df[df["Previous Claims"].isnull()]
null_predictions = filler.transform(fill_nas_df[["Health Score", "Annual Income", "Credit Score", "Previous Claims"]])[:, -1]

In [67]:
null_predictions.round().astype(int)

df.loc[df["Previous Claims"].isnull(), "Previous Claims"] = null_predictions.round().astype(int)

df[df["Previous Claims"].isnull()]

Unnamed: 0,Age,Gender,Annual Income,Marital Status,Number of Dependents,Education Level,Occupation,Health Score,Location,Policy Type,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Policy Start Date,Customer Feedback,Smoking Status,Exercise Frequency,Property Type,Premium Amount


# After Treating Nulls with most related columns using KNN-Imputer

In [68]:
nulls = []
nuniques = []
uniques = []
types = []

for i in df.columns:
    nulls.append(df[i].isnull().sum())
    nuniques.append(df[i].nunique())
    uniques.append(df[i].unique())
    types.append(df[i].dtype)

In [69]:
pd.DataFrame(
    {
        "Column" : df.columns,
        "Data Type" : types,
        "Nulls" : nulls,
        "No. of Uniques" : nuniques,
        "Uniques" : uniques
    }
).sort_values(by="Nulls", ascending=False)

Unnamed: 0,Column,Data Type,Nulls,No. of Uniques,Uniques
19,Premium Amount,float64,800000,4794,"[2869.0, 1483.0, 567.0, 765.0, 2022.0, 3202.0,..."
0,Age,object,0,47,"[19.0, 39.0, 23.0, 21.0, 29.0, 41.0, 48.0, 44...."
2,Annual Income,float64,0,97952,"[10049.0, 31678.0, 25602.0, 141855.0, 39651.0,..."
1,Gender,object,0,2,"[Female, Male]"
3,Marital Status,object,0,3,"[Married, Divorced, Single]"
4,Number of Dependents,object,0,5,"[1.0, 3.0, 2.0, 0.0, 4.0]"
6,Occupation,object,0,3,"[Self-Employed, Employed, Unemployed]"
5,Education Level,object,0,4,"[Bachelor's, Master's, High School, PhD]"
8,Location,object,0,3,"[Urban, Rural, Suburban]"
9,Policy Type,object,0,3,"[Premium, Comprehensive, Basic]"


In [70]:
df.to_csv("cleaned_df.csv", index=False)