In [24]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import *
import xgboost as xgb

from sklearn.preprocessing import PowerTransformer


import warnings
warnings.filterwarnings("ignore")
pd.set_option("display.max_columns", None)

In [25]:
train = pd.read_csv("train.csv")
train.head()

Unnamed: 0,id,Age,Gender,Annual Income,Marital Status,Number of Dependents,Education Level,Occupation,Health Score,Location,Policy Type,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Policy Start Date,Customer Feedback,Smoking Status,Exercise Frequency,Property Type,Premium Amount
0,0,19.0,Female,10049.0,Married,1.0,Bachelor's,Self-Employed,22.598761,Urban,Premium,2.0,17.0,372.0,5.0,2023-12-23 15:21:39.134960,Poor,No,Weekly,House,2869.0
1,1,39.0,Female,31678.0,Divorced,3.0,Master's,,15.569731,Rural,Comprehensive,1.0,12.0,694.0,2.0,2023-06-12 15:21:39.111551,Average,Yes,Monthly,House,1483.0
2,2,23.0,Male,25602.0,Divorced,3.0,High School,Self-Employed,47.177549,Suburban,Premium,1.0,14.0,,3.0,2023-09-30 15:21:39.221386,Good,Yes,Weekly,House,567.0
3,3,21.0,Male,141855.0,Married,2.0,Bachelor's,,10.938144,Rural,Basic,1.0,0.0,367.0,1.0,2024-06-12 15:21:39.226954,Poor,Yes,Daily,Apartment,765.0
4,4,21.0,Male,39651.0,Single,1.0,Bachelor's,Self-Employed,20.376094,Rural,Premium,0.0,8.0,598.0,4.0,2021-12-01 15:21:39.252145,Poor,Yes,Weekly,House,2022.0


In [3]:
test = pd.read_csv("test.csv")
test.head()

Unnamed: 0,id,Age,Gender,Annual Income,Marital Status,Number of Dependents,Education Level,Occupation,Health Score,Location,Policy Type,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Policy Start Date,Customer Feedback,Smoking Status,Exercise Frequency,Property Type
0,1200000,28.0,Female,2310.0,,4.0,Bachelor's,Self-Employed,7.657981,Rural,Basic,,19.0,,1.0,2023-06-04 15:21:39.245086,Poor,Yes,Weekly,House
1,1200001,31.0,Female,126031.0,Married,2.0,Master's,Self-Employed,13.381379,Suburban,Premium,,14.0,372.0,8.0,2024-04-22 15:21:39.224915,Good,Yes,Rarely,Apartment
2,1200002,47.0,Female,17092.0,Divorced,0.0,PhD,Unemployed,24.354527,Urban,Comprehensive,,16.0,819.0,9.0,2023-04-05 15:21:39.134960,Average,Yes,Monthly,Condo
3,1200003,28.0,Female,30424.0,Divorced,3.0,PhD,Self-Employed,5.136225,Suburban,Comprehensive,1.0,3.0,770.0,5.0,2023-10-25 15:21:39.134960,Poor,Yes,Daily,House
4,1200004,24.0,Male,10863.0,Divorced,2.0,High School,Unemployed,11.844155,Suburban,Premium,,14.0,755.0,7.0,2021-11-26 15:21:39.259788,Average,No,Weekly,House


In [4]:
df = pd.concat([train, test])
df.shape

(2000000, 21)

# Setting all columns to appropriate datatypes

In [5]:
df[["Previous Claims", "Number of Dependents", "Vehicle Age", "Age", "Insurance Duration"]] = df[["Previous Claims", "Number of Dependents", "Vehicle Age", "Age", "Insurance Duration"]].astype("object")

In [6]:
df["Policy Start Date"] = pd.to_datetime(df["Policy Start Date"])

In [7]:
nulls = []
nuniques = []
uniques = []
types = []

for i in df.columns:
    nulls.append(df[i].isnull().sum())
    nuniques.append(df[i].nunique())
    uniques.append(df[i].unique())
    types.append(df[i].dtype)

In [8]:
pd.DataFrame(
    {
        "Column" : df.columns,
        "Data Type" : types,
        "Nulls" : nulls,
        "No. of Uniques" : nuniques,
        "Uniques" : uniques
    }
).sort_values(by="Nulls", ascending=False)

Unnamed: 0,Column,Data Type,Nulls,No. of Uniques,Uniques
20,Premium Amount,float64,800000,4794,"[2869.0, 1483.0, 567.0, 765.0, 2022.0, 3202.0,..."
11,Previous Claims,object,606831,10,"[2.0, 1.0, 0.0, nan, 3.0, 4.0, 5.0, 6.0, 7.0, ..."
7,Occupation,object,597200,3,"[Self-Employed, nan, Employed, Unemployed]"
13,Credit Score,float64,229333,550,"[372.0, 694.0, nan, 367.0, 598.0, 614.0, 807.0..."
5,Number of Dependents,object,182802,5,"[1.0, 3.0, 2.0, 0.0, 4.0, nan]"
16,Customer Feedback,object,130100,3,"[Poor, Average, Good, nan]"
8,Health Score,float64,123525,811360,"[22.59876067181393, 15.569730989408043, 47.177..."
3,Annual Income,float64,74809,97540,"[10049.0, 31678.0, 25602.0, 141855.0, 39651.0,..."
1,Age,object,31194,47,"[19.0, 39.0, 23.0, 21.0, 29.0, 41.0, 48.0, 44...."
4,Marital Status,object,30865,3,"[Married, Divorced, Single, nan]"


# EDA - Filling Nulls

In [9]:
new_train = df.iloc[:1200000, :]

In [10]:
df.drop(columns=["id"], inplace=True)
new_train.drop(columns=["id"], inplace=True)

In [11]:
new_train.head(3)

Unnamed: 0,Age,Gender,Annual Income,Marital Status,Number of Dependents,Education Level,Occupation,Health Score,Location,Policy Type,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Policy Start Date,Customer Feedback,Smoking Status,Exercise Frequency,Property Type,Premium Amount
0,19.0,Female,10049.0,Married,1.0,Bachelor's,Self-Employed,22.598761,Urban,Premium,2.0,17.0,372.0,5.0,2023-12-23 15:21:39.134960,Poor,No,Weekly,House,2869.0
1,39.0,Female,31678.0,Divorced,3.0,Master's,,15.569731,Rural,Comprehensive,1.0,12.0,694.0,2.0,2023-06-12 15:21:39.111551,Average,Yes,Monthly,House,1483.0
2,23.0,Male,25602.0,Divorced,3.0,High School,Self-Employed,47.177549,Suburban,Premium,1.0,14.0,,3.0,2023-09-30 15:21:39.221386,Good,Yes,Weekly,House,567.0


### 

#
---
#

In [19]:
for i in df.isnull().sum().index:
    if i == "Premium Amount":
        continue
    else:
        df[i].fillna(-1, inplace=True)

In [20]:
df.isnull().sum()

Age                          0
Gender                       0
Annual Income                0
Marital Status               0
Number of Dependents         0
Education Level              0
Occupation                   0
Health Score                 0
Location                     0
Policy Type                  0
Previous Claims              0
Vehicle Age                  0
Credit Score                 0
Insurance Duration           0
Policy Start Date            0
Customer Feedback            0
Smoking Status               0
Exercise Frequency           0
Property Type                0
Premium Amount          800000
dtype: int64

# After Treating Nulls with most related columns using KNN-Imputer

In [21]:
nulls = []
nuniques = []
uniques = []
types = []

for i in df.columns:
    nulls.append(df[i].isnull().sum())
    nuniques.append(df[i].nunique())
    uniques.append(df[i].unique())
    types.append(df[i].dtype)

In [22]:
pd.DataFrame(
    {
        "Column" : df.columns,
        "Data Type" : types,
        "Nulls" : nulls,
        "No. of Uniques" : nuniques,
        "Uniques" : uniques
    }
).sort_values(by="Nulls", ascending=False)

Unnamed: 0,Column,Data Type,Nulls,No. of Uniques,Uniques
19,Premium Amount,float64,800000,4794,"[2869.0, 1483.0, 567.0, 765.0, 2022.0, 3202.0,..."
0,Age,float64,0,48,"[19.0, 39.0, 23.0, 21.0, 29.0, 41.0, 48.0, 44...."
2,Annual Income,float64,0,97541,"[10049.0, 31678.0, 25602.0, 141855.0, 39651.0,..."
1,Gender,object,0,2,"[Female, Male]"
3,Marital Status,object,0,4,"[Married, Divorced, Single, -1]"
4,Number of Dependents,float64,0,6,"[1.0, 3.0, 2.0, 0.0, 4.0, -1.0]"
6,Occupation,object,0,4,"[Self-Employed, -1, Employed, Unemployed]"
5,Education Level,object,0,4,"[Bachelor's, Master's, High School, PhD]"
8,Location,object,0,3,"[Urban, Rural, Suburban]"
9,Policy Type,object,0,3,"[Premium, Comprehensive, Basic]"


In [23]:
df.to_csv("cleaned_df.csv", index=False)