In [21]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import *
import xgboost as xgb

from sklearn.preprocessing import PowerTransformer


import warnings
warnings.filterwarnings("ignore")
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", 150)

In [22]:
df = pd.read_csv("EDAed_df.csv")

df["Policy Start Date"] = pd.to_datetime(df["Policy Start Date"])

In [23]:
df.shape

(2000000, 38)

In [24]:
df.isnull().sum()

Age                             0
Gender                          0
Annual Income                   0
Marital Status                  0
Number of Dependents            0
Education Level                 0
Occupation                      0
Health Score                    0
Location                        0
Policy Type                     0
Previous Claims                 0
Vehicle Age                     0
Credit Score                    0
Insurance Duration              0
Policy Start Date               0
Customer Feedback               0
Smoking Status                  0
Exercise Frequency              0
Property Type                   0
Premium Amount             800000
Health Conscious Level          0
Health Conscious Level1         0
Money Per Head                  0
Money Handling Level            0
Money Handling Level1           0
Growth                          0
Growth1                         0
Determinstic                    0
Day_Name                        0
Credit by Scor

In [25]:
train = df.iloc[:1200000, :]
train.shape

(1200000, 38)

In [26]:
test = df.iloc[1200000:, :]
test.shape

(800000, 38)

In [27]:
test.head(3)

Unnamed: 0,Age,Gender,Annual Income,Marital Status,Number of Dependents,Education Level,Occupation,Health Score,Location,Policy Type,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Policy Start Date,Customer Feedback,Smoking Status,Exercise Frequency,Property Type,Premium Amount,Health Conscious Level,Health Conscious Level1,Money Per Head,Money Handling Level,Money Handling Level1,Growth,Growth1,Determinstic,Day_Name,Credit by Score,CreditInsurance,Health_Risk_Score,Credit_Health_Score,Health_Age_Interaction,Feedback1,Feedback2,Feedback3,Feedback4
1200000,28.0,Female,2310.0,Single,4.0,Bachelor's,Self-Employed,7.657981,Rural,Basic,2.0,19.0,493.0,1.0,2023-06-04 15:21:39.245086,Poor,Yes,Weekly,House,,2,3430.775431,577.5,1138830.0,4.685598,4620.0,1155.0,82.5,Sunday,246.5,493.0,4.617101,3775.38457,214.423464,4620.0,986.0,4.0,15.315962
1200001,31.0,Female,126031.0,Married,2.0,Master's,Self-Employed,13.381379,Suburban,Premium,1.0,14.0,372.0,8.0,2024-04-22 15:21:39.224915,Good,Yes,Rarely,Apartment,,1,1659.291012,63015.5,46883532.0,338.793011,378093.0,42010.333333,4065.516129,Monday,372.0,2976.0,4.330931,4977.873036,414.822753,1008248.0,2976.0,8.0,107.051033
1200002,47.0,Female,17092.0,Divorced,0.0,PhD,Unemployed,24.354527,Urban,Comprehensive,1.0,16.0,819.0,9.0,2023-04-05 15:21:39.134960,Average,Yes,Monthly,Condo,,3,9157.302066,17092.0,13998348.0,20.869353,68368.0,4273.0,363.659574,Wednesday,819.0,7371.0,3.782274,19946.357425,1144.662758,68368.0,3276.0,4.0,97.418107


#
---
#

# Adding Dates columns

In [28]:
df["Policy Start Date - Day"] = df["Policy Start Date"].dt.day
df["Policy Start Date - Month"] = df["Policy Start Date"].dt.month
df["Policy Start Date - Year"] = df["Policy Start Date"].dt.year

In [29]:
df["Policy Start Date - Quarter"] = df["Policy Start Date"].dt.year.astype(str) + " Q" + df["Policy Start Date"].dt.quarter.astype(str)

In [30]:
df["Sin_Date"] = np.sin(2 * np.pi * df["Policy Start Date"].astype('int64'))
df["Cos_Date"] = np.cos(2 * np.pi * df["Policy Start Date"].astype('int64'))

In [31]:
df["Sin_Year"] = np.sin(2 * np.pi * df["Policy Start Date - Year"].astype('int64'))
df["Cos_Year"] = np.cos(2 * np.pi * df["Policy Start Date - Year"].astype('int64'))

In [32]:
df["Sin_Month"] = np.sin(2 * np.pi * df["Policy Start Date - Month"].astype('int64'))
df["Cos_Month"] = np.cos(2 * np.pi * df["Policy Start Date - Month"].astype('int64'))

In [33]:
df.head(3)

Unnamed: 0,Age,Gender,Annual Income,Marital Status,Number of Dependents,Education Level,Occupation,Health Score,Location,Policy Type,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Policy Start Date,Customer Feedback,Smoking Status,Exercise Frequency,Property Type,Premium Amount,Health Conscious Level,Health Conscious Level1,Money Per Head,Money Handling Level,Money Handling Level1,Growth,Growth1,Determinstic,Day_Name,Credit by Score,CreditInsurance,Health_Risk_Score,Credit_Health_Score,Health_Age_Interaction,Feedback1,Feedback2,Feedback3,Feedback4,Policy Start Date - Day,Policy Start Date - Month,Policy Start Date - Year,Policy Start Date - Quarter,Sin_Date,Cos_Date,Sin_Year,Cos_Year,Sin_Month,Cos_Month
0,19.0,Female,10049.0,Married,1.0,Bachelor's,Self-Employed,22.598761,Urban,Premium,2.0,17.0,372.0,5.0,2023-12-23 15:21:39.134960,Poor,No,Weekly,House,2869.0,4,13740.046488,10049.0,3738228.0,27.013441,20098.0,5024.5,528.894737,Saturday,186.0,1860.0,3.870062,8406.73897,429.376453,20098.0,744.0,4.0,45.197521,23,12,2023,2023 Q4,-0.975344,-0.220691,-6.447061e-13,1.0,-2.939152e-15,1.0
1,39.0,Female,31678.0,Divorced,3.0,Master's,Self-Employed,15.569731,Rural,Comprehensive,1.0,12.0,694.0,2.0,2023-06-12 15:21:39.111551,Average,Yes,Monthly,House,1483.0,2,4857.756069,10559.333333,21984532.0,45.645533,95034.0,10559.333333,812.25641,Monday,694.0,1388.0,4.221513,10805.393307,607.219509,126712.0,2776.0,4.0,62.278924,12,6,2023,2023 Q2,-0.998725,0.050489,-6.447061e-13,1.0,-1.469576e-15,1.0
2,23.0,Male,25602.0,Divorced,3.0,High School,Self-Employed,47.177549,Suburban,Premium,1.0,14.0,555.0,3.0,2023-09-30 15:21:39.221386,Good,Yes,Weekly,House,567.0,4,17361.338138,8534.0,14209110.0,46.12973,25602.0,25602.0,1113.130435,Saturday,555.0,1665.0,2.641123,26183.539855,1085.083634,204816.0,4440.0,8.0,377.420394,30,9,2023,2023 Q3,-0.994867,0.101192,-6.447061e-13,1.0,-2.204364e-15,1.0


#
---
#

In [34]:
data = df.copy()

#
---
#

In [35]:
df.drop(columns="Policy Start Date", inplace=True)

In [36]:
df.head(3)

Unnamed: 0,Age,Gender,Annual Income,Marital Status,Number of Dependents,Education Level,Occupation,Health Score,Location,Policy Type,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Customer Feedback,Smoking Status,Exercise Frequency,Property Type,Premium Amount,Health Conscious Level,Health Conscious Level1,Money Per Head,Money Handling Level,Money Handling Level1,Growth,Growth1,Determinstic,Day_Name,Credit by Score,CreditInsurance,Health_Risk_Score,Credit_Health_Score,Health_Age_Interaction,Feedback1,Feedback2,Feedback3,Feedback4,Policy Start Date - Day,Policy Start Date - Month,Policy Start Date - Year,Policy Start Date - Quarter,Sin_Date,Cos_Date,Sin_Year,Cos_Year,Sin_Month,Cos_Month
0,19.0,Female,10049.0,Married,1.0,Bachelor's,Self-Employed,22.598761,Urban,Premium,2.0,17.0,372.0,5.0,Poor,No,Weekly,House,2869.0,4,13740.046488,10049.0,3738228.0,27.013441,20098.0,5024.5,528.894737,Saturday,186.0,1860.0,3.870062,8406.73897,429.376453,20098.0,744.0,4.0,45.197521,23,12,2023,2023 Q4,-0.975344,-0.220691,-6.447061e-13,1.0,-2.939152e-15,1.0
1,39.0,Female,31678.0,Divorced,3.0,Master's,Self-Employed,15.569731,Rural,Comprehensive,1.0,12.0,694.0,2.0,Average,Yes,Monthly,House,1483.0,2,4857.756069,10559.333333,21984532.0,45.645533,95034.0,10559.333333,812.25641,Monday,694.0,1388.0,4.221513,10805.393307,607.219509,126712.0,2776.0,4.0,62.278924,12,6,2023,2023 Q2,-0.998725,0.050489,-6.447061e-13,1.0,-1.469576e-15,1.0
2,23.0,Male,25602.0,Divorced,3.0,High School,Self-Employed,47.177549,Suburban,Premium,1.0,14.0,555.0,3.0,Good,Yes,Weekly,House,567.0,4,17361.338138,8534.0,14209110.0,46.12973,25602.0,25602.0,1113.130435,Saturday,555.0,1665.0,2.641123,26183.539855,1085.083634,204816.0,4440.0,8.0,377.420394,30,9,2023,2023 Q3,-0.994867,0.101192,-6.447061e-13,1.0,-2.204364e-15,1.0


In [37]:
df[["Policy Start Date - Day", "Policy Start Date - Month", "Policy Start Date - Year"]] = df[["Policy Start Date - Day", "Policy Start Date - Month", "Policy Start Date - Year"]].astype("O")

In [38]:
def show_nulls(df):
    nulls = []
    nuniques = []
    uniques = []
    types = []
    
    for i in df.columns:
        nulls.append(df[i].isnull().sum())
        nuniques.append(df[i].nunique())
        uniques.append(df[i].unique())
        types.append(df[i].dtype)
    
    
    return pd.DataFrame(
        {
            "Column" : df.columns,
            "Data Type" : types,
            "Nulls" : nulls,
            "No. of Uniques" : nuniques,
            "Uniques" : uniques
        }
    ).sort_values(by="Nulls", ascending=False)

In [39]:
df["Health Conscious Level"] = df["Health Conscious Level"].astype("O")

In [40]:
show_nulls(df)

Unnamed: 0,Column,Data Type,Nulls,No. of Uniques,Uniques
18,Premium Amount,float64,800000,4794,"[2869.0, 1483.0, 567.0, 765.0, 2022.0, 3202.0,..."
1,Gender,object,0,2,"[Female, Male]"
0,Age,float64,0,47,"[19.0, 39.0, 23.0, 21.0, 29.0, 41.0, 48.0, 44...."
3,Marital Status,object,0,3,"[Married, Divorced, Single]"
4,Number of Dependents,float64,0,5,"[1.0, 3.0, 2.0, 0.0, 4.0]"
5,Education Level,object,0,4,"[Bachelor's, Master's, High School, PhD]"
2,Annual Income,float64,0,97952,"[10049.0, 31678.0, 25602.0, 141855.0, 39651.0,..."
7,Health Score,float64,0,934000,"[22.59876067181393, 15.569730989408043, 47.177..."
8,Location,object,0,3,"[Urban, Rural, Suburban]"
9,Policy Type,object,0,3,"[Premium, Comprehensive, Basic]"


#
---
#

In [41]:
def do_magic(target_column, *columns: list):
    for i in columns:
        df[f"{i}_MIN_{target_column}"] = df.groupby(by=i)[target_column].transform("min")
        df[f"{i}_MEAN_{target_column}"] = df.groupby(by=i)[target_column].transform("mean")
        df[f"{i}_MEDIAN_{target_column}"] = df.groupby(by=i)[target_column].transform("median")
        df[f"{i}_STD_{target_column}"] = df.groupby(by=i)[target_column].transform("std")
        df[f"{i}_MAX_{target_column}"] = df.groupby(by=i)[target_column].transform("max")

In [42]:
do_magic("Premium Amount", "Number of Dependents", "Occupation", "Education Level", "Previous Claims", "Health Conscious Level", "Insurance Duration")

In [43]:
df.isnull().sum()

Age                                                  0
Gender                                               0
Annual Income                                        0
Marital Status                                       0
Number of Dependents                                 0
Education Level                                      0
Occupation                                           0
Health Score                                         0
Location                                             0
Policy Type                                          0
Previous Claims                                      0
Vehicle Age                                          0
Credit Score                                         0
Insurance Duration                                   0
Customer Feedback                                    0
Smoking Status                                       0
Exercise Frequency                                   0
Property Type                                        0
Premium Am

#
---
#

In [44]:
def return_splits(ddf, feature_name, target_name):
    return [ddf[ddf[feature_name] == i][target_name] for i in ddf[feature_name].unique()]

def give_stats_analysis(df, target_column_name):
    ddf = df.copy()
    ddf = ddf.dropna()

    features = []
    tests = []
    stats = []
    pvals = []
    verdict = []
    count = 0

    target = ddf[target_column_name]
    for i in ddf.columns:
        features.append(i)
        feature = ddf[i]
        
        if (feature.dtype == "O" and (target.dtype == "float" or target.dtype == "int")) or (target.dtype == "O" and (feature.dtype == "float" or feature.dtype == "int")):
            stat, pval, *_ = kruskal(*return_splits(ddf, feature.name, target.name))
            tests.append("Kruskal-Wallis")
            stats.append(stat)
            pvals.append(pval)
            
        
        elif (feature.dtype == "float" or feature.dtype == "int") and (target.dtype == "float" or target.dtype == "int"):
            stat, pval, *_ = spearmanr(feature, target)
            tests.append("SpearmanR")
            stats.append(stat)
            pvals.append(pval)

        elif feature.dtype == "O" and target.dtype == "O":
            stat, pval, *_ = chi2_contingency(pd.crosstab(feature, target))
            tests.append("Chi-Square")
            stats.append(stat)
            pvals.append(pval)
        
        else:
            tests.append(np.nan)
            stats.append(np.nan)
            pvals.append(np.nan)
        
        if pval <= 0.025:
            verdict.append("There is Relationship")
        else:
            verdict.append("There is NO Relationship")

        print(f"{feature.name} ■■■ {target_column_name}".ljust(100, "-")+"✅")
    
    return pd.DataFrame({
        "Feature" : features,
        "Target" : [target_column_name]*ddf.shape[1],
        "Statistic Test" : tests,
        "Test Statistic" : stats,
        "P-Value" : pvals,
        "Verdict" : verdict
    }).sort_values(by="P-Value")

# H0 :- There is ***No Relationship*** among the given two columns
# H1 :- There is ***Relationship*** among the given two columns

### ***Health-related indicators***
- [x] Health Score
- [x] Smoking Status
- [x] Exercise Frequency
### ***Demographic information***
- [x] Age
- [x] Gender
- [x] Marital Status
- [x] Number of Dependents
- [x] Occupation
### ***Policy details***
- [x] Policy Type
- [x] Policy Start Date
- [x] Insurance Duration
### ***Financial factors***
- [x] Annual Income
- [x] Credit Score.
### ***Premium calculation***
- [x] Premium Amount

In [45]:
stats_result = give_stats_analysis(df.iloc[:1200000, :], "Premium Amount")
stats_result

Age ■■■ Premium Amount------------------------------------------------------------------------------✅
Gender ■■■ Premium Amount---------------------------------------------------------------------------✅
Annual Income ■■■ Premium Amount--------------------------------------------------------------------✅
Marital Status ■■■ Premium Amount-------------------------------------------------------------------✅
Number of Dependents ■■■ Premium Amount-------------------------------------------------------------✅
Education Level ■■■ Premium Amount------------------------------------------------------------------✅
Occupation ■■■ Premium Amount-----------------------------------------------------------------------✅
Health Score ■■■ Premium Amount---------------------------------------------------------------------✅
Location ■■■ Premium Amount-------------------------------------------------------------------------✅
Policy Type ■■■ Premium Amount----------------------------------------------------

Unnamed: 0,Feature,Target,Statistic Test,Test Statistic,P-Value,Verdict
2,Annual Income,Premium Amount,SpearmanR,-0.060743,0.0,There is Relationship
12,Credit Score,Premium Amount,SpearmanR,-0.037141,0.0,There is Relationship
26,Determinstic,Premium Amount,SpearmanR,-0.055946,0.0,There is Relationship
28,Credit by Score,Premium Amount,SpearmanR,-0.05753,0.0,There is Relationship
21,Money Per Head,Premium Amount,SpearmanR,-0.052062,0.0,There is Relationship
22,Money Handling Level,Premium Amount,SpearmanR,-0.070907,0.0,There is Relationship
23,Money Handling Level1,Premium Amount,SpearmanR,-0.047643,0.0,There is Relationship
18,Premium Amount,Premium Amount,SpearmanR,1.0,0.0,There is Relationship
24,Growth,Premium Amount,SpearmanR,-0.054155,0.0,There is Relationship
25,Growth1,Premium Amount,SpearmanR,-0.052737,0.0,There is Relationship


# <ins>Key Premium as per Reseach Papers and as per dataset.</ins>
### `Strikeoff features are said by research and dataset too. But unstrike ones are not impactful to determine premium amount as per dataset but as per research it should be. We need to find why like so in these features`

- ### ~~Age~~
- ### Gender
- ### ~~Health Score~~
- ### Smoking Status
- ### Exercise Frequency
- ### ~~Occupation~~
- ### Policy Type
- ### ~~Previous Claims~~
- ### ~~Annual Income~~
- ### Insurance Duration
- ### ~~Credit Score~~

#
---
#

In [46]:
cols = ["Gender", "Smoking Status", "Exercise Frequency", "Policy Type", "Insurance Duration"]

In [47]:
# fig, axs = plt.subplots(2, 3, figsize=(20, 8))
# for col, ax in zip(cols, axs.flatten()):
#     sns.boxplot(y=df["Premium Amount"], x=df[col], color="mediumblue", ax=ax)

In [48]:
useless_columns = stats_result[stats_result["P-Value"] >= 0.05]["Feature"]
useless_columns

58          Education Level_MEAN_Premium Amount
59        Education Level_MEDIAN_Premium Amount
8                                      Location
5                               Education Level
42                                     Cos_Date
11                                  Vehicle Age
9                                   Policy Type
51      Number of Dependents_MAX_Premium Amount
61           Education Level_MAX_Premium Amount
17                                Property Type
16                           Exercise Frequency
70    Health Conscious Level_STD_Premium Amount
15                               Smoking Status
27                                     Day_Name
60           Education Level_STD_Premium Amount
37                      Policy Start Date - Day
1                                        Gender
41                                     Sin_Date
13                           Insurance Duration
Name: Feature, dtype: object

In [49]:
meaningless_df = df[useless_columns]
meaningless_df.head(3)

Unnamed: 0,Education Level_MEAN_Premium Amount,Education Level_MEDIAN_Premium Amount,Location,Education Level,Cos_Date,Vehicle Age,Policy Type,Number of Dependents_MAX_Premium Amount,Education Level_MAX_Premium Amount,Property Type,Exercise Frequency,Health Conscious Level_STD_Premium Amount,Smoking Status,Day_Name,Education Level_STD_Premium Amount,Policy Start Date - Day,Gender,Sin_Date,Insurance Duration
0,1102.698438,873.0,Urban,Bachelor's,-0.220691,17.0,Premium,4994.0,4988.0,House,Weekly,864.672928,No,Saturday,864.866296,23,Female,-0.975344,5.0
1,1102.113989,871.0,Rural,Master's,0.050489,12.0,Comprehensive,4997.0,4997.0,House,Monthly,863.957863,Yes,Monday,866.235322,12,Female,-0.998725,2.0
2,1104.78749,876.0,Suburban,High School,0.101192,14.0,Premium,4997.0,4999.0,House,Weekly,864.672928,Yes,Saturday,865.951488,30,Male,-0.994867,3.0


In [50]:
# df = df[stats_result[stats_result["P-Value"] < 0.05]["Feature"]]
# df.head(3)

# Compressing Meaningless DF's information in a component using PCA

In [51]:
meaningless_df.head(3)

Unnamed: 0,Education Level_MEAN_Premium Amount,Education Level_MEDIAN_Premium Amount,Location,Education Level,Cos_Date,Vehicle Age,Policy Type,Number of Dependents_MAX_Premium Amount,Education Level_MAX_Premium Amount,Property Type,Exercise Frequency,Health Conscious Level_STD_Premium Amount,Smoking Status,Day_Name,Education Level_STD_Premium Amount,Policy Start Date - Day,Gender,Sin_Date,Insurance Duration
0,1102.698438,873.0,Urban,Bachelor's,-0.220691,17.0,Premium,4994.0,4988.0,House,Weekly,864.672928,No,Saturday,864.866296,23,Female,-0.975344,5.0
1,1102.113989,871.0,Rural,Master's,0.050489,12.0,Comprehensive,4997.0,4997.0,House,Monthly,863.957863,Yes,Monday,866.235322,12,Female,-0.998725,2.0
2,1104.78749,876.0,Suburban,High School,0.101192,14.0,Premium,4997.0,4999.0,House,Weekly,864.672928,Yes,Saturday,865.951488,30,Male,-0.994867,3.0


## Encoding Columns

In [52]:
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

###
---
###

In [53]:
meaningless_df["Location"].unique()

array(['Urban', 'Rural', 'Suburban'], dtype=object)

In [54]:
a = OrdinalEncoder(categories=[['Rural', 'Suburban', 'Urban']])

b = pd.DataFrame({"ENCODED_Location" : a.fit_transform(meaningless_df[["Location"]]).flatten()})

meaningless_df = pd.concat([meaningless_df, b], axis=1)
meaningless_df.drop(columns="Location", inplace=True)

###
---
###

In [55]:
meaningless_df["Education Level"].unique()

array(["Bachelor's", "Master's", 'High School', 'PhD'], dtype=object)

In [56]:
a = OrdinalEncoder(categories=[['High School', "Bachelor's", "Master's", 'PhD']])

b = pd.DataFrame({"ENCODED_Education Level" : a.fit_transform(meaningless_df[["Education Level"]]).flatten()})

meaningless_df = pd.concat([meaningless_df, b], axis=1)
meaningless_df.drop(columns="Education Level", inplace=True)

###
---
###

In [57]:
meaningless_df["Policy Type"].unique()

array(['Premium', 'Comprehensive', 'Basic'], dtype=object)

In [58]:
a = OrdinalEncoder(categories=[['Basic', 'Comprehensive', 'Premium']])

b = pd.DataFrame({"ENCODED_Policy Type" : a.fit_transform(meaningless_df[["Policy Type"]]).flatten()})

meaningless_df = pd.concat([meaningless_df, b], axis=1)
meaningless_df.drop(columns="Policy Type", inplace=True)

###
---
###

In [59]:
a = OneHotEncoder(drop="first", sparse_output=False)

b = pd.DataFrame(
        a.fit_transform(meaningless_df[["Property Type"]]),
        columns=a.get_feature_names_out()
    )

meaningless_df = pd.concat([meaningless_df, b], axis=1)
meaningless_df.drop(columns="Property Type", inplace=True)

###
---
###

In [60]:
meaningless_df["Exercise Frequency"].unique()

array(['Weekly', 'Monthly', 'Daily', 'Rarely'], dtype=object)

In [61]:
a = OrdinalEncoder(categories=[['Rarely', 'Monthly', 'Weekly', 'Daily']])

b = pd.DataFrame({"ENCODED_Exercise Frequency" : a.fit_transform(meaningless_df[["Exercise Frequency"]]).flatten()})

meaningless_df = pd.concat([meaningless_df, b], axis=1)
meaningless_df.drop(columns="Exercise Frequency", inplace=True)

###
---
###

In [62]:
a = OneHotEncoder(drop="first", sparse_output=False)

b = pd.DataFrame(
        a.fit_transform(meaningless_df[["Smoking Status"]]),
        columns=a.get_feature_names_out()
    )

meaningless_df = pd.concat([meaningless_df, b], axis=1)
meaningless_df.drop(columns="Smoking Status", inplace=True)

###
---
###

In [63]:
a = OneHotEncoder(drop="first", sparse_output=False)

b = pd.DataFrame(
        a.fit_transform(meaningless_df[["Gender"]]),
        columns=a.get_feature_names_out()
    )

meaningless_df = pd.concat([meaningless_df, b], axis=1)
meaningless_df.drop(columns="Gender", inplace=True)

###
---
###

In [64]:
meaningless_df["Policy Start Date - Day"] = meaningless_df["Policy Start Date - Day"].astype(int)

#
---
#

In [65]:
meaningless_df.head(3)

Unnamed: 0,Education Level_MEAN_Premium Amount,Education Level_MEDIAN_Premium Amount,Cos_Date,Vehicle Age,Number of Dependents_MAX_Premium Amount,Education Level_MAX_Premium Amount,Health Conscious Level_STD_Premium Amount,Day_Name,Education Level_STD_Premium Amount,Policy Start Date - Day,Sin_Date,Insurance Duration,ENCODED_Location,ENCODED_Education Level,ENCODED_Policy Type,Property Type_Condo,Property Type_House,ENCODED_Exercise Frequency,Smoking Status_Yes,Gender_Male
0,1102.698438,873.0,-0.220691,17.0,4994.0,4988.0,864.672928,Saturday,864.866296,23,-0.975344,5.0,2.0,1.0,2.0,0.0,1.0,2.0,0.0,0.0
1,1102.113989,871.0,0.050489,12.0,4997.0,4997.0,863.957863,Monday,866.235322,12,-0.998725,2.0,0.0,2.0,1.0,0.0,1.0,1.0,1.0,0.0
2,1104.78749,876.0,0.101192,14.0,4997.0,4999.0,864.672928,Saturday,865.951488,30,-0.994867,3.0,1.0,0.0,2.0,0.0,1.0,2.0,1.0,1.0


In [66]:
meaningless_df.dtypes

Education Level_MEAN_Premium Amount          float64
Education Level_MEDIAN_Premium Amount        float64
Cos_Date                                     float64
Vehicle Age                                  float64
Number of Dependents_MAX_Premium Amount      float64
Education Level_MAX_Premium Amount           float64
Health Conscious Level_STD_Premium Amount    float64
Day_Name                                      object
Education Level_STD_Premium Amount           float64
Policy Start Date - Day                        int64
Sin_Date                                     float64
Insurance Duration                           float64
ENCODED_Location                             float64
ENCODED_Education Level                      float64
ENCODED_Policy Type                          float64
Property Type_Condo                          float64
Property Type_House                          float64
ENCODED_Exercise Frequency                   float64
Smoking Status_Yes                           f

###
---
###

# Doing PCA on this `meaningless_df`

In [67]:
# from sklearn.decomposition import PCA

In [68]:
# pca = PCA(n_components=3)
# pca_df = pd.DataFrame(pca.fit_transform(meaningless_df), columns=['PC1_Meaningless_df', "PC2_Meaningless_df", "PC3_Meaningless_df"])
# pca_df

In [69]:
# pca.explained_variance_ratio_

###
---
###

# Combining 2 PCs of Meaningless_columns to the df

In [70]:
# df = pd.concat([df, pca_df.iloc[:, :2]], axis=1)

In [71]:
df.head()

Unnamed: 0,Age,Gender,Annual Income,Marital Status,Number of Dependents,Education Level,Occupation,Health Score,Location,Policy Type,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Customer Feedback,Smoking Status,Exercise Frequency,Property Type,Premium Amount,Health Conscious Level,Health Conscious Level1,Money Per Head,Money Handling Level,Money Handling Level1,Growth,Growth1,Determinstic,Day_Name,Credit by Score,CreditInsurance,Health_Risk_Score,Credit_Health_Score,Health_Age_Interaction,Feedback1,Feedback2,Feedback3,Feedback4,Policy Start Date - Day,Policy Start Date - Month,Policy Start Date - Year,Policy Start Date - Quarter,Sin_Date,Cos_Date,Sin_Year,Cos_Year,Sin_Month,Cos_Month,Number of Dependents_MIN_Premium Amount,Number of Dependents_MEAN_Premium Amount,Number of Dependents_MEDIAN_Premium Amount,Number of Dependents_STD_Premium Amount,Number of Dependents_MAX_Premium Amount,Occupation_MIN_Premium Amount,Occupation_MEAN_Premium Amount,Occupation_MEDIAN_Premium Amount,Occupation_STD_Premium Amount,Occupation_MAX_Premium Amount,Education Level_MIN_Premium Amount,Education Level_MEAN_Premium Amount,Education Level_MEDIAN_Premium Amount,Education Level_STD_Premium Amount,Education Level_MAX_Premium Amount,Previous Claims_MIN_Premium Amount,Previous Claims_MEAN_Premium Amount,Previous Claims_MEDIAN_Premium Amount,Previous Claims_STD_Premium Amount,Previous Claims_MAX_Premium Amount,Health Conscious Level_MIN_Premium Amount,Health Conscious Level_MEAN_Premium Amount,Health Conscious Level_MEDIAN_Premium Amount,Health Conscious Level_STD_Premium Amount,Health Conscious Level_MAX_Premium Amount,Insurance Duration_MIN_Premium Amount,Insurance Duration_MEAN_Premium Amount,Insurance Duration_MEDIAN_Premium Amount,Insurance Duration_STD_Premium Amount,Insurance Duration_MAX_Premium Amount
0,19.0,Female,10049.0,Married,1.0,Bachelor's,Self-Employed,22.598761,Urban,Premium,2.0,17.0,372.0,5.0,Poor,No,Weekly,House,2869.0,4,13740.046488,10049.0,3738228.0,27.013441,20098.0,5024.5,528.894737,Saturday,186.0,1860.0,3.870062,8406.73897,429.376453,20098.0,744.0,4.0,45.197521,23,12,2023,2023 Q4,-0.975344,-0.220691,-6.447061e-13,1.0,-2.939152e-15,1.0,20.0,1107.625281,878.0,866.838892,4994.0,20.0,1099.847641,870.0,864.000609,4999.0,20.0,1102.698438,873.0,864.866296,4988.0,20.0,1157.177084,912.0,898.578219,4991.0,20.0,1102.480047,871.0,864.672928,4991.0,20.0,1100.812035,872.0,859.965806,4996.0
1,39.0,Female,31678.0,Divorced,3.0,Master's,Self-Employed,15.569731,Rural,Comprehensive,1.0,12.0,694.0,2.0,Average,Yes,Monthly,House,1483.0,2,4857.756069,10559.333333,21984532.0,45.645533,95034.0,10559.333333,812.25641,Monday,694.0,1388.0,4.221513,10805.393307,607.219509,126712.0,2776.0,4.0,62.278924,12,6,2023,2023 Q2,-0.998725,0.050489,-6.447061e-13,1.0,-1.469576e-15,1.0,20.0,1102.153646,874.0,863.643171,4997.0,20.0,1099.847641,870.0,864.000609,4999.0,20.0,1102.113989,871.0,866.235322,4997.0,20.0,1083.665634,853.0,856.283142,4997.0,20.0,1097.640837,861.0,863.957863,4999.0,20.0,1106.883166,878.0,863.675409,4997.0
2,23.0,Male,25602.0,Divorced,3.0,High School,Self-Employed,47.177549,Suburban,Premium,1.0,14.0,555.0,3.0,Good,Yes,Weekly,House,567.0,4,17361.338138,8534.0,14209110.0,46.12973,25602.0,25602.0,1113.130435,Saturday,555.0,1665.0,2.641123,26183.539855,1085.083634,204816.0,4440.0,8.0,377.420394,30,9,2023,2023 Q3,-0.994867,0.101192,-6.447061e-13,1.0,-2.204364e-15,1.0,20.0,1102.153646,874.0,863.643171,4997.0,20.0,1099.847641,870.0,864.000609,4999.0,20.0,1104.78749,876.0,865.951488,4999.0,20.0,1083.665634,853.0,856.283142,4997.0,20.0,1102.480047,871.0,864.672928,4991.0,20.0,1101.733536,872.0,865.791213,4997.0
3,21.0,Male,141855.0,Married,2.0,Bachelor's,Self-Employed,10.938144,Rural,Basic,1.0,0.0,367.0,1.0,Poor,Yes,Daily,Apartment,765.0,3,7350.432875,70927.5,52060785.0,386.525886,283710.0,70927.5,6755.0,Wednesday,367.0,367.0,4.453093,4014.298906,229.701027,283710.0,734.0,2.0,21.876288,12,6,2024,2024 Q2,0.111402,0.993775,1.585375e-14,1.0,-1.469576e-15,1.0,20.0,1107.670467,874.0,867.07961,4997.0,20.0,1099.847641,870.0,864.000609,4999.0,20.0,1102.698438,873.0,864.866296,4988.0,20.0,1083.665634,853.0,856.283142,4997.0,20.0,1099.109456,868.0,865.407894,4997.0,20.0,1097.042977,861.0,865.431191,4988.0
4,21.0,Male,39651.0,Single,1.0,Bachelor's,Self-Employed,20.376094,Rural,Premium,0.0,8.0,598.0,4.0,Poor,Yes,Weekly,House,2022.0,3,6846.367459,39651.0,23711298.0,66.30602,79302.0,19825.5,1888.142857,Wednesday,598.0,2392.0,3.981195,12184.903989,427.897966,79302.0,1196.0,0.0,40.752187,1,12,2021,2021 Q4,-0.996246,0.086565,-1.468363e-13,1.0,-2.939152e-15,1.0,20.0,1107.625281,878.0,866.838892,4994.0,20.0,1099.847641,870.0,864.000609,4999.0,20.0,1102.698438,873.0,864.866296,4988.0,20.0,1080.669491,855.0,847.585613,4999.0,20.0,1099.109456,868.0,865.407894,4997.0,20.0,1104.723079,872.0,866.377508,4991.0


###
---
###

# Encoding of column in `df`

In [72]:
show_nulls(df)

Unnamed: 0,Column,Data Type,Nulls,No. of Uniques,Uniques
18,Premium Amount,float64,800000,4794,"[2869.0, 1483.0, 567.0, 765.0, 2022.0, 3202.0,..."
65,Previous Claims_STD_Premium Amount,float64,9,9,"[898.5782192074151, 856.283141891759, 847.5856..."
2,Annual Income,float64,0,97952,"[10049.0, 31678.0, 25602.0, 141855.0, 39651.0,..."
1,Gender,object,0,2,"[Female, Male]"
0,Age,float64,0,47,"[19.0, 39.0, 23.0, 21.0, 29.0, 41.0, 48.0, 44...."
5,Education Level,object,0,4,"[Bachelor's, Master's, High School, PhD]"
6,Occupation,object,0,3,"[Self-Employed, Employed, Unemployed]"
7,Health Score,float64,0,934000,"[22.59876067181393, 15.569730989408043, 47.177..."
8,Location,object,0,3,"[Urban, Rural, Suburban]"
9,Policy Type,object,0,3,"[Premium, Comprehensive, Basic]"


### Policy Start Date - Year	

In [73]:
df["Policy Start Date - Year"].unique()

array([2023, 2024, 2021, 2022, 2020, 2019], dtype=object)

In [74]:
a = OrdinalEncoder(categories=[[2019, 2020, 2021, 2022, 2023, 2024]])

b = pd.DataFrame({"ENCODED_Policy Start Date - Year" : a.fit_transform(df[["Policy Start Date - Year"]]).flatten()})

df = pd.concat([df, b], axis=1)
df.drop(columns="Policy Start Date - Year", inplace=True)

### Policy Start Date - Quarter

In [75]:
sorted(list(df["Policy Start Date - Quarter"].unique()))

['2019 Q3',
 '2019 Q4',
 '2020 Q1',
 '2020 Q2',
 '2020 Q3',
 '2020 Q4',
 '2021 Q1',
 '2021 Q2',
 '2021 Q3',
 '2021 Q4',
 '2022 Q1',
 '2022 Q2',
 '2022 Q3',
 '2022 Q4',
 '2023 Q1',
 '2023 Q2',
 '2023 Q3',
 '2023 Q4',
 '2024 Q1',
 '2024 Q2',
 '2024 Q3']

In [76]:
a = OrdinalEncoder(categories=[['2019 Q3', '2019 Q4', '2020 Q1', '2020 Q2', '2020 Q3', '2020 Q4', '2021 Q1', '2021 Q2', '2021 Q3',
                 '2021 Q4', '2022 Q1', '2022 Q2', '2022 Q3', '2022 Q4', '2023 Q1', '2023 Q2', '2023 Q3', '2023 Q4', '2024 Q1', '2024 Q2', '2024 Q3']])

b = pd.DataFrame({"ENCODED_Policy Start Date - Quarter" : a.fit_transform(df[["Policy Start Date - Quarter"]]).flatten()})

df = pd.concat([df, b], axis=1)
df.drop(columns="Policy Start Date - Quarter", inplace=True)

### Policy Start Date - Month

In [77]:
df["Policy Start Date - Month"] = df["Policy Start Date - Month"].astype(int)

### Customer Feedback

In [78]:
df["Customer Feedback"].unique()

array(['Poor', 'Average', 'Good'], dtype=object)

In [79]:
a = OrdinalEncoder(categories=[['Poor', 'Average', 'Good']])

b = pd.DataFrame({"ENCODED_Customer Feedback" : a.fit_transform(df[["Customer Feedback"]]).flatten()})

df = pd.concat([df, b], axis=1)
df.drop(columns="Customer Feedback", inplace=True)

### Occupation

In [80]:
a = OneHotEncoder(drop="first", sparse_output=False)

b = pd.DataFrame(
        a.fit_transform(df[["Occupation"]]),
        columns="ENCODED_" + a.get_feature_names_out()
    )

df = pd.concat([df, b], axis=1)
df.drop(columns="Occupation", inplace=True)

### Marital Status

In [81]:
a = OneHotEncoder(drop="first", sparse_output=False)

b = pd.DataFrame(
        a.fit_transform(df[["Marital Status"]]),
        columns="ENCODED_" + a.get_feature_names_out()
    )

df = pd.concat([df, b], axis=1)
df.drop(columns="Marital Status", inplace=True)

In [82]:
df

Unnamed: 0,Age,Gender,Annual Income,Number of Dependents,Education Level,Health Score,Location,Policy Type,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Smoking Status,Exercise Frequency,Property Type,Premium Amount,Health Conscious Level,Health Conscious Level1,Money Per Head,Money Handling Level,Money Handling Level1,Growth,Growth1,Determinstic,Day_Name,Credit by Score,CreditInsurance,Health_Risk_Score,Credit_Health_Score,Health_Age_Interaction,Feedback1,Feedback2,Feedback3,Feedback4,Policy Start Date - Day,Policy Start Date - Month,Sin_Date,Cos_Date,Sin_Year,Cos_Year,Sin_Month,Cos_Month,Number of Dependents_MIN_Premium Amount,Number of Dependents_MEAN_Premium Amount,Number of Dependents_MEDIAN_Premium Amount,Number of Dependents_STD_Premium Amount,Number of Dependents_MAX_Premium Amount,Occupation_MIN_Premium Amount,Occupation_MEAN_Premium Amount,Occupation_MEDIAN_Premium Amount,Occupation_STD_Premium Amount,Occupation_MAX_Premium Amount,Education Level_MIN_Premium Amount,Education Level_MEAN_Premium Amount,Education Level_MEDIAN_Premium Amount,Education Level_STD_Premium Amount,Education Level_MAX_Premium Amount,Previous Claims_MIN_Premium Amount,Previous Claims_MEAN_Premium Amount,Previous Claims_MEDIAN_Premium Amount,Previous Claims_STD_Premium Amount,Previous Claims_MAX_Premium Amount,Health Conscious Level_MIN_Premium Amount,Health Conscious Level_MEAN_Premium Amount,Health Conscious Level_MEDIAN_Premium Amount,Health Conscious Level_STD_Premium Amount,Health Conscious Level_MAX_Premium Amount,Insurance Duration_MIN_Premium Amount,Insurance Duration_MEAN_Premium Amount,Insurance Duration_MEDIAN_Premium Amount,Insurance Duration_STD_Premium Amount,Insurance Duration_MAX_Premium Amount,ENCODED_Policy Start Date - Year,ENCODED_Policy Start Date - Quarter,ENCODED_Customer Feedback,ENCODED_Occupation_Self-Employed,ENCODED_Occupation_Unemployed,ENCODED_Marital Status_Married,ENCODED_Marital Status_Single
0,19.0,Female,10049.0,1.0,Bachelor's,22.598761,Urban,Premium,2.0,17.0,372.0,5.0,No,Weekly,House,2869.0,4,13740.046488,10049.000000,3738228.0,27.013441,20098.0,5024.500000,528.894737,Saturday,186.0,1860.0,3.870062,8406.738970,429.376453,20098.0,744.0,4.0,45.197521,23,12,-0.975344,-0.220691,-6.447061e-13,1.0,-2.939152e-15,1.0,20.0,1107.625281,878.0,866.838892,4994.0,20.0,1099.847641,870.0,864.000609,4999.0,20.0,1102.698438,873.0,864.866296,4988.0,20.0,1157.177084,912.0,898.578219,4991.0,20.0,1102.480047,871.0,864.672928,4991.0,20.0,1100.812035,872.0,859.965806,4996.0,4.0,17.0,0.0,1.0,0.0,1.0,0.0
1,39.0,Female,31678.0,3.0,Master's,15.569731,Rural,Comprehensive,1.0,12.0,694.0,2.0,Yes,Monthly,House,1483.0,2,4857.756069,10559.333333,21984532.0,45.645533,95034.0,10559.333333,812.256410,Monday,694.0,1388.0,4.221513,10805.393307,607.219509,126712.0,2776.0,4.0,62.278924,12,6,-0.998725,0.050489,-6.447061e-13,1.0,-1.469576e-15,1.0,20.0,1102.153646,874.0,863.643171,4997.0,20.0,1099.847641,870.0,864.000609,4999.0,20.0,1102.113989,871.0,866.235322,4997.0,20.0,1083.665634,853.0,856.283142,4997.0,20.0,1097.640837,861.0,863.957863,4999.0,20.0,1106.883166,878.0,863.675409,4997.0,4.0,15.0,1.0,1.0,0.0,0.0,0.0
2,23.0,Male,25602.0,3.0,High School,47.177549,Suburban,Premium,1.0,14.0,555.0,3.0,Yes,Weekly,House,567.0,4,17361.338138,8534.000000,14209110.0,46.129730,25602.0,25602.000000,1113.130435,Saturday,555.0,1665.0,2.641123,26183.539855,1085.083634,204816.0,4440.0,8.0,377.420394,30,9,-0.994867,0.101192,-6.447061e-13,1.0,-2.204364e-15,1.0,20.0,1102.153646,874.0,863.643171,4997.0,20.0,1099.847641,870.0,864.000609,4999.0,20.0,1104.787490,876.0,865.951488,4999.0,20.0,1083.665634,853.0,856.283142,4997.0,20.0,1102.480047,871.0,864.672928,4991.0,20.0,1101.733536,872.0,865.791213,4997.0,4.0,16.0,2.0,1.0,0.0,0.0,0.0
3,21.0,Male,141855.0,2.0,Bachelor's,10.938144,Rural,Basic,1.0,0.0,367.0,1.0,Yes,Daily,Apartment,765.0,3,7350.432875,70927.500000,52060785.0,386.525886,283710.0,70927.500000,6755.000000,Wednesday,367.0,367.0,4.453093,4014.298906,229.701027,283710.0,734.0,2.0,21.876288,12,6,0.111402,0.993775,1.585375e-14,1.0,-1.469576e-15,1.0,20.0,1107.670467,874.0,867.079610,4997.0,20.0,1099.847641,870.0,864.000609,4999.0,20.0,1102.698438,873.0,864.866296,4988.0,20.0,1083.665634,853.0,856.283142,4997.0,20.0,1099.109456,868.0,865.407894,4997.0,20.0,1097.042977,861.0,865.431191,4988.0,5.0,19.0,0.0,1.0,0.0,1.0,0.0
4,21.0,Male,39651.0,1.0,Bachelor's,20.376094,Rural,Premium,0.0,8.0,598.0,4.0,Yes,Weekly,House,2022.0,3,6846.367459,39651.000000,23711298.0,66.306020,79302.0,19825.500000,1888.142857,Wednesday,598.0,2392.0,3.981195,12184.903989,427.897966,79302.0,1196.0,0.0,40.752187,1,12,-0.996246,0.086565,-1.468363e-13,1.0,-2.939152e-15,1.0,20.0,1107.625281,878.0,866.838892,4994.0,20.0,1099.847641,870.0,864.000609,4999.0,20.0,1102.698438,873.0,864.866296,4988.0,20.0,1080.669491,855.0,847.585613,4999.0,20.0,1099.109456,868.0,865.407894,4997.0,20.0,1104.723079,872.0,866.377508,4991.0,2.0,9.0,0.0,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1999995,50.0,Female,38782.0,1.0,Bachelor's,14.498639,Rural,Premium,1.0,8.0,309.0,2.0,Yes,Daily,Condo,,4,23197.822227,38782.000000,11983638.0,125.508091,77564.0,19391.000000,775.640000,Friday,309.0,618.0,4.275068,4480.079418,724.931945,155128.0,1236.0,4.0,57.994556,9,7,0.645845,-0.763468,-1.468363e-13,1.0,-1.714506e-15,1.0,20.0,1107.625281,878.0,866.838892,4994.0,20.0,1105.604643,876.0,863.360484,4994.0,20.0,1102.698438,873.0,864.866296,4988.0,20.0,1083.665634,853.0,856.283142,4997.0,20.0,1102.480047,871.0,864.672928,4991.0,20.0,1106.883166,878.0,863.675409,4997.0,2.0,8.0,1.0,0.0,0.0,1.0,0.0
1999996,56.0,Female,73462.0,0.0,Master's,8.145748,Rural,Basic,2.0,0.0,452.0,2.0,No,Daily,Apartment,,6,29194.361129,73462.000000,33204824.0,162.526549,220386.0,24487.333333,1311.821429,Tuesday,226.0,904.0,4.592713,3681.878133,456.161893,587696.0,3616.0,16.0,65.165985,28,3,0.681828,0.731513,-6.447061e-13,1.0,-7.347881e-16,1.0,20.0,1098.032667,867.0,862.433346,4999.0,20.0,1099.847641,870.0,864.000609,4999.0,20.0,1102.113989,871.0,866.235322,4997.0,20.0,1157.177084,912.0,898.578219,4991.0,20.0,1104.461730,878.0,863.186210,4996.0,20.0,1106.883166,878.0,863.675409,4997.0,4.0,14.0,2.0,1.0,0.0,0.0,1.0
1999997,26.0,Female,35178.0,0.0,Master's,6.636583,Urban,Comprehensive,2.0,10.0,764.0,6.0,No,Monthly,Apartment,,2,2760.818699,35178.000000,26875992.0,46.044503,105534.0,11726.000000,1353.000000,Monday,382.0,4584.0,4.668171,5070.349727,172.551169,70356.0,1528.0,4.0,13.273167,30,9,-0.709843,0.704360,3.510335e-13,1.0,-2.204364e-15,1.0,20.0,1098.032667,867.0,862.433346,4999.0,20.0,1105.604643,876.0,863.360484,4994.0,20.0,1102.113989,871.0,866.235322,4997.0,20.0,1157.177084,912.0,898.578219,4991.0,20.0,1097.640837,861.0,863.957863,4999.0,20.0,1104.558441,873.0,866.550287,4999.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1999998,34.0,Female,45661.0,3.0,Master's,15.937248,Urban,Premium,2.0,17.0,467.0,7.0,No,Weekly,Condo,,4,17339.725601,15220.333333,21323687.0,97.775161,136983.0,15220.333333,1342.970588,Monday,233.5,3269.0,4.203138,7442.694720,541.866425,182644.0,1868.0,8.0,63.748991,9,5,0.561062,-0.827774,-1.305266e-12,1.0,-1.224647e-15,1.0,20.0,1102.153646,874.0,863.643171,4997.0,20.0,1099.847641,870.0,864.000609,4999.0,20.0,1102.113989,871.0,866.235322,4997.0,20.0,1157.177084,912.0,898.578219,4991.0,20.0,1102.480047,871.0,864.672928,4991.0,20.0,1105.964399,877.0,866.222189,4988.0,3.0,11.0,1.0,1.0,0.0,0.0,1.0


#
---
#

In [83]:
def return_splits(ddf, feature_name, target_name):
    return [ddf[ddf[feature_name] == i][target_name] for i in ddf[feature_name].unique()]

def give_stats_analysis(df, target_column_name):
    ddf = df.copy()
    ddf = ddf.dropna()

    features = []
    tests = []
    stats = []
    pvals = []
    verdict = []
    count = 0

    target = ddf[target_column_name]
    for i in ddf.columns:
        features.append(i)
        feature = ddf[i]
        
        if (feature.dtype == "O" and (target.dtype == "float" or target.dtype == "int")) or (target.dtype == "O" and (feature.dtype == "float" or feature.dtype == "int")):
            stat, pval, *_ = kruskal(*return_splits(ddf, feature.name, target.name))
            tests.append("Kruskal-Wallis")
            stats.append(stat)
            pvals.append(pval)
            
        
        elif (feature.dtype == "float" or feature.dtype == "int") and (target.dtype == "float" or target.dtype == "int"):
            stat, pval, *_ = spearmanr(feature, target)
            tests.append("SpearmanR")
            stats.append(stat)
            pvals.append(pval)

        elif feature.dtype == "O" and target.dtype == "O":
            stat, pval, *_ = chi2_contingency(pd.crosstab(feature, target))
            tests.append("Chi-Square")
            stats.append(stat)
            pvals.append(pval)
        
        else:
            tests.append(np.nan)
            stats.append(np.nan)
            pvals.append(np.nan)
        
        if pval <= 0.05:
            verdict.append("There is Relationship")
        else:
            verdict.append("There is NO Relationship")

        print(f"{feature.name} ■■■ {target_column_name}".ljust(50, "-")+"✅")
    
    return pd.DataFrame({
        "Feature" : features,
        "Target" : [target_column_name]*ddf.shape[1],
        "Statistic Test" : tests,
        "Test Statistic" : stats,
        "P-Value" : pvals,
        "Verdict" : verdict
    }).sort_values(by="P-Value")

In [84]:
stats_df = give_stats_analysis(df, "Premium Amount")

Age ■■■ Premium Amount----------------------------✅
Gender ■■■ Premium Amount-------------------------✅
Annual Income ■■■ Premium Amount------------------✅
Number of Dependents ■■■ Premium Amount-----------✅
Education Level ■■■ Premium Amount----------------✅
Health Score ■■■ Premium Amount-------------------✅
Location ■■■ Premium Amount-----------------------✅
Policy Type ■■■ Premium Amount--------------------✅
Previous Claims ■■■ Premium Amount----------------✅
Vehicle Age ■■■ Premium Amount--------------------✅
Credit Score ■■■ Premium Amount-------------------✅
Insurance Duration ■■■ Premium Amount-------------✅
Smoking Status ■■■ Premium Amount-----------------✅
Exercise Frequency ■■■ Premium Amount-------------✅
Property Type ■■■ Premium Amount------------------✅
Premium Amount ■■■ Premium Amount-----------------✅
Health Conscious Level ■■■ Premium Amount---------✅
Health Conscious Level1 ■■■ Premium Amount--------✅
Money Per Head ■■■ Premium Amount-----------------✅
Money Handli

In [85]:
wanted_columns = stats_df[stats_df["P-Value"] <= 0.05]["Feature"]

In [86]:
df = df[wanted_columns]

#
---
#

# Spliting Data

In [87]:
train = df.iloc[:1200000, :]
test = df.iloc[1200000:, :]

train.shape, test.shape

((1200000, 50), (800000, 50))

In [88]:
X = train.drop(columns="Premium Amount")
Y = train["Premium Amount"]

In [89]:
from sklearn.model_selection import train_test_split

In [90]:
x_train, x_validate, y_train, y_validate = train_test_split(X, Y, test_size=10000)

In [91]:
x_validate.shape

(10000, 49)

In [92]:
test.drop(columns="Premium Amount", inplace=True)

In [93]:
test.shape

(800000, 49)

##
---
##

# Scaling on `df` 

In [94]:
# fig, axs = plt.subplots(3, 6, figsize=(20, 9))

# for i, ax in zip(x_train.columns, axs.flatten()):
#     sns.kdeplot(x_train[i], ax=ax, color="darkgray", fill=True)

# plt.tight_layout()
# plt.show()

In [95]:
from sklearn.preprocessing import RobustScaler, MinMaxScaler, StandardScaler

In [96]:
def do_scaling(scaler):
    var_cols = list(map(lambda x : x.replace(" ", "_"), x_train.columns))
    scalers = {}
    cols = x_train.select_dtypes("number").columns
    
    for i in range(len(cols)):
        scalers[f"SCALER_{var_cols[i]}"] = scaler
        
        x_train[f"SCALER_{var_cols[i]}"] = scalers[f"SCALER_{var_cols[i]}"].fit_transform(x_train[[cols[i]]]).flatten()
        x_train.drop(columns=cols[i], inplace=True)

        x_validate[f"SCALER_{var_cols[i]}"] = scalers[f"SCALER_{var_cols[i]}"].transform(x_validate[[cols[i]]]).flatten()
        x_validate.drop(columns=cols[i], inplace=True)

        test[f"SCALER_{var_cols[i]}"] = scalers[f"SCALER_{var_cols[i]}"].transform(test[[cols[i]]]).flatten()
        test.drop(columns=cols[i], inplace=True)
    
    return scalers

In [97]:
scaler_objects = do_scaling(RobustScaler())
scaler_objects

{'SCALER_Annual_Income': RobustScaler(),
 'SCALER_Credit_Score': RobustScaler(),
 'SCALER_Growth1': RobustScaler(),
 'SCALER_Money_Handling_Level1': RobustScaler(),
 'SCALER_Growth': RobustScaler(),
 'SCALER_Money_Per_Head': RobustScaler(),
 'SCALER_Money_Handling_Level': RobustScaler(),
 'SCALER_Credit_by_Score': RobustScaler(),
 'SCALER_Feedback1': RobustScaler(),
 'SCALER_Determinstic': RobustScaler(),
 'SCALER_Previous_Claims_MEDIAN_Premium_Amount': RobustScaler(),
 'SCALER_Previous_Claims_MEAN_Premium_Amount': RobustScaler(),
 'SCALER_Previous_Claims': RobustScaler(),
 'SCALER_Previous_Claims_STD_Premium_Amount': RobustScaler(),
 'SCALER_Previous_Claims_MAX_Premium_Amount': RobustScaler(),
 'SCALER_Feedback3': RobustScaler(),
 'SCALER_Health_Score': RobustScaler(),
 'SCALER_Health_Risk_Score': RobustScaler(),
 'SCALER_Feedback2': RobustScaler(),
 'SCALER_CreditInsurance': RobustScaler(),
 'SCALER_Sin_Year': RobustScaler(),
 'SCALER_Health_Age_Interaction': RobustScaler(),
 'SCALER

In [98]:
x_train.head(3)

Unnamed: 0,Health Conscious Level,SCALER_Annual_Income,SCALER_Credit_Score,SCALER_Growth1,SCALER_Money_Handling_Level1,SCALER_Growth,SCALER_Money_Per_Head,SCALER_Money_Handling_Level,SCALER_Credit_by_Score,SCALER_Feedback1,SCALER_Determinstic,SCALER_Previous_Claims_MEDIAN_Premium_Amount,SCALER_Previous_Claims_MEAN_Premium_Amount,SCALER_Previous_Claims,SCALER_Previous_Claims_STD_Premium_Amount,SCALER_Previous_Claims_MAX_Premium_Amount,SCALER_Feedback3,SCALER_Health_Score,SCALER_Health_Risk_Score,SCALER_Feedback2,SCALER_CreditInsurance,SCALER_Sin_Year,SCALER_Health_Age_Interaction,SCALER_ENCODED_Policy_Start_Date_-_Year,SCALER_ENCODED_Policy_Start_Date_-_Quarter,SCALER_Feedback4,SCALER_Health_Conscious_Level1,SCALER_Sin_Month,SCALER_Policy_Start_Date_-_Month,SCALER_Health_Conscious_Level,SCALER_Number_of_Dependents_MEAN_Premium_Amount,SCALER_Health_Conscious_Level_MEAN_Premium_Amount,SCALER_Health_Conscious_Level_MEDIAN_Premium_Amount,SCALER_Number_of_Dependents_MEDIAN_Premium_Amount,SCALER_Insurance_Duration_MEAN_Premium_Amount,SCALER_Insurance_Duration_MEDIAN_Premium_Amount,SCALER_Number_of_Dependents_STD_Premium_Amount,SCALER_Credit_Health_Score,SCALER_Health_Conscious_Level_MAX_Premium_Amount,SCALER_Occupation_MEAN_Premium_Amount,SCALER_Occupation_MEDIAN_Premium_Amount,SCALER_Occupation_MAX_Premium_Amount,SCALER_Previous_Claims_MIN_Premium_Amount,SCALER_ENCODED_Occupation_Self-Employed,SCALER_Insurance_Duration_MAX_Premium_Amount,SCALER_ENCODED_Marital_Status_Single,SCALER_Age,SCALER_Insurance_Duration_STD_Premium_Amount,SCALER_Number_of_Dependents
503085,2,0.130038,0.413793,-0.000546,0.054205,0.388106,0.132312,0.296045,-0.582114,0.169414,0.213843,28.5,24.535366,1.0,4.862885,-3.0,0.5,-0.495008,0.495008,0.173913,0.254968,-1.0,-0.42653,0.0,0.1,-0.253423,-0.394952,2.449294e-16,-0.2,0.575111,-1.175476,-2.5,0.0,-0.739699,0.0,0.802161,-0.261775,0.833333,0.0,0.0,0.0,0.0,0.0,0.222222,0.0,-0.217391,-2.155882,0.0,0.872245
399928,4,0.130393,-1.030172,0.264131,0.659991,0.080421,0.132601,-0.172122,-0.543089,0.86098,0.052884,-1.0,0.0,0.0,0.0,0.0,0.5,-0.589405,0.589405,0.216504,-0.642295,1.507418,-0.345143,-1.0,-0.9,0.209848,0.272109,-8.330074e-15,1.0,0.575111,0.0,0.0,0.0,-0.557755,0.0,0.802161,-0.78894,-0.5,-0.683947,-0.333333,0.4,0.0,1.0,0.333333,0.0,0.173913,0.0,0.0,0.0
734435,3,0.291972,0.051724,0.425583,0.294089,0.205826,1.042436,0.352634,0.273171,1.142343,0.092843,-1.0,0.0,0.0,0.0,0.0,0.5,0.409616,-0.409616,1.107365,0.097488,0.0,0.828632,0.333333,0.6,1.507988,-0.042354,-4.898587e-16,0.4,-0.429599,-0.818739,-0.75,-1.0,-0.739699,0.0,-0.65174,0.485134,0.5,0.0,0.0,0.0,0.0,0.0,0.222222,1.0,0.434783,-2.155882,-1.0,0.872245


In [99]:
x_validate.head(3)

Unnamed: 0,Health Conscious Level,SCALER_Annual_Income,SCALER_Credit_Score,SCALER_Growth1,SCALER_Money_Handling_Level1,SCALER_Growth,SCALER_Money_Per_Head,SCALER_Money_Handling_Level,SCALER_Credit_by_Score,SCALER_Feedback1,SCALER_Determinstic,SCALER_Previous_Claims_MEDIAN_Premium_Amount,SCALER_Previous_Claims_MEAN_Premium_Amount,SCALER_Previous_Claims,SCALER_Previous_Claims_STD_Premium_Amount,SCALER_Previous_Claims_MAX_Premium_Amount,SCALER_Feedback3,SCALER_Health_Score,SCALER_Health_Risk_Score,SCALER_Feedback2,SCALER_CreditInsurance,SCALER_Sin_Year,SCALER_Health_Age_Interaction,SCALER_ENCODED_Policy_Start_Date_-_Year,SCALER_ENCODED_Policy_Start_Date_-_Quarter,SCALER_Feedback4,SCALER_Health_Conscious_Level1,SCALER_Sin_Month,SCALER_Policy_Start_Date_-_Month,SCALER_Health_Conscious_Level,SCALER_Number_of_Dependents_MEAN_Premium_Amount,SCALER_Health_Conscious_Level_MEAN_Premium_Amount,SCALER_Health_Conscious_Level_MEDIAN_Premium_Amount,SCALER_Number_of_Dependents_MEDIAN_Premium_Amount,SCALER_Insurance_Duration_MEAN_Premium_Amount,SCALER_Insurance_Duration_MEDIAN_Premium_Amount,SCALER_Number_of_Dependents_STD_Premium_Amount,SCALER_Credit_Health_Score,SCALER_Health_Conscious_Level_MAX_Premium_Amount,SCALER_Occupation_MEAN_Premium_Amount,SCALER_Occupation_MEDIAN_Premium_Amount,SCALER_Occupation_MAX_Premium_Amount,SCALER_Previous_Claims_MIN_Premium_Amount,SCALER_ENCODED_Occupation_Self-Employed,SCALER_Insurance_Duration_MAX_Premium_Amount,SCALER_ENCODED_Marital_Status_Single,SCALER_Age,SCALER_Insurance_Duration_STD_Premium_Amount,SCALER_Number_of_Dependents
182701,6,0.037002,-0.142241,-0.179189,0.10889,0.55165,0.056537,0.023066,-0.79187,0.08841,-0.023021,28.5,24.535366,1.0,4.862885,-3.0,0.5,0.26092,-0.26092,-0.055013,0.648669,-0.246291,0.459845,-0.666667,-0.7,0.237708,2.94013,0.0,0.0,0.575111,0.481364,1.75,0.0,0.260301,1.0,0.802161,0.225785,0.333333,-0.683947,-0.333333,0.4,0.0,1.0,0.0,1.0,0.173913,0.82098,0.0,0.0
1039735,6,0.836783,0.931034,0.220391,0.451052,1.793114,1.929896,1.477412,0.936585,2.091043,0.353693,0.0,-1.0,-1.0,-1.0,1.0,-0.5,-0.487455,0.487455,1.831411,1.700787,-0.246291,7.8e-05,-0.666667,-0.6,0.342323,1.906871,-4.898587e-16,0.4,-0.429599,0.481364,1.75,-1.0,-1.753579,-2.75,-0.65174,-0.082339,0.333333,0.0,0.0,0.0,0.0,0.0,-0.666667,0.0,0.73913,-1.066031,-1.0,0.872245
503479,4,0.421682,-0.073276,0.555193,0.465065,0.306498,-0.072094,0.43659,-0.765854,1.368212,0.069125,28.5,24.535366,1.0,4.862885,-3.0,1.5,-0.171398,0.171398,1.004437,0.478815,-0.246291,0.437282,-0.666667,-0.7,0.753012,-0.152291,4.898587e-16,-0.4,-0.655226,0.0,0.0,-1.571429,0.277595,1.25,0.0,-0.126676,-0.5,0.0,0.0,0.0,0.0,0.0,-0.666667,0.0,0.826087,0.159496,1.0,0.872245


In [100]:
test.head(3)

Unnamed: 0,Health Conscious Level,SCALER_Annual_Income,SCALER_Credit_Score,SCALER_Growth1,SCALER_Money_Handling_Level1,SCALER_Growth,SCALER_Money_Per_Head,SCALER_Money_Handling_Level,SCALER_Credit_by_Score,SCALER_Feedback1,SCALER_Determinstic,SCALER_Previous_Claims_MEDIAN_Premium_Amount,SCALER_Previous_Claims_MEAN_Premium_Amount,SCALER_Previous_Claims,SCALER_Previous_Claims_STD_Premium_Amount,SCALER_Previous_Claims_MAX_Premium_Amount,SCALER_Feedback3,SCALER_Health_Score,SCALER_Health_Risk_Score,SCALER_Feedback2,SCALER_CreditInsurance,SCALER_Sin_Year,SCALER_Health_Age_Interaction,SCALER_ENCODED_Policy_Start_Date_-_Year,SCALER_ENCODED_Policy_Start_Date_-_Quarter,SCALER_Feedback4,SCALER_Health_Conscious_Level1,SCALER_Sin_Month,SCALER_Policy_Start_Date_-_Month,SCALER_Health_Conscious_Level,SCALER_Number_of_Dependents_MEAN_Premium_Amount,SCALER_Health_Conscious_Level_MEAN_Premium_Amount,SCALER_Health_Conscious_Level_MEDIAN_Premium_Amount,SCALER_Number_of_Dependents_MEDIAN_Premium_Amount,SCALER_Insurance_Duration_MEAN_Premium_Amount,SCALER_Insurance_Duration_MEDIAN_Premium_Amount,SCALER_Number_of_Dependents_STD_Premium_Amount,SCALER_Credit_Health_Score,SCALER_Health_Conscious_Level_MAX_Premium_Amount,SCALER_Occupation_MEAN_Premium_Amount,SCALER_Occupation_MEDIAN_Premium_Amount,SCALER_Occupation_MAX_Premium_Amount,SCALER_Previous_Claims_MIN_Premium_Amount,SCALER_ENCODED_Occupation_Self-Employed,SCALER_Insurance_Duration_MAX_Premium_Amount,SCALER_ENCODED_Marital_Status_Single,SCALER_Age,SCALER_Insurance_Duration_STD_Premium_Amount,SCALER_Number_of_Dependents
1200000,2,-0.600464,-0.452586,-0.466158,-0.521806,-0.486817,-0.488346,-0.619504,-0.908943,-0.494072,-0.509998,28.5,24.535366,1.0,4.862885,-3.0,0.0,-0.963429,0.963429,-0.620231,-0.861267,0.0,-0.854232,0.333333,0.5,-0.698557,-0.443382,0.0,0.0,-0.655226,-1.175476,-2.5,-1.571429,-1.48387,-2.75,0.0,-0.908487,0.833333,-0.683947,-0.333333,0.4,0.0,1.0,-0.666667,1.0,-0.565217,-0.133238,1.0,0.0
1200001,1,2.778045,-0.974138,1.763417,4.653089,3.47089,2.289022,1.58137,-0.500813,5.471435,3.468024,-1.0,0.0,0.0,0.0,0.0,0.5,-0.639501,0.639501,0.262644,0.069741,1.0,-0.615625,0.666667,0.9,0.144752,-0.517447,4.898587e-16,-0.4,0.575111,-0.945703,0.0,0.0,0.260301,1.0,0.802161,-0.80171,-2.166667,-0.683947,-0.333333,0.4,0.0,1.0,0.0,0.0,-0.434783,0.82098,0.0,0.0
1200002,3,-0.196805,0.952586,-0.296001,-0.27114,0.188723,0.246252,-0.000806,0.952846,-0.115158,-0.229191,-1.0,0.0,0.0,0.0,0.0,0.0,-0.018453,0.018453,0.395741,1.71766,0.0,0.253363,0.333333,0.5,0.056198,-0.203957,4.898587e-16,-0.4,-0.429599,-0.818739,-0.75,-1.0,-1.753579,-2.75,-0.65174,0.527437,0.5,0.0,0.0,0.0,0.0,0.0,-0.666667,0.0,0.26087,-1.066031,-1.0,0.872245


#
---
#

# Joining All Data

In [101]:
train = pd.concat([pd.concat([x_train, y_train], axis=1), pd.concat([x_validate, y_validate], axis=1)]).sort_index()
train.head(3)

Unnamed: 0,Health Conscious Level,SCALER_Annual_Income,SCALER_Credit_Score,SCALER_Growth1,SCALER_Money_Handling_Level1,SCALER_Growth,SCALER_Money_Per_Head,SCALER_Money_Handling_Level,SCALER_Credit_by_Score,SCALER_Feedback1,SCALER_Determinstic,SCALER_Previous_Claims_MEDIAN_Premium_Amount,SCALER_Previous_Claims_MEAN_Premium_Amount,SCALER_Previous_Claims,SCALER_Previous_Claims_STD_Premium_Amount,SCALER_Previous_Claims_MAX_Premium_Amount,SCALER_Feedback3,SCALER_Health_Score,SCALER_Health_Risk_Score,SCALER_Feedback2,SCALER_CreditInsurance,SCALER_Sin_Year,SCALER_Health_Age_Interaction,SCALER_ENCODED_Policy_Start_Date_-_Year,SCALER_ENCODED_Policy_Start_Date_-_Quarter,SCALER_Feedback4,SCALER_Health_Conscious_Level1,SCALER_Sin_Month,SCALER_Policy_Start_Date_-_Month,SCALER_Health_Conscious_Level,SCALER_Number_of_Dependents_MEAN_Premium_Amount,SCALER_Health_Conscious_Level_MEAN_Premium_Amount,SCALER_Health_Conscious_Level_MEDIAN_Premium_Amount,SCALER_Number_of_Dependents_MEDIAN_Premium_Amount,SCALER_Insurance_Duration_MEAN_Premium_Amount,SCALER_Insurance_Duration_MEDIAN_Premium_Amount,SCALER_Number_of_Dependents_STD_Premium_Amount,SCALER_Credit_Health_Score,SCALER_Health_Conscious_Level_MAX_Premium_Amount,SCALER_Occupation_MEAN_Premium_Amount,SCALER_Occupation_MEDIAN_Premium_Amount,SCALER_Occupation_MAX_Premium_Amount,SCALER_Previous_Claims_MIN_Premium_Amount,SCALER_ENCODED_Occupation_Self-Employed,SCALER_Insurance_Duration_MAX_Premium_Amount,SCALER_ENCODED_Marital_Status_Single,SCALER_Age,SCALER_Insurance_Duration_STD_Premium_Amount,SCALER_Number_of_Dependents,Premium Amount
0,4,-0.389132,-0.974138,-0.25499,-0.175976,-0.322796,-0.067034,-0.494442,-1.105691,-0.402072,-0.064163,28.5,24.535366,1.0,4.862885,-3.0,0.0,-0.117824,0.117824,-0.727595,-0.348706,0.0,-0.598296,0.333333,0.7,-0.42386,-0.012354,-1.469576e-15,1.2,0.570401,0.0,0.0,0.571429,-0.739699,0.0,0.726836,-0.497239,-0.5,-0.683947,-0.333333,0.4,0.0,1.0,0.222222,0.0,-0.956522,-2.155882,-0.5,0.0,2869.0
1,2,0.201502,0.413793,0.04706,0.112611,0.471303,-0.044334,0.383426,0.546341,0.231635,0.218843,-1.0,0.0,0.0,0.0,0.0,0.0,-0.515647,0.515647,0.173913,-0.525684,0.0,-0.386547,0.333333,0.5,-0.266832,-0.38372,0.0,0.0,0.0,-1.175476,-2.5,0.0,0.458999,1.5,-0.273164,-0.284248,0.833333,-0.683947,-0.333333,0.4,0.0,1.0,0.333333,0.0,-0.086957,-0.783022,0.5,0.0,1483.0
2,4,0.035582,-0.185345,0.867975,0.120111,-0.26447,-0.134425,0.009334,0.094309,0.695881,0.51934,-1.0,0.0,0.0,0.0,0.0,0.5,1.273265,-1.273265,0.912156,-0.421822,0.0,0.182425,0.333333,0.6,2.630225,0.139051,-7.347881e-16,0.6,0.0,0.0,0.0,0.0,-0.557755,0.0,-0.273164,1.081276,-0.5,-0.683947,-0.333333,0.4,0.0,1.0,0.333333,0.0,-0.782609,0.0,0.5,0.0,567.0


In [102]:
test.head(3)

Unnamed: 0,Health Conscious Level,SCALER_Annual_Income,SCALER_Credit_Score,SCALER_Growth1,SCALER_Money_Handling_Level1,SCALER_Growth,SCALER_Money_Per_Head,SCALER_Money_Handling_Level,SCALER_Credit_by_Score,SCALER_Feedback1,SCALER_Determinstic,SCALER_Previous_Claims_MEDIAN_Premium_Amount,SCALER_Previous_Claims_MEAN_Premium_Amount,SCALER_Previous_Claims,SCALER_Previous_Claims_STD_Premium_Amount,SCALER_Previous_Claims_MAX_Premium_Amount,SCALER_Feedback3,SCALER_Health_Score,SCALER_Health_Risk_Score,SCALER_Feedback2,SCALER_CreditInsurance,SCALER_Sin_Year,SCALER_Health_Age_Interaction,SCALER_ENCODED_Policy_Start_Date_-_Year,SCALER_ENCODED_Policy_Start_Date_-_Quarter,SCALER_Feedback4,SCALER_Health_Conscious_Level1,SCALER_Sin_Month,SCALER_Policy_Start_Date_-_Month,SCALER_Health_Conscious_Level,SCALER_Number_of_Dependents_MEAN_Premium_Amount,SCALER_Health_Conscious_Level_MEAN_Premium_Amount,SCALER_Health_Conscious_Level_MEDIAN_Premium_Amount,SCALER_Number_of_Dependents_MEDIAN_Premium_Amount,SCALER_Insurance_Duration_MEAN_Premium_Amount,SCALER_Insurance_Duration_MEDIAN_Premium_Amount,SCALER_Number_of_Dependents_STD_Premium_Amount,SCALER_Credit_Health_Score,SCALER_Health_Conscious_Level_MAX_Premium_Amount,SCALER_Occupation_MEAN_Premium_Amount,SCALER_Occupation_MEDIAN_Premium_Amount,SCALER_Occupation_MAX_Premium_Amount,SCALER_Previous_Claims_MIN_Premium_Amount,SCALER_ENCODED_Occupation_Self-Employed,SCALER_Insurance_Duration_MAX_Premium_Amount,SCALER_ENCODED_Marital_Status_Single,SCALER_Age,SCALER_Insurance_Duration_STD_Premium_Amount,SCALER_Number_of_Dependents
1200000,2,-0.600464,-0.452586,-0.466158,-0.521806,-0.486817,-0.488346,-0.619504,-0.908943,-0.494072,-0.509998,28.5,24.535366,1.0,4.862885,-3.0,0.0,-0.963429,0.963429,-0.620231,-0.861267,0.0,-0.854232,0.333333,0.5,-0.698557,-0.443382,0.0,0.0,-0.655226,-1.175476,-2.5,-1.571429,-1.48387,-2.75,0.0,-0.908487,0.833333,-0.683947,-0.333333,0.4,0.0,1.0,-0.666667,1.0,-0.565217,-0.133238,1.0,0.0
1200001,1,2.778045,-0.974138,1.763417,4.653089,3.47089,2.289022,1.58137,-0.500813,5.471435,3.468024,-1.0,0.0,0.0,0.0,0.0,0.5,-0.639501,0.639501,0.262644,0.069741,1.0,-0.615625,0.666667,0.9,0.144752,-0.517447,4.898587e-16,-0.4,0.575111,-0.945703,0.0,0.0,0.260301,1.0,0.802161,-0.80171,-2.166667,-0.683947,-0.333333,0.4,0.0,1.0,0.0,0.0,-0.434783,0.82098,0.0,0.0
1200002,3,-0.196805,0.952586,-0.296001,-0.27114,0.188723,0.246252,-0.000806,0.952846,-0.115158,-0.229191,-1.0,0.0,0.0,0.0,0.0,0.0,-0.018453,0.018453,0.395741,1.71766,0.0,0.253363,0.333333,0.5,0.056198,-0.203957,4.898587e-16,-0.4,-0.429599,-0.818739,-0.75,-1.0,-1.753579,-2.75,-0.65174,0.527437,0.5,0.0,0.0,0.0,0.0,0.0,-0.666667,0.0,0.26087,-1.066031,-1.0,0.872245


In [103]:
df = pd.concat([train, test])

#
---
#

# Download the `Model Ready df`

In [104]:
df.to_csv("trainable_df.csv", index=False)