In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import *
import xgboost as xgb

from sklearn.preprocessing import PowerTransformer


import warnings
warnings.filterwarnings("ignore")
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", 150)

In [2]:
df = pd.read_csv("EDAed_df.csv")

df["Policy Start Date"] = pd.to_datetime(df["Policy Start Date"])

In [3]:
df.shape

(2000000, 49)

In [4]:
df.isnull().sum()

Age                                 0
Gender                              0
Annual Income                       0
Marital Status                      0
Number of Dependents                0
Education Level                     0
Occupation                          0
Health Score                        0
Location                            0
Policy Type                         0
Previous Claims                     0
Vehicle Age                         0
Credit Score                        0
Insurance Duration                  0
Policy Start Date                   0
Customer Feedback                   0
Smoking Status                      0
Exercise Frequency                  0
Property Type                       0
Premium Amount                 800000
IsNull_Age                          0
IsNull_Annual Income                0
IsNull_Marital Status               0
IsNull_Number of Dependents         0
IsNull_Occupation                   0
IsNull_Health Score                 0
IsNull_Previ

In [5]:
train = df.iloc[:1200000, :]
train.shape

(1200000, 49)

In [6]:
test = df.iloc[1200000:, :]
test.shape

(800000, 49)

In [7]:
test.head(3)

Unnamed: 0,Age,Gender,Annual Income,Marital Status,Number of Dependents,Education Level,Occupation,Health Score,Location,Policy Type,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Policy Start Date,Customer Feedback,Smoking Status,Exercise Frequency,Property Type,Premium Amount,IsNull_Age,IsNull_Annual Income,IsNull_Marital Status,IsNull_Number of Dependents,IsNull_Occupation,IsNull_Health Score,IsNull_Previous Claims,IsNull_Vehicle Age,IsNull_Credit Score,IsNull_Insurance Duration,IsNull_Customer Feedback,Health Conscious Level,Health Conscious Level1,Money Per Head,Money Handling Level,Money Handling Level1,Growth,Growth1,Determinstic,Day_Name,Credit by Score,CreditInsurance,Health_Risk_Score,Credit_Health_Score,Health_Age_Interaction,Feedback1,Feedback2,Feedback3,Feedback4
1200000,28.0,Female,2310.0,Divorced,4.0,Bachelor's,Self-Employed,7.657981,Rural,Basic,2.0,19.0,551.0,1.0,2023-06-04 15:21:39.245086,Poor,Yes,Weekly,House,,0,0,1,0,0,0,1,0,1,0,0,2,3430.775431,577.5,1272810.0,4.192377,4620.0,1155.0,82.5,Sunday,275.5,551.0,4.617101,4219.54746,214.423464,4620.0,1102.0,4.0,15.315962
1200001,31.0,Female,126031.0,Married,2.0,Master's,Self-Employed,13.381379,Suburban,Premium,1.0,14.0,372.0,8.0,2024-04-22 15:21:39.224915,Good,Yes,Rarely,Apartment,,0,0,0,0,0,0,1,0,0,0,0,1,1659.291012,63015.5,46883532.0,338.793011,378093.0,42010.333333,4065.516129,Monday,372.0,2976.0,4.330931,4977.873036,414.822753,1008248.0,2976.0,8.0,107.051033
1200002,47.0,Female,17092.0,Divorced,0.0,PhD,Unemployed,24.354527,Urban,Comprehensive,1.0,16.0,819.0,9.0,2023-04-05 15:21:39.134960,Average,Yes,Monthly,Condo,,0,0,0,0,0,0,1,0,0,0,0,3,9157.302066,17092.0,13998348.0,20.869353,68368.0,4273.0,363.659574,Wednesday,819.0,7371.0,3.782274,19946.357425,1144.662758,68368.0,3276.0,4.0,97.418107


#
---
#

# Adding Dates columns

In [8]:
df["Policy Start Date - Day"] = df["Policy Start Date"].dt.day
df["Policy Start Date - Month"] = df["Policy Start Date"].dt.month
df["Policy Start Date - Year"] = df["Policy Start Date"].dt.year

In [9]:
df["Policy Start Date - Quarter"] = df["Policy Start Date"].dt.year.astype(str) + " Q" + df["Policy Start Date"].dt.quarter.astype(str)

In [10]:
df["Sin_Date"] = np.sin(2 * np.pi * df["Policy Start Date"].astype('int64'))
df["Cos_Date"] = np.cos(2 * np.pi * df["Policy Start Date"].astype('int64'))

In [11]:
df["Sin_Year"] = np.sin(2 * np.pi * df["Policy Start Date - Year"].astype('int64'))
df["Cos_Year"] = np.cos(2 * np.pi * df["Policy Start Date - Year"].astype('int64'))

In [12]:
df["Sin_Month"] = np.sin(2 * np.pi * df["Policy Start Date - Month"].astype('int64'))
df["Cos_Month"] = np.cos(2 * np.pi * df["Policy Start Date - Month"].astype('int64'))

In [13]:
df.head(3)

Unnamed: 0,Age,Gender,Annual Income,Marital Status,Number of Dependents,Education Level,Occupation,Health Score,Location,Policy Type,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Policy Start Date,Customer Feedback,Smoking Status,Exercise Frequency,Property Type,Premium Amount,IsNull_Age,IsNull_Annual Income,IsNull_Marital Status,IsNull_Number of Dependents,IsNull_Occupation,IsNull_Health Score,IsNull_Previous Claims,IsNull_Vehicle Age,IsNull_Credit Score,IsNull_Insurance Duration,IsNull_Customer Feedback,Health Conscious Level,Health Conscious Level1,Money Per Head,Money Handling Level,Money Handling Level1,Growth,Growth1,Determinstic,Day_Name,Credit by Score,CreditInsurance,Health_Risk_Score,Credit_Health_Score,Health_Age_Interaction,Feedback1,Feedback2,Feedback3,Feedback4,Policy Start Date - Day,Policy Start Date - Month,Policy Start Date - Year,Policy Start Date - Quarter,Sin_Date,Cos_Date,Sin_Year,Cos_Year,Sin_Month,Cos_Month
0,19.0,Female,10049.0,Married,1.0,Bachelor's,Self-Employed,22.598761,Urban,Premium,2.0,17.0,372.0,5.0,2023-12-23 15:21:39.134960,Poor,No,Weekly,House,2869.0,0,0,0,0,0,0,0,0,0,0,0,4,13740.046488,10049.0,3738228.0,27.013441,20098.0,5024.5,528.894737,Saturday,186.0,1860.0,3.870062,8406.73897,429.376453,20098.0,744.0,4.0,45.197521,23,12,2023,2023 Q4,-0.975344,-0.220691,-6.447061e-13,1.0,-2.939152e-15,1.0
1,39.0,Female,31678.0,Divorced,3.0,Master's,Unemployed,15.569731,Rural,Comprehensive,1.0,12.0,694.0,2.0,2023-06-12 15:21:39.111551,Average,Yes,Monthly,House,1483.0,0,0,0,0,1,0,0,0,0,0,0,2,4857.756069,10559.333333,21984532.0,45.645533,95034.0,10559.333333,812.25641,Monday,694.0,1388.0,4.221513,10805.393307,607.219509,126712.0,2776.0,4.0,62.278924,12,6,2023,2023 Q2,-0.998725,0.050489,-6.447061e-13,1.0,-1.469576e-15,1.0
2,23.0,Male,25602.0,Divorced,3.0,High School,Self-Employed,47.177549,Suburban,Premium,1.0,14.0,632.0,3.0,2023-09-30 15:21:39.221386,Good,Yes,Weekly,House,567.0,0,0,0,0,0,0,0,0,1,0,0,4,17361.338138,8534.0,16180464.0,40.509494,25602.0,25602.0,1113.130435,Saturday,632.0,1896.0,2.641123,29816.21115,1085.083634,204816.0,5056.0,8.0,377.420394,30,9,2023,2023 Q3,-0.994867,0.101192,-6.447061e-13,1.0,-2.204364e-15,1.0


#
---
#

In [14]:
data = df.copy()

#
---
#

In [15]:
df.drop(columns="Policy Start Date", inplace=True)

In [16]:
df.head(3)

Unnamed: 0,Age,Gender,Annual Income,Marital Status,Number of Dependents,Education Level,Occupation,Health Score,Location,Policy Type,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Customer Feedback,Smoking Status,Exercise Frequency,Property Type,Premium Amount,IsNull_Age,IsNull_Annual Income,IsNull_Marital Status,IsNull_Number of Dependents,IsNull_Occupation,IsNull_Health Score,IsNull_Previous Claims,IsNull_Vehicle Age,IsNull_Credit Score,IsNull_Insurance Duration,IsNull_Customer Feedback,Health Conscious Level,Health Conscious Level1,Money Per Head,Money Handling Level,Money Handling Level1,Growth,Growth1,Determinstic,Day_Name,Credit by Score,CreditInsurance,Health_Risk_Score,Credit_Health_Score,Health_Age_Interaction,Feedback1,Feedback2,Feedback3,Feedback4,Policy Start Date - Day,Policy Start Date - Month,Policy Start Date - Year,Policy Start Date - Quarter,Sin_Date,Cos_Date,Sin_Year,Cos_Year,Sin_Month,Cos_Month
0,19.0,Female,10049.0,Married,1.0,Bachelor's,Self-Employed,22.598761,Urban,Premium,2.0,17.0,372.0,5.0,Poor,No,Weekly,House,2869.0,0,0,0,0,0,0,0,0,0,0,0,4,13740.046488,10049.0,3738228.0,27.013441,20098.0,5024.5,528.894737,Saturday,186.0,1860.0,3.870062,8406.73897,429.376453,20098.0,744.0,4.0,45.197521,23,12,2023,2023 Q4,-0.975344,-0.220691,-6.447061e-13,1.0,-2.939152e-15,1.0
1,39.0,Female,31678.0,Divorced,3.0,Master's,Unemployed,15.569731,Rural,Comprehensive,1.0,12.0,694.0,2.0,Average,Yes,Monthly,House,1483.0,0,0,0,0,1,0,0,0,0,0,0,2,4857.756069,10559.333333,21984532.0,45.645533,95034.0,10559.333333,812.25641,Monday,694.0,1388.0,4.221513,10805.393307,607.219509,126712.0,2776.0,4.0,62.278924,12,6,2023,2023 Q2,-0.998725,0.050489,-6.447061e-13,1.0,-1.469576e-15,1.0
2,23.0,Male,25602.0,Divorced,3.0,High School,Self-Employed,47.177549,Suburban,Premium,1.0,14.0,632.0,3.0,Good,Yes,Weekly,House,567.0,0,0,0,0,0,0,0,0,1,0,0,4,17361.338138,8534.0,16180464.0,40.509494,25602.0,25602.0,1113.130435,Saturday,632.0,1896.0,2.641123,29816.21115,1085.083634,204816.0,5056.0,8.0,377.420394,30,9,2023,2023 Q3,-0.994867,0.101192,-6.447061e-13,1.0,-2.204364e-15,1.0


In [17]:
df[["Policy Start Date - Day", "Policy Start Date - Month", "Policy Start Date - Year"]] = df[["Policy Start Date - Day", "Policy Start Date - Month", "Policy Start Date - Year"]].astype("O")

In [18]:
def show_nulls(df):
    nulls = []
    nuniques = []
    uniques = []
    types = []
    
    for i in df.columns:
        nulls.append(df[i].isnull().sum())
        nuniques.append(df[i].nunique())
        uniques.append(df[i].unique())
        types.append(df[i].dtype)
    
    
    return pd.DataFrame(
        {
            "Column" : df.columns,
            "Data Type" : types,
            "Nulls" : nulls,
            "No. of Uniques" : nuniques,
            "Uniques" : uniques
        }
    ).sort_values(by="Nulls", ascending=False)

In [19]:
df["Health Conscious Level"] = df["Health Conscious Level"].astype("O")

In [20]:
show_nulls(df)

Unnamed: 0,Column,Data Type,Nulls,No. of Uniques,Uniques
18,Premium Amount,float64,800000,4794,"[2869.0, 1483.0, 567.0, 765.0, 2022.0, 3202.0,..."
0,Age,float64,0,47,"[19.0, 39.0, 23.0, 21.0, 29.0, 41.0, 48.0, 44...."
2,Annual Income,float64,0,97970,"[10049.0, 31678.0, 25602.0, 141855.0, 39651.0,..."
3,Marital Status,object,0,3,"[Married, Divorced, Single]"
4,Number of Dependents,float64,0,5,"[1.0, 3.0, 2.0, 0.0, 4.0]"
5,Education Level,object,0,4,"[Bachelor's, Master's, High School, PhD]"
6,Occupation,object,0,3,"[Self-Employed, Unemployed, Employed]"
7,Health Score,float64,0,933976,"[22.59876067181393, 15.569730989408043, 47.177..."
8,Location,object,0,3,"[Urban, Rural, Suburban]"
1,Gender,object,0,2,"[Female, Male]"


#
---
#

In [21]:
def do_magic(target_column, *columns: list):
    for i in columns:
        df[f"{i}_MIN_{target_column}"] = df.groupby(by=i)[target_column].transform("min")
        df[f"{i}_MEAN_{target_column}"] = df.groupby(by=i)[target_column].transform("mean")
        df[f"{i}_MEDIAN_{target_column}"] = df.groupby(by=i)[target_column].transform("median")
        df[f"{i}_STD_{target_column}"] = df.groupby(by=i)[target_column].transform("std")
        df[f"{i}_MAX_{target_column}"] = df.groupby(by=i)[target_column].transform("max")

In [22]:
do_magic("Premium Amount", "Number of Dependents", "Occupation", "Education Level", "Previous Claims", "Health Conscious Level", "Insurance Duration")

In [23]:
df.isnull().sum()

Age                                                  0
Gender                                               0
Annual Income                                        0
Marital Status                                       0
Number of Dependents                                 0
Education Level                                      0
Occupation                                           0
Health Score                                         0
Location                                             0
Policy Type                                          0
Previous Claims                                      0
Vehicle Age                                          0
Credit Score                                         0
Insurance Duration                                   0
Customer Feedback                                    0
Smoking Status                                       0
Exercise Frequency                                   0
Property Type                                        0
Premium Am

#
---
#

In [24]:
def return_splits(ddf, feature_name, target_name):
    return [ddf[ddf[feature_name] == i][target_name] for i in ddf[feature_name].unique()]

def give_stats_analysis(df, target_column_name):
    ddf = df.copy()
    ddf = ddf.dropna()

    features = []
    tests = []
    stats = []
    pvals = []
    verdict = []
    count = 0

    target = ddf[target_column_name]
    for i in ddf.columns:
        features.append(i)
        feature = ddf[i]
        
        if (feature.dtype == "O" and (target.dtype == "float" or target.dtype == "int")) or (target.dtype == "O" and (feature.dtype == "float" or feature.dtype == "int")):
            stat, pval, *_ = kruskal(*return_splits(ddf, feature.name, target.name))
            tests.append("Kruskal-Wallis")
            stats.append(stat)
            pvals.append(pval)
            
        
        elif (feature.dtype == "float" or feature.dtype == "int") and (target.dtype == "float" or target.dtype == "int"):
            stat, pval, *_ = spearmanr(feature, target)
            tests.append("SpearmanR")
            stats.append(stat)
            pvals.append(pval)

        elif feature.dtype == "O" and target.dtype == "O":
            stat, pval, *_ = chi2_contingency(pd.crosstab(feature, target))
            tests.append("Chi-Square")
            stats.append(stat)
            pvals.append(pval)
        
        else:
            tests.append(np.nan)
            stats.append(np.nan)
            pvals.append(np.nan)
        
        if pval <= 0.025:
            verdict.append("There is Relationship")
        else:
            verdict.append("There is NO Relationship")

        print(f"{feature.name} ■■■ {target_column_name}".ljust(100, "-")+"✅")
    
    return pd.DataFrame({
        "Feature" : features,
        "Target" : [target_column_name]*ddf.shape[1],
        "Statistic Test" : tests,
        "Test Statistic" : stats,
        "P-Value" : pvals,
        "Verdict" : verdict
    }).sort_values(by="P-Value")

# H0 :- There is ***No Relationship*** among the given two columns
# H1 :- There is ***Relationship*** among the given two columns

### ***Health-related indicators***
- [x] Health Score
- [x] Smoking Status
- [x] Exercise Frequency
### ***Demographic information***
- [x] Age
- [x] Gender
- [x] Marital Status
- [x] Number of Dependents
- [x] Occupation
### ***Policy details***
- [x] Policy Type
- [x] Policy Start Date
- [x] Insurance Duration
### ***Financial factors***
- [x] Annual Income
- [x] Credit Score.
### ***Premium calculation***
- [x] Premium Amount

In [25]:
stats_result = give_stats_analysis(df.iloc[:1200000, :], "Premium Amount")
stats_result

Age ■■■ Premium Amount------------------------------------------------------------------------------✅
Gender ■■■ Premium Amount---------------------------------------------------------------------------✅
Annual Income ■■■ Premium Amount--------------------------------------------------------------------✅
Marital Status ■■■ Premium Amount-------------------------------------------------------------------✅
Number of Dependents ■■■ Premium Amount-------------------------------------------------------------✅
Education Level ■■■ Premium Amount------------------------------------------------------------------✅
Occupation ■■■ Premium Amount-----------------------------------------------------------------------✅
Health Score ■■■ Premium Amount---------------------------------------------------------------------✅
Location ■■■ Premium Amount-------------------------------------------------------------------------✅
Policy Type ■■■ Premium Amount----------------------------------------------------

Unnamed: 0,Feature,Target,Statistic Test,Test Statistic,P-Value,Verdict
2,Annual Income,Premium Amount,SpearmanR,-0.061831,0.0,There is Relationship
12,Credit Score,Premium Amount,SpearmanR,-0.036687,0.0,There is Relationship
20,IsNull_Annual Income,Premium Amount,SpearmanR,-0.065399,0.0,There is Relationship
18,Premium Amount,Premium Amount,SpearmanR,1.0,0.0,There is Relationship
35,Growth,Premium Amount,SpearmanR,-0.055,0.0,There is Relationship
34,Money Handling Level1,Premium Amount,SpearmanR,-0.048668,0.0,There is Relationship
33,Money Handling Level,Premium Amount,SpearmanR,-0.072097,0.0,There is Relationship
32,Money Per Head,Premium Amount,SpearmanR,-0.053422,0.0,There is Relationship
44,Feedback1,Premium Amount,SpearmanR,-0.053714,0.0,There is Relationship
39,Credit by Score,Premium Amount,SpearmanR,-0.05485,0.0,There is Relationship


# <ins>Key Premium as per Reseach Papers and as per dataset.</ins>
### `Strikeoff features are said by research and dataset too. But unstrike ones are not impactful to determine premium amount as per dataset but as per research it should be. We need to find why like so in these features`

- ### ~~Age~~
- ### Gender
- ### ~~Health Score~~
- ### Smoking Status
- ### Exercise Frequency
- ### ~~Occupation~~
- ### Policy Type
- ### ~~Previous Claims~~
- ### ~~Annual Income~~
- ### Insurance Duration
- ### ~~Credit Score~~

#
---
#

In [26]:
cols = ["Gender", "Smoking Status", "Exercise Frequency", "Policy Type", "Insurance Duration"]

In [27]:
# fig, axs = plt.subplots(2, 3, figsize=(20, 8))
# for col, ax in zip(cols, axs.flatten()):
#     sns.boxplot(y=df["Premium Amount"], x=df[col], color="mediumblue", ax=ax)

In [28]:
useless_columns = stats_result[stats_result["P-Value"] >= 0.05]["Feature"]
useless_columns

69          Education Level_MEAN_Premium Amount
70        Education Level_MEDIAN_Premium Amount
8                                      Location
62      Number of Dependents_MAX_Premium Amount
5                               Education Level
4                          Number of Dependents
53                                     Cos_Date
11                                  Vehicle Age
81    Health Conscious Level_STD_Premium Amount
19                                   IsNull_Age
9                                   Policy Type
72           Education Level_MAX_Premium Amount
17                                Property Type
3                                Marital Status
16                           Exercise Frequency
15                               Smoking Status
38                                     Day_Name
26                           IsNull_Vehicle Age
71           Education Level_STD_Premium Amount
28                    IsNull_Insurance Duration
48                      Policy Start Dat

In [29]:
meaningless_df = df[useless_columns]
meaningless_df.head(3)

Unnamed: 0,Education Level_MEAN_Premium Amount,Education Level_MEDIAN_Premium Amount,Location,Number of Dependents_MAX_Premium Amount,Education Level,Number of Dependents,Cos_Date,Vehicle Age,Health Conscious Level_STD_Premium Amount,IsNull_Age,Policy Type,Education Level_MAX_Premium Amount,Property Type,Marital Status,Exercise Frequency,Smoking Status,Day_Name,IsNull_Vehicle Age,Education Level_STD_Premium Amount,IsNull_Insurance Duration,Policy Start Date - Day,Gender,Sin_Date,Insurance Duration
0,1102.698438,873.0,Urban,4994.0,Bachelor's,1.0,-0.220691,17.0,864.569091,0,Premium,4988.0,House,Married,Weekly,No,Saturday,0,864.866296,0,23,Female,-0.975344,5.0
1,1102.113989,871.0,Rural,4997.0,Master's,3.0,0.050489,12.0,865.103831,0,Comprehensive,4997.0,House,Divorced,Monthly,Yes,Monday,0,866.235322,0,12,Female,-0.998725,2.0
2,1104.78749,876.0,Suburban,4997.0,High School,3.0,0.101192,14.0,864.569091,0,Premium,4999.0,House,Divorced,Weekly,Yes,Saturday,0,865.951488,0,30,Male,-0.994867,3.0


In [30]:
# df = df[stats_result[stats_result["P-Value"] < 0.05]["Feature"]]
# df.head(3)

# Compressing Meaningless DF's information in a component using PCA

In [31]:
meaningless_df.head(3)

Unnamed: 0,Education Level_MEAN_Premium Amount,Education Level_MEDIAN_Premium Amount,Location,Number of Dependents_MAX_Premium Amount,Education Level,Number of Dependents,Cos_Date,Vehicle Age,Health Conscious Level_STD_Premium Amount,IsNull_Age,Policy Type,Education Level_MAX_Premium Amount,Property Type,Marital Status,Exercise Frequency,Smoking Status,Day_Name,IsNull_Vehicle Age,Education Level_STD_Premium Amount,IsNull_Insurance Duration,Policy Start Date - Day,Gender,Sin_Date,Insurance Duration
0,1102.698438,873.0,Urban,4994.0,Bachelor's,1.0,-0.220691,17.0,864.569091,0,Premium,4988.0,House,Married,Weekly,No,Saturday,0,864.866296,0,23,Female,-0.975344,5.0
1,1102.113989,871.0,Rural,4997.0,Master's,3.0,0.050489,12.0,865.103831,0,Comprehensive,4997.0,House,Divorced,Monthly,Yes,Monday,0,866.235322,0,12,Female,-0.998725,2.0
2,1104.78749,876.0,Suburban,4997.0,High School,3.0,0.101192,14.0,864.569091,0,Premium,4999.0,House,Divorced,Weekly,Yes,Saturday,0,865.951488,0,30,Male,-0.994867,3.0


## Encoding Columns

In [32]:
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

###
---
###

In [33]:
meaningless_df["Location"].unique()

array(['Urban', 'Rural', 'Suburban'], dtype=object)

In [34]:
a = OrdinalEncoder(categories=[['Rural', 'Suburban', 'Urban']])

b = pd.DataFrame({"ENCODED_Location" : a.fit_transform(meaningless_df[["Location"]]).flatten()})

meaningless_df = pd.concat([meaningless_df, b], axis=1)
meaningless_df.drop(columns="Location", inplace=True)

###
---
###

In [35]:
meaningless_df["Education Level"].unique()

array(["Bachelor's", "Master's", 'High School', 'PhD'], dtype=object)

In [36]:
a = OrdinalEncoder(categories=[['High School', "Bachelor's", "Master's", 'PhD']])

b = pd.DataFrame({"ENCODED_Education Level" : a.fit_transform(meaningless_df[["Education Level"]]).flatten()})

meaningless_df = pd.concat([meaningless_df, b], axis=1)
meaningless_df.drop(columns="Education Level", inplace=True)

###
---
###

In [37]:
meaningless_df["Policy Type"].unique()

array(['Premium', 'Comprehensive', 'Basic'], dtype=object)

In [38]:
a = OrdinalEncoder(categories=[['Basic', 'Comprehensive', 'Premium']])

b = pd.DataFrame({"ENCODED_Policy Type" : a.fit_transform(meaningless_df[["Policy Type"]]).flatten()})

meaningless_df = pd.concat([meaningless_df, b], axis=1)
meaningless_df.drop(columns="Policy Type", inplace=True)

###
---
###

In [39]:
a = OneHotEncoder(drop="first", sparse_output=False)

b = pd.DataFrame(
        a.fit_transform(meaningless_df[["Property Type"]]),
        columns=a.get_feature_names_out()
    )

meaningless_df = pd.concat([meaningless_df, b], axis=1)
meaningless_df.drop(columns="Property Type", inplace=True)

###
---
###

In [40]:
meaningless_df["Exercise Frequency"].unique()

array(['Weekly', 'Monthly', 'Daily', 'Rarely'], dtype=object)

In [41]:
a = OrdinalEncoder(categories=[['Rarely', 'Monthly', 'Weekly', 'Daily']])

b = pd.DataFrame({"ENCODED_Exercise Frequency" : a.fit_transform(meaningless_df[["Exercise Frequency"]]).flatten()})

meaningless_df = pd.concat([meaningless_df, b], axis=1)
meaningless_df.drop(columns="Exercise Frequency", inplace=True)

###
---
###

In [42]:
a = OneHotEncoder(drop="first", sparse_output=False)

b = pd.DataFrame(
        a.fit_transform(meaningless_df[["Smoking Status"]]),
        columns=a.get_feature_names_out()
    )

meaningless_df = pd.concat([meaningless_df, b], axis=1)
meaningless_df.drop(columns="Smoking Status", inplace=True)

###
---
###

In [43]:
a = OneHotEncoder(drop="first", sparse_output=False)

b = pd.DataFrame(
        a.fit_transform(meaningless_df[["Gender"]]),
        columns=a.get_feature_names_out()
    )

meaningless_df = pd.concat([meaningless_df, b], axis=1)
meaningless_df.drop(columns="Gender", inplace=True)

###
---
###

In [44]:
meaningless_df["Policy Start Date - Day"] = meaningless_df["Policy Start Date - Day"].astype(int)

#
---
#

In [45]:
meaningless_df.head(3)

Unnamed: 0,Education Level_MEAN_Premium Amount,Education Level_MEDIAN_Premium Amount,Number of Dependents_MAX_Premium Amount,Number of Dependents,Cos_Date,Vehicle Age,Health Conscious Level_STD_Premium Amount,IsNull_Age,Education Level_MAX_Premium Amount,Marital Status,Day_Name,IsNull_Vehicle Age,Education Level_STD_Premium Amount,IsNull_Insurance Duration,Policy Start Date - Day,Sin_Date,Insurance Duration,ENCODED_Location,ENCODED_Education Level,ENCODED_Policy Type,Property Type_Condo,Property Type_House,ENCODED_Exercise Frequency,Smoking Status_Yes,Gender_Male
0,1102.698438,873.0,4994.0,1.0,-0.220691,17.0,864.569091,0,4988.0,Married,Saturday,0,864.866296,0,23,-0.975344,5.0,2.0,1.0,2.0,0.0,1.0,2.0,0.0,0.0
1,1102.113989,871.0,4997.0,3.0,0.050489,12.0,865.103831,0,4997.0,Divorced,Monday,0,866.235322,0,12,-0.998725,2.0,0.0,2.0,1.0,0.0,1.0,1.0,1.0,0.0
2,1104.78749,876.0,4997.0,3.0,0.101192,14.0,864.569091,0,4999.0,Divorced,Saturday,0,865.951488,0,30,-0.994867,3.0,1.0,0.0,2.0,0.0,1.0,2.0,1.0,1.0


In [46]:
meaningless_df.dtypes

Education Level_MEAN_Premium Amount          float64
Education Level_MEDIAN_Premium Amount        float64
Number of Dependents_MAX_Premium Amount      float64
Number of Dependents                         float64
Cos_Date                                     float64
Vehicle Age                                  float64
Health Conscious Level_STD_Premium Amount    float64
IsNull_Age                                     int64
Education Level_MAX_Premium Amount           float64
Marital Status                                object
Day_Name                                      object
IsNull_Vehicle Age                             int64
Education Level_STD_Premium Amount           float64
IsNull_Insurance Duration                      int64
Policy Start Date - Day                        int64
Sin_Date                                     float64
Insurance Duration                           float64
ENCODED_Location                             float64
ENCODED_Education Level                      f

###
---
###

# Doing PCA on this `meaningless_df`

In [47]:
# from sklearn.decomposition import PCA

In [48]:
# pca = PCA(n_components=3)
# pca_df = pd.DataFrame(pca.fit_transform(meaningless_df), columns=['PC1_Meaningless_df', "PC2_Meaningless_df", "PC3_Meaningless_df"])
# pca_df

In [49]:
# pca.explained_variance_ratio_

###
---
###

# Combining 2 PCs of Meaningless_columns to the df

In [50]:
# df = pd.concat([df, pca_df.iloc[:, :2]], axis=1)

In [51]:
df.head()

Unnamed: 0,Age,Gender,Annual Income,Marital Status,Number of Dependents,Education Level,Occupation,Health Score,Location,Policy Type,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Customer Feedback,Smoking Status,Exercise Frequency,Property Type,Premium Amount,IsNull_Age,IsNull_Annual Income,IsNull_Marital Status,IsNull_Number of Dependents,IsNull_Occupation,IsNull_Health Score,IsNull_Previous Claims,IsNull_Vehicle Age,IsNull_Credit Score,IsNull_Insurance Duration,IsNull_Customer Feedback,Health Conscious Level,Health Conscious Level1,Money Per Head,Money Handling Level,Money Handling Level1,Growth,Growth1,Determinstic,Day_Name,Credit by Score,CreditInsurance,Health_Risk_Score,Credit_Health_Score,Health_Age_Interaction,Feedback1,Feedback2,Feedback3,Feedback4,Policy Start Date - Day,Policy Start Date - Month,Policy Start Date - Year,Policy Start Date - Quarter,Sin_Date,Cos_Date,Sin_Year,Cos_Year,Sin_Month,Cos_Month,Number of Dependents_MIN_Premium Amount,Number of Dependents_MEAN_Premium Amount,Number of Dependents_MEDIAN_Premium Amount,Number of Dependents_STD_Premium Amount,Number of Dependents_MAX_Premium Amount,Occupation_MIN_Premium Amount,Occupation_MEAN_Premium Amount,Occupation_MEDIAN_Premium Amount,Occupation_STD_Premium Amount,Occupation_MAX_Premium Amount,Education Level_MIN_Premium Amount,Education Level_MEAN_Premium Amount,Education Level_MEDIAN_Premium Amount,Education Level_STD_Premium Amount,Education Level_MAX_Premium Amount,Previous Claims_MIN_Premium Amount,Previous Claims_MEAN_Premium Amount,Previous Claims_MEDIAN_Premium Amount,Previous Claims_STD_Premium Amount,Previous Claims_MAX_Premium Amount,Health Conscious Level_MIN_Premium Amount,Health Conscious Level_MEAN_Premium Amount,Health Conscious Level_MEDIAN_Premium Amount,Health Conscious Level_STD_Premium Amount,Health Conscious Level_MAX_Premium Amount,Insurance Duration_MIN_Premium Amount,Insurance Duration_MEAN_Premium Amount,Insurance Duration_MEDIAN_Premium Amount,Insurance Duration_STD_Premium Amount,Insurance Duration_MAX_Premium Amount
0,19.0,Female,10049.0,Married,1.0,Bachelor's,Self-Employed,22.598761,Urban,Premium,2.0,17.0,372.0,5.0,Poor,No,Weekly,House,2869.0,0,0,0,0,0,0,0,0,0,0,0,4,13740.046488,10049.0,3738228.0,27.013441,20098.0,5024.5,528.894737,Saturday,186.0,1860.0,3.870062,8406.73897,429.376453,20098.0,744.0,4.0,45.197521,23,12,2023,2023 Q4,-0.975344,-0.220691,-6.447061e-13,1.0,-2.939152e-15,1.0,20.0,1104.678891,874.0,865.235996,4994.0,20.0,1100.430574,870.0,865.079864,4999.0,20.0,1102.698438,873.0,864.866296,4988.0,20.0,1151.583106,907.0,898.40295,4988.0,20.0,1102.677039,871.0,864.569091,4991.0,20.0,1100.812035,872.0,859.965806,4996.0
1,39.0,Female,31678.0,Divorced,3.0,Master's,Unemployed,15.569731,Rural,Comprehensive,1.0,12.0,694.0,2.0,Average,Yes,Monthly,House,1483.0,0,0,0,0,1,0,0,0,0,0,0,2,4857.756069,10559.333333,21984532.0,45.645533,95034.0,10559.333333,812.25641,Monday,694.0,1388.0,4.221513,10805.393307,607.219509,126712.0,2776.0,4.0,62.278924,12,6,2023,2023 Q2,-0.998725,0.050489,-6.447061e-13,1.0,-1.469576e-15,1.0,20.0,1104.006551,875.0,864.955881,4997.0,20.0,1103.361209,872.0,867.02349,4997.0,20.0,1102.113989,871.0,866.235322,4997.0,20.0,1083.632645,855.0,853.156218,4997.0,20.0,1098.15965,862.0,865.103831,4999.0,20.0,1106.883166,878.0,863.675409,4997.0
2,23.0,Male,25602.0,Divorced,3.0,High School,Self-Employed,47.177549,Suburban,Premium,1.0,14.0,632.0,3.0,Good,Yes,Weekly,House,567.0,0,0,0,0,0,0,0,0,1,0,0,4,17361.338138,8534.0,16180464.0,40.509494,25602.0,25602.0,1113.130435,Saturday,632.0,1896.0,2.641123,29816.21115,1085.083634,204816.0,5056.0,8.0,377.420394,30,9,2023,2023 Q3,-0.994867,0.101192,-6.447061e-13,1.0,-2.204364e-15,1.0,20.0,1104.006551,875.0,864.955881,4997.0,20.0,1100.430574,870.0,865.079864,4999.0,20.0,1104.78749,876.0,865.951488,4999.0,20.0,1083.632645,855.0,853.156218,4997.0,20.0,1102.677039,871.0,864.569091,4991.0,20.0,1101.733099,872.0,865.787949,4997.0
3,21.0,Male,141855.0,Married,2.0,Bachelor's,Self-Employed,10.938144,Rural,Basic,1.0,0.0,367.0,1.0,Poor,Yes,Daily,Apartment,765.0,0,0,0,0,1,0,0,0,0,0,0,3,7350.432875,70927.5,52060785.0,386.525886,283710.0,70927.5,6755.0,Wednesday,367.0,367.0,4.453093,4014.298906,229.701027,283710.0,734.0,2.0,21.876288,12,6,2024,2024 Q2,0.111402,0.993775,1.585375e-14,1.0,-1.469576e-15,1.0,20.0,1108.443461,876.0,866.852628,4997.0,20.0,1100.430574,870.0,865.079864,4999.0,20.0,1102.698438,873.0,864.866296,4988.0,20.0,1083.632645,855.0,853.156218,4997.0,20.0,1099.009424,867.0,865.547081,4997.0,20.0,1097.042977,861.0,865.431191,4988.0
4,21.0,Male,39651.0,Single,1.0,Bachelor's,Self-Employed,20.376094,Rural,Premium,0.0,8.0,598.0,4.0,Poor,Yes,Weekly,House,2022.0,0,0,0,0,0,0,0,0,0,0,0,3,6846.367459,39651.0,23711298.0,66.30602,79302.0,19825.5,1888.142857,Wednesday,598.0,2392.0,3.981195,12184.903989,427.897966,79302.0,1196.0,0.0,40.752187,1,12,2021,2021 Q4,-0.996246,0.086565,-1.468363e-13,1.0,-2.939152e-15,1.0,20.0,1104.678891,874.0,865.235996,4994.0,20.0,1100.430574,870.0,865.079864,4999.0,20.0,1102.698438,873.0,864.866296,4988.0,20.0,1082.452746,855.0,851.671355,4999.0,20.0,1099.009424,867.0,865.547081,4997.0,20.0,1104.723079,872.0,866.377508,4991.0


###
---
###

# Encoding of column in `df`

In [52]:
show_nulls(df)

Unnamed: 0,Column,Data Type,Nulls,No. of Uniques,Uniques
18,Premium Amount,float64,800000,4794,"[2869.0, 1483.0, 567.0, 765.0, 2022.0, 3202.0,..."
76,Previous Claims_STD_Premium Amount,float64,9,9,"[898.4029501785653, 853.1562175615868, 851.671..."
2,Annual Income,float64,0,97970,"[10049.0, 31678.0, 25602.0, 141855.0, 39651.0,..."
0,Age,float64,0,47,"[19.0, 39.0, 23.0, 21.0, 29.0, 41.0, 48.0, 44...."
4,Number of Dependents,float64,0,5,"[1.0, 3.0, 2.0, 0.0, 4.0]"
5,Education Level,object,0,4,"[Bachelor's, Master's, High School, PhD]"
6,Occupation,object,0,3,"[Self-Employed, Unemployed, Employed]"
1,Gender,object,0,2,"[Female, Male]"
7,Health Score,float64,0,933976,"[22.59876067181393, 15.569730989408043, 47.177..."
8,Location,object,0,3,"[Urban, Rural, Suburban]"


### Policy Start Date - Year	

In [53]:
df["Policy Start Date - Year"].unique()

array([2023, 2024, 2021, 2022, 2020, 2019], dtype=object)

In [54]:
a = OrdinalEncoder(categories=[[2019, 2020, 2021, 2022, 2023, 2024]])

b = pd.DataFrame({"ENCODED_Policy Start Date - Year" : a.fit_transform(df[["Policy Start Date - Year"]]).flatten()})

df = pd.concat([df, b], axis=1)
df.drop(columns="Policy Start Date - Year", inplace=True)

### Policy Start Date - Quarter

In [55]:
sorted(list(df["Policy Start Date - Quarter"].unique()))

['2019 Q3',
 '2019 Q4',
 '2020 Q1',
 '2020 Q2',
 '2020 Q3',
 '2020 Q4',
 '2021 Q1',
 '2021 Q2',
 '2021 Q3',
 '2021 Q4',
 '2022 Q1',
 '2022 Q2',
 '2022 Q3',
 '2022 Q4',
 '2023 Q1',
 '2023 Q2',
 '2023 Q3',
 '2023 Q4',
 '2024 Q1',
 '2024 Q2',
 '2024 Q3']

In [56]:
a = OrdinalEncoder(categories=[['2019 Q3', '2019 Q4', '2020 Q1', '2020 Q2', '2020 Q3', '2020 Q4', '2021 Q1', '2021 Q2', '2021 Q3',
                 '2021 Q4', '2022 Q1', '2022 Q2', '2022 Q3', '2022 Q4', '2023 Q1', '2023 Q2', '2023 Q3', '2023 Q4', '2024 Q1', '2024 Q2', '2024 Q3']])

b = pd.DataFrame({"ENCODED_Policy Start Date - Quarter" : a.fit_transform(df[["Policy Start Date - Quarter"]]).flatten()})

df = pd.concat([df, b], axis=1)
df.drop(columns="Policy Start Date - Quarter", inplace=True)

### Policy Start Date - Month

In [57]:
df["Policy Start Date - Month"] = df["Policy Start Date - Month"].astype(int)

### Customer Feedback

In [58]:
df["Customer Feedback"].unique()

array(['Poor', 'Average', 'Good'], dtype=object)

In [59]:
a = OrdinalEncoder(categories=[['Poor', 'Average', 'Good']])

b = pd.DataFrame({"ENCODED_Customer Feedback" : a.fit_transform(df[["Customer Feedback"]]).flatten()})

df = pd.concat([df, b], axis=1)
df.drop(columns="Customer Feedback", inplace=True)

### Occupation

In [60]:
a = OneHotEncoder(drop="first", sparse_output=False)

b = pd.DataFrame(
        a.fit_transform(df[["Occupation"]]),
        columns="ENCODED_" + a.get_feature_names_out()
    )

df = pd.concat([df, b], axis=1)
df.drop(columns="Occupation", inplace=True)

### Marital Status

In [61]:
a = OneHotEncoder(drop="first", sparse_output=False)

b = pd.DataFrame(
        a.fit_transform(df[["Marital Status"]]),
        columns="ENCODED_" + a.get_feature_names_out()
    )

df = pd.concat([df, b], axis=1)
df.drop(columns="Marital Status", inplace=True)

In [62]:
df

Unnamed: 0,Age,Gender,Annual Income,Number of Dependents,Education Level,Health Score,Location,Policy Type,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Smoking Status,Exercise Frequency,Property Type,Premium Amount,IsNull_Age,IsNull_Annual Income,IsNull_Marital Status,IsNull_Number of Dependents,IsNull_Occupation,IsNull_Health Score,IsNull_Previous Claims,IsNull_Vehicle Age,IsNull_Credit Score,IsNull_Insurance Duration,IsNull_Customer Feedback,Health Conscious Level,Health Conscious Level1,Money Per Head,Money Handling Level,Money Handling Level1,Growth,Growth1,Determinstic,Day_Name,Credit by Score,CreditInsurance,Health_Risk_Score,Credit_Health_Score,Health_Age_Interaction,Feedback1,Feedback2,Feedback3,Feedback4,Policy Start Date - Day,Policy Start Date - Month,Sin_Date,Cos_Date,Sin_Year,Cos_Year,Sin_Month,Cos_Month,Number of Dependents_MIN_Premium Amount,Number of Dependents_MEAN_Premium Amount,Number of Dependents_MEDIAN_Premium Amount,Number of Dependents_STD_Premium Amount,Number of Dependents_MAX_Premium Amount,Occupation_MIN_Premium Amount,Occupation_MEAN_Premium Amount,Occupation_MEDIAN_Premium Amount,Occupation_STD_Premium Amount,Occupation_MAX_Premium Amount,Education Level_MIN_Premium Amount,Education Level_MEAN_Premium Amount,Education Level_MEDIAN_Premium Amount,Education Level_STD_Premium Amount,Education Level_MAX_Premium Amount,Previous Claims_MIN_Premium Amount,Previous Claims_MEAN_Premium Amount,Previous Claims_MEDIAN_Premium Amount,Previous Claims_STD_Premium Amount,Previous Claims_MAX_Premium Amount,Health Conscious Level_MIN_Premium Amount,Health Conscious Level_MEAN_Premium Amount,Health Conscious Level_MEDIAN_Premium Amount,Health Conscious Level_STD_Premium Amount,Health Conscious Level_MAX_Premium Amount,Insurance Duration_MIN_Premium Amount,Insurance Duration_MEAN_Premium Amount,Insurance Duration_MEDIAN_Premium Amount,Insurance Duration_STD_Premium Amount,Insurance Duration_MAX_Premium Amount,ENCODED_Policy Start Date - Year,ENCODED_Policy Start Date - Quarter,ENCODED_Customer Feedback,ENCODED_Occupation_Self-Employed,ENCODED_Occupation_Unemployed,ENCODED_Marital Status_Married,ENCODED_Marital Status_Single
0,19.0,Female,10049.0,1.0,Bachelor's,22.598761,Urban,Premium,2.0,17.0,372.0,5.0,No,Weekly,House,2869.0,0,0,0,0,0,0,0,0,0,0,0,4,13740.046488,10049.000000,3738228.0,27.013441,20098.0,5024.500000,528.894737,Saturday,186.0,1860.0,3.870062,8406.738970,429.376453,20098.0,744.0,4.0,45.197521,23,12,-0.975344,-0.220691,-6.447061e-13,1.0,-2.939152e-15,1.0,20.0,1104.678891,874.0,865.235996,4994.0,20.0,1100.430574,870.0,865.079864,4999.0,20.0,1102.698438,873.0,864.866296,4988.0,20.0,1151.583106,907.0,898.402950,4988.0,20.0,1102.677039,871.0,864.569091,4991.0,20.0,1100.812035,872.0,859.965806,4996.0,4.0,17.0,0.0,1.0,0.0,1.0,0.0
1,39.0,Female,31678.0,3.0,Master's,15.569731,Rural,Comprehensive,1.0,12.0,694.0,2.0,Yes,Monthly,House,1483.0,0,0,0,0,1,0,0,0,0,0,0,2,4857.756069,10559.333333,21984532.0,45.645533,95034.0,10559.333333,812.256410,Monday,694.0,1388.0,4.221513,10805.393307,607.219509,126712.0,2776.0,4.0,62.278924,12,6,-0.998725,0.050489,-6.447061e-13,1.0,-1.469576e-15,1.0,20.0,1104.006551,875.0,864.955881,4997.0,20.0,1103.361209,872.0,867.023490,4997.0,20.0,1102.113989,871.0,866.235322,4997.0,20.0,1083.632645,855.0,853.156218,4997.0,20.0,1098.159650,862.0,865.103831,4999.0,20.0,1106.883166,878.0,863.675409,4997.0,4.0,15.0,1.0,0.0,1.0,0.0,0.0
2,23.0,Male,25602.0,3.0,High School,47.177549,Suburban,Premium,1.0,14.0,632.0,3.0,Yes,Weekly,House,567.0,0,0,0,0,0,0,0,0,1,0,0,4,17361.338138,8534.000000,16180464.0,40.509494,25602.0,25602.000000,1113.130435,Saturday,632.0,1896.0,2.641123,29816.211150,1085.083634,204816.0,5056.0,8.0,377.420394,30,9,-0.994867,0.101192,-6.447061e-13,1.0,-2.204364e-15,1.0,20.0,1104.006551,875.0,864.955881,4997.0,20.0,1100.430574,870.0,865.079864,4999.0,20.0,1104.787490,876.0,865.951488,4999.0,20.0,1083.632645,855.0,853.156218,4997.0,20.0,1102.677039,871.0,864.569091,4991.0,20.0,1101.733099,872.0,865.787949,4997.0,4.0,16.0,2.0,1.0,0.0,0.0,0.0
3,21.0,Male,141855.0,2.0,Bachelor's,10.938144,Rural,Basic,1.0,0.0,367.0,1.0,Yes,Daily,Apartment,765.0,0,0,0,0,1,0,0,0,0,0,0,3,7350.432875,70927.500000,52060785.0,386.525886,283710.0,70927.500000,6755.000000,Wednesday,367.0,367.0,4.453093,4014.298906,229.701027,283710.0,734.0,2.0,21.876288,12,6,0.111402,0.993775,1.585375e-14,1.0,-1.469576e-15,1.0,20.0,1108.443461,876.0,866.852628,4997.0,20.0,1100.430574,870.0,865.079864,4999.0,20.0,1102.698438,873.0,864.866296,4988.0,20.0,1083.632645,855.0,853.156218,4997.0,20.0,1099.009424,867.0,865.547081,4997.0,20.0,1097.042977,861.0,865.431191,4988.0,5.0,19.0,0.0,1.0,0.0,1.0,0.0
4,21.0,Male,39651.0,1.0,Bachelor's,20.376094,Rural,Premium,0.0,8.0,598.0,4.0,Yes,Weekly,House,2022.0,0,0,0,0,0,0,0,0,0,0,0,3,6846.367459,39651.000000,23711298.0,66.306020,79302.0,19825.500000,1888.142857,Wednesday,598.0,2392.0,3.981195,12184.903989,427.897966,79302.0,1196.0,0.0,40.752187,1,12,-0.996246,0.086565,-1.468363e-13,1.0,-2.939152e-15,1.0,20.0,1104.678891,874.0,865.235996,4994.0,20.0,1100.430574,870.0,865.079864,4999.0,20.0,1102.698438,873.0,864.866296,4988.0,20.0,1082.452746,855.0,851.671355,4999.0,20.0,1099.009424,867.0,865.547081,4997.0,20.0,1104.723079,872.0,866.377508,4991.0,2.0,9.0,0.0,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1999995,50.0,Female,38782.0,1.0,Bachelor's,14.498639,Rural,Premium,0.0,8.0,309.0,2.0,Yes,Daily,Condo,,0,0,0,0,1,0,1,0,0,0,0,4,23197.822227,38782.000000,11983638.0,125.508091,77564.0,19391.000000,775.640000,Friday,309.0,618.0,4.275068,4480.079418,724.931945,155128.0,1236.0,0.0,57.994556,9,7,0.645845,-0.763468,-1.468363e-13,1.0,-1.714506e-15,1.0,20.0,1104.678891,874.0,865.235996,4994.0,20.0,1100.430574,870.0,865.079864,4999.0,20.0,1102.698438,873.0,864.866296,4988.0,20.0,1082.452746,855.0,851.671355,4999.0,20.0,1102.677039,871.0,864.569091,4991.0,20.0,1106.883166,878.0,863.675409,4997.0,2.0,8.0,1.0,1.0,0.0,1.0,0.0
1999996,35.0,Female,73462.0,0.0,Master's,8.145748,Rural,Basic,2.0,0.0,462.0,2.0,No,Daily,Apartment,,1,0,0,0,1,0,0,0,1,0,0,5,18246.475706,73462.000000,33939444.0,159.008658,220386.0,24487.333333,2098.914286,Tuesday,231.0,924.0,4.592713,3763.335614,285.101183,587696.0,3696.0,16.0,65.165985,28,3,0.681828,0.731513,-6.447061e-13,1.0,-7.347881e-16,1.0,20.0,1097.649985,867.0,862.779627,4999.0,20.0,1100.430574,870.0,865.079864,4999.0,20.0,1102.113989,871.0,866.235322,4997.0,20.0,1151.583106,907.0,898.402950,4988.0,20.0,1102.591279,872.0,864.171931,4994.0,20.0,1106.883166,878.0,863.675409,4997.0,4.0,14.0,2.0,1.0,0.0,0.0,1.0
1999997,26.0,Female,35178.0,0.0,Master's,6.636583,Urban,Comprehensive,1.0,10.0,698.0,6.0,No,Monthly,Apartment,,0,0,0,0,0,0,1,0,1,0,0,2,2760.818699,35178.000000,24554244.0,50.398281,105534.0,11726.000000,1353.000000,Monday,698.0,4188.0,4.668171,4632.335221,172.551169,70356.0,1396.0,2.0,13.273167,30,9,-0.709843,0.704360,3.510335e-13,1.0,-2.204364e-15,1.0,20.0,1097.649985,867.0,862.779627,4999.0,20.0,1105.160880,876.0,862.882837,4994.0,20.0,1102.113989,871.0,866.235322,4997.0,20.0,1083.632645,855.0,853.156218,4997.0,20.0,1098.159650,862.0,865.103831,4999.0,20.0,1104.558441,873.0,866.550287,4999.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1999998,34.0,Female,45661.0,3.0,Master's,15.937248,Urban,Premium,2.0,17.0,467.0,7.0,No,Weekly,Condo,,0,0,0,0,1,0,0,0,0,0,0,4,17339.725601,15220.333333,21323687.0,97.775161,136983.0,15220.333333,1342.970588,Monday,233.5,3269.0,4.203138,7442.694720,541.866425,182644.0,1868.0,8.0,63.748991,9,5,0.561062,-0.827774,-1.305266e-12,1.0,-1.224647e-15,1.0,20.0,1104.006551,875.0,864.955881,4997.0,20.0,1100.430574,870.0,865.079864,4999.0,20.0,1102.113989,871.0,866.235322,4997.0,20.0,1151.583106,907.0,898.402950,4988.0,20.0,1102.677039,871.0,864.569091,4991.0,20.0,1105.964863,877.0,866.225414,4988.0,3.0,11.0,1.0,1.0,0.0,0.0,1.0


#
---
#

In [63]:
def return_splits(ddf, feature_name, target_name):
    return [ddf[ddf[feature_name] == i][target_name] for i in ddf[feature_name].unique()]

def give_stats_analysis(df, target_column_name):
    ddf = df.copy()
    ddf = ddf.dropna()

    features = []
    tests = []
    stats = []
    pvals = []
    verdict = []
    count = 0

    target = ddf[target_column_name]
    for i in ddf.columns:
        features.append(i)
        feature = ddf[i]
        
        if (feature.dtype == "O" and (target.dtype == "float" or target.dtype == "int")) or (target.dtype == "O" and (feature.dtype == "float" or feature.dtype == "int")):
            stat, pval, *_ = kruskal(*return_splits(ddf, feature.name, target.name))
            tests.append("Kruskal-Wallis")
            stats.append(stat)
            pvals.append(pval)
            
        
        elif (feature.dtype == "float" or feature.dtype == "int") and (target.dtype == "float" or target.dtype == "int"):
            stat, pval, *_ = spearmanr(feature, target)
            tests.append("SpearmanR")
            stats.append(stat)
            pvals.append(pval)

        elif feature.dtype == "O" and target.dtype == "O":
            stat, pval, *_ = chi2_contingency(pd.crosstab(feature, target))
            tests.append("Chi-Square")
            stats.append(stat)
            pvals.append(pval)
        
        else:
            tests.append(np.nan)
            stats.append(np.nan)
            pvals.append(np.nan)
        
        if pval <= 0.05:
            verdict.append("There is Relationship")
        else:
            verdict.append("There is NO Relationship")

        print(f"{feature.name} ■■■ {target_column_name}".ljust(50, "-")+"✅")
    
    return pd.DataFrame({
        "Feature" : features,
        "Target" : [target_column_name]*ddf.shape[1],
        "Statistic Test" : tests,
        "Test Statistic" : stats,
        "P-Value" : pvals,
        "Verdict" : verdict
    }).sort_values(by="P-Value")

In [64]:
stats_df = give_stats_analysis(df, "Premium Amount")

Age ■■■ Premium Amount----------------------------✅
Gender ■■■ Premium Amount-------------------------✅
Annual Income ■■■ Premium Amount------------------✅
Number of Dependents ■■■ Premium Amount-----------✅
Education Level ■■■ Premium Amount----------------✅
Health Score ■■■ Premium Amount-------------------✅
Location ■■■ Premium Amount-----------------------✅
Policy Type ■■■ Premium Amount--------------------✅
Previous Claims ■■■ Premium Amount----------------✅
Vehicle Age ■■■ Premium Amount--------------------✅
Credit Score ■■■ Premium Amount-------------------✅
Insurance Duration ■■■ Premium Amount-------------✅
Smoking Status ■■■ Premium Amount-----------------✅
Exercise Frequency ■■■ Premium Amount-------------✅
Property Type ■■■ Premium Amount------------------✅
Premium Amount ■■■ Premium Amount-----------------✅
IsNull_Age ■■■ Premium Amount---------------------✅
IsNull_Annual Income ■■■ Premium Amount-----------✅
IsNull_Marital Status ■■■ Premium Amount----------✅
IsNull_Numbe

In [65]:
wanted_columns = stats_df[stats_df["P-Value"] <= 0.05]["Feature"]

In [66]:
df = df[wanted_columns]

#
---
#

# Spliting Data

In [67]:
train = df.iloc[:1200000, :]
test = df.iloc[1200000:, :]

train.shape, test.shape

((1200000, 56), (800000, 56))

In [68]:
X = train.drop(columns="Premium Amount")
Y = train["Premium Amount"]

In [69]:
from sklearn.model_selection import train_test_split

In [70]:
x_train, x_validate, y_train, y_validate = train_test_split(X, Y, test_size=10000)

In [71]:
x_validate.shape

(10000, 55)

In [72]:
test.drop(columns="Premium Amount", inplace=True)

In [73]:
test.shape

(800000, 55)

##
---
##

# Scaling on `df` 

In [74]:
# fig, axs = plt.subplots(3, 6, figsize=(20, 9))

# for i, ax in zip(x_train.columns, axs.flatten()):
#     sns.kdeplot(x_train[i], ax=ax, color="darkgray", fill=True)

# plt.tight_layout()
# plt.show()

In [75]:
from sklearn.preprocessing import RobustScaler, MinMaxScaler, StandardScaler

In [76]:
def do_scaling(scaler):
    var_cols = list(map(lambda x : x.replace(" ", "_"), x_train.columns))
    scalers = {}
    cols = x_train.select_dtypes("number").columns
    
    for i in range(len(cols)):
        scalers[f"SCALER_{var_cols[i]}"] = scaler
        
        x_train[f"SCALER_{var_cols[i]}"] = scalers[f"SCALER_{var_cols[i]}"].fit_transform(x_train[[cols[i]]]).flatten()
        x_train.drop(columns=cols[i], inplace=True)

        x_validate[f"SCALER_{var_cols[i]}"] = scalers[f"SCALER_{var_cols[i]}"].transform(x_validate[[cols[i]]]).flatten()
        x_validate.drop(columns=cols[i], inplace=True)

        test[f"SCALER_{var_cols[i]}"] = scalers[f"SCALER_{var_cols[i]}"].transform(test[[cols[i]]]).flatten()
        test.drop(columns=cols[i], inplace=True)
    
    return scalers

In [77]:
scaler_objects = do_scaling(RobustScaler())
scaler_objects

{'SCALER_Annual_Income': RobustScaler(),
 'SCALER_Credit_Score': RobustScaler(),
 'SCALER_IsNull_Annual_Income': RobustScaler(),
 'SCALER_Money_Handling_Level': RobustScaler(),
 'SCALER_Money_Handling_Level1': RobustScaler(),
 'SCALER_Money_Per_Head': RobustScaler(),
 'SCALER_Growth': RobustScaler(),
 'SCALER_Credit_by_Score': RobustScaler(),
 'SCALER_Determinstic': RobustScaler(),
 'SCALER_Growth1': RobustScaler(),
 'SCALER_Feedback1': RobustScaler(),
 'SCALER_Previous_Claims_MEDIAN_Premium_Amount': RobustScaler(),
 'SCALER_IsNull_Health_Score': RobustScaler(),
 'SCALER_Previous_Claims_MEAN_Premium_Amount': RobustScaler(),
 'SCALER_Previous_Claims': RobustScaler(),
 'SCALER_Previous_Claims_STD_Premium_Amount': RobustScaler(),
 'SCALER_IsNull_Customer_Feedback': RobustScaler(),
 'SCALER_Previous_Claims_MAX_Premium_Amount': RobustScaler(),
 'SCALER_Feedback3': RobustScaler(),
 'SCALER_IsNull_Previous_Claims': RobustScaler(),
 'SCALER_IsNull_Marital_Status': RobustScaler(),
 'SCALER_Heal

In [78]:
x_train.head(3)

Unnamed: 0,Health Conscious Level,SCALER_Annual_Income,SCALER_Credit_Score,SCALER_IsNull_Annual_Income,SCALER_Money_Handling_Level,SCALER_Money_Handling_Level1,SCALER_Money_Per_Head,SCALER_Growth,SCALER_Credit_by_Score,SCALER_Determinstic,SCALER_Growth1,SCALER_Feedback1,SCALER_Previous_Claims_MEDIAN_Premium_Amount,SCALER_IsNull_Health_Score,SCALER_Previous_Claims_MEAN_Premium_Amount,SCALER_Previous_Claims,SCALER_Previous_Claims_STD_Premium_Amount,SCALER_IsNull_Customer_Feedback,SCALER_Previous_Claims_MAX_Premium_Amount,SCALER_Feedback3,SCALER_IsNull_Previous_Claims,SCALER_IsNull_Marital_Status,SCALER_Health_Score,SCALER_Health_Risk_Score,SCALER_Feedback2,SCALER_CreditInsurance,SCALER_Sin_Year,SCALER_IsNull_Credit_Score,SCALER_Health_Age_Interaction,SCALER_ENCODED_Policy_Start_Date_-_Year,SCALER_ENCODED_Policy_Start_Date_-_Quarter,SCALER_Feedback4,SCALER_IsNull_Number_of_Dependents,SCALER_IsNull_Occupation,SCALER_Health_Conscious_Level1,SCALER_Sin_Month,SCALER_Policy_Start_Date_-_Month,SCALER_Health_Conscious_Level,SCALER_Health_Conscious_Level_MEAN_Premium_Amount,SCALER_Health_Conscious_Level_MEDIAN_Premium_Amount,SCALER_Number_of_Dependents_MEAN_Premium_Amount,SCALER_Number_of_Dependents_MEDIAN_Premium_Amount,SCALER_Number_of_Dependents_STD_Premium_Amount,SCALER_Insurance_Duration_MEAN_Premium_Amount,SCALER_Insurance_Duration_MEDIAN_Premium_Amount,SCALER_Health_Conscious_Level_MAX_Premium_Amount,SCALER_Credit_Health_Score,SCALER_Occupation_MEDIAN_Premium_Amount,SCALER_Occupation_MAX_Premium_Amount,SCALER_Occupation_MEAN_Premium_Amount,SCALER_Previous_Claims_MIN_Premium_Amount,SCALER_Insurance_Duration_MAX_Premium_Amount,SCALER_ENCODED_Occupation_Self-Employed,SCALER_Age,SCALER_Insurance_Duration_STD_Premium_Amount
573536,5,-0.419399,-0.39485,0.0,-0.458139,-0.316579,-0.383179,-0.249424,-0.045307,-0.416154,-0.367209,-0.417294,0.0,0.0,-1.0,-1.0,-1.0,0.0,1.0,-0.5,0.0,0.0,0.912276,-0.912276,-0.604775,0.666168,-1.0,0.0,1.36557,0.0,0.1,-0.088336,1.0,0.0,0.803688,0.0,0.0,0.0,0.2,0.0,0.125,0.0,-1.753579,-2.75,0.0,0.59327,-0.333333,0.4,-0.619544,0.0,-0.666667,1.0,0.434783,-1.064823,0.0
201688,5,0.668703,0.613734,0.0,1.064318,0.420001,1.657081,1.528954,0.71521,0.179995,0.134682,0.641878,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.332284,-0.332284,0.259947,0.623551,-1.0,0.0,1.181338,0.0,0.0,0.284678,0.0,0.0,0.700256,7.347881e-16,-0.6,0.0,0.2,-0.904347,-0.875,-4.197114,0.0,0.25,0.0,0.76975,0.0,0.0,0.0,0.0,0.555556,0.0,0.956522,0.282128,0.469401
94858,5,1.424482,-0.682403,0.0,0.941713,2.074093,0.618032,1.08125,-0.975728,0.617392,1.554785,3.130469,52.0,0.0,57.590036,1.0,30.472002,0.0,-4.5,1.5,0.0,0.0,-0.390487,0.390487,0.505747,0.275888,-0.246291,1.0,0.225692,-0.666667,-0.8,0.470885,0.0,0.0,0.163735,7.347881e-16,-0.6,0.0,0.2,0.0,0.125,0.0,0.260301,1.0,0.0,-0.548877,-0.333333,0.4,-0.619544,0.0,0.0,1.0,0.956522,0.822189,0.0


In [79]:
x_validate.head(3)

Unnamed: 0,Health Conscious Level,SCALER_Annual_Income,SCALER_Credit_Score,SCALER_IsNull_Annual_Income,SCALER_Money_Handling_Level,SCALER_Money_Handling_Level1,SCALER_Money_Per_Head,SCALER_Growth,SCALER_Credit_by_Score,SCALER_Determinstic,SCALER_Growth1,SCALER_Feedback1,SCALER_Previous_Claims_MEDIAN_Premium_Amount,SCALER_IsNull_Health_Score,SCALER_Previous_Claims_MEAN_Premium_Amount,SCALER_Previous_Claims,SCALER_Previous_Claims_STD_Premium_Amount,SCALER_IsNull_Customer_Feedback,SCALER_Previous_Claims_MAX_Premium_Amount,SCALER_Feedback3,SCALER_IsNull_Previous_Claims,SCALER_IsNull_Marital_Status,SCALER_Health_Score,SCALER_Health_Risk_Score,SCALER_Feedback2,SCALER_CreditInsurance,SCALER_Sin_Year,SCALER_IsNull_Credit_Score,SCALER_Health_Age_Interaction,SCALER_ENCODED_Policy_Start_Date_-_Year,SCALER_ENCODED_Policy_Start_Date_-_Quarter,SCALER_Feedback4,SCALER_IsNull_Number_of_Dependents,SCALER_IsNull_Occupation,SCALER_Health_Conscious_Level1,SCALER_Sin_Month,SCALER_Policy_Start_Date_-_Month,SCALER_Health_Conscious_Level,SCALER_Health_Conscious_Level_MEAN_Premium_Amount,SCALER_Health_Conscious_Level_MEDIAN_Premium_Amount,SCALER_Number_of_Dependents_MEAN_Premium_Amount,SCALER_Number_of_Dependents_MEDIAN_Premium_Amount,SCALER_Number_of_Dependents_STD_Premium_Amount,SCALER_Insurance_Duration_MEAN_Premium_Amount,SCALER_Insurance_Duration_MEDIAN_Premium_Amount,SCALER_Health_Conscious_Level_MAX_Premium_Amount,SCALER_Credit_Health_Score,SCALER_Occupation_MEDIAN_Premium_Amount,SCALER_Occupation_MAX_Premium_Amount,SCALER_Occupation_MEAN_Premium_Amount,SCALER_Previous_Claims_MIN_Premium_Amount,SCALER_Insurance_Duration_MAX_Premium_Amount,SCALER_ENCODED_Occupation_Self-Employed,SCALER_Age,SCALER_Insurance_Duration_STD_Premium_Amount
249167,5,-0.505042,-1.141631,0.0,-0.583793,-0.318576,-0.254917,-0.348775,-0.608414,-0.477381,-0.424184,-0.384381,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.435929,-0.435929,-0.463307,-0.418692,-0.246291,0.0,0.855713,-0.666667,-0.7,0.351714,0.0,0.0,1.621382,4.898587e-16,-0.4,0.0,0.2,0.095653,0.0,0.540228,-0.739699,0.0,0.0,-0.286039,-0.333333,0.4,-0.619544,0.0,0.222222,1.0,0.434783,-2.154674,0.0
1046989,3,0.29774,0.540773,0.0,0.551197,0.15654,1.052793,0.582507,0.660194,0.224716,0.109872,0.317528,0.0,0.0,-1.0,-1.0,-1.0,0.0,1.0,-0.5,0.0,0.0,-0.847052,0.847052,0.229885,1.128224,-1.0,0.0,-0.610559,0.0,0.1,-0.478096,0.0,1.0,-0.305756,4.898587e-16,-0.4,-0.976617,-0.8,-0.904347,-0.875,-4.197114,0.260301,1.0,0.5,-0.616866,-0.333333,0.4,-0.619544,0.0,0.0,1.0,0.086957,0.822189,0.0
139600,5,-0.475802,0.04721,0.0,-0.474068,-0.41442,-0.413806,-0.314855,0.288026,-0.450928,-0.404732,-0.358815,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.293775,0.293775,0.026525,0.784299,1.0,0.0,0.028193,0.666667,0.8,-0.120245,0.0,0.0,0.692204,7.347881e-16,-0.6,0.0,0.2,0.0,0.125,0.0,0.260301,1.0,0.0,-0.187369,0.666667,-0.6,0.380456,0.0,0.0,0.0,0.347826,0.822189,-0.530599


In [80]:
test.head(3)

Unnamed: 0,Health Conscious Level,SCALER_Annual_Income,SCALER_Credit_Score,SCALER_IsNull_Annual_Income,SCALER_Money_Handling_Level,SCALER_Money_Handling_Level1,SCALER_Money_Per_Head,SCALER_Growth,SCALER_Credit_by_Score,SCALER_Determinstic,SCALER_Growth1,SCALER_Feedback1,SCALER_Previous_Claims_MEDIAN_Premium_Amount,SCALER_IsNull_Health_Score,SCALER_Previous_Claims_MEAN_Premium_Amount,SCALER_Previous_Claims,SCALER_Previous_Claims_STD_Premium_Amount,SCALER_IsNull_Customer_Feedback,SCALER_Previous_Claims_MAX_Premium_Amount,SCALER_Feedback3,SCALER_IsNull_Previous_Claims,SCALER_IsNull_Marital_Status,SCALER_Health_Score,SCALER_Health_Risk_Score,SCALER_Feedback2,SCALER_CreditInsurance,SCALER_Sin_Year,SCALER_IsNull_Credit_Score,SCALER_Health_Age_Interaction,SCALER_ENCODED_Policy_Start_Date_-_Year,SCALER_ENCODED_Policy_Start_Date_-_Quarter,SCALER_Feedback4,SCALER_IsNull_Number_of_Dependents,SCALER_IsNull_Occupation,SCALER_Health_Conscious_Level1,SCALER_Sin_Month,SCALER_Policy_Start_Date_-_Month,SCALER_Health_Conscious_Level,SCALER_Health_Conscious_Level_MEAN_Premium_Amount,SCALER_Health_Conscious_Level_MEDIAN_Premium_Amount,SCALER_Number_of_Dependents_MEAN_Premium_Amount,SCALER_Number_of_Dependents_MEDIAN_Premium_Amount,SCALER_Number_of_Dependents_STD_Premium_Amount,SCALER_Insurance_Duration_MEAN_Premium_Amount,SCALER_Insurance_Duration_MEDIAN_Premium_Amount,SCALER_Health_Conscious_Level_MAX_Premium_Amount,SCALER_Credit_Health_Score,SCALER_Occupation_MEDIAN_Premium_Amount,SCALER_Occupation_MAX_Premium_Amount,SCALER_Occupation_MEAN_Premium_Amount,SCALER_Previous_Claims_MIN_Premium_Amount,SCALER_Insurance_Duration_MAX_Premium_Amount,SCALER_ENCODED_Occupation_Self-Employed,SCALER_Age,SCALER_Insurance_Duration_STD_Premium_Amount
1200000,2,-0.602845,-0.2103,0.0,-0.617124,-0.524925,-0.491358,-0.486643,-0.797735,-0.510231,-0.46825,-0.497491,52.0,0.0,57.590036,1.0,30.472002,0.0,-4.5,0.0,1.0,1.0,-0.967877,0.967877,-0.566755,-0.837009,0.0,1.0,-0.855997,0.333333,0.5,-0.696361,0.0,0.0,-0.443551,0.0,0.0,-1.208313,-1.8,-1.073045,-1.25,-0.459772,-1.48387,-2.75,0.833333,-0.872912,-0.333333,0.4,-0.619544,0.0,-0.666667,1.0,-0.565217,-0.13203,0.0
1200001,1,2.778065,-0.978541,0.0,1.573065,4.61255,2.288055,3.459856,-0.485437,3.458248,1.759921,5.497467,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,1.0,0.0,-0.644056,0.644056,0.261715,0.069533,1.0,0.0,-0.618038,0.666667,0.9,0.142877,0.0,0.0,-0.51736,4.898587e-16,-0.4,-1.12816,0.0,0.631238,0.25,3.658057,0.260301,1.0,-2.166667,-0.805739,-0.333333,0.4,-0.619544,0.0,0.0,1.0,-0.434783,0.822189,0.0
1200002,3,-0.198899,0.939914,0.0,-0.006054,-0.268866,0.243781,0.186984,0.961165,-0.230098,-0.298201,-0.116706,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,-0.023212,0.023212,0.394341,1.712523,0.0,0.0,0.248595,0.333333,0.5,0.05475,0.0,0.0,-0.204952,4.898587e-16,-0.4,-0.976617,-0.8,-0.904347,-0.875,-4.197114,-1.753579,-2.75,0.5,0.520175,0.0,0.0,0.0,0.0,-0.666667,0.0,0.26087,-1.064823,0.469401


#
---
#

# Joining All Data

In [81]:
train = pd.concat([pd.concat([x_train, y_train], axis=1), pd.concat([x_validate, y_validate], axis=1)]).sort_index()
train.head(3)

Unnamed: 0,Health Conscious Level,SCALER_Annual_Income,SCALER_Credit_Score,SCALER_IsNull_Annual_Income,SCALER_Money_Handling_Level,SCALER_Money_Handling_Level1,SCALER_Money_Per_Head,SCALER_Growth,SCALER_Credit_by_Score,SCALER_Determinstic,SCALER_Growth1,SCALER_Feedback1,SCALER_Previous_Claims_MEDIAN_Premium_Amount,SCALER_IsNull_Health_Score,SCALER_Previous_Claims_MEAN_Premium_Amount,SCALER_Previous_Claims,SCALER_Previous_Claims_STD_Premium_Amount,SCALER_IsNull_Customer_Feedback,SCALER_Previous_Claims_MAX_Premium_Amount,SCALER_Feedback3,SCALER_IsNull_Previous_Claims,SCALER_IsNull_Marital_Status,SCALER_Health_Score,SCALER_Health_Risk_Score,SCALER_Feedback2,SCALER_CreditInsurance,SCALER_Sin_Year,SCALER_IsNull_Credit_Score,SCALER_Health_Age_Interaction,SCALER_ENCODED_Policy_Start_Date_-_Year,SCALER_ENCODED_Policy_Start_Date_-_Quarter,SCALER_Feedback4,SCALER_IsNull_Number_of_Dependents,SCALER_IsNull_Occupation,SCALER_Health_Conscious_Level1,SCALER_Sin_Month,SCALER_Policy_Start_Date_-_Month,SCALER_Health_Conscious_Level,SCALER_Health_Conscious_Level_MEAN_Premium_Amount,SCALER_Health_Conscious_Level_MEDIAN_Premium_Amount,SCALER_Number_of_Dependents_MEAN_Premium_Amount,SCALER_Number_of_Dependents_MEDIAN_Premium_Amount,SCALER_Number_of_Dependents_STD_Premium_Amount,SCALER_Insurance_Duration_MEAN_Premium_Amount,SCALER_Insurance_Duration_MEDIAN_Premium_Amount,SCALER_Health_Conscious_Level_MAX_Premium_Amount,SCALER_Credit_Health_Score,SCALER_Occupation_MEDIAN_Premium_Amount,SCALER_Occupation_MAX_Premium_Amount,SCALER_Occupation_MEAN_Premium_Amount,SCALER_Previous_Claims_MIN_Premium_Amount,SCALER_Insurance_Duration_MAX_Premium_Amount,SCALER_ENCODED_Occupation_Self-Employed,SCALER_Age,SCALER_Insurance_Duration_STD_Premium_Amount,Premium Amount
0,4,-0.391362,-0.978541,0.0,-0.498737,-0.174529,-0.069736,-0.323087,-1.087379,-0.065465,-0.257215,-0.405037,52.0,0.0,57.590036,1.0,30.472002,0.0,-4.5,0.0,0.0,0.0,-0.12255,0.12255,-0.725022,-0.347664,0.0,0.0,-0.600756,0.333333,0.7,-0.42299,0.0,0.0,-0.014009,-1.469576e-15,1.2,0.023383,0.0,0.095653,0.0,0.540228,-0.739699,0.0,-0.5,-0.502009,-0.333333,0.4,-0.619544,0.0,0.222222,1.0,-0.956522,-2.154674,0.0,2869.0
1,2,0.199691,0.403433,0.0,0.377435,0.111549,-0.047019,0.468764,0.556634,0.216862,0.044644,0.231799,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.520242,0.520242,0.173298,-0.524112,0.0,0.0,-0.38958,0.333333,0.5,-0.266721,0.0,1.0,-0.384095,0.0,0.0,-1.208313,-1.8,0.0,0.125,0.0,0.458999,1.5,0.833333,-0.289535,0.0,0.0,0.0,0.0,0.333333,0.0,-0.086957,-0.781814,0.469401,1483.0
2,4,0.033653,0.137339,0.0,0.098729,0.03269,-0.137176,-0.264926,0.355987,0.516638,0.865042,0.698337,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,1.268081,-1.268081,1.181256,-0.334206,0.0,1.0,0.177849,0.333333,0.6,2.61635,0.0,0.0,0.136874,-7.347881e-16,0.6,0.023383,0.0,0.0,0.125,0.0,-0.557842,0.0,-0.5,1.394451,-0.333333,0.4,-0.619544,0.0,0.333333,1.0,-0.782609,0.0,0.0,567.0


In [82]:
test.head(3)

Unnamed: 0,Health Conscious Level,SCALER_Annual_Income,SCALER_Credit_Score,SCALER_IsNull_Annual_Income,SCALER_Money_Handling_Level,SCALER_Money_Handling_Level1,SCALER_Money_Per_Head,SCALER_Growth,SCALER_Credit_by_Score,SCALER_Determinstic,SCALER_Growth1,SCALER_Feedback1,SCALER_Previous_Claims_MEDIAN_Premium_Amount,SCALER_IsNull_Health_Score,SCALER_Previous_Claims_MEAN_Premium_Amount,SCALER_Previous_Claims,SCALER_Previous_Claims_STD_Premium_Amount,SCALER_IsNull_Customer_Feedback,SCALER_Previous_Claims_MAX_Premium_Amount,SCALER_Feedback3,SCALER_IsNull_Previous_Claims,SCALER_IsNull_Marital_Status,SCALER_Health_Score,SCALER_Health_Risk_Score,SCALER_Feedback2,SCALER_CreditInsurance,SCALER_Sin_Year,SCALER_IsNull_Credit_Score,SCALER_Health_Age_Interaction,SCALER_ENCODED_Policy_Start_Date_-_Year,SCALER_ENCODED_Policy_Start_Date_-_Quarter,SCALER_Feedback4,SCALER_IsNull_Number_of_Dependents,SCALER_IsNull_Occupation,SCALER_Health_Conscious_Level1,SCALER_Sin_Month,SCALER_Policy_Start_Date_-_Month,SCALER_Health_Conscious_Level,SCALER_Health_Conscious_Level_MEAN_Premium_Amount,SCALER_Health_Conscious_Level_MEDIAN_Premium_Amount,SCALER_Number_of_Dependents_MEAN_Premium_Amount,SCALER_Number_of_Dependents_MEDIAN_Premium_Amount,SCALER_Number_of_Dependents_STD_Premium_Amount,SCALER_Insurance_Duration_MEAN_Premium_Amount,SCALER_Insurance_Duration_MEDIAN_Premium_Amount,SCALER_Health_Conscious_Level_MAX_Premium_Amount,SCALER_Credit_Health_Score,SCALER_Occupation_MEDIAN_Premium_Amount,SCALER_Occupation_MAX_Premium_Amount,SCALER_Occupation_MEAN_Premium_Amount,SCALER_Previous_Claims_MIN_Premium_Amount,SCALER_Insurance_Duration_MAX_Premium_Amount,SCALER_ENCODED_Occupation_Self-Employed,SCALER_Age,SCALER_Insurance_Duration_STD_Premium_Amount
1200000,2,-0.602845,-0.2103,0.0,-0.617124,-0.524925,-0.491358,-0.486643,-0.797735,-0.510231,-0.46825,-0.497491,52.0,0.0,57.590036,1.0,30.472002,0.0,-4.5,0.0,1.0,1.0,-0.967877,0.967877,-0.566755,-0.837009,0.0,1.0,-0.855997,0.333333,0.5,-0.696361,0.0,0.0,-0.443551,0.0,0.0,-1.208313,-1.8,-1.073045,-1.25,-0.459772,-1.48387,-2.75,0.833333,-0.872912,-0.333333,0.4,-0.619544,0.0,-0.666667,1.0,-0.565217,-0.13203,0.0
1200001,1,2.778065,-0.978541,0.0,1.573065,4.61255,2.288055,3.459856,-0.485437,3.458248,1.759921,5.497467,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,1.0,0.0,-0.644056,0.644056,0.261715,0.069533,1.0,0.0,-0.618038,0.666667,0.9,0.142877,0.0,0.0,-0.51736,4.898587e-16,-0.4,-1.12816,0.0,0.631238,0.25,3.658057,0.260301,1.0,-2.166667,-0.805739,-0.333333,0.4,-0.619544,0.0,0.0,1.0,-0.434783,0.822189,0.0
1200002,3,-0.198899,0.939914,0.0,-0.006054,-0.268866,0.243781,0.186984,0.961165,-0.230098,-0.298201,-0.116706,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,-0.023212,0.023212,0.394341,1.712523,0.0,0.0,0.248595,0.333333,0.5,0.05475,0.0,0.0,-0.204952,4.898587e-16,-0.4,-0.976617,-0.8,-0.904347,-0.875,-4.197114,-1.753579,-2.75,0.5,0.520175,0.0,0.0,0.0,0.0,-0.666667,0.0,0.26087,-1.064823,0.469401


In [83]:
df = pd.concat([train, test])

#
---
#

# Download the `Model Ready df`

In [84]:
df.to_csv("trainable_df.csv", index=False)