In [219]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import *
import xgboost as xgb

from sklearn.preprocessing import PowerTransformer


import warnings
warnings.filterwarnings("ignore")
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", 150)

In [220]:
df = pd.read_csv("EDAed_df.csv")

df["Policy Start Date"] = pd.to_datetime(df["Policy Start Date"])

In [221]:
df.shape

(2000000, 34)

In [222]:
df.isnull().sum()

Age                             0
Gender                          0
Annual Income                   0
Marital Status                  0
Number of Dependents            0
Education Level                 0
Occupation                      0
Health Score                    0
Location                        0
Policy Type                     0
Previous Claims                 0
Vehicle Age                     0
Credit Score                    0
Insurance Duration              0
Policy Start Date               0
Customer Feedback               0
Smoking Status                  0
Exercise Frequency              0
Property Type                   0
Premium Amount             800000
Health Conscious Level          0
Health Conscious Level1         0
Money Per Head                  0
Money Handling Level            0
Money Handling Level1           0
Growth                          0
Growth1                         0
Determinstic                    0
Day_Name                        0
Credit by Scor

In [223]:
train = df.iloc[:1200000, :]
train.shape

(1200000, 34)

In [224]:
test = df.iloc[1200000:, :]
test.shape

(800000, 34)

In [225]:
test.head(3)

Unnamed: 0,Age,Gender,Annual Income,Marital Status,Number of Dependents,Education Level,Occupation,Health Score,Location,Policy Type,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Policy Start Date,Customer Feedback,Smoking Status,Exercise Frequency,Property Type,Premium Amount,Health Conscious Level,Health Conscious Level1,Money Per Head,Money Handling Level,Money Handling Level1,Growth,Growth1,Determinstic,Day_Name,Credit by Score,Feedback1,Feedback2,Feedback3,Feedback4
1200000,28.0,Female,2310.0,Single,4.0,Bachelor's,Self-Employed,7.657981,Rural,Basic,2.0,19.0,493.0,1.0,2023-06-04 15:21:39.245086,Poor,Yes,Weekly,House,,2,3430.775431,577.5,1138830.0,4.685598,4620.0,1155.0,82.5,Sunday,246.5,4620.0,986.0,4.0,15.315962
1200001,31.0,Female,126031.0,Married,2.0,Master's,Self-Employed,13.381379,Suburban,Premium,1.0,14.0,372.0,8.0,2024-04-22 15:21:39.224915,Good,Yes,Rarely,Apartment,,1,1659.291012,63015.5,46883532.0,338.793011,378093.0,42010.333333,4065.516129,Monday,372.0,1008248.0,2976.0,8.0,107.051033
1200002,47.0,Female,17092.0,Divorced,0.0,PhD,Unemployed,24.354527,Urban,Comprehensive,1.0,16.0,819.0,9.0,2023-04-05 15:21:39.134960,Average,Yes,Monthly,Condo,,3,9157.302066,17092.0,13998348.0,20.869353,68368.0,4273.0,363.659574,Wednesday,819.0,68368.0,3276.0,4.0,97.418107


#
---
#

# Adding Dates columns

In [226]:
df["Policy Start Date - Day"] = df["Policy Start Date"].dt.day
df["Policy Start Date - Month"] = df["Policy Start Date"].dt.month
df["Policy Start Date - Year"] = df["Policy Start Date"].dt.year

In [227]:
df["Policy Start Date - Quarter"] = df["Policy Start Date"].dt.year.astype(str) + " Q" + df["Policy Start Date"].dt.quarter.astype(str)

In [228]:
df["Sin_Date"] = np.sin(2 * np.pi * df["Policy Start Date"].astype('int64'))
df["Cos_Date"] = np.cos(2 * np.pi * df["Policy Start Date"].astype('int64'))

In [229]:
df.head(3)

Unnamed: 0,Age,Gender,Annual Income,Marital Status,Number of Dependents,Education Level,Occupation,Health Score,Location,Policy Type,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Policy Start Date,Customer Feedback,Smoking Status,Exercise Frequency,Property Type,Premium Amount,Health Conscious Level,Health Conscious Level1,Money Per Head,Money Handling Level,Money Handling Level1,Growth,Growth1,Determinstic,Day_Name,Credit by Score,Feedback1,Feedback2,Feedback3,Feedback4,Policy Start Date - Day,Policy Start Date - Month,Policy Start Date - Year,Policy Start Date - Quarter,Sin_Date,Cos_Date
0,19.0,Female,10049.0,Married,1.0,Bachelor's,Self-Employed,22.598761,Urban,Premium,2.0,17.0,372.0,5.0,2023-12-23 15:21:39.134960,Poor,No,Weekly,House,2869.0,4,13740.046488,10049.0,3738228.0,27.013441,20098.0,5024.5,528.894737,Saturday,186.0,20098.0,744.0,4.0,45.197521,23,12,2023,2023 Q4,-0.975344,-0.220691
1,39.0,Female,31678.0,Divorced,3.0,Master's,Self-Employed,15.569731,Rural,Comprehensive,1.0,12.0,694.0,2.0,2023-06-12 15:21:39.111551,Average,Yes,Monthly,House,1483.0,2,4857.756069,10559.333333,21984532.0,45.645533,95034.0,10559.333333,812.25641,Monday,694.0,126712.0,2776.0,4.0,62.278924,12,6,2023,2023 Q2,-0.998725,0.050489
2,23.0,Male,25602.0,Divorced,3.0,High School,Self-Employed,47.177549,Suburban,Premium,1.0,14.0,555.0,3.0,2023-09-30 15:21:39.221386,Good,Yes,Weekly,House,567.0,4,17361.338138,8534.0,14209110.0,46.12973,25602.0,25602.0,1113.130435,Saturday,555.0,204816.0,4440.0,8.0,377.420394,30,9,2023,2023 Q3,-0.994867,0.101192


#
---
#

In [230]:
data = df.copy()

#
---
#

In [231]:
df.drop(columns="Policy Start Date", inplace=True)

In [232]:
df.head(3)

Unnamed: 0,Age,Gender,Annual Income,Marital Status,Number of Dependents,Education Level,Occupation,Health Score,Location,Policy Type,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Customer Feedback,Smoking Status,Exercise Frequency,Property Type,Premium Amount,Health Conscious Level,Health Conscious Level1,Money Per Head,Money Handling Level,Money Handling Level1,Growth,Growth1,Determinstic,Day_Name,Credit by Score,Feedback1,Feedback2,Feedback3,Feedback4,Policy Start Date - Day,Policy Start Date - Month,Policy Start Date - Year,Policy Start Date - Quarter,Sin_Date,Cos_Date
0,19.0,Female,10049.0,Married,1.0,Bachelor's,Self-Employed,22.598761,Urban,Premium,2.0,17.0,372.0,5.0,Poor,No,Weekly,House,2869.0,4,13740.046488,10049.0,3738228.0,27.013441,20098.0,5024.5,528.894737,Saturday,186.0,20098.0,744.0,4.0,45.197521,23,12,2023,2023 Q4,-0.975344,-0.220691
1,39.0,Female,31678.0,Divorced,3.0,Master's,Self-Employed,15.569731,Rural,Comprehensive,1.0,12.0,694.0,2.0,Average,Yes,Monthly,House,1483.0,2,4857.756069,10559.333333,21984532.0,45.645533,95034.0,10559.333333,812.25641,Monday,694.0,126712.0,2776.0,4.0,62.278924,12,6,2023,2023 Q2,-0.998725,0.050489
2,23.0,Male,25602.0,Divorced,3.0,High School,Self-Employed,47.177549,Suburban,Premium,1.0,14.0,555.0,3.0,Good,Yes,Weekly,House,567.0,4,17361.338138,8534.0,14209110.0,46.12973,25602.0,25602.0,1113.130435,Saturday,555.0,204816.0,4440.0,8.0,377.420394,30,9,2023,2023 Q3,-0.994867,0.101192


In [233]:
df[["Policy Start Date - Day", "Policy Start Date - Month", "Policy Start Date - Year"]] = df[["Policy Start Date - Day", "Policy Start Date - Month", "Policy Start Date - Year"]].astype("O")

In [234]:
def show_nulls(df):
    nulls = []
    nuniques = []
    uniques = []
    types = []
    
    for i in df.columns:
        nulls.append(df[i].isnull().sum())
        nuniques.append(df[i].nunique())
        uniques.append(df[i].unique())
        types.append(df[i].dtype)
    
    
    return pd.DataFrame(
        {
            "Column" : df.columns,
            "Data Type" : types,
            "Nulls" : nulls,
            "No. of Uniques" : nuniques,
            "Uniques" : uniques
        }
    ).sort_values(by="Nulls", ascending=False)

In [235]:
df["Health Conscious Level"] = df["Health Conscious Level"].astype("O")

In [236]:
show_nulls(df)

Unnamed: 0,Column,Data Type,Nulls,No. of Uniques,Uniques
18,Premium Amount,float64,800000,4794,"[2869.0, 1483.0, 567.0, 765.0, 2022.0, 3202.0,..."
1,Gender,object,0,2,"[Female, Male]"
0,Age,float64,0,47,"[19.0, 39.0, 23.0, 21.0, 29.0, 41.0, 48.0, 44...."
3,Marital Status,object,0,3,"[Married, Divorced, Single]"
4,Number of Dependents,float64,0,5,"[1.0, 3.0, 2.0, 0.0, 4.0]"
5,Education Level,object,0,4,"[Bachelor's, Master's, High School, PhD]"
2,Annual Income,float64,0,97952,"[10049.0, 31678.0, 25602.0, 141855.0, 39651.0,..."
6,Occupation,object,0,3,"[Self-Employed, Employed, Unemployed]"
7,Health Score,float64,0,934000,"[22.59876067181393, 15.569730989408043, 47.177..."
9,Policy Type,object,0,3,"[Premium, Comprehensive, Basic]"


#
---
#

In [237]:
def do_magic(target_column, *columns: list):
    for i in columns:
        df[f"{i}_MIN_{target_column}"] = df.groupby(by=i)[target_column].transform("min")
        df[f"{i}_MEAN_{target_column}"] = df.groupby(by=i)[target_column].transform("mean")
        df[f"{i}_MEDIAN_{target_column}"] = df.groupby(by=i)[target_column].transform("median")
        df[f"{i}_STD_{target_column}"] = df.groupby(by=i)[target_column].transform("std")
        df[f"{i}_MAX_{target_column}"] = df.groupby(by=i)[target_column].transform("max")

In [238]:
do_magic("Premium Amount", "Number of Dependents", "Occupation", "Education Level", "Previous Claims", "Insurance Duration", "Health Conscious Level1",	"Money Per Head", "Money Handling Level", "Money Handling Level1", "Growth", "Growth1",	"Determinstic")

In [239]:
df

Unnamed: 0,Age,Gender,Annual Income,Marital Status,Number of Dependents,Education Level,Occupation,Health Score,Location,Policy Type,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Customer Feedback,Smoking Status,Exercise Frequency,Property Type,Premium Amount,Health Conscious Level,Health Conscious Level1,Money Per Head,Money Handling Level,Money Handling Level1,Growth,Growth1,Determinstic,Day_Name,Credit by Score,Feedback1,Feedback2,Feedback3,Feedback4,Policy Start Date - Day,Policy Start Date - Month,Policy Start Date - Year,Policy Start Date - Quarter,Sin_Date,Cos_Date,Number of Dependents_MIN_Premium Amount,Number of Dependents_MEAN_Premium Amount,Number of Dependents_MEDIAN_Premium Amount,Number of Dependents_STD_Premium Amount,Number of Dependents_MAX_Premium Amount,Occupation_MIN_Premium Amount,Occupation_MEAN_Premium Amount,Occupation_MEDIAN_Premium Amount,Occupation_STD_Premium Amount,Occupation_MAX_Premium Amount,Education Level_MIN_Premium Amount,Education Level_MEAN_Premium Amount,Education Level_MEDIAN_Premium Amount,Education Level_STD_Premium Amount,Education Level_MAX_Premium Amount,Previous Claims_MIN_Premium Amount,Previous Claims_MEAN_Premium Amount,Previous Claims_MEDIAN_Premium Amount,Previous Claims_STD_Premium Amount,Previous Claims_MAX_Premium Amount,Insurance Duration_MIN_Premium Amount,Insurance Duration_MEAN_Premium Amount,Insurance Duration_MEDIAN_Premium Amount,Insurance Duration_STD_Premium Amount,Insurance Duration_MAX_Premium Amount,Health Conscious Level1_MIN_Premium Amount,Health Conscious Level1_MEAN_Premium Amount,Health Conscious Level1_MEDIAN_Premium Amount,Health Conscious Level1_STD_Premium Amount,Health Conscious Level1_MAX_Premium Amount,Money Per Head_MIN_Premium Amount,Money Per Head_MEAN_Premium Amount,Money Per Head_MEDIAN_Premium Amount,Money Per Head_STD_Premium Amount,Money Per Head_MAX_Premium Amount,Money Handling Level_MIN_Premium Amount,Money Handling Level_MEAN_Premium Amount,Money Handling Level_MEDIAN_Premium Amount,Money Handling Level_STD_Premium Amount,Money Handling Level_MAX_Premium Amount,Money Handling Level1_MIN_Premium Amount,Money Handling Level1_MEAN_Premium Amount,Money Handling Level1_MEDIAN_Premium Amount,Money Handling Level1_STD_Premium Amount,Money Handling Level1_MAX_Premium Amount,Growth_MIN_Premium Amount,Growth_MEAN_Premium Amount,Growth_MEDIAN_Premium Amount,Growth_STD_Premium Amount,Growth_MAX_Premium Amount,Growth1_MIN_Premium Amount,Growth1_MEAN_Premium Amount,Growth1_MEDIAN_Premium Amount,Growth1_STD_Premium Amount,Growth1_MAX_Premium Amount,Determinstic_MIN_Premium Amount,Determinstic_MEAN_Premium Amount,Determinstic_MEDIAN_Premium Amount,Determinstic_STD_Premium Amount,Determinstic_MAX_Premium Amount
0,19.0,Female,10049.0,Married,1.0,Bachelor's,Self-Employed,22.598761,Urban,Premium,2.0,17.0,372.0,5.0,Poor,No,Weekly,House,2869.0,4,13740.046488,10049.000000,3738228.0,27.013441,20098.0,5024.500000,528.894737,Saturday,186.0,20098.0,744.0,4.0,45.197521,23,12,2023,2023 Q4,-0.975344,-0.220691,20.0,1107.625281,878.0,866.838892,4994.0,20.0,1099.847641,870.0,864.000609,4999.0,20.0,1102.698438,873.0,864.866296,4988.0,20.0,1157.177084,912.0,898.578219,4991.0,20.0,1100.812035,872.0,859.965806,4996.0,2869.0,2869.0,2869.0,,2869.0,26.0,1082.905660,823.0,878.580154,4950.0,2869.0,2869.0,2869.0,,2869.0,2869.0,2869.0,2869.0,,2869.0,25.0,1099.000000,974.0,838.709037,3693.0,25.0,1012.700000,958.5,717.596138,2898.0,479.0,1159.000000,644.0,1147.569896,2869.0
1,39.0,Female,31678.0,Divorced,3.0,Master's,Self-Employed,15.569731,Rural,Comprehensive,1.0,12.0,694.0,2.0,Average,Yes,Monthly,House,1483.0,2,4857.756069,10559.333333,21984532.0,45.645533,95034.0,10559.333333,812.256410,Monday,694.0,126712.0,2776.0,4.0,62.278924,12,6,2023,2023 Q2,-0.998725,0.050489,20.0,1102.153646,874.0,863.643171,4997.0,20.0,1099.847641,870.0,864.000609,4999.0,20.0,1102.113989,871.0,866.235322,4997.0,20.0,1083.665634,853.0,856.283142,4997.0,20.0,1106.883166,878.0,863.675409,4997.0,1483.0,1483.0,1483.0,,1483.0,344.0,913.500000,913.5,805.394624,1483.0,1483.0,1483.0,1483.0,,1483.0,1483.0,1483.0,1483.0,,1483.0,250.0,1310.666667,1483.0,985.862228,2199.0,250.0,866.500000,866.5,871.862661,1483.0,1483.0,1483.000000,1483.0,,1483.0
2,23.0,Male,25602.0,Divorced,3.0,High School,Self-Employed,47.177549,Suburban,Premium,1.0,14.0,555.0,3.0,Good,Yes,Weekly,House,567.0,4,17361.338138,8534.000000,14209110.0,46.129730,25602.0,25602.000000,1113.130435,Saturday,555.0,204816.0,4440.0,8.0,377.420394,30,9,2023,2023 Q3,-0.994867,0.101192,20.0,1102.153646,874.0,863.643171,4997.0,20.0,1099.847641,870.0,864.000609,4999.0,20.0,1104.787490,876.0,865.951488,4999.0,20.0,1083.665634,853.0,856.283142,4997.0,20.0,1101.733536,872.0,865.791213,4997.0,567.0,567.0,567.0,,567.0,20.0,919.357143,742.0,773.215219,3572.0,567.0,567.0,567.0,,567.0,567.0,567.0,567.0,,567.0,22.0,1135.168675,881.0,819.725130,3243.0,22.0,1133.086420,881.0,822.937288,3243.0,37.0,870.444444,705.0,783.233699,2619.0
3,21.0,Male,141855.0,Married,2.0,Bachelor's,Self-Employed,10.938144,Rural,Basic,1.0,0.0,367.0,1.0,Poor,Yes,Daily,Apartment,765.0,3,7350.432875,70927.500000,52060785.0,386.525886,283710.0,70927.500000,6755.000000,Wednesday,367.0,283710.0,734.0,2.0,21.876288,12,6,2024,2024 Q2,0.111402,0.993775,20.0,1107.670467,874.0,867.079610,4997.0,20.0,1099.847641,870.0,864.000609,4999.0,20.0,1102.698438,873.0,864.866296,4988.0,20.0,1083.665634,853.0,856.283142,4997.0,20.0,1097.042977,861.0,865.431191,4988.0,765.0,765.0,765.0,,765.0,765.0,893.500000,893.5,181.726443,1022.0,765.0,765.0,765.0,,765.0,765.0,765.0,765.0,,765.0,765.0,1701.666667,2045.0,820.751688,2295.0,765.0,1405.000000,1405.0,905.096680,2045.0,765.0,765.000000,765.0,,765.0
4,21.0,Male,39651.0,Single,1.0,Bachelor's,Self-Employed,20.376094,Rural,Premium,0.0,8.0,598.0,4.0,Poor,Yes,Weekly,House,2022.0,3,6846.367459,39651.000000,23711298.0,66.306020,79302.0,19825.500000,1888.142857,Wednesday,598.0,79302.0,1196.0,0.0,40.752187,1,12,2021,2021 Q4,-0.996246,0.086565,20.0,1107.625281,878.0,866.838892,4994.0,20.0,1099.847641,870.0,864.000609,4999.0,20.0,1102.698438,873.0,864.866296,4988.0,20.0,1080.669491,855.0,847.585613,4999.0,20.0,1104.723079,872.0,866.377508,4991.0,2022.0,2022.0,2022.0,0.0,2022.0,271.0,1111.818182,964.0,527.594886,2022.0,2022.0,2022.0,2022.0,,2022.0,2022.0,2022.0,2022.0,,2022.0,48.0,970.000000,879.0,526.211364,2022.0,538.0,1062.200000,921.5,451.404180,2022.0,2022.0,2103.333333,2022.0,140.873466,2266.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1999995,50.0,Female,38782.0,Married,1.0,Bachelor's,Employed,14.498639,Rural,Premium,1.0,8.0,309.0,2.0,Average,Yes,Daily,Condo,,4,23197.822227,38782.000000,11983638.0,125.508091,77564.0,19391.000000,775.640000,Friday,309.0,155128.0,1236.0,4.0,57.994556,9,7,2021,2021 Q3,0.645845,-0.763468,20.0,1107.625281,878.0,866.838892,4994.0,20.0,1105.604643,876.0,863.360484,4994.0,20.0,1102.698438,873.0,864.866296,4988.0,20.0,1083.665634,853.0,856.283142,4997.0,20.0,1106.883166,878.0,863.675409,4997.0,,,,,,23.0,1144.529412,883.0,783.855135,4056.0,,,,,,,,,,,24.0,1126.484536,866.0,784.481306,3850.0,24.0,1094.354167,843.5,804.074741,3850.0,33.0,982.777778,793.0,795.568944,2840.0
1999996,56.0,Female,73462.0,Single,0.0,Master's,Self-Employed,8.145748,Rural,Basic,2.0,0.0,452.0,2.0,Good,No,Daily,Apartment,,6,29194.361129,73462.000000,33204824.0,162.526549,220386.0,24487.333333,1311.821429,Tuesday,226.0,587696.0,3616.0,16.0,65.165985,28,3,2023,2023 Q1,0.681828,0.731513,20.0,1098.032667,867.0,862.433346,4999.0,20.0,1099.847641,870.0,864.000609,4999.0,20.0,1102.113989,871.0,866.235322,4997.0,20.0,1157.177084,912.0,898.578219,4991.0,20.0,1106.883166,878.0,863.675409,4997.0,,,,,,122.0,707.111111,287.0,774.748257,2275.0,,,,,,,,,,,265.0,1133.333333,860.0,1032.501009,2275.0,265.0,1133.333333,860.0,1032.501009,2275.0,287.0,287.000000,287.0,,287.0
1999997,26.0,Female,35178.0,Single,0.0,Master's,Employed,6.636583,Urban,Comprehensive,2.0,10.0,764.0,6.0,Poor,No,Monthly,Apartment,,2,2760.818699,35178.000000,26875992.0,46.044503,105534.0,11726.000000,1353.000000,Monday,382.0,70356.0,1528.0,4.0,13.273167,30,9,2019,2019 Q3,-0.709843,0.704360,20.0,1098.032667,867.0,862.433346,4999.0,20.0,1105.604643,876.0,863.360484,4994.0,20.0,1102.113989,871.0,866.235322,4997.0,20.0,1157.177084,912.0,898.578219,4991.0,20.0,1104.558441,873.0,866.550287,4999.0,,,,,,27.0,1113.365385,888.5,762.397831,4054.0,664.0,1148.0,1057.5,495.437181,1813.0,664.0,1148.0,1057.5,495.437181,1813.0,119.0,1109.831579,903.0,697.169271,3524.0,28.0,1138.514019,906.0,747.570207,3700.0,177.0,1172.428571,507.0,1165.448391,3167.0
1999998,34.0,Female,45661.0,Single,3.0,Master's,Self-Employed,15.937248,Urban,Premium,2.0,17.0,467.0,7.0,Average,No,Weekly,Condo,,4,17339.725601,15220.333333,21323687.0,97.775161,136983.0,15220.333333,1342.970588,Monday,233.5,182644.0,1868.0,8.0,63.748991,9,5,2022,2022 Q2,0.561062,-0.827774,20.0,1102.153646,874.0,863.643171,4997.0,20.0,1099.847641,870.0,864.000609,4999.0,20.0,1102.113989,871.0,866.235322,4997.0,20.0,1157.177084,912.0,898.578219,4991.0,20.0,1105.964399,877.0,866.222189,4988.0,,,,,,776.0,776.000000,776.0,,776.0,,,,,,,,,,,1297.0,1297.000000,1297.0,,1297.0,1297.0,1297.000000,1297.0,,1297.0,,,,,


#
---
#

In [240]:
def return_splits(ddf, feature_name, target_name):
    return [ddf[ddf[feature_name] == i][target_name] for i in ddf[feature_name].unique()]

def give_stats_analysis(df, target_column_name):
    ddf = df.copy()
    ddf = ddf.dropna()

    features = []
    tests = []
    stats = []
    pvals = []
    verdict = []
    count = 0

    target = ddf[target_column_name]
    for i in ddf.columns:
        features.append(i)
        feature = ddf[i]
        
        if (feature.dtype == "O" and (target.dtype == "float" or target.dtype == "int")) or (target.dtype == "O" and (feature.dtype == "float" or feature.dtype == "int")):
            stat, pval, *_ = kruskal(*return_splits(ddf, feature.name, target.name))
            tests.append("Kruskal-Wallis")
            stats.append(stat)
            pvals.append(pval)
            
        
        elif (feature.dtype == "float" or feature.dtype == "int") and (target.dtype == "float" or target.dtype == "int"):
            stat, pval, *_ = spearmanr(feature, target)
            tests.append("SpearmanR")
            stats.append(stat)
            pvals.append(pval)

        elif feature.dtype == "O" and target.dtype == "O":
            stat, pval, *_ = chi2_contingency(pd.crosstab(feature, target))
            tests.append("Chi-Square")
            stats.append(stat)
            pvals.append(pval)
        
        else:
            tests.append(np.nan)
            stats.append(np.nan)
            pvals.append(np.nan)
        
        if pval <= 0.025:
            verdict.append("There is Relationship")
        else:
            verdict.append("There is NO Relationship")

        print(f"{feature.name} ■■■ {target_column_name}".ljust(100, "-")+"✅")
    
    return pd.DataFrame({
        "Feature" : features,
        "Target" : [target_column_name]*ddf.shape[1],
        "Statistic Test" : tests,
        "Test Statistic" : stats,
        "P-Value" : pvals,
        "Verdict" : verdict
    }).sort_values(by="P-Value")

# H0 :- There is ***No Relationship*** among the given two columns
# H1 :- There is ***Relationship*** among the given two columns

### ***Health-related indicators***
- [x] Health Score
- [x] Smoking Status
- [x] Exercise Frequency
### ***Demographic information***
- [x] Age
- [x] Gender
- [x] Marital Status
- [x] Number of Dependents
- [x] Occupation
### ***Policy details***
- [x] Policy Type
- [x] Policy Start Date
- [x] Insurance Duration
### ***Financial factors***
- [x] Annual Income
- [x] Credit Score.
### ***Premium calculation***
- [x] Premium Amount

In [241]:
stats_result = give_stats_analysis(df.iloc[:1200000, :], "Premium Amount")
stats_result

Age ■■■ Premium Amount------------------------------------------------------------------------------✅
Gender ■■■ Premium Amount---------------------------------------------------------------------------✅
Annual Income ■■■ Premium Amount--------------------------------------------------------------------✅
Marital Status ■■■ Premium Amount-------------------------------------------------------------------✅
Number of Dependents ■■■ Premium Amount-------------------------------------------------------------✅
Education Level ■■■ Premium Amount------------------------------------------------------------------✅
Occupation ■■■ Premium Amount-----------------------------------------------------------------------✅
Health Score ■■■ Premium Amount---------------------------------------------------------------------✅
Location ■■■ Premium Amount-------------------------------------------------------------------------✅
Policy Type ■■■ Premium Amount----------------------------------------------------

Unnamed: 0,Feature,Target,Statistic Test,Test Statistic,P-Value,Verdict
18,Premium Amount,Premium Amount,SpearmanR,1.0,0.0,There is Relationship
66,Health Conscious Level1_MEDIAN_Premium Amount,Premium Amount,SpearmanR,0.747842,0.0,There is Relationship
67,Health Conscious Level1_STD_Premium Amount,Premium Amount,SpearmanR,0.274432,0.0,There is Relationship
65,Health Conscious Level1_MEAN_Premium Amount,Premium Amount,SpearmanR,0.74892,0.0,There is Relationship
64,Health Conscious Level1_MIN_Premium Amount,Premium Amount,SpearmanR,0.613048,0.0,There is Relationship
68,Health Conscious Level1_MAX_Premium Amount,Premium Amount,SpearmanR,0.628621,0.0,There is Relationship
81,Money Handling Level1_MEDIAN_Premium Amount,Premium Amount,SpearmanR,0.689001,0.0,There is Relationship
83,Money Handling Level1_MAX_Premium Amount,Premium Amount,SpearmanR,0.376211,0.0,There is Relationship
78,Money Handling Level_MAX_Premium Amount,Premium Amount,SpearmanR,0.371457,0.0,There is Relationship
79,Money Handling Level1_MIN_Premium Amount,Premium Amount,SpearmanR,0.523549,0.0,There is Relationship


# <ins>Key Premium as per Reseach Papers and as per dataset.</ins>
### `Strikeoff features are said by research and dataset too. But unstrike ones are not impactful to determine premium amount as per dataset but as per research it should be. We need to find why like so in these features`

- ### ~~Age~~
- ### Gender
- ### ~~Health Score~~
- ### Smoking Status
- ### Exercise Frequency
- ### ~~Occupation~~
- ### Policy Type
- ### ~~Previous Claims~~
- ### ~~Annual Income~~
- ### Insurance Duration
- ### ~~Credit Score~~

#
---
#

In [242]:
cols = ["Gender", "Smoking Status", "Exercise Frequency", "Policy Type", "Insurance Duration"]

In [243]:
# fig, axs = plt.subplots(2, 3, figsize=(20, 8))
# for col, ax in zip(cols, axs.flatten()):
#     sns.boxplot(y=df["Premium Amount"], x=df[col], color="mediumblue", ax=ax)

In [244]:
useless_columns = stats_result[stats_result["P-Value"] >= 0.05]["Feature"]
useless_columns

62         Insurance Duration_STD_Premium Amount
42       Number of Dependents_STD_Premium Amount
9                                    Policy Type
48                 Occupation_MAX_Premium Amount
45                Occupation_MEAN_Premium Amount
46              Occupation_MEDIAN_Premium Amount
11                                   Vehicle Age
43       Number of Dependents_MAX_Premium Amount
34                     Policy Start Date - Month
38                                      Cos_Date
1                                         Gender
20                       Health Conscious Level1
36                   Policy Start Date - Quarter
33                       Policy Start Date - Day
63         Insurance Duration_MAX_Premium Amount
53            Education Level_MAX_Premium Amount
27                                      Day_Name
58            Previous Claims_MAX_Premium Amount
57            Previous Claims_STD_Premium Amount
10                               Previous Claims
55           Previou

In [245]:
meaningless_df = df[useless_columns]
meaningless_df.head(3)

Unnamed: 0,Insurance Duration_STD_Premium Amount,Number of Dependents_STD_Premium Amount,Policy Type,Occupation_MAX_Premium Amount,Occupation_MEAN_Premium Amount,Occupation_MEDIAN_Premium Amount,Vehicle Age,Number of Dependents_MAX_Premium Amount,Policy Start Date - Month,Cos_Date,Gender,Health Conscious Level1,Policy Start Date - Quarter,Policy Start Date - Day,Insurance Duration_MAX_Premium Amount,Education Level_MAX_Premium Amount,Day_Name,Previous Claims_MAX_Premium Amount,Previous Claims_STD_Premium Amount,Previous Claims,Previous Claims_MEAN_Premium Amount,Smoking Status,Number of Dependents,Exercise Frequency,Marital Status,Insurance Duration_MEAN_Premium Amount,Customer Feedback,Education Level,Education Level_MEAN_Premium Amount,Education Level_MEDIAN_Premium Amount,Insurance Duration,Insurance Duration_MEDIAN_Premium Amount,Location,Number of Dependents_MEAN_Premium Amount,Policy Start Date - Year,Number of Dependents_MEDIAN_Premium Amount,Education Level_STD_Premium Amount,Property Type,Feedback3,Health Conscious Level,Sin_Date
0,859.965806,866.838892,Premium,4999.0,1099.847641,870.0,17.0,4994.0,12,-0.220691,Female,13740.046488,2023 Q4,23,4996.0,4988.0,Saturday,4991.0,898.578219,2.0,1157.177084,No,1.0,Weekly,Married,1100.812035,Poor,Bachelor's,1102.698438,873.0,5.0,872.0,Urban,1107.625281,2023,878.0,864.866296,House,4.0,4,-0.975344
1,863.675409,863.643171,Comprehensive,4999.0,1099.847641,870.0,12.0,4997.0,6,0.050489,Female,4857.756069,2023 Q2,12,4997.0,4997.0,Monday,4997.0,856.283142,1.0,1083.665634,Yes,3.0,Monthly,Divorced,1106.883166,Average,Master's,1102.113989,871.0,2.0,878.0,Rural,1102.153646,2023,874.0,866.235322,House,4.0,2,-0.998725
2,865.791213,863.643171,Premium,4999.0,1099.847641,870.0,14.0,4997.0,9,0.101192,Male,17361.338138,2023 Q3,30,4997.0,4999.0,Saturday,4997.0,856.283142,1.0,1083.665634,Yes,3.0,Weekly,Divorced,1101.733536,Good,High School,1104.78749,876.0,3.0,872.0,Suburban,1102.153646,2023,874.0,865.951488,House,8.0,4,-0.994867


In [246]:
# df = df[stats_result[stats_result["P-Value"] < 0.05]["Feature"]]
# df.head(3)

# Compressing Meaningless DF's information in a component using PCA

In [247]:
meaningless_df.head(3)

Unnamed: 0,Insurance Duration_STD_Premium Amount,Number of Dependents_STD_Premium Amount,Policy Type,Occupation_MAX_Premium Amount,Occupation_MEAN_Premium Amount,Occupation_MEDIAN_Premium Amount,Vehicle Age,Number of Dependents_MAX_Premium Amount,Policy Start Date - Month,Cos_Date,Gender,Health Conscious Level1,Policy Start Date - Quarter,Policy Start Date - Day,Insurance Duration_MAX_Premium Amount,Education Level_MAX_Premium Amount,Day_Name,Previous Claims_MAX_Premium Amount,Previous Claims_STD_Premium Amount,Previous Claims,Previous Claims_MEAN_Premium Amount,Smoking Status,Number of Dependents,Exercise Frequency,Marital Status,Insurance Duration_MEAN_Premium Amount,Customer Feedback,Education Level,Education Level_MEAN_Premium Amount,Education Level_MEDIAN_Premium Amount,Insurance Duration,Insurance Duration_MEDIAN_Premium Amount,Location,Number of Dependents_MEAN_Premium Amount,Policy Start Date - Year,Number of Dependents_MEDIAN_Premium Amount,Education Level_STD_Premium Amount,Property Type,Feedback3,Health Conscious Level,Sin_Date
0,859.965806,866.838892,Premium,4999.0,1099.847641,870.0,17.0,4994.0,12,-0.220691,Female,13740.046488,2023 Q4,23,4996.0,4988.0,Saturday,4991.0,898.578219,2.0,1157.177084,No,1.0,Weekly,Married,1100.812035,Poor,Bachelor's,1102.698438,873.0,5.0,872.0,Urban,1107.625281,2023,878.0,864.866296,House,4.0,4,-0.975344
1,863.675409,863.643171,Comprehensive,4999.0,1099.847641,870.0,12.0,4997.0,6,0.050489,Female,4857.756069,2023 Q2,12,4997.0,4997.0,Monday,4997.0,856.283142,1.0,1083.665634,Yes,3.0,Monthly,Divorced,1106.883166,Average,Master's,1102.113989,871.0,2.0,878.0,Rural,1102.153646,2023,874.0,866.235322,House,4.0,2,-0.998725
2,865.791213,863.643171,Premium,4999.0,1099.847641,870.0,14.0,4997.0,9,0.101192,Male,17361.338138,2023 Q3,30,4997.0,4999.0,Saturday,4997.0,856.283142,1.0,1083.665634,Yes,3.0,Weekly,Divorced,1101.733536,Good,High School,1104.78749,876.0,3.0,872.0,Suburban,1102.153646,2023,874.0,865.951488,House,8.0,4,-0.994867


## Encoding Columns

In [248]:
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

###
---
###

In [249]:
meaningless_df["Location"].unique()

array(['Urban', 'Rural', 'Suburban'], dtype=object)

In [250]:
a = OrdinalEncoder(categories=[['Rural', 'Suburban', 'Urban']])

b = pd.DataFrame({"ENCODED_Location" : a.fit_transform(meaningless_df[["Location"]]).flatten()})

meaningless_df = pd.concat([meaningless_df, b], axis=1)
meaningless_df.drop(columns="Location", inplace=True)

###
---
###

In [251]:
meaningless_df["Education Level"].unique()

array(["Bachelor's", "Master's", 'High School', 'PhD'], dtype=object)

In [252]:
a = OrdinalEncoder(categories=[['High School', "Bachelor's", "Master's", 'PhD']])

b = pd.DataFrame({"ENCODED_Education Level" : a.fit_transform(meaningless_df[["Education Level"]]).flatten()})

meaningless_df = pd.concat([meaningless_df, b], axis=1)
meaningless_df.drop(columns="Education Level", inplace=True)

###
---
###

In [253]:
meaningless_df["Policy Type"].unique()

array(['Premium', 'Comprehensive', 'Basic'], dtype=object)

In [254]:
a = OrdinalEncoder(categories=[['Basic', 'Comprehensive', 'Premium']])

b = pd.DataFrame({"ENCODED_Policy Type" : a.fit_transform(meaningless_df[["Policy Type"]]).flatten()})

meaningless_df = pd.concat([meaningless_df, b], axis=1)
meaningless_df.drop(columns="Policy Type", inplace=True)

###
---
###

In [255]:
a = OneHotEncoder(drop="first", sparse_output=False)

b = pd.DataFrame(
        a.fit_transform(meaningless_df[["Property Type"]]),
        columns=a.get_feature_names_out()
    )

meaningless_df = pd.concat([meaningless_df, b], axis=1)
meaningless_df.drop(columns="Property Type", inplace=True)

###
---
###

In [256]:
meaningless_df["Exercise Frequency"].unique()

array(['Weekly', 'Monthly', 'Daily', 'Rarely'], dtype=object)

In [257]:
a = OrdinalEncoder(categories=[['Rarely', 'Monthly', 'Weekly', 'Daily']])

b = pd.DataFrame({"ENCODED_Exercise Frequency" : a.fit_transform(meaningless_df[["Exercise Frequency"]]).flatten()})

meaningless_df = pd.concat([meaningless_df, b], axis=1)
meaningless_df.drop(columns="Exercise Frequency", inplace=True)

###
---
###

In [258]:
a = OneHotEncoder(drop="first", sparse_output=False)

b = pd.DataFrame(
        a.fit_transform(meaningless_df[["Smoking Status"]]),
        columns=a.get_feature_names_out()
    )

meaningless_df = pd.concat([meaningless_df, b], axis=1)
meaningless_df.drop(columns="Smoking Status", inplace=True)

###
---
###

In [259]:
a = OneHotEncoder(drop="first", sparse_output=False)

b = pd.DataFrame(
        a.fit_transform(meaningless_df[["Gender"]]),
        columns=a.get_feature_names_out()
    )

meaningless_df = pd.concat([meaningless_df, b], axis=1)
meaningless_df.drop(columns="Gender", inplace=True)

###
---
###

In [260]:
meaningless_df["Policy Start Date - Day"] = meaningless_df["Policy Start Date - Day"].astype(int)

#
---
#

In [261]:
meaningless_df.head(3)

Unnamed: 0,Insurance Duration_STD_Premium Amount,Number of Dependents_STD_Premium Amount,Occupation_MAX_Premium Amount,Occupation_MEAN_Premium Amount,Occupation_MEDIAN_Premium Amount,Vehicle Age,Number of Dependents_MAX_Premium Amount,Policy Start Date - Month,Cos_Date,Health Conscious Level1,Policy Start Date - Quarter,Policy Start Date - Day,Insurance Duration_MAX_Premium Amount,Education Level_MAX_Premium Amount,Day_Name,Previous Claims_MAX_Premium Amount,Previous Claims_STD_Premium Amount,Previous Claims,Previous Claims_MEAN_Premium Amount,Number of Dependents,Marital Status,Insurance Duration_MEAN_Premium Amount,Customer Feedback,Education Level_MEAN_Premium Amount,Education Level_MEDIAN_Premium Amount,Insurance Duration,Insurance Duration_MEDIAN_Premium Amount,Number of Dependents_MEAN_Premium Amount,Policy Start Date - Year,Number of Dependents_MEDIAN_Premium Amount,Education Level_STD_Premium Amount,Feedback3,Health Conscious Level,Sin_Date,ENCODED_Location,ENCODED_Education Level,ENCODED_Policy Type,Property Type_Condo,Property Type_House,ENCODED_Exercise Frequency,Smoking Status_Yes,Gender_Male
0,859.965806,866.838892,4999.0,1099.847641,870.0,17.0,4994.0,12,-0.220691,13740.046488,2023 Q4,23,4996.0,4988.0,Saturday,4991.0,898.578219,2.0,1157.177084,1.0,Married,1100.812035,Poor,1102.698438,873.0,5.0,872.0,1107.625281,2023,878.0,864.866296,4.0,4,-0.975344,2.0,1.0,2.0,0.0,1.0,2.0,0.0,0.0
1,863.675409,863.643171,4999.0,1099.847641,870.0,12.0,4997.0,6,0.050489,4857.756069,2023 Q2,12,4997.0,4997.0,Monday,4997.0,856.283142,1.0,1083.665634,3.0,Divorced,1106.883166,Average,1102.113989,871.0,2.0,878.0,1102.153646,2023,874.0,866.235322,4.0,2,-0.998725,0.0,2.0,1.0,0.0,1.0,1.0,1.0,0.0
2,865.791213,863.643171,4999.0,1099.847641,870.0,14.0,4997.0,9,0.101192,17361.338138,2023 Q3,30,4997.0,4999.0,Saturday,4997.0,856.283142,1.0,1083.665634,3.0,Divorced,1101.733536,Good,1104.78749,876.0,3.0,872.0,1102.153646,2023,874.0,865.951488,8.0,4,-0.994867,1.0,0.0,2.0,0.0,1.0,2.0,1.0,1.0


In [262]:
meaningless_df.dtypes

Insurance Duration_STD_Premium Amount         float64
Number of Dependents_STD_Premium Amount       float64
Occupation_MAX_Premium Amount                 float64
Occupation_MEAN_Premium Amount                float64
Occupation_MEDIAN_Premium Amount              float64
Vehicle Age                                   float64
Number of Dependents_MAX_Premium Amount       float64
Policy Start Date - Month                      object
Cos_Date                                      float64
Health Conscious Level1                       float64
Policy Start Date - Quarter                    object
Policy Start Date - Day                         int64
Insurance Duration_MAX_Premium Amount         float64
Education Level_MAX_Premium Amount            float64
Day_Name                                       object
Previous Claims_MAX_Premium Amount            float64
Previous Claims_STD_Premium Amount            float64
Previous Claims                               float64
Previous Claims_MEAN_Premium

###
---
###

# Doing PCA on this `meaningless_df`

In [263]:
# from sklearn.decomposition import PCA

In [264]:
# pca = PCA(n_components=3)
# pca_df = pd.DataFrame(pca.fit_transform(meaningless_df), columns=['PC1_Meaningless_df', "PC2_Meaningless_df", "PC3_Meaningless_df"])
# pca_df

In [265]:
# pca.explained_variance_ratio_

###
---
###

# Combining 2 PCs of Meaningless_columns to the df

In [266]:
# df = pd.concat([df, pca_df.iloc[:, :2]], axis=1)

In [267]:
df.head()

Unnamed: 0,Age,Gender,Annual Income,Marital Status,Number of Dependents,Education Level,Occupation,Health Score,Location,Policy Type,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Customer Feedback,Smoking Status,Exercise Frequency,Property Type,Premium Amount,Health Conscious Level,Health Conscious Level1,Money Per Head,Money Handling Level,Money Handling Level1,Growth,Growth1,Determinstic,Day_Name,Credit by Score,Feedback1,Feedback2,Feedback3,Feedback4,Policy Start Date - Day,Policy Start Date - Month,Policy Start Date - Year,Policy Start Date - Quarter,Sin_Date,Cos_Date,Number of Dependents_MIN_Premium Amount,Number of Dependents_MEAN_Premium Amount,Number of Dependents_MEDIAN_Premium Amount,Number of Dependents_STD_Premium Amount,Number of Dependents_MAX_Premium Amount,Occupation_MIN_Premium Amount,Occupation_MEAN_Premium Amount,Occupation_MEDIAN_Premium Amount,Occupation_STD_Premium Amount,Occupation_MAX_Premium Amount,Education Level_MIN_Premium Amount,Education Level_MEAN_Premium Amount,Education Level_MEDIAN_Premium Amount,Education Level_STD_Premium Amount,Education Level_MAX_Premium Amount,Previous Claims_MIN_Premium Amount,Previous Claims_MEAN_Premium Amount,Previous Claims_MEDIAN_Premium Amount,Previous Claims_STD_Premium Amount,Previous Claims_MAX_Premium Amount,Insurance Duration_MIN_Premium Amount,Insurance Duration_MEAN_Premium Amount,Insurance Duration_MEDIAN_Premium Amount,Insurance Duration_STD_Premium Amount,Insurance Duration_MAX_Premium Amount,Health Conscious Level1_MIN_Premium Amount,Health Conscious Level1_MEAN_Premium Amount,Health Conscious Level1_MEDIAN_Premium Amount,Health Conscious Level1_STD_Premium Amount,Health Conscious Level1_MAX_Premium Amount,Money Per Head_MIN_Premium Amount,Money Per Head_MEAN_Premium Amount,Money Per Head_MEDIAN_Premium Amount,Money Per Head_STD_Premium Amount,Money Per Head_MAX_Premium Amount,Money Handling Level_MIN_Premium Amount,Money Handling Level_MEAN_Premium Amount,Money Handling Level_MEDIAN_Premium Amount,Money Handling Level_STD_Premium Amount,Money Handling Level_MAX_Premium Amount,Money Handling Level1_MIN_Premium Amount,Money Handling Level1_MEAN_Premium Amount,Money Handling Level1_MEDIAN_Premium Amount,Money Handling Level1_STD_Premium Amount,Money Handling Level1_MAX_Premium Amount,Growth_MIN_Premium Amount,Growth_MEAN_Premium Amount,Growth_MEDIAN_Premium Amount,Growth_STD_Premium Amount,Growth_MAX_Premium Amount,Growth1_MIN_Premium Amount,Growth1_MEAN_Premium Amount,Growth1_MEDIAN_Premium Amount,Growth1_STD_Premium Amount,Growth1_MAX_Premium Amount,Determinstic_MIN_Premium Amount,Determinstic_MEAN_Premium Amount,Determinstic_MEDIAN_Premium Amount,Determinstic_STD_Premium Amount,Determinstic_MAX_Premium Amount
0,19.0,Female,10049.0,Married,1.0,Bachelor's,Self-Employed,22.598761,Urban,Premium,2.0,17.0,372.0,5.0,Poor,No,Weekly,House,2869.0,4,13740.046488,10049.0,3738228.0,27.013441,20098.0,5024.5,528.894737,Saturday,186.0,20098.0,744.0,4.0,45.197521,23,12,2023,2023 Q4,-0.975344,-0.220691,20.0,1107.625281,878.0,866.838892,4994.0,20.0,1099.847641,870.0,864.000609,4999.0,20.0,1102.698438,873.0,864.866296,4988.0,20.0,1157.177084,912.0,898.578219,4991.0,20.0,1100.812035,872.0,859.965806,4996.0,2869.0,2869.0,2869.0,,2869.0,26.0,1082.90566,823.0,878.580154,4950.0,2869.0,2869.0,2869.0,,2869.0,2869.0,2869.0,2869.0,,2869.0,25.0,1099.0,974.0,838.709037,3693.0,25.0,1012.7,958.5,717.596138,2898.0,479.0,1159.0,644.0,1147.569896,2869.0
1,39.0,Female,31678.0,Divorced,3.0,Master's,Self-Employed,15.569731,Rural,Comprehensive,1.0,12.0,694.0,2.0,Average,Yes,Monthly,House,1483.0,2,4857.756069,10559.333333,21984532.0,45.645533,95034.0,10559.333333,812.25641,Monday,694.0,126712.0,2776.0,4.0,62.278924,12,6,2023,2023 Q2,-0.998725,0.050489,20.0,1102.153646,874.0,863.643171,4997.0,20.0,1099.847641,870.0,864.000609,4999.0,20.0,1102.113989,871.0,866.235322,4997.0,20.0,1083.665634,853.0,856.283142,4997.0,20.0,1106.883166,878.0,863.675409,4997.0,1483.0,1483.0,1483.0,,1483.0,344.0,913.5,913.5,805.394624,1483.0,1483.0,1483.0,1483.0,,1483.0,1483.0,1483.0,1483.0,,1483.0,250.0,1310.666667,1483.0,985.862228,2199.0,250.0,866.5,866.5,871.862661,1483.0,1483.0,1483.0,1483.0,,1483.0
2,23.0,Male,25602.0,Divorced,3.0,High School,Self-Employed,47.177549,Suburban,Premium,1.0,14.0,555.0,3.0,Good,Yes,Weekly,House,567.0,4,17361.338138,8534.0,14209110.0,46.12973,25602.0,25602.0,1113.130435,Saturday,555.0,204816.0,4440.0,8.0,377.420394,30,9,2023,2023 Q3,-0.994867,0.101192,20.0,1102.153646,874.0,863.643171,4997.0,20.0,1099.847641,870.0,864.000609,4999.0,20.0,1104.78749,876.0,865.951488,4999.0,20.0,1083.665634,853.0,856.283142,4997.0,20.0,1101.733536,872.0,865.791213,4997.0,567.0,567.0,567.0,,567.0,20.0,919.357143,742.0,773.215219,3572.0,567.0,567.0,567.0,,567.0,567.0,567.0,567.0,,567.0,22.0,1135.168675,881.0,819.72513,3243.0,22.0,1133.08642,881.0,822.937288,3243.0,37.0,870.444444,705.0,783.233699,2619.0
3,21.0,Male,141855.0,Married,2.0,Bachelor's,Self-Employed,10.938144,Rural,Basic,1.0,0.0,367.0,1.0,Poor,Yes,Daily,Apartment,765.0,3,7350.432875,70927.5,52060785.0,386.525886,283710.0,70927.5,6755.0,Wednesday,367.0,283710.0,734.0,2.0,21.876288,12,6,2024,2024 Q2,0.111402,0.993775,20.0,1107.670467,874.0,867.07961,4997.0,20.0,1099.847641,870.0,864.000609,4999.0,20.0,1102.698438,873.0,864.866296,4988.0,20.0,1083.665634,853.0,856.283142,4997.0,20.0,1097.042977,861.0,865.431191,4988.0,765.0,765.0,765.0,,765.0,765.0,893.5,893.5,181.726443,1022.0,765.0,765.0,765.0,,765.0,765.0,765.0,765.0,,765.0,765.0,1701.666667,2045.0,820.751688,2295.0,765.0,1405.0,1405.0,905.09668,2045.0,765.0,765.0,765.0,,765.0
4,21.0,Male,39651.0,Single,1.0,Bachelor's,Self-Employed,20.376094,Rural,Premium,0.0,8.0,598.0,4.0,Poor,Yes,Weekly,House,2022.0,3,6846.367459,39651.0,23711298.0,66.30602,79302.0,19825.5,1888.142857,Wednesday,598.0,79302.0,1196.0,0.0,40.752187,1,12,2021,2021 Q4,-0.996246,0.086565,20.0,1107.625281,878.0,866.838892,4994.0,20.0,1099.847641,870.0,864.000609,4999.0,20.0,1102.698438,873.0,864.866296,4988.0,20.0,1080.669491,855.0,847.585613,4999.0,20.0,1104.723079,872.0,866.377508,4991.0,2022.0,2022.0,2022.0,0.0,2022.0,271.0,1111.818182,964.0,527.594886,2022.0,2022.0,2022.0,2022.0,,2022.0,2022.0,2022.0,2022.0,,2022.0,48.0,970.0,879.0,526.211364,2022.0,538.0,1062.2,921.5,451.40418,2022.0,2022.0,2103.333333,2022.0,140.873466,2266.0


###
---
###

# Encoding of column in `df`

In [268]:
show_nulls(df)

Unnamed: 0,Column,Data Type,Nulls,No. of Uniques,Uniques
67,Health Conscious Level1_STD_Premium Amount,float64,1885549,9656,"[nan, 0.0, 84.14570696119915, 410.829039869384..."
82,Money Handling Level1_STD_Premium Amount,float64,1485965,42938,"[nan, 1520.9866863322636, 588.3128419472075, 3..."
77,Money Handling Level_STD_Premium Amount,float64,1442565,47583,"[nan, 1520.9866863322636, 588.3128419472075, 3..."
18,Premium Amount,float64,800000,4794,"[2869.0, 1483.0, 567.0, 765.0, 2022.0, 3202.0,..."
97,Determinstic_STD_Premium Amount,float64,749931,119553,"[1147.5698962009533, nan, 783.2336993374186, 1..."
65,Health Conscious Level1_MEAN_Premium Amount,float64,735960,10644,"[2869.0, 1483.0, 567.0, 765.0, 2022.0, 3202.0,..."
64,Health Conscious Level1_MIN_Premium Amount,float64,735960,4773,"[2869.0, 1483.0, 567.0, 765.0, 2022.0, 3202.0,..."
68,Health Conscious Level1_MAX_Premium Amount,float64,735960,4792,"[2869.0, 1483.0, 567.0, 765.0, 2022.0, 3202.0,..."
66,Health Conscious Level1_MEDIAN_Premium Amount,float64,735960,7372,"[2869.0, 1483.0, 567.0, 765.0, 2022.0, 3202.0,..."
81,Money Handling Level1_MEDIAN_Premium Amount,float64,542030,7892,"[2869.0, 1483.0, 567.0, 765.0, 2022.0, 2126.5,..."


### Policy Start Date - Year	

In [269]:
df["Policy Start Date - Year"].unique()

array([2023, 2024, 2021, 2022, 2020, 2019], dtype=object)

In [270]:
a = OrdinalEncoder(categories=[[2019, 2020, 2021, 2022, 2023, 2024]])

b = pd.DataFrame({"ENCODED_Policy Start Date - Year" : a.fit_transform(df[["Policy Start Date - Year"]]).flatten()})

df = pd.concat([df, b], axis=1)
df.drop(columns="Policy Start Date - Year", inplace=True)

### Policy Start Date - Quarter

In [271]:
sorted(list(df["Policy Start Date - Quarter"].unique()))

['2019 Q3',
 '2019 Q4',
 '2020 Q1',
 '2020 Q2',
 '2020 Q3',
 '2020 Q4',
 '2021 Q1',
 '2021 Q2',
 '2021 Q3',
 '2021 Q4',
 '2022 Q1',
 '2022 Q2',
 '2022 Q3',
 '2022 Q4',
 '2023 Q1',
 '2023 Q2',
 '2023 Q3',
 '2023 Q4',
 '2024 Q1',
 '2024 Q2',
 '2024 Q3']

In [272]:
a = OrdinalEncoder(categories=[['2019 Q3', '2019 Q4', '2020 Q1', '2020 Q2', '2020 Q3', '2020 Q4', '2021 Q1', '2021 Q2', '2021 Q3',
                 '2021 Q4', '2022 Q1', '2022 Q2', '2022 Q3', '2022 Q4', '2023 Q1', '2023 Q2', '2023 Q3', '2023 Q4', '2024 Q1', '2024 Q2', '2024 Q3']])

b = pd.DataFrame({"ENCODED_Policy Start Date - Quarter" : a.fit_transform(df[["Policy Start Date - Quarter"]]).flatten()})

df = pd.concat([df, b], axis=1)
df.drop(columns="Policy Start Date - Quarter", inplace=True)

### Policy Start Date - Month

In [273]:
df["Policy Start Date - Month"] = df["Policy Start Date - Month"].astype(int)

### Customer Feedback

In [274]:
df["Customer Feedback"].unique()

array(['Poor', 'Average', 'Good'], dtype=object)

In [275]:
a = OrdinalEncoder(categories=[['Poor', 'Average', 'Good']])

b = pd.DataFrame({"ENCODED_Customer Feedback" : a.fit_transform(df[["Customer Feedback"]]).flatten()})

df = pd.concat([df, b], axis=1)
df.drop(columns="Customer Feedback", inplace=True)

### Occupation

In [276]:
a = OneHotEncoder(drop="first", sparse_output=False)

b = pd.DataFrame(
        a.fit_transform(df[["Occupation"]]),
        columns="ENCODED_" + a.get_feature_names_out()
    )

df = pd.concat([df, b], axis=1)
df.drop(columns="Occupation", inplace=True)

### Marital Status

In [277]:
a = OneHotEncoder(drop="first", sparse_output=False)

b = pd.DataFrame(
        a.fit_transform(df[["Marital Status"]]),
        columns="ENCODED_" + a.get_feature_names_out()
    )

df = pd.concat([df, b], axis=1)
df.drop(columns="Marital Status", inplace=True)

In [279]:
df

Unnamed: 0,Age,Gender,Annual Income,Number of Dependents,Education Level,Health Score,Location,Policy Type,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Smoking Status,Exercise Frequency,Property Type,Premium Amount,Health Conscious Level,Health Conscious Level1,Money Per Head,Money Handling Level,Money Handling Level1,Growth,Growth1,Determinstic,Day_Name,Credit by Score,Feedback1,Feedback2,Feedback3,Feedback4,Policy Start Date - Day,Policy Start Date - Month,Sin_Date,Cos_Date,Number of Dependents_MIN_Premium Amount,Number of Dependents_MEAN_Premium Amount,Number of Dependents_MEDIAN_Premium Amount,Number of Dependents_STD_Premium Amount,Number of Dependents_MAX_Premium Amount,Occupation_MIN_Premium Amount,Occupation_MEAN_Premium Amount,Occupation_MEDIAN_Premium Amount,Occupation_STD_Premium Amount,Occupation_MAX_Premium Amount,Education Level_MIN_Premium Amount,Education Level_MEAN_Premium Amount,Education Level_MEDIAN_Premium Amount,Education Level_STD_Premium Amount,Education Level_MAX_Premium Amount,Previous Claims_MIN_Premium Amount,Previous Claims_MEAN_Premium Amount,Previous Claims_MEDIAN_Premium Amount,Previous Claims_STD_Premium Amount,Previous Claims_MAX_Premium Amount,Insurance Duration_MIN_Premium Amount,Insurance Duration_MEAN_Premium Amount,Insurance Duration_MEDIAN_Premium Amount,Insurance Duration_STD_Premium Amount,Insurance Duration_MAX_Premium Amount,Health Conscious Level1_MIN_Premium Amount,Health Conscious Level1_MEAN_Premium Amount,Health Conscious Level1_MEDIAN_Premium Amount,Health Conscious Level1_STD_Premium Amount,Health Conscious Level1_MAX_Premium Amount,Money Per Head_MIN_Premium Amount,Money Per Head_MEAN_Premium Amount,Money Per Head_MEDIAN_Premium Amount,Money Per Head_STD_Premium Amount,Money Per Head_MAX_Premium Amount,Money Handling Level_MIN_Premium Amount,Money Handling Level_MEAN_Premium Amount,Money Handling Level_MEDIAN_Premium Amount,Money Handling Level_STD_Premium Amount,Money Handling Level_MAX_Premium Amount,Money Handling Level1_MIN_Premium Amount,Money Handling Level1_MEAN_Premium Amount,Money Handling Level1_MEDIAN_Premium Amount,Money Handling Level1_STD_Premium Amount,Money Handling Level1_MAX_Premium Amount,Growth_MIN_Premium Amount,Growth_MEAN_Premium Amount,Growth_MEDIAN_Premium Amount,Growth_STD_Premium Amount,Growth_MAX_Premium Amount,Growth1_MIN_Premium Amount,Growth1_MEAN_Premium Amount,Growth1_MEDIAN_Premium Amount,Growth1_STD_Premium Amount,Growth1_MAX_Premium Amount,Determinstic_MIN_Premium Amount,Determinstic_MEAN_Premium Amount,Determinstic_MEDIAN_Premium Amount,Determinstic_STD_Premium Amount,Determinstic_MAX_Premium Amount,ENCODED_Policy Start Date - Year,ENCODED_Policy Start Date - Quarter,ENCODED_Customer Feedback,ENCODED_Occupation_Self-Employed,ENCODED_Occupation_Unemployed,ENCODED_Marital Status_Married,ENCODED_Marital Status_Single
0,19.0,Female,10049.0,1.0,Bachelor's,22.598761,Urban,Premium,2.0,17.0,372.0,5.0,No,Weekly,House,2869.0,4,13740.046488,10049.000000,3738228.0,27.013441,20098.0,5024.500000,528.894737,Saturday,186.0,20098.0,744.0,4.0,45.197521,23,12,-0.975344,-0.220691,20.0,1107.625281,878.0,866.838892,4994.0,20.0,1099.847641,870.0,864.000609,4999.0,20.0,1102.698438,873.0,864.866296,4988.0,20.0,1157.177084,912.0,898.578219,4991.0,20.0,1100.812035,872.0,859.965806,4996.0,2869.0,2869.0,2869.0,,2869.0,26.0,1082.905660,823.0,878.580154,4950.0,2869.0,2869.0,2869.0,,2869.0,2869.0,2869.0,2869.0,,2869.0,25.0,1099.000000,974.0,838.709037,3693.0,25.0,1012.700000,958.5,717.596138,2898.0,479.0,1159.000000,644.0,1147.569896,2869.0,4.0,17.0,0.0,1.0,0.0,1.0,0.0
1,39.0,Female,31678.0,3.0,Master's,15.569731,Rural,Comprehensive,1.0,12.0,694.0,2.0,Yes,Monthly,House,1483.0,2,4857.756069,10559.333333,21984532.0,45.645533,95034.0,10559.333333,812.256410,Monday,694.0,126712.0,2776.0,4.0,62.278924,12,6,-0.998725,0.050489,20.0,1102.153646,874.0,863.643171,4997.0,20.0,1099.847641,870.0,864.000609,4999.0,20.0,1102.113989,871.0,866.235322,4997.0,20.0,1083.665634,853.0,856.283142,4997.0,20.0,1106.883166,878.0,863.675409,4997.0,1483.0,1483.0,1483.0,,1483.0,344.0,913.500000,913.5,805.394624,1483.0,1483.0,1483.0,1483.0,,1483.0,1483.0,1483.0,1483.0,,1483.0,250.0,1310.666667,1483.0,985.862228,2199.0,250.0,866.500000,866.5,871.862661,1483.0,1483.0,1483.000000,1483.0,,1483.0,4.0,15.0,1.0,1.0,0.0,0.0,0.0
2,23.0,Male,25602.0,3.0,High School,47.177549,Suburban,Premium,1.0,14.0,555.0,3.0,Yes,Weekly,House,567.0,4,17361.338138,8534.000000,14209110.0,46.129730,25602.0,25602.000000,1113.130435,Saturday,555.0,204816.0,4440.0,8.0,377.420394,30,9,-0.994867,0.101192,20.0,1102.153646,874.0,863.643171,4997.0,20.0,1099.847641,870.0,864.000609,4999.0,20.0,1104.787490,876.0,865.951488,4999.0,20.0,1083.665634,853.0,856.283142,4997.0,20.0,1101.733536,872.0,865.791213,4997.0,567.0,567.0,567.0,,567.0,20.0,919.357143,742.0,773.215219,3572.0,567.0,567.0,567.0,,567.0,567.0,567.0,567.0,,567.0,22.0,1135.168675,881.0,819.725130,3243.0,22.0,1133.086420,881.0,822.937288,3243.0,37.0,870.444444,705.0,783.233699,2619.0,4.0,16.0,2.0,1.0,0.0,0.0,0.0
3,21.0,Male,141855.0,2.0,Bachelor's,10.938144,Rural,Basic,1.0,0.0,367.0,1.0,Yes,Daily,Apartment,765.0,3,7350.432875,70927.500000,52060785.0,386.525886,283710.0,70927.500000,6755.000000,Wednesday,367.0,283710.0,734.0,2.0,21.876288,12,6,0.111402,0.993775,20.0,1107.670467,874.0,867.079610,4997.0,20.0,1099.847641,870.0,864.000609,4999.0,20.0,1102.698438,873.0,864.866296,4988.0,20.0,1083.665634,853.0,856.283142,4997.0,20.0,1097.042977,861.0,865.431191,4988.0,765.0,765.0,765.0,,765.0,765.0,893.500000,893.5,181.726443,1022.0,765.0,765.0,765.0,,765.0,765.0,765.0,765.0,,765.0,765.0,1701.666667,2045.0,820.751688,2295.0,765.0,1405.000000,1405.0,905.096680,2045.0,765.0,765.000000,765.0,,765.0,5.0,19.0,0.0,1.0,0.0,1.0,0.0
4,21.0,Male,39651.0,1.0,Bachelor's,20.376094,Rural,Premium,0.0,8.0,598.0,4.0,Yes,Weekly,House,2022.0,3,6846.367459,39651.000000,23711298.0,66.306020,79302.0,19825.500000,1888.142857,Wednesday,598.0,79302.0,1196.0,0.0,40.752187,1,12,-0.996246,0.086565,20.0,1107.625281,878.0,866.838892,4994.0,20.0,1099.847641,870.0,864.000609,4999.0,20.0,1102.698438,873.0,864.866296,4988.0,20.0,1080.669491,855.0,847.585613,4999.0,20.0,1104.723079,872.0,866.377508,4991.0,2022.0,2022.0,2022.0,0.0,2022.0,271.0,1111.818182,964.0,527.594886,2022.0,2022.0,2022.0,2022.0,,2022.0,2022.0,2022.0,2022.0,,2022.0,48.0,970.000000,879.0,526.211364,2022.0,538.0,1062.200000,921.5,451.404180,2022.0,2022.0,2103.333333,2022.0,140.873466,2266.0,2.0,9.0,0.0,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1999995,50.0,Female,38782.0,1.0,Bachelor's,14.498639,Rural,Premium,1.0,8.0,309.0,2.0,Yes,Daily,Condo,,4,23197.822227,38782.000000,11983638.0,125.508091,77564.0,19391.000000,775.640000,Friday,309.0,155128.0,1236.0,4.0,57.994556,9,7,0.645845,-0.763468,20.0,1107.625281,878.0,866.838892,4994.0,20.0,1105.604643,876.0,863.360484,4994.0,20.0,1102.698438,873.0,864.866296,4988.0,20.0,1083.665634,853.0,856.283142,4997.0,20.0,1106.883166,878.0,863.675409,4997.0,,,,,,23.0,1144.529412,883.0,783.855135,4056.0,,,,,,,,,,,24.0,1126.484536,866.0,784.481306,3850.0,24.0,1094.354167,843.5,804.074741,3850.0,33.0,982.777778,793.0,795.568944,2840.0,2.0,8.0,1.0,0.0,0.0,1.0,0.0
1999996,56.0,Female,73462.0,0.0,Master's,8.145748,Rural,Basic,2.0,0.0,452.0,2.0,No,Daily,Apartment,,6,29194.361129,73462.000000,33204824.0,162.526549,220386.0,24487.333333,1311.821429,Tuesday,226.0,587696.0,3616.0,16.0,65.165985,28,3,0.681828,0.731513,20.0,1098.032667,867.0,862.433346,4999.0,20.0,1099.847641,870.0,864.000609,4999.0,20.0,1102.113989,871.0,866.235322,4997.0,20.0,1157.177084,912.0,898.578219,4991.0,20.0,1106.883166,878.0,863.675409,4997.0,,,,,,122.0,707.111111,287.0,774.748257,2275.0,,,,,,,,,,,265.0,1133.333333,860.0,1032.501009,2275.0,265.0,1133.333333,860.0,1032.501009,2275.0,287.0,287.000000,287.0,,287.0,4.0,14.0,2.0,1.0,0.0,0.0,1.0
1999997,26.0,Female,35178.0,0.0,Master's,6.636583,Urban,Comprehensive,2.0,10.0,764.0,6.0,No,Monthly,Apartment,,2,2760.818699,35178.000000,26875992.0,46.044503,105534.0,11726.000000,1353.000000,Monday,382.0,70356.0,1528.0,4.0,13.273167,30,9,-0.709843,0.704360,20.0,1098.032667,867.0,862.433346,4999.0,20.0,1105.604643,876.0,863.360484,4994.0,20.0,1102.113989,871.0,866.235322,4997.0,20.0,1157.177084,912.0,898.578219,4991.0,20.0,1104.558441,873.0,866.550287,4999.0,,,,,,27.0,1113.365385,888.5,762.397831,4054.0,664.0,1148.0,1057.5,495.437181,1813.0,664.0,1148.0,1057.5,495.437181,1813.0,119.0,1109.831579,903.0,697.169271,3524.0,28.0,1138.514019,906.0,747.570207,3700.0,177.0,1172.428571,507.0,1165.448391,3167.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1999998,34.0,Female,45661.0,3.0,Master's,15.937248,Urban,Premium,2.0,17.0,467.0,7.0,No,Weekly,Condo,,4,17339.725601,15220.333333,21323687.0,97.775161,136983.0,15220.333333,1342.970588,Monday,233.5,182644.0,1868.0,8.0,63.748991,9,5,0.561062,-0.827774,20.0,1102.153646,874.0,863.643171,4997.0,20.0,1099.847641,870.0,864.000609,4999.0,20.0,1102.113989,871.0,866.235322,4997.0,20.0,1157.177084,912.0,898.578219,4991.0,20.0,1105.964399,877.0,866.222189,4988.0,,,,,,776.0,776.000000,776.0,,776.0,,,,,,,,,,,1297.0,1297.000000,1297.0,,1297.0,1297.0,1297.000000,1297.0,,1297.0,,,,,,3.0,11.0,1.0,1.0,0.0,0.0,1.0


#
---
#

# Spliting Data

In [None]:
train = df.iloc[:1200000, :]
test = df.iloc[1200000:, :]

train.shape, test.shape

In [None]:
X = train.drop(columns="Premium Amount")
Y = train["Premium Amount"]

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
x_train, x_validate, y_train, y_validate = train_test_split(X, Y, test_size=10000)

In [None]:
x_validate.shape

In [216]:
test.drop(columns="Premium Amount", inplace=True)

In [None]:
test.shape

##
---
##

# Scaling on `df` 

In [None]:
fig, axs = plt.subplots(3, 6, figsize=(20, 9))

for i, ax in zip(x_train.columns, axs.flatten()):
    sns.kdeplot(x_train[i], ax=ax, color="darkgray", fill=True)

plt.tight_layout()
plt.show()

In [None]:
from sklearn.preprocessing import RobustScaler, MinMaxScaler, StandardScaler

In [None]:
def do_scaling(scaler):
    var_cols = list(map(lambda x : x.replace(" ", "_"), x_train.columns))
    scalers = {}
    cols = x_train.columns
    
    for i in range(len(cols)):
        scalers[f"SCALER_{var_cols[i]}"] = scaler
        
        x_train[f"SCALER_{var_cols[i]}"] = scalers[f"SCALER_{var_cols[i]}"].fit_transform(x_train[[cols[i]]]).flatten()
        x_train.drop(columns=cols[i], inplace=True)

        x_validate[f"SCALER_{var_cols[i]}"] = scalers[f"SCALER_{var_cols[i]}"].transform(x_validate[[cols[i]]]).flatten()
        x_validate.drop(columns=cols[i], inplace=True)

        test[f"SCALER_{var_cols[i]}"] = scalers[f"SCALER_{var_cols[i]}"].transform(test[[cols[i]]]).flatten()
        test.drop(columns=cols[i], inplace=True)
    
    return scalers

In [None]:
scaler_objects = do_scaling(RobustScaler())
scaler_objects

In [None]:
x_train.head(3)

In [None]:
x_validate.head(3)

In [None]:
test.head(3)

#
---
#

# Joining All Data

In [None]:
train = pd.concat([pd.concat([x_train, y_train], axis=1), pd.concat([x_validate, y_validate], axis=1)]).sort_index()
train.head(3)

In [None]:
test.head(3)

In [None]:
df = pd.concat([train, test])

#
---
#

# Download the `Model Ready df`

In [None]:
df.to_csv("trainable_df.csv", index=False)