#
---
# Gist of this Notebook : 

<ol>
<li><strong>Data Loading:</strong> Loads `trainable_df.csv` into a pandas DataFrame.</li>

<li><strong>Exploration:</strong>
    <ul>
        <li>Displays initial data.</li>
        <li>Uses <code><strong>give_stats_analysis</strong></code> to assess relationships between features and "Premium Amount" using statistical tests (Kruskal-Wallis, Spearman, Chi-Square).</li>
         <li>Checks for missing values.</li>
    </ul>
</li>

 <li><strong>Data Splitting:</strong>
    <ul>
        <li>Splits data into train and test sets.</li>
        <li>Separates features (X) and target (Y), and log-transforms Y.</li>
        <li>Further splits train into train/validate sets for model testing.</li>
   </ul>
</li>

 <li><strong>Feature Prep:</strong>
   <ul>
        <li>Drops target from test set.</li>
        <li>Fills missing values in "SCALER_Previous_Claims_STD_Premium_Amount" with medians for both train and test data.</li>
    </ul>
 </li>

<li><strong>Hyperparameter Optimization:</strong> Uses <code><strong>optuna</strong></code> to tune hyperparameters for LightGBM and XGBoost models using RMSE as the optimization metric.</li>

<li><strong>Cross-Validation:</strong>
    <ul>
         <li>Implements KFold (10 splits).</li>
         <li>Trains a <code><strong>VotingRegressor</strong></code> that combines <code><strong>LightGBM</strong></code> and <code><strong>XGBoost</strong></code>, also a stacking model with xgb_model as final estimator.</li>
        <li>Calculates and prints RMSLE per fold.</li>
        <li>Generates test predictions from each fold, and stores them in a DataFrame.</li>
    </ul>
</li>

<li><strong>Saving the Trained Models and Predictions:</strong>
    <ul>
        <li>Saves models using pickle.</li>
        <li>Saves test predictions from all folds.</li>
    </ul>
</li>

<li><strong>Final Prediction:</strong> Averages predictions from all folds and exponentiates them.</li>

 <li><strong>Submission:</strong> Generates and saves submission to a CSV file.</li>

</ol>

#####
---
#

In [82]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import *
import xgboost as xgb
import optuna
from sklearn.ensemble import VotingRegressor
from lightgbm import LGBMRegressor, early_stopping
from xgboost import XGBRegressor
import pickle

from sklearn.preprocessing import PowerTransformer
from sklearn.metrics import root_mean_squared_log_error, root_mean_squared_error
from sklearn.model_selection import KFold, StratifiedKFold

import warnings
warnings.filterwarnings("ignore")
pd.set_option("display.max_columns", None)

In [83]:
df = pd.read_csv("trainable_df.csv")

In [84]:
df

Unnamed: 0,Health Conscious Level,SCALER_Annual_Income,SCALER_Credit_Score,SCALER_IsNull_Annual_Income,SCALER_Money_Handling_Level,SCALER_Money_Handling_Level1,SCALER_Money_Per_Head,SCALER_Growth,SCALER_Credit_by_Score,SCALER_Determinstic,SCALER_Growth1,SCALER_Feedback1,SCALER_Previous_Claims_MEDIAN_Premium_Amount,SCALER_IsNull_Health_Score,SCALER_Previous_Claims_MEAN_Premium_Amount,SCALER_Previous_Claims,SCALER_Previous_Claims_STD_Premium_Amount,SCALER_Previous_Claims_Q3_Premium_Amount,SCALER_Previous_Claims_Q1_Premium_Amount,SCALER_IsNull_Customer_Feedback,SCALER_Previous_Claims_MAX_Premium_Amount,SCALER_Feedback3,SCALER_IsNull_Previous_Claims,SCALER_IsNull_Marital_Status,SCALER_Health_Score,SCALER_Health_Risk_Score,SCALER_Feedback2,SCALER_CreditInsurance,SCALER_Sin_Year,SCALER_IsNull_Credit_Score,SCALER_Health_Age_Interaction,SCALER_Total_Nulls,SCALER_ENCODED_Policy_Start_Date_-_Year,SCALER_ENCODED_Policy_Start_Date_-_Quarter,SCALER_Feedback4,SCALER_IsNull_Number_of_Dependents,SCALER_IsNull_Occupation,SCALER_Health_Conscious_Level1,SCALER_Sin_Month,SCALER_Policy_Start_Date_-_Month,SCALER_Health_Conscious_Level,SCALER_Health_Conscious_Level_Q1_Premium_Amount,SCALER_Health_Conscious_Level_MEAN_Premium_Amount,SCALER_Health_Conscious_Level_MEDIAN_Premium_Amount,SCALER_Number_of_Dependents_MEAN_Premium_Amount,SCALER_Number_of_Dependents_MEDIAN_Premium_Amount,SCALER_Number_of_Dependents_Q1_Premium_Amount,SCALER_Number_of_Dependents_Q3_Premium_Amount,SCALER_Number_of_Dependents_STD_Premium_Amount,SCALER_Health_Conscious_Level_Q3_Premium_Amount,SCALER_Insurance_Duration_MEAN_Premium_Amount,SCALER_Insurance_Duration_MEDIAN_Premium_Amount,SCALER_Insurance_Duration_Q1_Premium_Amount,SCALER_Insurance_Duration_Q3_Premium_Amount,SCALER_Health_Conscious_Level_MAX_Premium_Amount,SCALER_Credit_Health_Score,SCALER_Occupation_Q3_Premium_Amount,SCALER_Occupation_MEAN_Premium_Amount,SCALER_Occupation_MAX_Premium_Amount,SCALER_Occupation_MEDIAN_Premium_Amount,SCALER_Occupation_Q1_Premium_Amount,SCALER_Previous_Claims_MIN_Premium_Amount,SCALER_Insurance_Duration_MAX_Premium_Amount,SCALER_ENCODED_Occupation_Self-Employed,SCALER_Age,SCALER_Insurance_Duration_STD_Premium_Amount,Premium Amount
0,4,-0.391440,-0.978541,0.0,-0.498898,-0.174525,-0.069667,-0.323100,-1.087379,-0.065546,-0.257316,-0.405109,52.0,0.0,57.590036,1.0,30.472002,32.5,9.0,0.0,-4.5,0.00,0.0,0.0,-0.122520,0.122520,-0.725022,-0.347664,0.000000,0.0,-0.600823,-0.5,0.333333,0.7,-0.422895,0.0,0.0,-0.014100,-1.469576e-15,1.2,0.00,0.023383,0.0,0.095653,0.000,0.666667,0.000000,0.540228,0.285714,-0.739699,0.00,0.0,-0.125,-0.500000,-0.502004,-0.142857,-0.619544,0.4,-0.333333,0.0,0.0,0.222222,1.0,-0.956522,-2.154674,0.000000,2869.0
1,2,0.199710,0.403433,0.0,0.377333,0.111512,-0.046949,0.468784,0.556634,0.216782,0.044581,0.231841,0.0,0.0,0.000000,0.0,0.000000,0.0,-1.0,0.0,0.0,0.00,0.0,0.0,-0.520202,0.520202,0.173298,-0.524112,0.000000,0.0,-0.389619,0.0,0.333333,0.5,-0.266654,0.0,1.0,-0.384119,0.000000e+00,0.0,-0.75,-1.208313,-1.8,0.000000,0.125,0.000000,0.200000,0.000000,-1.714286,0.458999,1.50,3.0,0.625,0.833333,-0.289457,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.333333,0.0,-0.086957,-0.781814,0.469401,1483.0
2,4,0.033645,0.137339,0.0,0.098608,0.032665,-0.137109,-0.264937,0.355987,0.516558,0.865083,0.698462,0.0,0.0,0.000000,0.0,0.000000,0.0,-1.0,0.0,0.0,0.50,0.0,0.0,1.268077,-1.268077,1.181256,-0.334206,0.000000,1.0,0.177887,0.0,0.333333,0.6,2.615906,0.0,0.0,0.136757,-7.347881e-16,0.6,0.00,0.023383,0.0,0.000000,0.125,0.000000,0.200000,0.000000,0.285714,-0.557842,0.00,-1.0,-0.875,-0.500000,1.395108,-0.142857,-0.619544,0.4,-0.333333,0.0,0.0,0.333333,1.0,-0.782609,0.000000,0.000000,567.0
3,3,3.210998,-1.000000,0.0,1.821665,5.344662,2.640390,2.462612,-0.501618,6.137843,3.337361,1.169803,0.0,0.0,0.000000,0.0,0.000000,0.0,-1.0,0.0,0.0,-0.25,0.0,0.0,-0.782244,0.782244,-0.729443,-0.905794,1.000000,0.0,-0.837955,0.0,0.666667,0.9,-0.636212,0.0,1.0,-0.280279,0.000000e+00,0.0,-1.00,-0.976617,-0.8,0.631238,0.250,0.666667,0.666667,3.658057,-0.714286,-1.483870,-2.75,-8.0,-0.625,0.500000,-0.891221,-0.142857,-0.619544,0.4,-0.333333,0.0,0.0,-0.666667,1.0,-0.869565,-0.132030,0.000000,765.0
4,3,0.417623,-0.008584,0.0,0.460256,0.428689,1.248090,0.302536,0.245955,1.288743,0.550004,-0.051403,0.0,0.0,-1.000000,-1.0,-1.000000,-1.0,0.0,0.0,1.0,-0.50,0.0,0.0,-0.248272,0.248272,-0.525199,-0.148785,0.753709,0.0,-0.602579,-0.5,-0.333333,-0.1,-0.463556,0.0,0.0,-0.301277,-1.469576e-15,1.2,-1.00,-0.976617,-0.8,0.095653,0.000,0.666667,0.000000,0.540228,-0.714286,0.032506,0.00,0.0,0.000,0.500000,-0.167217,-0.142857,-0.619544,0.4,-0.333333,0.0,0.0,-0.333333,1.0,-0.869565,0.218186,0.000000,2022.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1999995,4,0.393872,-1.248927,0.0,-0.102934,1.337552,1.209406,0.284170,-0.689320,0.180299,0.526304,0.401608,0.0,0.0,-1.000000,-1.0,-1.000000,-1.0,0.0,0.0,1.0,-0.50,1.0,0.0,-0.580801,0.580801,-0.507515,-0.811963,0.753709,0.0,-0.249825,0.5,-0.333333,-0.2,-0.305842,0.0,1.0,0.379893,-2.449294e-16,0.2,0.00,0.023383,0.0,0.095653,0.000,0.666667,0.000000,0.540228,0.285714,0.458999,1.50,3.0,0.625,-0.500000,-0.849948,-0.142857,-0.619544,0.4,-0.333333,0.0,0.0,0.333333,1.0,0.391304,-0.781814,0.000000,
1999996,5,1.341724,-0.592275,0.0,0.951435,1.851848,2.753215,1.793438,-0.941748,1.498745,0.804284,2.985924,52.0,0.0,57.590036,1.0,30.472002,32.5,9.0,0.0,-4.5,1.50,0.0,0.0,-0.940229,0.940229,0.580018,-0.697570,0.000000,1.0,-0.772163,1.0,0.333333,0.4,-0.240246,0.0,1.0,0.173630,7.347881e-16,-0.6,0.00,0.000000,0.2,-0.904347,-0.875,-0.333333,-0.800000,-4.197114,0.000000,0.458999,1.50,3.0,0.625,0.000000,-0.913459,-0.142857,-0.619544,0.4,-0.333333,0.0,0.0,0.333333,1.0,-0.260870,-0.781814,0.000000,
1999997,2,0.295370,0.420601,0.0,0.500736,0.184476,1.048971,0.579742,0.569579,0.755552,0.108217,-0.104850,0.0,0.0,0.000000,0.0,0.000000,0.0,-1.0,0.0,0.0,-0.25,1.0,0.0,-1.025614,1.025614,-0.436782,0.522617,1.507418,1.0,-0.905826,0.5,-1.000000,-1.0,-0.714903,0.0,0.0,-0.471473,-7.347881e-16,0.6,-0.75,-1.208313,-1.8,-0.904347,-0.875,-0.333333,-0.800000,-4.197114,-1.714286,0.000000,0.25,0.0,0.500,0.833333,-0.836457,0.857143,0.380456,-0.6,0.666667,1.0,0.0,0.555556,0.0,-0.652174,0.282128,-0.530599,
1999998,4,0.581885,-0.570815,0.0,0.345597,0.911799,0.160539,0.912079,-0.933657,0.745560,0.298815,0.565999,52.0,0.0,57.590036,1.0,30.472002,32.5,9.0,0.0,-4.5,0.50,0.0,0.0,-0.499409,0.499409,-0.228117,0.179065,-1.000000,0.0,-0.467231,0.0,0.000000,0.1,-0.253207,0.0,1.0,0.135856,2.449294e-16,-0.2,0.00,0.023383,0.0,0.000000,0.125,0.000000,0.200000,0.000000,0.285714,0.277687,1.25,0.0,0.500,-0.500000,-0.587428,-0.142857,-0.619544,0.4,-0.333333,0.0,0.0,-0.666667,1.0,-0.304348,0.161898,0.000000,


#
---
#

# Just Checking!

In [85]:
def return_splits(ddf, feature_name, target_name):
    return [ddf[ddf[feature_name] == i][target_name] for i in ddf[feature_name].unique()]

def give_stats_analysis(df, target_column_name):
    ddf = df.copy()
    ddf = ddf.dropna()

    features = []
    tests = []
    stats = []
    pvals = []
    verdict = []
    count = 0

    target = ddf[target_column_name]
    for i in ddf.columns:
        features.append(i)
        feature = ddf[i]
        
        if (feature.dtype == "O" and (target.dtype == "float" or target.dtype == "int")) or (target.dtype == "O" and (feature.dtype == "float" or feature.dtype == "int")):
            stat, pval, *_ = kruskal(*return_splits(ddf, feature.name, target.name))
            tests.append("Kruskal-Wallis")
            stats.append(stat)
            pvals.append(pval)
            
        
        elif (feature.dtype == "float" or feature.dtype == "int") and (target.dtype == "float" or target.dtype == "int"):
            stat, pval, *_ = spearmanr(feature, target)
            tests.append("SpearmanR")
            stats.append(stat)
            pvals.append(pval)

        elif feature.dtype == "O" and target.dtype == "O":
            stat, pval, *_ = chi2_contingency(pd.crosstab(feature, target))
            tests.append("Chi-Square")
            stats.append(stat)
            pvals.append(pval)
        
        else:
            tests.append(np.nan)
            stats.append(np.nan)
            pvals.append(np.nan)
        
        if pval <= 0.05:
            verdict.append("There is Relationship")
        else:
            verdict.append("There is NO Relationship")

        print(f"{feature.name} ■■■ {target_column_name}".ljust(50, "-")+"✅")
    
    return pd.DataFrame({
        "Feature" : features,
        "Target" : [target_column_name]*ddf.shape[1],
        "Statistic Test" : tests,
        "Test Statistic" : stats,
        "P-Value" : pvals,
        "Verdict" : verdict
    }).sort_values(by="P-Value")

In [86]:
give_stats_analysis(df, "Premium Amount")

Health Conscious Level ■■■ Premium Amount---------✅
SCALER_Annual_Income ■■■ Premium Amount-----------✅
SCALER_Credit_Score ■■■ Premium Amount------------✅
SCALER_IsNull_Annual_Income ■■■ Premium Amount----✅
SCALER_Money_Handling_Level ■■■ Premium Amount----✅
SCALER_Money_Handling_Level1 ■■■ Premium Amount---✅
SCALER_Money_Per_Head ■■■ Premium Amount----------✅
SCALER_Growth ■■■ Premium Amount------------------✅
SCALER_Credit_by_Score ■■■ Premium Amount---------✅
SCALER_Determinstic ■■■ Premium Amount------------✅
SCALER_Growth1 ■■■ Premium Amount-----------------✅
SCALER_Feedback1 ■■■ Premium Amount---------------✅
SCALER_Previous_Claims_MEDIAN_Premium_Amount ■■■ Premium Amount✅
SCALER_IsNull_Health_Score ■■■ Premium Amount-----✅
SCALER_Previous_Claims_MEAN_Premium_Amount ■■■ Premium Amount✅
SCALER_Previous_Claims ■■■ Premium Amount---------✅
SCALER_Previous_Claims_STD_Premium_Amount ■■■ Premium Amount✅
SCALER_Previous_Claims_Q3_Premium_Amount ■■■ Premium Amount✅
SCALER_Previous_Claim

Unnamed: 0,Feature,Target,Statistic Test,Test Statistic,P-Value,Verdict
1,SCALER_Annual_Income,Premium Amount,SpearmanR,-0.061831,0.0,There is Relationship
2,SCALER_Credit_Score,Premium Amount,SpearmanR,-0.036687,0.0,There is Relationship
3,SCALER_IsNull_Annual_Income,Premium Amount,SpearmanR,-0.065399,0.0,There is Relationship
4,SCALER_Money_Handling_Level,Premium Amount,SpearmanR,-0.072097,0.0,There is Relationship
6,SCALER_Money_Per_Head,Premium Amount,SpearmanR,-0.053422,0.0,There is Relationship
5,SCALER_Money_Handling_Level1,Premium Amount,SpearmanR,-0.048668,0.0,There is Relationship
7,SCALER_Growth,Premium Amount,SpearmanR,-0.055,0.0,There is Relationship
8,SCALER_Credit_by_Score,Premium Amount,SpearmanR,-0.05485,0.0,There is Relationship
12,SCALER_Previous_Claims_MEDIAN_Premium_Amount,Premium Amount,SpearmanR,0.037268,0.0,There is Relationship
9,SCALER_Determinstic,Premium Amount,SpearmanR,-0.056869,0.0,There is Relationship


In [87]:
df.isnull().sum()

Health Conscious Level                                      0
SCALER_Annual_Income                                        0
SCALER_Credit_Score                                         0
SCALER_IsNull_Annual_Income                                 0
SCALER_Money_Handling_Level                                 0
SCALER_Money_Handling_Level1                                0
SCALER_Money_Per_Head                                       0
SCALER_Growth                                               0
SCALER_Credit_by_Score                                      0
SCALER_Determinstic                                         0
SCALER_Growth1                                              0
SCALER_Feedback1                                            0
SCALER_Previous_Claims_MEDIAN_Premium_Amount                0
SCALER_IsNull_Health_Score                                  0
SCALER_Previous_Claims_MEAN_Premium_Amount                  0
SCALER_Previous_Claims                                      0
SCALER_P

#
---
#

# Removing non related columns

In [88]:
# df.drop(columns=["SCALER_ENCODED_Marital_Status_Married", "SCALER_ENCODED_Customer_Feedback", "SCALER_PC2_Meaningless_df", "SCALER_PC1_Meaningless_df", "SCALER_ENCODED_Occupation_Unemployed"], inplace=True)

#
---
#

# Spliting Data

In [89]:
train = df.iloc[:1200000, :]
test = df.iloc[1200000:, :]

train.shape, test.shape

((1200000, 67), (800000, 67))

In [90]:
X = train.drop(columns="Premium Amount")
Y = train["Premium Amount"]
Y_log = np.log1p(Y)

In [91]:
from sklearn.model_selection import train_test_split

In [92]:
x_train, x_validate, y_train, y_validate = train_test_split(X, Y, test_size=10000)

In [93]:
x_validate.shape

(10000, 66)

In [94]:
test.drop(columns="Premium Amount", inplace=True)

In [95]:
test.shape

(800000, 66)

In [96]:
x_validate.head(3)

Unnamed: 0,Health Conscious Level,SCALER_Annual_Income,SCALER_Credit_Score,SCALER_IsNull_Annual_Income,SCALER_Money_Handling_Level,SCALER_Money_Handling_Level1,SCALER_Money_Per_Head,SCALER_Growth,SCALER_Credit_by_Score,SCALER_Determinstic,SCALER_Growth1,SCALER_Feedback1,SCALER_Previous_Claims_MEDIAN_Premium_Amount,SCALER_IsNull_Health_Score,SCALER_Previous_Claims_MEAN_Premium_Amount,SCALER_Previous_Claims,SCALER_Previous_Claims_STD_Premium_Amount,SCALER_Previous_Claims_Q3_Premium_Amount,SCALER_Previous_Claims_Q1_Premium_Amount,SCALER_IsNull_Customer_Feedback,SCALER_Previous_Claims_MAX_Premium_Amount,SCALER_Feedback3,SCALER_IsNull_Previous_Claims,SCALER_IsNull_Marital_Status,SCALER_Health_Score,SCALER_Health_Risk_Score,SCALER_Feedback2,SCALER_CreditInsurance,SCALER_Sin_Year,SCALER_IsNull_Credit_Score,SCALER_Health_Age_Interaction,SCALER_Total_Nulls,SCALER_ENCODED_Policy_Start_Date_-_Year,SCALER_ENCODED_Policy_Start_Date_-_Quarter,SCALER_Feedback4,SCALER_IsNull_Number_of_Dependents,SCALER_IsNull_Occupation,SCALER_Health_Conscious_Level1,SCALER_Sin_Month,SCALER_Policy_Start_Date_-_Month,SCALER_Health_Conscious_Level,SCALER_Health_Conscious_Level_Q1_Premium_Amount,SCALER_Health_Conscious_Level_MEAN_Premium_Amount,SCALER_Health_Conscious_Level_MEDIAN_Premium_Amount,SCALER_Number_of_Dependents_MEAN_Premium_Amount,SCALER_Number_of_Dependents_MEDIAN_Premium_Amount,SCALER_Number_of_Dependents_Q1_Premium_Amount,SCALER_Number_of_Dependents_Q3_Premium_Amount,SCALER_Number_of_Dependents_STD_Premium_Amount,SCALER_Health_Conscious_Level_Q3_Premium_Amount,SCALER_Insurance_Duration_MEAN_Premium_Amount,SCALER_Insurance_Duration_MEDIAN_Premium_Amount,SCALER_Insurance_Duration_Q1_Premium_Amount,SCALER_Insurance_Duration_Q3_Premium_Amount,SCALER_Health_Conscious_Level_MAX_Premium_Amount,SCALER_Credit_Health_Score,SCALER_Occupation_Q3_Premium_Amount,SCALER_Occupation_MEAN_Premium_Amount,SCALER_Occupation_MAX_Premium_Amount,SCALER_Occupation_MEDIAN_Premium_Amount,SCALER_Occupation_Q1_Premium_Amount,SCALER_Previous_Claims_MIN_Premium_Amount,SCALER_Insurance_Duration_MAX_Premium_Amount,SCALER_ENCODED_Occupation_Self-Employed,SCALER_Age,SCALER_Insurance_Duration_STD_Premium_Amount
445066,6,-0.63775,1.004292,0.0,-0.636884,-0.570143,-0.470844,-0.491652,-0.339806,-0.566018,-0.517236,-0.475619,52.0,0.0,57.590036,1.0,30.472002,32.5,9.0,0.0,-4.5,1.5,0.0,0.0,0.812871,-0.812871,1.895668,-0.419439,-0.246291,0.0,0.70168,-0.5,-0.666667,-0.7,2.027157,0.0,0.0,1.447951,2.449294e-16,-0.2,1.0,0.700173,1.6,0.095653,0.0,0.666667,0.0,0.540228,0.142857,0.458999,1.5,3.0,0.625,0.333333,1.644963,-0.142857,-0.619544,0.4,-0.333333,0.0,0.0,0.333333,1.0,-0.086957,-0.781814,0.0
651327,6,-0.598502,0.094421,0.0,-0.604548,-0.528195,-0.406919,-0.457085,-0.682848,-0.546021,-0.486414,-0.406985,52.0,0.0,57.590036,1.0,30.472002,32.5,9.0,0.0,-4.5,1.5,0.0,0.0,1.044823,-1.044823,1.145889,-0.345421,1.0,0.0,1.610342,-0.5,0.666667,0.9,2.327157,0.0,0.0,0.940717,4.898587e-16,-0.4,1.0,0.700173,1.6,-0.904347,-0.875,-0.333333,-0.8,-4.197114,0.142857,-0.557842,0.0,-1.0,-0.875,0.333333,1.135815,-0.142857,-0.619544,0.4,-0.333333,0.0,0.0,0.333333,1.0,0.521739,0.0,0.0
617153,4,1.524079,-1.055794,1.0,0.683855,2.885928,1.266609,0.311328,-0.543689,1.30848,3.839529,3.304812,0.0,0.0,-1.0,-1.0,-1.0,-1.0,0.0,0.0,1.0,-0.5,0.0,0.0,1.538377,-1.538377,0.198055,0.148037,-1.0,0.0,1.480709,1.0,0.0,0.3,2.965504,1.0,1.0,0.140738,-9.797174e-16,0.8,0.0,0.023383,0.0,0.631238,0.25,0.666667,0.666667,3.658057,0.285714,-1.753579,-2.75,-8.0,-0.5,-0.5,0.382809,0.0,0.0,0.0,0.0,0.0,0.0,-0.666667,0.0,0.043478,-1.064823,0.469401


In [97]:
pd.set_option("display.max_rows", 100)

In [98]:
x_validate.iloc[0]

Health Conscious Level                                 6.000000e+00
SCALER_Annual_Income                                  -6.377501e-01
SCALER_Credit_Score                                    1.004292e+00
SCALER_IsNull_Annual_Income                            0.000000e+00
SCALER_Money_Handling_Level                           -6.368836e-01
SCALER_Money_Handling_Level1                          -5.701433e-01
SCALER_Money_Per_Head                                 -4.708440e-01
SCALER_Growth                                         -4.916517e-01
SCALER_Credit_by_Score                                -3.398058e-01
SCALER_Determinstic                                   -5.660184e-01
SCALER_Growth1                                        -5.172362e-01
SCALER_Feedback1                                      -4.756186e-01
SCALER_Previous_Claims_MEDIAN_Premium_Amount           5.200000e+01
SCALER_IsNull_Health_Score                             0.000000e+00
SCALER_Previous_Claims_MEAN_Premium_Amount      

#
---
#

# Base Model Building

In [99]:
# from xgboost import XGBRegressor
# from sklearn.metrics import root_mean_squared_log_error

In [100]:
# model = XGBRegressor(verbosity=3)

In [101]:
# model.fit(x_train, np.log(y_train))

In [102]:
# prediction = model.predict(x_validate)

In [103]:
# prediction[prediction < 0] = 0

In [104]:
# root_mean_squared_log_error(y_validate, prediction)

In [105]:
# prediction = pd.DataFrame({"id" : test.index, "Premium Amount" : model.predict(test)})
# prediction

In [106]:
# prediction.to_csv("Dileep's_Submission_Take3.csv", index=False)

#
---
#

# Hyper-tuning -- IT is taking too much time

In [107]:
# from sklearn.model_selection import RandomizedSearchCV

In [108]:
# esimator = XGBRegressor(n_jobs=-1, verbosity=3)

In [109]:
# param_grid = {
#     "n_estimators" : range(50, 1000),
#     "max_depth" : range(5, 30)
# }

In [110]:
# cv_model = RandomizedSearchCV(estimator=esimator, param_distributions=param_grid, verbose=3, n_jobs=-1, cv=2, n_iter=60)

In [111]:
# cv_model.fit(x_train, y_train)

#
---
#

# Optuna Hyper-parameter Tuning

In [112]:
def objective_lgbm(trial):
    params = {
        'boosting_type': 'gbdt',
        'objective': 'regression',
        'metric': 'rmse',
        'n_estimators': trial.suggest_int('n_estimators', 300, 1500),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2),
        'num_leaves': trial.suggest_int('num_leaves', 20, 150),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 10.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 10.0),
        'device': 'gpu',
        'gpu_platform_id': 1,
        'gpu_device_id': 0,
        'verbose': 3,
    }

    model = LGBMRegressor(**params)
    model.fit(x_train, y_train, 
              eval_set=[(x_validate, y_validate)], 
              eval_metric='rmse', 
              callbacks=[early_stopping(stopping_rounds=200)])
    preds = model.predict(x_validate)
    return root_mean_squared_error(y_validate, preds)

# study_lgbm = optuna.create_study(direction='minimize')
# study_lgbm.optimize(objective_lgbm, n_trials=30)
# print("Best parameters for LightGBM:", study_lgbm.best_params)

In [113]:
# study_lgbm.best_params

In [114]:
def objective_xgb(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 3000),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 10.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 10.0),
        'tree_method': 'gpu_hist',
        'predictor': 'gpu_predictor'
    }

    model = XGBRegressor(**params)
    model.fit(x_train, y_train, 
              eval_set=[(x_validate, y_validate)],
              verbose=3)
    preds = model.predict(x_validate)
    return root_mean_squared_error(y_validate, preds)

# study_xgb = optuna.create_study(direction='minimize')
# study_xgb.optimize(objective_xgb, n_trials=30)
# print("Best parameters for XGBoost:", study_xgb.best_params)

In [115]:
# study_xgb.best_params

#
---
#

# Let's do Cross-Validation

In [116]:
X["SCALER_Previous_Claims_STD_Premium_Amount"].fillna(X["SCALER_Previous_Claims_STD_Premium_Amount"].median(), inplace=True)

In [117]:
test["SCALER_Previous_Claims_STD_Premium_Amount"].fillna(test["SCALER_Previous_Claims_STD_Premium_Amount"].median(), inplace=True)

In [118]:
# ===============================1.04 RMSLE================================
# ===============Best Parmeters for KNN Imputer Data=======================
lgb_params = {
    'n_estimators': 470,
     'learning_rate': 0.055186215312218706,
     'num_leaves': 131,
     'max_depth': 11,
     'min_child_samples': 100,
     'subsample': 0.7456958918734639,
     'colsample_bytree': 0.8071815211022829,
     'reg_alpha': 5.4821727206510005,
     'reg_lambda': 6.187384822358707
 }

# cat_params = {
#     'iterations': 3000, 
#     'learning_rate': 0.038365175314273574, 
#     'depth': 11, 
#     'l2_leaf_reg': 3.596285147607088, 
#     'bagging_temperature': 0.2618728648567565
# }

xgb_params = {
    'n_estimators': 711,
     'learning_rate': 0.018352261556416866,
     'max_depth': 9,
     'min_child_weight': 6,
     'subsample': 0.9301085945058237,
     'colsample_bytree': 0.6532668613763417,
     'reg_alpha': 9.92413527770934,
     'reg_lambda': 9.101727260657976
}
# ===============Best Parmeters for KNN Imputer Data=======================
# =========================================================================

# ===============================WASTE - 1.06 RMSLE=========================
# ===============Best Parmeters for MICE Imputer Data=======================
# lgb_params = {
#     'n_estimators': 1385,
#      'learning_rate': 0.14257294852559838,
#      'num_leaves': 60,
#      'max_depth': 9,
#      'min_child_samples': 40,
#      'subsample': 0.9935190832160928,
#      'colsample_bytree': 0.9966322334638195,
#      'reg_alpha': 0.23095019319145593,
#      'reg_lambda': 4.253578962871361
#  }

# # cat_params = {
# #     'iterations': 3000, 
# #     'learning_rate': 0.038365175314273574, 
# #     'depth': 11, 
# #     'l2_leaf_reg': 3.596285147607088, 
# #     'bagging_temperature': 0.2618728648567565
# # }

# xgb_params = {
#     'n_estimators': 2846,
#      'learning_rate': 0.01624734031067095,
#      'max_depth': 9,
#      'min_child_weight': 7,
#      'subsample': 0.927649566424982,
#      'colsample_bytree': 0.6223559022447491,
#      'reg_alpha': 7.0497248789196085,
#      'reg_lambda': 0.9338395072495587
# }
# ===============Best Parmeters for MICE Imputer Data=======================
# =========================================================================



In [119]:
from lightgbm import LGBMRegressor
# from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import VotingRegressor
from sklearn.metrics import root_mean_squared_log_error, root_mean_squared_error, r2_score


splitter = KFold(n_splits=10)
models = []


count = 1

folds_df = pd.DataFrame()

for train_index, test_index in splitter.split(X, Y_log):
    fold_xtrain  = X.iloc[train_index, :]
    fold_ytrain  = Y_log.iloc[train_index]
    fold_xtest  = X.iloc[test_index, :]
    fold_ytest  = Y_log.iloc[test_index]

    # LightGBM model
    lgbm_model = LGBMRegressor(**lgb_params, device='gpu', gpu_platform_id=1, gpu_device_id=0)

    # CatBoost model
    # cat_model = CatBoostRegressor(**cat_params, verbose=3, task_type= 'GPU')

    # XGBoost model
    xgb_model = XGBRegressor(**xgb_params, tree_method="gpu_hist", predictor= 'gpu_predictor')

    # Voting Regressor
    model = VotingRegressor(
        estimators=[
            ('lgbm', lgbm_model),
            # ('cat', cat_model),
            ('xgb', xgb_model)
        ]
    )

    model.fit(fold_xtrain, fold_ytrain)
    fold_pred = np.maximum(0, model.predict(fold_xtest))
    fold_rmsle = root_mean_squared_log_error(np.expm1(fold_ytest), np.expm1(fold_pred))
    print(f"Fold {count} RMSLE: {fold_rmsle}")
    models.append(model)

    folds_df[f"Fold {count}"] = model.predict(test)
    count += 1

[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 4917
[LightGBM] [Info] Number of data points in the train set: 1080000, number of used features: 65
[LightGBM] [Info] Using requested OpenCL platform 1 device 0
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce RTX 3050 6GB Laptop GPU, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 54 dense feature groups (57.68 MB) transferred to GPU in 0.101891 secs. 1 sparse feature groups
[LightGBM] [Info] Start training from score 6.593622
Fold 1 RMSLE: 1.0420928618713494
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 4917
[LightGBM] [Info] Number of data points in the train set: 1080000, number of used features: 65
[LightGBM] [Info] Using requested OpenCL platform 1 device 0
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce RTX 3050 6GB Laptop GPU

In [None]:
# from lightgbm import LGBMRegressor
# # from catboost import CatBoostRegressor
# from sklearn.linear_model import LinearRegression, Lasso
# from xgboost import XGBRegressor
# from sklearn.ensemble import StackingRegressor
# from sklearn.metrics import root_mean_squared_log_error, root_mean_squared_error, r2_score


# splitter = KFold(n_splits=10)
# models = []


# count = 1

# folds_df = pd.DataFrame()

# for train_index, test_index in splitter.split(X, Y_log):
#     fold_xtrain  = X.iloc[train_index, :]
#     fold_ytrain  = Y_log.iloc[train_index]
#     fold_xtest  = X.iloc[test_index, :]
#     fold_ytest  = Y_log.iloc[test_index]

#     # LightGBM model
#     lgbm_model = LGBMRegressor(**lgb_params, device='gpu', gpu_platform_id=1, gpu_device_id=0)

#     # CatBoost model
#     # cat_model = CatBoostRegressor(**cat_params, verbose=3, task_type= 'GPU')

#     # XGBoost model
#     xgb_model = XGBRegressor(**xgb_params, tree_method="gpu_hist", predictor= 'gpu_predictor')
#     xgb_model1 = XGBRegressor(**xgb_params, tree_method="gpu_hist", predictor= 'gpu_predictor')

#     # Linear Regression
#     lir_model = Lasso()

#     # Voting Regressor
#     # vote_model = VotingRegressor(
#     #     estimators=[
#     #         ('lgbm', lgbm_model),
#     #         # ('cat', cat_model),
#     #         ('xgb', xgb_model)
#     #     ]
#     # )
    
#     model = StackingRegressor(
#         estimators=[
#             ('lgbm', lgbm_model),
#             # ('cat', cat_model),
#             ('xgb', xgb_model)
#         ], final_estimator=xgb_model1
#     )

#     model.fit(fold_xtrain, fold_ytrain)
#     fold_pred = np.maximum(0, model.predict(fold_xtest))
#     fold_rmsle = root_mean_squared_log_error(np.expm1(fold_ytest), np.expm1(fold_pred))
#     print(f"Fold {count} RMSLE: {fold_rmsle}")
#     models.append(model)

#     folds_df[f"Fold {count}"] = model.predict(test)
#     count += 1

In [None]:
# splitter = KFold(n_splits=10)

In [None]:
# from sklearn.ensemble import GradientBoostingRegressor
# from xgboost import XGBRegressor
# from sklearn.metrics import root_mean_squared_log_error, root_mean_squared_error, r2_score
# gb_model = GradientBoostingRegressor(verbose=3, loss="absolute_error", learning_rate=0.5, n_estimators=150)
# xg_model = XGBRegressor(verbose=3, loss="absolute_error", learning_rate=0.075, n_estimators=100)

# count = 1

# folds_df = pd.DataFrame()

# for train_index, test_index in splitter.split(X, Y_log):
#     fold_xtrain  = X.iloc[train_index, :]
#     fold_ytrain  = Y_log.iloc[train_index]
#     fold_xtest  = X.iloc[test_index, :]
#     fold_ytest  = Y_log.iloc[test_index]

#     # print(fold_xtrain.shape, fold_ytrain.shape, fold_xtest.shape, fold_ytest.shape)

#     fold_ypred = xg_model.fit(fold_xtrain, fold_ytrain).predict(fold_xtest)
#     print(f"The Root mean squared log error for FOLD - {count} is --> {root_mean_squared_log_error(fold_ytest, fold_ypred)}")
    
#     folds_df[f"Fold {count}"] = xg_model.predict(test)
#     count += 1

In [38]:
with open('models.pkl', 'wb') as file:
    pickle.dump(models, file)

print("Cross-validation models saved!")

Cross-validation models saved!


In [39]:
import gzip
import pickle

with gzip.open('models.pkl.gz', 'wb') as f:
    pickle.dump(models, f)


In [None]:
# with gzip.open('models.pkl.gz', 'rb') as f:
#     models = pickle.load(f)

In [38]:
# from sklearn.metrics import root_mean_squared_log_error, root_mean_squared_error, r2_score

# print(f"The Root mean squared log error is --> {np.exp(root_mean_squared_log_error(fold_ytest, fold_ypred))}")
# print(f"The R-Square is --> {r2_score(fold_ytest, fold_ypred)}")


In [39]:
folds_df.to_csv("folds_prediction.csv", index=False)

#
---
#

In [40]:
preds = pd.read_csv("folds_prediction.csv")
preds

Unnamed: 0,Fold 1,Fold 2,Fold 3,Fold 4,Fold 5,Fold 6,Fold 7,Fold 8,Fold 9,Fold 10
0,6.551424,6.485476,6.601262,6.532599,6.573359,6.519237,6.472776,6.641038,6.532362,6.469150
1,6.693403,6.735639,6.719050,6.685140,6.733575,6.682848,6.725292,6.681686,6.720510,6.686640
2,6.669284,6.659231,6.636197,6.655928,6.655478,6.664165,6.689860,6.691477,6.633185,6.643531
3,6.687991,6.700724,6.685077,6.689615,6.694585,6.698281,6.695249,6.703917,6.709956,6.718604
4,6.601223,6.586610,6.594584,6.616701,6.613587,6.607813,6.592059,6.613125,6.597745,6.591645
...,...,...,...,...,...,...,...,...,...,...
799995,6.902434,6.892521,6.884256,6.886408,6.910605,6.916957,6.910586,6.871274,6.879500,6.883066
799996,6.268355,6.142203,6.171807,6.120068,6.172171,6.267633,6.156420,6.191467,6.206396,6.273045
799997,6.682500,6.664797,6.689899,6.653399,6.677367,6.664535,6.613153,6.649033,6.648409,6.661296
799998,6.718887,6.719099,6.686544,6.719136,6.712388,6.706915,6.681868,6.679136,6.714805,6.706521


In [41]:
pred = preds.mean(axis=1).apply(np.expm1)

In [42]:
prediction = pd.DataFrame({"id" : test.index, "Premium Amount" : pred})
prediction

Unnamed: 0,id,Premium Amount
0,1200000,689.812419
1,1200001,816.604097
2,1200002,779.421176
3,1200003,810.106907
4,1200004,735.205335
...,...,...
799995,1999995,985.102819
799996,1999996,490.251595
799997,1999997,779.893634
799998,1999998,815.094305


In [43]:
prediction.to_csv("Dileep's_Submission_Take34.csv", index=False)

In [None]:
# Take 20 did after Feature engineering
# Before -- 821 Rank
# After -- 757

#
---
#

# Lets implement... `STACKING`

In [None]:
X.fillna(X["SCALER_Previous_Claims_STD_Premium_Amount"].median(), inplace=True)

In [None]:
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, IsolationForest
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from sklearn.metrics import root_mean_squared_log_error

In [None]:
lir_model = LinearRegression()
knn_model = KNeighborsRegressor()
rf_model = RandomForestRegressor(verbose=3, n_estimators=40)
gb_model = GradientBoostingRegressor(verbose=3, loss="absolute_error", learning_rate=0.5, n_estimators=250)
xgb_model = XGBRegressor(verbose=3)
ridge_model = Ridge()
if_model = IsolationForest(verbose=3)

In [None]:
stacked_model = StackingRegressor(estimators=[
                    ("rf_model", rf_model),
                    # ("knn_model", knn_model),
                    ("xgb_model", xgb_model),
                    ("gb_model", gb_model),
                    # ("lir_model", lir_model)
                ], final_estimator=lir_model, verbose=3)

In [None]:
gb_model.fit(X, np.log(Y))

In [None]:
prediction = gb_model.predict(x_validate)

In [None]:
# prediction[prediction < 0] = prediction.mean()

In [None]:
root_mean_squared_log_error(y_validate, prediction)

In [None]:
test.fillna(test["SCALER_Previous_Claims_STD_Premium_Amount"].median(), inplace=True)

In [None]:
prediction = pd.DataFrame({"id" : test.index, "Premium Amount" : np.exp(gb_model.predict(test))})
prediction

In [None]:
prediction.to_csv("Dileep's_Submission_Take14.csv", index=False)