#
---
# Gist of this Notebook : 

<ol>
<li><strong>Data Loading:</strong> Loads `trainable_df.csv` into a pandas DataFrame.</li>

<li><strong>Exploration:</strong>
    <ul>
        <li>Displays initial data.</li>
        <li>Uses <code><strong>give_stats_analysis</strong></code> to assess relationships between features and "Premium Amount" using statistical tests (Kruskal-Wallis, Spearman, Chi-Square).</li>
         <li>Checks for missing values.</li>
    </ul>
</li>

 <li><strong>Data Splitting:</strong>
    <ul>
        <li>Splits data into train and test sets.</li>
        <li>Separates features (X) and target (Y), and log-transforms Y.</li>
        <li>Further splits train into train/validate sets for model testing.</li>
   </ul>
</li>

 <li><strong>Feature Prep:</strong>
   <ul>
        <li>Drops target from test set.</li>
        <li>Fills missing values in "SCALER_Previous_Claims_STD_Premium_Amount" with medians for both train and test data.</li>
    </ul>
 </li>

<li><strong>Hyperparameter Optimization:</strong> Uses <code><strong>optuna</strong></code> to tune hyperparameters for LightGBM and XGBoost models using RMSE as the optimization metric.</li>

<li><strong>Cross-Validation:</strong>
    <ul>
         <li>Implements KFold (10 splits).</li>
         <li>Trains a <code><strong>VotingRegressor</strong></code> that combines <code><strong>LightGBM</strong></code> and <code><strong>XGBoost</strong></code>, also a stacking model with xgb_model as final estimator.</li>
        <li>Calculates and prints RMSLE per fold.</li>
        <li>Generates test predictions from each fold, and stores them in a DataFrame.</li>
    </ul>
</li>

<li><strong>Saving the Trained Models and Predictions:</strong>
    <ul>
        <li>Saves models using pickle.</li>
        <li>Saves test predictions from all folds.</li>
    </ul>
</li>

<li><strong>Final Prediction:</strong> Averages predictions from all folds and exponentiates them.</li>

 <li><strong>Submission:</strong> Generates and saves submission to a CSV file.</li>

</ol>

#####
---
#

In [4]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import *
import xgboost as xgb
import optuna
from sklearn.ensemble import VotingRegressor
from lightgbm import LGBMRegressor, early_stopping
from xgboost import XGBRegressor
import pickle

from sklearn.preprocessing import PowerTransformer
from sklearn.metrics import root_mean_squared_log_error, root_mean_squared_error
from sklearn.model_selection import KFold, StratifiedKFold

import warnings
warnings.filterwarnings("ignore")
pd.set_option("display.max_columns", None)

In [5]:
df = pd.read_csv("trainable_df.csv")

In [6]:
df

Unnamed: 0,SCALER_Annual_Income,SCALER_Credit_Score,SCALER_IsNull_Annual_Income,SCALER_Money_Handling_Level,SCALER_Money_Handling_Level1,SCALER_Money_Per_Head,SCALER_Growth,SCALER_Credit_by_Score,SCALER_Determinstic,SCALER_Growth1,SCALER_Feedback1,SCALER_Previous_Claims_MEDIAN_Premium_Amount,SCALER_IsNull_Health_Score,SCALER_Previous_Claims_MEAN_Premium_Amount,SCALER_Previous_Claims,SCALER_Previous_Claims_STD_Premium_Amount,SCALER_Previous_Claims_Q3_Premium_Amount,SCALER_Previous_Claims_Q1_Premium_Amount,SCALER_IsNull_Customer_Feedback,SCALER_Previous_Claims_MAX_Premium_Amount,SCALER_Feedback3,SCALER_IsNull_Previous_Claims,SCALER_IsNull_Marital_Status,SCALER_Health_Score,SCALER_Health_Risk_Score,SCALER_Feedback2,SCALER_CreditInsurance,SCALER_Sin_Year,SCALER_IsNull_Credit_Score,SCALER_Health_Age_Interaction,SCALER_Total_Nulls,SCALER_ENCODED_Policy_Start_Date_-_Year,SCALER_ENCODED_Policy_Start_Date_-_Quarter,SCALER_Feedback4,SCALER_IsNull_Number_of_Dependents,SCALER_IsNull_Occupation,SCALER_Health_Conscious_Level1,SCALER_Sin_Month,SCALER_Policy_Start_Date_-_Month,SCALER_Health_Conscious_Level,SCALER_Health_Conscious_Level_Q1_Premium_Amount,SCALER_Health_Conscious_Level_MEAN_Premium_Amount,SCALER_Health_Conscious_Level_MEDIAN_Premium_Amount,SCALER_Number_of_Dependents_MEAN_Premium_Amount,SCALER_Number_of_Dependents_MEDIAN_Premium_Amount,SCALER_Number_of_Dependents_Q1_Premium_Amount,SCALER_Number_of_Dependents_Q3_Premium_Amount,SCALER_Number_of_Dependents_STD_Premium_Amount,SCALER_Health_Conscious_Level_Q3_Premium_Amount,SCALER_Insurance_Duration_MEAN_Premium_Amount,SCALER_Insurance_Duration_MEDIAN_Premium_Amount,SCALER_Insurance_Duration_Q1_Premium_Amount,SCALER_Insurance_Duration_Q3_Premium_Amount,SCALER_Health_Conscious_Level_MAX_Premium_Amount,SCALER_Credit_Health_Score,SCALER_Occupation_Q3_Premium_Amount,SCALER_Occupation_MEAN_Premium_Amount,SCALER_Occupation_MAX_Premium_Amount,SCALER_Occupation_MEDIAN_Premium_Amount,SCALER_Occupation_Q1_Premium_Amount,SCALER_Previous_Claims_MIN_Premium_Amount,SCALER_Insurance_Duration_MAX_Premium_Amount,SCALER_ENCODED_Occupation_Self-Employed,SCALER_Age,SCALER_Insurance_Duration_STD_Premium_Amount,SCALER_Occupation_STD_Premium_Amount,Premium Amount
0,-0.391365,-0.978541,0.0,-0.498819,-0.174499,-0.069588,-0.323148,-1.087379,-0.065466,-0.257208,-0.405166,52.0,0.0,57.590036,1.0,30.472002,32.5,9.0,0.0,-4.5,0.00,0.0,0.0,-0.122232,0.122232,-0.725022,-0.347664,0.000000,0.0,-0.600771,-0.5,0.333333,0.7,-0.422984,0.0,0.0,-0.014187,-1.469576e-15,1.2,0.0,0.00,0.023383,0.0,0.095653,0.000,0.666667,0.000000,0.540228,0.285714,-0.739699,0.00,0.0,-0.125,-0.500000,-0.501901,-0.142857,-0.619544,0.4,-0.333333,0.0,0.0,0.222222,1.0,-0.956522,-2.154674,0.000000,2869.0
1,0.199672,0.403433,0.0,0.377393,0.111528,-0.046870,0.468853,0.556634,0.216832,0.044643,0.231761,0.0,0.0,0.000000,0.0,0.000000,0.0,-1.0,0.0,0.0,0.00,0.0,0.0,-0.519872,0.519872,0.173298,-0.524112,0.000000,0.0,-0.389589,0.0,0.333333,0.5,-0.266712,0.0,1.0,-0.384303,0.000000e+00,0.0,-1.0,-0.75,-1.208313,-1.8,0.000000,0.125,0.000000,0.200000,0.000000,-1.714286,0.458999,1.50,3.0,0.625,0.833333,-0.289410,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.333333,0.0,-0.086957,-0.781814,0.469401,1483.0
2,0.033638,0.137339,0.0,0.098674,0.032683,-0.137028,-0.264976,0.355987,0.516577,0.865019,0.698365,0.0,0.0,0.000000,0.0,0.000000,0.0,-1.0,0.0,0.0,0.50,0.0,0.0,1.268215,-1.268215,1.181256,-0.334206,0.000000,1.0,0.177856,0.0,0.333333,0.6,2.616414,0.0,0.0,0.136708,-7.347881e-16,0.6,0.0,0.00,0.023383,0.0,0.000000,0.125,0.000000,0.200000,0.000000,0.285714,-0.557842,0.00,-1.0,-0.875,-0.500000,1.394712,-0.142857,-0.619544,0.4,-0.333333,0.0,0.0,0.333333,1.0,-0.782609,0.000000,0.000000,567.0
3,3.210384,-1.000000,0.0,1.821694,5.344500,2.640407,2.462977,-0.501618,6.137265,3.336917,1.169690,0.0,0.0,0.000000,0.0,0.000000,0.0,-1.0,0.0,0.0,-0.25,0.0,0.0,-0.781885,0.781885,-0.729443,-0.905794,1.000000,0.0,-0.837878,0.0,0.666667,0.9,-0.636343,0.0,1.0,-0.280435,0.000000e+00,0.0,-0.5,-1.00,-0.976617,-0.8,0.631238,0.250,0.666667,0.666667,3.658057,-0.714286,-1.483870,-2.75,-8.0,-0.625,0.500000,-0.891017,-0.142857,-0.619544,0.4,-0.333333,0.0,0.0,-0.666667,1.0,-0.869565,-0.132030,0.000000,765.0
4,0.417543,-0.008584,0.0,0.460315,0.428694,1.248139,0.302581,0.245955,1.288679,0.549988,-0.051473,0.0,0.0,-1.000000,-1.0,-1.000000,-1.0,0.0,0.0,1.0,-0.50,0.0,0.0,-0.247971,0.247971,-0.525199,-0.148785,0.753709,0.0,-0.602527,-0.5,-0.333333,-0.1,-0.463653,0.0,0.0,-0.301439,-1.469576e-15,1.2,-0.5,-1.00,-0.976617,-0.8,0.095653,0.000,0.666667,0.000000,0.540228,-0.714286,0.032506,0.00,0.0,0.000,0.500000,-0.167203,-0.142857,-0.619544,0.4,-0.333333,0.0,0.0,-0.333333,1.0,-0.869565,0.218186,0.000000,2022.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1999995,0.393797,-1.248927,0.0,-0.102863,1.337526,1.209456,0.284212,-0.689320,0.180353,0.526292,0.401522,0.0,0.0,-1.000000,-1.0,-1.000000,-1.0,0.0,0.0,1.0,-0.50,1.0,0.0,-0.580465,0.580465,-0.507515,-0.811963,0.753709,0.0,-0.249810,0.5,-0.333333,-0.2,-0.305908,0.0,1.0,0.379908,-2.449294e-16,0.2,0.0,0.00,0.023383,0.0,0.095653,0.000,0.666667,0.000000,0.540228,0.285714,0.458999,1.50,3.0,0.625,-0.500000,-0.849755,-0.142857,-0.619544,0.4,-0.333333,0.0,0.0,0.333333,1.0,0.391304,-0.781814,0.000000,
1999996,1.341467,-0.592275,0.0,0.951484,1.851805,2.753229,1.793703,-0.941748,1.498660,0.804228,2.985746,52.0,0.0,57.590036,1.0,30.472002,32.5,9.0,0.0,-4.5,1.50,0.0,0.0,-0.939854,0.939854,0.580018,-0.697570,0.000000,1.0,-0.772093,1.0,0.333333,0.4,-0.240299,0.0,1.0,0.173591,7.347881e-16,-0.6,0.5,0.00,0.000000,0.2,-0.904347,-0.875,-0.333333,-0.800000,-4.197114,0.000000,0.458999,1.50,3.0,0.625,0.000000,-0.913249,-0.142857,-0.619544,0.4,-0.333333,0.0,0.0,0.333333,1.0,-0.260870,-0.781814,0.000000,
1999997,0.295314,0.420601,0.0,0.500794,0.184489,1.049025,0.579828,0.569579,0.755546,0.108269,-0.104918,0.0,0.0,0.000000,0.0,0.000000,0.0,-1.0,0.0,0.0,-0.25,1.0,0.0,-1.025229,1.025229,-0.436782,0.522617,1.507418,1.0,-0.905742,0.5,-1.000000,-1.0,-0.715050,0.0,0.0,-0.471680,-7.347881e-16,0.6,-1.0,-0.75,-1.208313,-1.8,-0.904347,-0.875,-0.333333,-0.800000,-4.197114,-1.714286,0.000000,0.25,0.0,0.500,0.833333,-0.836267,0.857143,0.380456,-0.6,0.666667,1.0,0.0,0.555556,0.0,-0.652174,0.282128,-0.530599,
1999998,0.581773,-0.570815,0.0,0.345658,0.911788,0.160613,0.912214,-0.933657,0.745554,0.298838,0.565907,52.0,0.0,57.590036,1.0,30.472002,32.5,9.0,0.0,-4.5,0.50,0.0,0.0,-0.499081,0.499081,-0.228117,0.179065,-1.000000,0.0,-0.467194,0.0,0.000000,0.1,-0.253263,0.0,1.0,0.135808,2.449294e-16,-0.2,0.0,0.00,0.023383,0.0,0.000000,0.125,0.000000,0.200000,0.000000,0.285714,0.277687,1.25,0.0,0.500,-0.500000,-0.587304,-0.142857,-0.619544,0.4,-0.333333,0.0,0.0,-0.666667,1.0,-0.304348,0.161898,0.000000,


#
---
#

# Just Checking!

In [7]:
def return_splits(ddf, feature_name, target_name):
    return [ddf[ddf[feature_name] == i][target_name] for i in ddf[feature_name].unique()]

def give_stats_analysis(df, target_column_name):
    ddf = df.copy()
    ddf = ddf.dropna()

    features = []
    tests = []
    stats = []
    pvals = []
    verdict = []
    count = 0

    target = ddf[target_column_name]
    for i in ddf.columns:
        features.append(i)
        feature = ddf[i]
        
        if (feature.dtype == "O" and (target.dtype == "float" or target.dtype == "int")) or (target.dtype == "O" and (feature.dtype == "float" or feature.dtype == "int")):
            stat, pval, *_ = kruskal(*return_splits(ddf, feature.name, target.name))
            tests.append("Kruskal-Wallis")
            stats.append(stat)
            pvals.append(pval)
            
        
        elif (feature.dtype == "float" or feature.dtype == "int") and (target.dtype == "float" or target.dtype == "int"):
            stat, pval, *_ = spearmanr(feature, target)
            tests.append("SpearmanR")
            stats.append(stat)
            pvals.append(pval)

        elif feature.dtype == "O" and target.dtype == "O":
            stat, pval, *_ = chi2_contingency(pd.crosstab(feature, target))
            tests.append("Chi-Square")
            stats.append(stat)
            pvals.append(pval)
        
        else:
            tests.append(np.nan)
            stats.append(np.nan)
            pvals.append(np.nan)
        
        if pval <= 0.05:
            verdict.append("There is Relationship")
        else:
            verdict.append("There is NO Relationship")

        print(f"{feature.name} ■■■ {target_column_name}".ljust(50, "-")+"✅")
    
    return pd.DataFrame({
        "Feature" : features,
        "Target" : [target_column_name]*ddf.shape[1],
        "Statistic Test" : tests,
        "Test Statistic" : stats,
        "P-Value" : pvals,
        "Verdict" : verdict
    }).sort_values(by="P-Value")

In [8]:
give_stats_analysis(df, "Premium Amount")

SCALER_Annual_Income ■■■ Premium Amount-----------✅
SCALER_Credit_Score ■■■ Premium Amount------------✅
SCALER_IsNull_Annual_Income ■■■ Premium Amount----✅
SCALER_Money_Handling_Level ■■■ Premium Amount----✅
SCALER_Money_Handling_Level1 ■■■ Premium Amount---✅
SCALER_Money_Per_Head ■■■ Premium Amount----------✅
SCALER_Growth ■■■ Premium Amount------------------✅
SCALER_Credit_by_Score ■■■ Premium Amount---------✅
SCALER_Determinstic ■■■ Premium Amount------------✅
SCALER_Growth1 ■■■ Premium Amount-----------------✅
SCALER_Feedback1 ■■■ Premium Amount---------------✅
SCALER_Previous_Claims_MEDIAN_Premium_Amount ■■■ Premium Amount✅
SCALER_IsNull_Health_Score ■■■ Premium Amount-----✅
SCALER_Previous_Claims_MEAN_Premium_Amount ■■■ Premium Amount✅
SCALER_Previous_Claims ■■■ Premium Amount---------✅
SCALER_Previous_Claims_STD_Premium_Amount ■■■ Premium Amount✅
SCALER_Previous_Claims_Q3_Premium_Amount ■■■ Premium Amount✅
SCALER_Previous_Claims_Q1_Premium_Amount ■■■ Premium Amount✅
SCALER_IsNul

Unnamed: 0,Feature,Target,Statistic Test,Test Statistic,P-Value,Verdict
0,SCALER_Annual_Income,Premium Amount,SpearmanR,-0.061831,0.000000,There is Relationship
1,SCALER_Credit_Score,Premium Amount,SpearmanR,-0.036687,0.000000,There is Relationship
2,SCALER_IsNull_Annual_Income,Premium Amount,SpearmanR,-0.065399,0.000000,There is Relationship
3,SCALER_Money_Handling_Level,Premium Amount,SpearmanR,-0.072097,0.000000,There is Relationship
4,SCALER_Money_Handling_Level1,Premium Amount,SpearmanR,-0.048668,0.000000,There is Relationship
...,...,...,...,...,...,...
61,SCALER_Insurance_Duration_MAX_Premium_Amount,Premium Amount,SpearmanR,0.002807,0.002103,There is Relationship
62,SCALER_ENCODED_Occupation_Self-Employed,Premium Amount,SpearmanR,-0.002737,0.002717,There is Relationship
63,SCALER_Age,Premium Amount,SpearmanR,-0.002377,0.009204,There is Relationship
64,SCALER_Insurance_Duration_STD_Premium_Amount,Premium Amount,SpearmanR,0.002200,0.015957,There is Relationship


In [9]:
df.isnull().sum()

SCALER_Annual_Income                                 0
SCALER_Credit_Score                                  0
SCALER_IsNull_Annual_Income                          0
SCALER_Money_Handling_Level                          0
SCALER_Money_Handling_Level1                         0
                                                 ...  
SCALER_ENCODED_Occupation_Self-Employed              0
SCALER_Age                                           0
SCALER_Insurance_Duration_STD_Premium_Amount         0
SCALER_Occupation_STD_Premium_Amount                 0
Premium Amount                                  800000
Length: 67, dtype: int64

#
---
#

# Removing non related columns

In [10]:
# df.drop(columns=["SCALER_ENCODED_Marital_Status_Married", "SCALER_ENCODED_Customer_Feedback", "SCALER_PC2_Meaningless_df", "SCALER_PC1_Meaningless_df", "SCALER_ENCODED_Occupation_Unemployed"], inplace=True)

#
---
#

# Spliting Data

In [11]:
train = df.iloc[:1200000, :]
test = df.iloc[1200000:, :]

train.shape, test.shape

((1200000, 67), (800000, 67))

In [12]:
X = train.drop(columns="Premium Amount")
Y = train["Premium Amount"]
Y_log = np.log1p(Y)

In [13]:
from sklearn.model_selection import train_test_split

In [14]:
x_train, x_validate, y_train, y_validate = train_test_split(X, Y, test_size=10000)

In [15]:
x_validate.shape

(10000, 66)

In [16]:
test.drop(columns="Premium Amount", inplace=True)

In [17]:
test.shape

(800000, 66)

In [18]:
x_validate.head(3)

Unnamed: 0,SCALER_Annual_Income,SCALER_Credit_Score,SCALER_IsNull_Annual_Income,SCALER_Money_Handling_Level,SCALER_Money_Handling_Level1,SCALER_Money_Per_Head,SCALER_Growth,SCALER_Credit_by_Score,SCALER_Determinstic,SCALER_Growth1,SCALER_Feedback1,SCALER_Previous_Claims_MEDIAN_Premium_Amount,SCALER_IsNull_Health_Score,SCALER_Previous_Claims_MEAN_Premium_Amount,SCALER_Previous_Claims,SCALER_Previous_Claims_STD_Premium_Amount,SCALER_Previous_Claims_Q3_Premium_Amount,SCALER_Previous_Claims_Q1_Premium_Amount,SCALER_IsNull_Customer_Feedback,SCALER_Previous_Claims_MAX_Premium_Amount,SCALER_Feedback3,SCALER_IsNull_Previous_Claims,SCALER_IsNull_Marital_Status,SCALER_Health_Score,SCALER_Health_Risk_Score,SCALER_Feedback2,SCALER_CreditInsurance,SCALER_Sin_Year,SCALER_IsNull_Credit_Score,SCALER_Health_Age_Interaction,SCALER_Total_Nulls,SCALER_ENCODED_Policy_Start_Date_-_Year,SCALER_ENCODED_Policy_Start_Date_-_Quarter,SCALER_Feedback4,SCALER_IsNull_Number_of_Dependents,SCALER_IsNull_Occupation,SCALER_Health_Conscious_Level1,SCALER_Sin_Month,SCALER_Policy_Start_Date_-_Month,SCALER_Health_Conscious_Level,SCALER_Health_Conscious_Level_Q1_Premium_Amount,SCALER_Health_Conscious_Level_MEAN_Premium_Amount,SCALER_Health_Conscious_Level_MEDIAN_Premium_Amount,SCALER_Number_of_Dependents_MEAN_Premium_Amount,SCALER_Number_of_Dependents_MEDIAN_Premium_Amount,SCALER_Number_of_Dependents_Q1_Premium_Amount,SCALER_Number_of_Dependents_Q3_Premium_Amount,SCALER_Number_of_Dependents_STD_Premium_Amount,SCALER_Health_Conscious_Level_Q3_Premium_Amount,SCALER_Insurance_Duration_MEAN_Premium_Amount,SCALER_Insurance_Duration_MEDIAN_Premium_Amount,SCALER_Insurance_Duration_Q1_Premium_Amount,SCALER_Insurance_Duration_Q3_Premium_Amount,SCALER_Health_Conscious_Level_MAX_Premium_Amount,SCALER_Credit_Health_Score,SCALER_Occupation_Q3_Premium_Amount,SCALER_Occupation_MEAN_Premium_Amount,SCALER_Occupation_MAX_Premium_Amount,SCALER_Occupation_MEDIAN_Premium_Amount,SCALER_Occupation_Q1_Premium_Amount,SCALER_Previous_Claims_MIN_Premium_Amount,SCALER_Insurance_Duration_MAX_Premium_Amount,SCALER_ENCODED_Occupation_Self-Employed,SCALER_Age,SCALER_Insurance_Duration_STD_Premium_Amount,SCALER_Occupation_STD_Premium_Amount
1176823,0.561962,-0.48927,0.0,0.370399,0.830207,0.483242,0.414296,-0.116505,0.555505,0.694101,0.548582,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,-0.497391,0.497391,-0.194518,-0.679626,-0.246291,0.0,-0.371185,-0.5,-0.666667,-0.8,-0.252169,0.0,0.0,-0.379136,7.347881e-16,-0.6,-1.0,-0.75,-1.208313,-1.8,0.631238,0.25,0.666667,0.666667,3.658057,-1.714286,0.458999,1.5,3.0,0.625,0.833333,-0.559192,0.857143,0.380456,-0.6,0.666667,1.0,0.0,0.333333,0.0,-0.086957,-0.781814,-0.530599
514827,-0.643694,-0.506438,0.0,-0.659469,-0.563235,-0.480637,-0.526951,-0.12945,-0.564377,-0.48678,-0.505759,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.638706,-0.638706,-0.201592,0.398505,-0.246291,0.0,0.130785,-0.5,-0.666667,-0.5,0.482749,0.0,0.0,0.807281,-1.469576e-15,1.2,0.5,0.0,0.0,0.2,-0.904347,-0.875,-0.333333,-0.8,-4.197114,0.0,0.260301,1.0,-1.0,0.625,0.0,0.292664,0.857143,0.380456,-0.6,0.666667,1.0,0.0,0.0,0.0,-0.521739,0.822189,-0.530599
153758,0.764421,-0.184549,0.0,0.721784,0.853476,0.259792,1.67738,0.113269,0.494052,0.182452,0.100198,0.0,0.0,-1.0,-1.0,-1.0,-1.0,0.0,0.0,1.0,-0.5,0.0,0.0,-0.712185,0.712185,-0.56145,-0.210093,0.753709,0.0,-0.416959,-0.5,-0.333333,-0.3,-0.613799,0.0,0.0,0.192217,0.0,0.0,0.0,0.0,0.023383,0.0,0.0,0.125,0.0,0.2,0.0,0.285714,0.032506,0.0,0.0,0.0,-0.5,-0.646115,-0.142857,-0.619544,0.4,-0.333333,0.0,0.0,-0.333333,1.0,0.304348,0.218186,0.0


In [19]:
pd.set_option("display.max_rows", 100)

In [20]:
x_validate.iloc[0]

SCALER_Annual_Income                                   5.619620e-01
SCALER_Credit_Score                                   -4.892704e-01
SCALER_IsNull_Annual_Income                            0.000000e+00
SCALER_Money_Handling_Level                            3.703993e-01
SCALER_Money_Handling_Level1                           8.302072e-01
SCALER_Money_Per_Head                                  4.832416e-01
SCALER_Growth                                          4.142957e-01
SCALER_Credit_by_Score                                -1.165049e-01
SCALER_Determinstic                                    5.555046e-01
SCALER_Growth1                                         6.941008e-01
SCALER_Feedback1                                       5.485817e-01
SCALER_Previous_Claims_MEDIAN_Premium_Amount           0.000000e+00
SCALER_IsNull_Health_Score                             0.000000e+00
SCALER_Previous_Claims_MEAN_Premium_Amount             0.000000e+00
SCALER_Previous_Claims                          

#
---
#

# Base Model Building

In [21]:
# from xgboost import XGBRegressor
# from sklearn.metrics import root_mean_squared_log_error

In [22]:
# model = XGBRegressor(verbosity=3)

In [23]:
# model.fit(x_train, np.log(y_train))

In [24]:
# prediction = model.predict(x_validate)

In [25]:
# prediction[prediction < 0] = 0

In [26]:
# root_mean_squared_log_error(y_validate, prediction)

In [27]:
# prediction = pd.DataFrame({"id" : test.index, "Premium Amount" : model.predict(test)})
# prediction

In [28]:
# prediction.to_csv("Dileep's_Submission_Take3.csv", index=False)

#
---
#

# Hyper-tuning -- IT is taking too much time

In [29]:
# from sklearn.model_selection import RandomizedSearchCV

In [30]:
# esimator = XGBRegressor(n_jobs=-1, verbosity=3)

In [31]:
# param_grid = {
#     "n_estimators" : range(50, 1000),
#     "max_depth" : range(5, 30)
# }

In [32]:
# cv_model = RandomizedSearchCV(estimator=esimator, param_distributions=param_grid, verbose=3, n_jobs=-1, cv=2, n_iter=60)

In [33]:
# cv_model.fit(x_train, y_train)

#
---
#

# Optuna Hyper-parameter Tuning

In [34]:
def objective_lgbm(trial):
    params = {
        'boosting_type': 'gbdt',
        'objective': 'regression',
        'metric': 'rmse',
        'n_estimators': trial.suggest_int('n_estimators', 300, 1500),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2),
        'num_leaves': trial.suggest_int('num_leaves', 20, 150),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 10.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 10.0),
        'device': 'gpu',
        'gpu_platform_id': 1,
        'gpu_device_id': 0,
        'verbose': 3,
    }

    model = LGBMRegressor(**params)
    model.fit(x_train, y_train, 
              eval_set=[(x_validate, y_validate)], 
              eval_metric='rmse', 
              callbacks=[early_stopping(stopping_rounds=200)])
    preds = model.predict(x_validate)
    return root_mean_squared_error(y_validate, preds)

# study_lgbm = optuna.create_study(direction='minimize')
# study_lgbm.optimize(objective_lgbm, n_trials=30)
# print("Best parameters for LightGBM:", study_lgbm.best_params)

In [35]:
# study_lgbm.best_params

In [36]:
def objective_xgb(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 3000),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 10.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 10.0),
        'tree_method': 'gpu_hist',
        'predictor': 'gpu_predictor'
    }

    model = XGBRegressor(**params)
    model.fit(x_train, y_train, 
              eval_set=[(x_validate, y_validate)],
              verbose=3)
    preds = model.predict(x_validate)
    return root_mean_squared_error(y_validate, preds)

# study_xgb = optuna.create_study(direction='minimize')
# study_xgb.optimize(objective_xgb, n_trials=30)
# print("Best parameters for XGBoost:", study_xgb.best_params)

In [37]:
# study_xgb.best_params

#
---
#

# Let's do Cross-Validation

In [38]:
X["SCALER_Previous_Claims_STD_Premium_Amount"].fillna(X["SCALER_Previous_Claims_STD_Premium_Amount"].median(), inplace=True)

In [39]:
test["SCALER_Previous_Claims_STD_Premium_Amount"].fillna(test["SCALER_Previous_Claims_STD_Premium_Amount"].median(), inplace=True)

In [40]:
# ===============================1.04 RMSLE================================
# ===============Best Parmeters for KNN Imputer Data=======================
lgb_params = {
    'n_estimators': 470,
     'learning_rate': 0.055186215312218706,
     'num_leaves': 131,
     'max_depth': 11,
     'min_child_samples': 100,
     'subsample': 0.7456958918734639,
     'colsample_bytree': 0.8071815211022829,
     'reg_alpha': 5.4821727206510005,
     'reg_lambda': 6.187384822358707
 }

# cat_params = {
#     'iterations': 3000, 
#     'learning_rate': 0.038365175314273574, 
#     'depth': 11, 
#     'l2_leaf_reg': 3.596285147607088, 
#     'bagging_temperature': 0.2618728648567565
# }

xgb_params = {
    'n_estimators': 711,
     'learning_rate': 0.018352261556416866,
     'max_depth': 9,
     'min_child_weight': 6,
     'subsample': 0.9301085945058237,
     'colsample_bytree': 0.6532668613763417,
     'reg_alpha': 9.92413527770934,
     'reg_lambda': 9.101727260657976
}
# ===============Best Parmeters for KNN Imputer Data=======================
# =========================================================================

# ===============================WASTE - 1.06 RMSLE=========================
# ===============Best Parmeters for MICE Imputer Data=======================
# lgb_params = {
#     'n_estimators': 1385,
#      'learning_rate': 0.14257294852559838,
#      'num_leaves': 60,
#      'max_depth': 9,
#      'min_child_samples': 40,
#      'subsample': 0.9935190832160928,
#      'colsample_bytree': 0.9966322334638195,
#      'reg_alpha': 0.23095019319145593,
#      'reg_lambda': 4.253578962871361
#  }

# # cat_params = {
# #     'iterations': 3000, 
# #     'learning_rate': 0.038365175314273574, 
# #     'depth': 11, 
# #     'l2_leaf_reg': 3.596285147607088, 
# #     'bagging_temperature': 0.2618728648567565
# # }

# xgb_params = {
#     'n_estimators': 2846,
#      'learning_rate': 0.01624734031067095,
#      'max_depth': 9,
#      'min_child_weight': 7,
#      'subsample': 0.927649566424982,
#      'colsample_bytree': 0.6223559022447491,
#      'reg_alpha': 7.0497248789196085,
#      'reg_lambda': 0.9338395072495587
# }
# ===============Best Parmeters for MICE Imputer Data=======================
# =========================================================================



In [41]:
from lightgbm import LGBMRegressor
# from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import VotingRegressor
from sklearn.metrics import root_mean_squared_log_error, root_mean_squared_error, r2_score


splitter = KFold(n_splits=5)
models = []


count = 1

folds_df = pd.DataFrame()

for train_index, test_index in splitter.split(X, Y_log):
    fold_xtrain  = X.iloc[train_index, :]
    fold_ytrain  = Y_log.iloc[train_index]
    fold_xtest  = X.iloc[test_index, :]
    fold_ytest  = Y_log.iloc[test_index]

    # LightGBM model
    lgbm_model = LGBMRegressor(**lgb_params, device='gpu', gpu_platform_id=1, gpu_device_id=0)

    # CatBoost model
    # cat_model = CatBoostRegressor(**cat_params, verbose=3, task_type= 'GPU')

    # XGBoost model
    xgb_model = XGBRegressor(**xgb_params, tree_method="gpu_hist", predictor= 'gpu_predictor')

    # Voting Regressor
    model = VotingRegressor(
        estimators=[
            ('lgbm', lgbm_model),
            # ('cat', cat_model),
            ('xgb', xgb_model)
        ]
    )

    model.fit(fold_xtrain, fold_ytrain)
    fold_pred = np.maximum(0, model.predict(fold_xtest))
    fold_rmsle = root_mean_squared_log_error(np.expm1(fold_ytest), np.expm1(fold_pred))
    print(f"Fold {count} RMSLE: {fold_rmsle}")
    models.append(model)

    folds_df[f"Fold {count}"] = model.predict(test)
    count += 1

[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 4923
[LightGBM] [Info] Number of data points in the train set: 960000, number of used features: 65
[LightGBM] [Info] Using requested OpenCL platform 1 device 0
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce RTX 3050 6GB Laptop GPU, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 54 dense feature groups (51.27 MB) transferred to GPU in 0.065125 secs. 1 sparse feature groups
[LightGBM] [Info] Start training from score 6.593915
Fold 1 RMSLE: 1.0469741822055718
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 4923
[LightGBM] [Info] Number of data points in the train set: 960000, number of used features: 65
[LightGBM] [Info] Using requested OpenCL platform 1 device 0
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce RTX 3050 6GB Laptop GPU, 

In [42]:
models

[VotingRegressor(estimators=[('lgbm',
                              LGBMRegressor(colsample_bytree=0.8071815211022829,
                                            device='gpu', gpu_device_id=0,
                                            gpu_platform_id=1,
                                            learning_rate=0.055186215312218706,
                                            max_depth=11, min_child_samples=100,
                                            n_estimators=470, num_leaves=131,
                                            reg_alpha=5.4821727206510005,
                                            reg_lambda=6.187384822358707,
                                            subsample=0.7456958918734639)),
                             ('xgb',
                              XGBRegressor(base_s...
                                           importance_type=None,
                                           interaction_constraints=None,
                                           learning_

In [43]:
# from lightgbm import LGBMRegressor
# # from catboost import CatBoostRegressor
# from sklearn.linear_model import LinearRegression, Lasso
# from xgboost import XGBRegressor
# from sklearn.ensemble import StackingRegressor
# from sklearn.metrics import root_mean_squared_log_error, root_mean_squared_error, r2_score


# splitter = KFold(n_splits=10)
# models = []


# count = 1

# folds_df = pd.DataFrame()

# for train_index, test_index in splitter.split(X, Y_log):
#     fold_xtrain  = X.iloc[train_index, :]
#     fold_ytrain  = Y_log.iloc[train_index]
#     fold_xtest  = X.iloc[test_index, :]
#     fold_ytest  = Y_log.iloc[test_index]

#     # LightGBM model
#     lgbm_model = LGBMRegressor(**lgb_params, device='gpu', gpu_platform_id=1, gpu_device_id=0)

#     # CatBoost model
#     # cat_model = CatBoostRegressor(**cat_params, verbose=3, task_type= 'GPU')

#     # XGBoost model
#     xgb_model = XGBRegressor(**xgb_params, tree_method="gpu_hist", predictor= 'gpu_predictor')
#     xgb_model1 = XGBRegressor(**xgb_params, tree_method="gpu_hist", predictor= 'gpu_predictor')

#     # Linear Regression
#     lir_model = Lasso()

#     # Voting Regressor
#     # vote_model = VotingRegressor(
#     #     estimators=[
#     #         ('lgbm', lgbm_model),
#     #         # ('cat', cat_model),
#     #         ('xgb', xgb_model)
#     #     ]
#     # )
    
#     model = StackingRegressor(
#         estimators=[
#             ('lgbm', lgbm_model),
#             # ('cat', cat_model),
#             ('xgb', xgb_model)
#         ], final_estimator=xgb_model1
#     )

#     model.fit(fold_xtrain, fold_ytrain)
#     fold_pred = np.maximum(0, model.predict(fold_xtest))
#     fold_rmsle = root_mean_squared_log_error(np.expm1(fold_ytest), np.expm1(fold_pred))
#     print(f"Fold {count} RMSLE: {fold_rmsle}")
#     models.append(model)

#     folds_df[f"Fold {count}"] = model.predict(test)
#     count += 1

In [44]:
# splitter = KFold(n_splits=10)

In [45]:
# from sklearn.ensemble import GradientBoostingRegressor
# from xgboost import XGBRegressor
# from sklearn.metrics import root_mean_squared_log_error, root_mean_squared_error, r2_score
# gb_model = GradientBoostingRegressor(verbose=3, loss="absolute_error", learning_rate=0.5, n_estimators=150)
# xg_model = XGBRegressor(verbose=3, loss="absolute_error", learning_rate=0.075, n_estimators=100)

# count = 1

# folds_df = pd.DataFrame()

# for train_index, test_index in splitter.split(X, Y_log):
#     fold_xtrain  = X.iloc[train_index, :]
#     fold_ytrain  = Y_log.iloc[train_index]
#     fold_xtest  = X.iloc[test_index, :]
#     fold_ytest  = Y_log.iloc[test_index]

#     # print(fold_xtrain.shape, fold_ytrain.shape, fold_xtest.shape, fold_ytest.shape)

#     fold_ypred = xg_model.fit(fold_xtrain, fold_ytrain).predict(fold_xtest)
#     print(f"The Root mean squared log error for FOLD - {count} is --> {root_mean_squared_log_error(fold_ytest, fold_ypred)}")
    
#     folds_df[f"Fold {count}"] = xg_model.predict(test)
#     count += 1

In [46]:
# with open('models.pkl', 'wb') as file:
#     pickle.dump(models, file)

# print("Cross-validation models saved!")

In [47]:
# import gzip
# import pickle

# with gzip.open('models.pkl.gz', 'wb') as f:
#     pickle.dump(models, f)


In [48]:
# with gzip.open('models.pkl.gz', 'rb') as f:
#     models = pickle.load(f)

In [49]:
# from sklearn.metrics import root_mean_squared_log_error, root_mean_squared_error, r2_score

# print(f"The Root mean squared log error is --> {np.exp(root_mean_squared_log_error(fold_ytest, fold_ypred))}")
# print(f"The R-Square is --> {r2_score(fold_ytest, fold_ypred)}")


In [50]:
folds_df.to_csv("folds_prediction.csv", index=False)

#
---
#

In [51]:
preds = pd.read_csv("folds_prediction.csv")
preds

Unnamed: 0,Fold 1,Fold 2,Fold 3,Fold 4,Fold 5
0,6.479038,6.498585,6.523505,6.619140,6.556611
1,6.744839,6.693970,6.713507,6.693515,6.736408
2,6.663790,6.624553,6.653802,6.690502,6.654886
3,6.704652,6.684525,6.689175,6.681214,6.727800
4,6.582340,6.596033,6.617133,6.598980,6.602212
...,...,...,...,...,...
799995,6.884297,6.874234,6.919119,6.900542,6.862596
799996,6.295704,6.186181,6.209846,6.239298,6.249433
799997,6.637500,6.652418,6.642992,6.663728,6.668590
799998,6.702969,6.711475,6.707851,6.699588,6.704911


In [52]:
pred = preds.mean(axis=1).apply(np.expm1)

In [53]:
prediction = pd.DataFrame({"id" : test.index, "Premium Amount" : pred})
prediction

Unnamed: 0,id,Premium Amount
0,1200000,688.092783
1,1200001,824.878693
2,1200002,777.607255
3,1200003,809.355753
4,1200004,733.609849
...,...,...
799995,1999995,979.593223
799996,1999996,509.858430
799997,1999997,774.141466
799998,1999998,815.771169


In [54]:
prediction.to_csv("Dileep's_Submission_Take34.csv", index=False)

In [55]:
# Take 20 did after Feature engineering
# Before -- 821 Rank
# After -- 757

#
---
#

# Insight about model

In [56]:
# fold_xtrain, fold_ytrain, fold_xtest, fold_ytest

In [58]:
from sklearn.metrics import r2_score, root_mean_squared_error, root_mean_squared_log_error, mean_absolute_percentage_error

In [59]:
fold_ytrain = fold_ytrain.apply(np.expm1)
fold_ytest = fold_ytest.apply(np.expm1)

In [61]:
print(f" R2 for test ---> {r2_score(fold_ytest, np.expm1(model.predict(fold_xtest)))}")
print(f" R2 for train  ---> {r2_score(fold_ytrain, np.expm1(model.predict(fold_xtrain)))}")

 R2 for test ---> -0.14123185693346763
 R2 for train  ---> -0.10389365735741918


In [62]:
print(f" RMSE for test ---> {root_mean_squared_error(fold_ytest, np.expm1(model.predict(fold_xtest)))}")
print(f" RMSE for train  ---> {root_mean_squared_error(fold_ytrain, np.expm1(model.predict(fold_xtrain)))}")

 RMSE for test ---> 920.2918947446881
 RMSE for train  ---> 909.7475119128519


In [63]:
print(f" RMSLE for test ---> {root_mean_squared_log_error(fold_ytest, np.expm1(model.predict(fold_xtest)))}")
print(f" RMSLE for train  ---> {root_mean_squared_log_error(fold_ytrain, np.expm1(model.predict(fold_xtrain)))}")

 RMSLE for test ---> 1.0422635785116325
 RMSLE for train  ---> 1.0206258341407601


In [64]:
print(f" MAPE for test ---> {mean_absolute_percentage_error(fold_ytest, np.expm1(model.predict(fold_xtest)))}")
print(f" MAPE for train  ---> {mean_absolute_percentage_error(fold_ytrain, np.expm1(model.predict(fold_xtrain)))}")

 MAPE for test ---> 1.8285050087414068
 MAPE for train  ---> 1.7343184646536198


In [None]:
a = pd.DataFrame(
    {
        "Features" : models[0].named_estimators_["lgbm"].feature_name_,
        "Importance" : models[0].named_estimators_["lgbm"].feature_importances_
    }
).sort_values("Importance", ascending=False)

a

In [60]:
# a.to_csv("featuer_importance1.csv", index=False)

#
---
#

# Lets implement... `STACKING`

In [None]:
X.fillna(X["SCALER_Previous_Claims_STD_Premium_Amount"].median(), inplace=True)

In [None]:
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, IsolationForest
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from sklearn.metrics import root_mean_squared_log_error

In [None]:
lir_model = LinearRegression()
knn_model = KNeighborsRegressor()
rf_model = RandomForestRegressor(verbose=3, n_estimators=40)
gb_model = GradientBoostingRegressor(verbose=3, loss="absolute_error", learning_rate=0.5, n_estimators=250)
xgb_model = XGBRegressor(verbose=3)
ridge_model = Ridge()
if_model = IsolationForest(verbose=3)

In [None]:
stacked_model = StackingRegressor(estimators=[
                    ("rf_model", rf_model),
                    # ("knn_model", knn_model),
                    ("xgb_model", xgb_model),
                    ("gb_model", gb_model),
                    # ("lir_model", lir_model)
                ], final_estimator=lir_model, verbose=3)

In [None]:
gb_model.fit(X, np.log(Y))

In [None]:
prediction = gb_model.predict(x_validate)

In [None]:
# prediction[prediction < 0] = prediction.mean()

In [None]:
root_mean_squared_log_error(y_validate, prediction)

In [None]:
test.fillna(test["SCALER_Previous_Claims_STD_Premium_Amount"].median(), inplace=True)

In [None]:
prediction = pd.DataFrame({"id" : test.index, "Premium Amount" : np.exp(gb_model.predict(test))})
prediction

In [None]:
prediction.to_csv("Dileep's_Submission_Take14.csv", index=False)