# Regression with an Insurance Dataset
## Playground Series - Season 4, Episode 12
Submissions are evaluated using the Root Mean Squared Logarithmic Error (RMSLE).

In [30]:
# Mount Google Drive
from google.colab import drive
import sys

drive.mount('/content/drive', force_remount=True)
sys.path.append('/content/drive/MyDrive')

Mounted at /content/drive


In [31]:
!pip install scikit-optimize
!pip install shap
!pip install xgboost
!pip install catboost
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_log_error, make_scorer
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from skopt import BayesSearchCV
from skopt.space import Real, Integer
import shap
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from google.colab import files

pd.set_option('future.no_silent_downcasting', True)



## Load Train and Test data from google drive

In [32]:
train_set = pd.read_csv('/content/drive/MyDrive/Playground Series - Season 4, Episode 12/data/train.csv')
test_set = pd.read_csv('/content/drive/MyDrive/Playground Series - Season 4, Episode 12/data/test.csv')

train_set.head()

Unnamed: 0,id,Age,Gender,Annual Income,Marital Status,Number of Dependents,Education Level,Occupation,Health Score,Location,...,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Policy Start Date,Customer Feedback,Smoking Status,Exercise Frequency,Property Type,Premium Amount
0,0,19.0,Female,10049.0,Married,1.0,Bachelor's,Self-Employed,22.598761,Urban,...,2.0,17.0,372.0,5.0,2023-12-23 15:21:39.134960,Poor,No,Weekly,House,2869.0
1,1,39.0,Female,31678.0,Divorced,3.0,Master's,,15.569731,Rural,...,1.0,12.0,694.0,2.0,2023-06-12 15:21:39.111551,Average,Yes,Monthly,House,1483.0
2,2,23.0,Male,25602.0,Divorced,3.0,High School,Self-Employed,47.177549,Suburban,...,1.0,14.0,,3.0,2023-09-30 15:21:39.221386,Good,Yes,Weekly,House,567.0
3,3,21.0,Male,141855.0,Married,2.0,Bachelor's,,10.938144,Rural,...,1.0,0.0,367.0,1.0,2024-06-12 15:21:39.226954,Poor,Yes,Daily,Apartment,765.0
4,4,21.0,Male,39651.0,Single,1.0,Bachelor's,Self-Employed,20.376094,Rural,...,0.0,8.0,598.0,4.0,2021-12-01 15:21:39.252145,Poor,Yes,Weekly,House,2022.0


In [33]:
train_set.describe()

Unnamed: 0,id,Age,Annual Income,Number of Dependents,Health Score,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Premium Amount
count,1200000.0,1181295.0,1155051.0,1090328.0,1125924.0,835971.0,1199994.0,1062118.0,1199999.0,1200000.0
mean,599999.5,41.14556,32745.22,2.009934,25.61391,1.002689,9.569889,592.9244,5.018219,1102.545
std,346410.3,13.53995,32179.51,1.417338,12.20346,0.98284,5.776189,149.9819,2.594331,864.9989
min,0.0,18.0,1.0,0.0,2.012237,0.0,0.0,300.0,1.0,20.0
25%,299999.8,30.0,8001.0,1.0,15.91896,0.0,5.0,468.0,3.0,514.0
50%,599999.5,41.0,23911.0,2.0,24.57865,1.0,10.0,595.0,5.0,872.0
75%,899999.2,53.0,44634.0,3.0,34.52721,2.0,15.0,721.0,7.0,1509.0
max,1199999.0,64.0,149997.0,4.0,58.97591,9.0,19.0,849.0,9.0,4999.0


## Inspect the data further

In [34]:
for col in train_set.columns:
  if train_set[col].dtype == 'object':
    print(col, train_set[col].unique())

for col in train_set.columns:
    print(col, train_set[col].isna().sum())

for col in test_set.columns:
  if train_set[col].dtype == 'object':
    print(col, test_set[col].unique())

for col in test_set.columns:
    print(col, test_set[col].isna().sum())

Gender ['Female' 'Male']
Marital Status ['Married' 'Divorced' 'Single' nan]
Education Level ["Bachelor's" "Master's" 'High School' 'PhD']
Occupation ['Self-Employed' nan 'Employed' 'Unemployed']
Location ['Urban' 'Rural' 'Suburban']
Policy Type ['Premium' 'Comprehensive' 'Basic']
Policy Start Date ['2023-12-23 15:21:39.134960' '2023-06-12 15:21:39.111551'
 '2023-09-30 15:21:39.221386' ... '2021-04-28 15:21:39.129190'
 '2019-11-14 15:21:39.201446' '2020-10-19 15:21:39.118178']
Customer Feedback ['Poor' 'Average' 'Good' nan]
Smoking Status ['No' 'Yes']
Exercise Frequency ['Weekly' 'Monthly' 'Daily' 'Rarely']
Property Type ['House' 'Apartment' 'Condo']
id 0
Age 18705
Gender 0
Annual Income 44949
Marital Status 18529
Number of Dependents 109672
Education Level 0
Occupation 358075
Health Score 74076
Location 0
Policy Type 0
Previous Claims 364029
Vehicle Age 6
Credit Score 137882
Insurance Duration 1
Policy Start Date 0
Customer Feedback 77824
Smoking Status 0
Exercise Frequency 0
Property 

# Feature Engineering

In [35]:
# Age
train_set['Age'] = train_set['Age']

def categorize_age(age):
    if age < 25:
        return '18-25'
    elif age < 40:
        return '25-40'
    elif age < 60:
        return '40-60'
    elif age >= 60:
        return '60+'
    else:
        return 'nan'


train_set['Age Cat'] = train_set['Age'].apply(categorize_age)
train_set = pd.get_dummies(train_set, columns=['Age Cat'], drop_first=False, dtype=float)
train_set['Age'] = train_set['Age'] / 100
train_set['Age'] = train_set['Age'].replace('nan', None).fillna(train_set['Age'].mode()[0])

# Gender
train_set['Gender'] = train_set['Gender'].replace({'Male': 0, 'Female': 1}).astype(int)

# Annual Income
train_set['Annual Income isna'] = train_set['Annual Income'].isna().astype(int)
train_set['Annual Income'] = train_set['Annual Income'].fillna(train_set['Annual Income'].mean())
train_set['Monthly Income'] = train_set['Annual Income'] / 12
train_set['Annual Income SQRT'] = np.sqrt(train_set['Annual Income'])
train_set['Annual Income Log'] = np.log(train_set['Annual Income'])
train_set['Annual Income SQ'] = train_set['Annual Income'] ** 2

# Marital Status
train_set['Marital Status'] = train_set['Marital Status'].fillna('nan')
train_set['Marital Status Int'] = train_set['Marital Status'].replace({'Married': 1, 'Divorced': 0.3, 'Single': 0.2, 'nan': 0}).astype(float)
train_set = pd.get_dummies(train_set, columns=['Marital Status'], drop_first=False, dtype=float)

# Number of Dependents
train_set['Number of Dependents isna'] = train_set['Number of Dependents'].isna().astype(int)
train_set['Number of Dependents'] = train_set['Number of Dependents'].fillna(0)
train_set['Number of Dependents'] = train_set['Number of Dependents'] / train_set['Number of Dependents'].max()

# Education Level
train_set['Education Level'] = train_set['Education Level'].fillna('nan')
train_set['Education'] = train_set['Education Level'].copy()
train_set = pd.get_dummies(train_set, columns=['Education'], drop_first=False, dtype=float)
train_set['Education Level'] = train_set['Education Level'].replace({'nan': -1, 'High School': 0, "Bachelor's": 1, "Master's": 2, 'PhD': 3}).astype(int)

# Occupation
train_set['Occupation'] = train_set['Occupation'].fillna('nan')
train_set = pd.get_dummies(train_set, columns=['Occupation'], drop_first=False, dtype=float)

# Health Score
train_set['Health Score'] = train_set['Health Score'].fillna(-100)
train_set['Health Score'] = train_set['Health Score'] / 100

# Location
train_set = pd.get_dummies(train_set, columns=['Location'], drop_first=False, dtype=float)

# Policy Type
train_set = pd.get_dummies(train_set, columns=['Policy Type'], drop_first=False, dtype=float)

# Previous Claims
train_set['Previous Claims Isna'] = train_set['Previous Claims'].isna().astype(int)
train_set['Previous Claims'] = train_set['Previous Claims'].fillna(-100)

# Vehicle Age
train_set['Vehicle Age'] = train_set['Vehicle Age'].fillna(train_set['Vehicle Age'].mode()[0])

# Policy Start Date
train_set['Policy Start Date'] = pd.to_datetime(train_set['Policy Start Date'])
train_set['Policy Start Date Year'] = train_set['Policy Start Date'].dt.year
train_set['Policy Start Date Month'] = train_set['Policy Start Date'].dt.month
train_set['Policy Start Date Day'] = train_set['Policy Start Date'].dt.day
train_set['Policy Start Date Weekday'] = train_set['Policy Start Date'].dt.weekday

# Smocking Status
train_set['Smoking Status'] = train_set['Smoking Status'].replace({'Yes': 1, 'No': 0}).astype(int)

# Exercise Frequency
train_set['Exercise Frequency'] = train_set['Exercise Frequency'].replace({'Daily': 1, 'Weekly': 0.5, 'Monthly': 0.3, 'Rarely': 0}).astype(float)

# Property Type
train_set['Property Type Int'] = train_set['Property Type'].replace({'Condo': 1, 'House': 0.4, 'Apartment': 0.2}).astype(float)
train_set = pd.get_dummies(train_set, columns=['Property Type'], drop_first=False, dtype=float)

# Customer Feedback
train_set['Customer Feedback'] = train_set['Customer Feedback'].fillna('nan')
train_set = pd.get_dummies(train_set, columns=['Customer Feedback'], drop_first=False, dtype=float)

# Credit Score
train_set['Credit Score isna'] = train_set['Credit Score'].isna().astype(int)
train_set['Credit Score'] = train_set['Credit Score'].fillna(train_set['Credit Score'].mean())

# Insurance Duration
train_set['Insurance Duration'] = train_set['Insurance Duration'].fillna(train_set['Insurance Duration'].mode()[0])

train_set.drop(columns=['Policy Start Date'], inplace=True)

# Repeat the process for the Test set

In [36]:
# Age
test_set['Age'] = test_set['Age']

def categorize_age(age):
    if age < 25:
        return '18-25'
    elif age < 40:
        return '25-40'
    elif age < 60:
        return '40-60'
    elif age >= 60:
        return '60+'
    else:
        return 'nan'

test_set['Age Cat'] = test_set['Age'].apply(categorize_age)
test_set = pd.get_dummies(test_set, columns=['Age Cat'], drop_first=False, dtype=float)
test_set['Age'] = test_set['Age'] / 100
test_set['Age'] = test_set['Age'].replace('nan', None).fillna(train_set['Age'].mode()[0])

# Gender
test_set['Gender'] = test_set['Gender'].replace({'Male': 0, 'Female': 1}).astype(int)

# Annual Income
test_set['Annual Income isna'] = test_set['Annual Income'].isna().astype(int)
test_set['Annual Income'] = test_set['Annual Income'].fillna(train_set['Annual Income'].mean())
test_set['Monthly Income'] = test_set['Annual Income'] / 12
test_set['Annual Income SQRT'] = np.sqrt(test_set['Annual Income'])
test_set['Annual Income Log'] = np.log(test_set['Annual Income'])
test_set['Annual Income SQ'] = test_set['Annual Income'] ** 2

# Marital Status
test_set['Marital Status'] = test_set['Marital Status'].fillna('nan')
test_set['Marital Status Int'] = test_set['Marital Status'].replace({'Married': 1, 'Divorced': 0.3, 'Single': 0.2, 'nan': 0}).astype(float)
test_set = pd.get_dummies(test_set, columns=['Marital Status'], drop_first=False, dtype=float)

# Number of Dependents
test_set['Number of Dependents isna'] = test_set['Number of Dependents'].isna().astype(int)
test_set['Number of Dependents'] = test_set['Number of Dependents'].fillna(0)
test_set['Number of Dependents'] = test_set['Number of Dependents'] / test_set['Number of Dependents'].max()

# Education Level
test_set['Education Level'] = test_set['Education Level'].fillna('nan')
test_set['Education'] = test_set['Education Level'].copy()
test_set = pd.get_dummies(test_set, columns=['Education'], drop_first=False, dtype=float)
test_set['Education Level'] = test_set['Education Level'].replace({'nan': -1, 'High School': 0, "Bachelor's": 1, "Master's": 2, 'PhD': 3}).astype(int)

# Occupation
test_set['Occupation'] = test_set['Occupation'].fillna('nan')
test_set = pd.get_dummies(test_set, columns=['Occupation'], drop_first=False, dtype=float)

# Health Score
test_set['Health Score'] = test_set['Health Score'].fillna(-100)
test_set['Health Score'] = test_set['Health Score'] / 100

# Location
test_set = pd.get_dummies(test_set, columns=['Location'], drop_first=False, dtype=float)

# Policy Type
test_set = pd.get_dummies(test_set, columns=['Policy Type'], drop_first=False, dtype=float)

# Previous Claims
test_set['Previous Claims Isna'] = test_set['Previous Claims'].isna().astype(int)
test_set['Previous Claims'] = test_set['Previous Claims'].fillna(-100)

# Vehicle Age
test_set['Vehicle Age'] = test_set['Vehicle Age'].fillna(train_set['Vehicle Age'].mode()[0])

# Policy Start Date
test_set['Policy Start Date'] = pd.to_datetime(test_set['Policy Start Date'])
test_set['Policy Start Date Year'] = test_set['Policy Start Date'].dt.year
test_set['Policy Start Date Month'] = test_set['Policy Start Date'].dt.month
test_set['Policy Start Date Day'] = test_set['Policy Start Date'].dt.day
test_set['Policy Start Date Weekday'] = test_set['Policy Start Date'].dt.weekday

# Smocking Status
test_set['Smoking Status'] = test_set['Smoking Status'].replace({'Yes': 1, 'No': 0}).astype(int)

# Exercise Frequency
test_set['Exercise Frequency'] = test_set['Exercise Frequency'].replace({'Daily': 1, 'Weekly': 0.5, 'Monthly': 0.3, 'Rarely': 0}).astype(float)

# Property Type
test_set['Property Type Int'] = test_set['Property Type'].replace({'Condo': 1, 'House': 0.4, 'Apartment': 0.2}).astype(float)
test_set = pd.get_dummies(test_set, columns=['Property Type'], drop_first=False, dtype=float)

# Customer Feedback
test_set['Customer Feedback'] = test_set['Customer Feedback'].fillna('nan')
test_set = pd.get_dummies(test_set, columns=['Customer Feedback'], drop_first=False, dtype=float)

# Credit Score
test_set['Credit Score isna'] = test_set['Credit Score'].isna().astype(int)
test_set['Credit Score'] = test_set['Credit Score'].fillna(train_set['Credit Score'].mean())

# Insurance Duration
test_set['Insurance Duration'] = test_set['Insurance Duration'].fillna(train_set['Insurance Duration'].mode()[0])

test_set.drop(columns=['Policy Start Date'], inplace=True)

In [37]:
train_set.info()
test_set.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1200000 entries, 0 to 1199999
Data columns (total 58 columns):
 #   Column                     Non-Null Count    Dtype  
---  ------                     --------------    -----  
 0   id                         1200000 non-null  int64  
 1   Age                        1200000 non-null  float64
 2   Gender                     1200000 non-null  int64  
 3   Annual Income              1200000 non-null  float64
 4   Number of Dependents       1200000 non-null  float64
 5   Education Level            1200000 non-null  int64  
 6   Health Score               1200000 non-null  float64
 7   Previous Claims            1200000 non-null  float64
 8   Vehicle Age                1200000 non-null  float64
 9   Credit Score               1200000 non-null  float64
 10  Insurance Duration         1200000 non-null  float64
 11  Smoking Status             1200000 non-null  int64  
 12  Exercise Frequency         1200000 non-null  float64
 13  Premium Amou

In [38]:
print(train_set.shape)
print(test_set.shape)

(1200000, 58)
(800000, 57)


# Prepare the needed sets for training

In [39]:
id_train = train_set['id']
y_train = train_set['Premium Amount']
X_train = train_set.drop(columns=['id', 'Premium Amount'])
y_train_log = np.log(y_train)

id_test = test_set['id']
X_test = test_set.drop(columns=['id'])

# Training all Base models

The parameters of the non-linear models were determined after tuning.

In [42]:
# Function to calculate RMSLE
def calculate_rmsle(y_true, y_pred):
    return np.sqrt(mean_squared_log_error(y_true, y_pred))

# Log-transform the target variable for training
y_train_log = np.log(y_train)

# Initialize models
xgb_model1 = XGBRegressor(learning_rate = 0.051, max_depth= 7, n_estimators = 133, n_jobs=-1)
xgb_model2 = XGBRegressor(learning_rate = 0.01, max_depth= 8, n_estimators = 300, n_jobs=-1)
xgb_model3 = XGBRegressor(learning_rate = 0.02, max_depth= 3, n_estimators = 600, n_jobs=-1)
xgb_model4 = XGBRegressor(learning_rate = 0.046, max_depth= 5, n_estimators = 70, n_jobs=-1)
xgb_model5 = XGBRegressor(learning_rate = 0.01, max_depth= 10, n_estimators = 489, n_jobs=-1)
xgb_model6 = XGBRegressor(learning_rate = 0.02, max_depth= 8, n_estimators = 172, n_jobs=-1)
xgb_model7 = XGBRegressor(learning_rate = 0.001, max_depth= 7, n_estimators = 100, n_jobs=-1)
xgb_model8 = XGBRegressor(learning_rate = 0.01, max_depth= 8, n_estimators = 330, n_jobs=-1)
xgb_model9 = XGBRegressor(learning_rate = 0.06, max_depth= 5, n_estimators = 700, n_jobs=-1)
xgb_model10 = XGBRegressor(learning_rate = 0.007, max_depth= 4, n_estimators = 250, n_jobs=-1)
xgb_model11 = XGBRegressor(learning_rate = 0.1, max_depth= 1, n_estimators = 150, n_jobs=-1)
xgb_model12 = XGBRegressor(learning_rate = 0.08, max_depth= 2, n_estimators = 550, n_jobs=-1)

# Prepare OOF prediction storage
kf = KFold(n_splits=5, shuffle=True, random_state=42)
oof_preds_xgb1 = np.zeros(len(y_train))
oof_preds_xgb2 = np.zeros(len(y_train))
oof_preds_xgb3 = np.zeros(len(y_train))
oof_preds_xgb4 = np.zeros(len(y_train))
oof_preds_xgb5 = np.zeros(len(y_train))
oof_preds_xgb6 = np.zeros(len(y_train))
oof_preds_xgb7 = np.zeros(len(y_train))
oof_preds_xgb8 = np.zeros(len(y_train))
oof_preds_xgb9 = np.zeros(len(y_train))
oof_preds_xgb10 = np.zeros(len(y_train))
oof_preds_xgb11 = np.zeros(len(y_train))
oof_preds_xgb12 = np.zeros(len(y_train))

# Prepare RMSLE scores storage
rmsle_scores = {
    'fold': [],
    'model': [],
    'rmsle': []
}

print("Starting training with 5-fold cross-validation...\n")

for fold, (train_idx, val_idx) in enumerate(kf.split(X_train), start=1):
    print(f"Fold {fold}:")

    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_tr_log, y_val = y_train_log.iloc[train_idx], y_train.iloc[val_idx]  # y_val in original scale

    # Models 7 to 12
    for i in range(1, 13):
        model = globals()[f"xgb_model{i}"]  # Dynamically access xgb_model7, xgb_model8, etc.
        model.fit(X_tr, y_tr_log)
        preds = np.exp(model.predict(X_val))  # Transform back from log
        globals()[f"oof_preds_xgb{i}"][val_idx] = preds
        rmsle_scores['fold'].append(fold)
        rmsle_scores['model'].append(f'XGBoost{i}')
        rmsle_scores['rmsle'].append(calculate_rmsle(y_val, preds))
        print(f"  XGBoost{i} predictions done for Fold {fold}")

print("\nCross-validation completed.\n")

# Combine OOF predictions
oof_predictions = pd.DataFrame({
    'xgb1': oof_preds_xgb1,
    'xgb2': oof_preds_xgb2,
    'xgb3': oof_preds_xgb3,
    'xgb4': oof_preds_xgb4,
    'xgb5': oof_preds_xgb5,
    'xgb6': oof_preds_xgb6,
    'xgb7': oof_preds_xgb7,
    'xgb8': oof_preds_xgb8,
    'xgb9': oof_preds_xgb9,
    'xgb10': oof_preds_xgb10,
    'xgb11': oof_preds_xgb11,
    'xgb12': oof_preds_xgb12
})

# Print OOF Predictions
print("Out-of-Fold Predictions:\n")
print(oof_predictions.head())  # Inspect the combined OOF predictions
print(f"Shape of OOF Predictions: {oof_predictions.shape}")

# Create RMSLE Scores DataFrame
rmsle_df = pd.DataFrame(rmsle_scores)

# Pivot the RMSLE scores to create a table
rmsle_pivot = rmsle_df.pivot(index='fold', columns='model', values='rmsle')

# Print the pivoted table directly
print("\nRMSLE Scores: Models vs Folds\n")
print(rmsle_pivot)


Starting training with 5-fold cross-validation...

Fold 1:
  XGBoost1 predictions done for Fold 1
  XGBoost2 predictions done for Fold 1
  XGBoost3 predictions done for Fold 1
  XGBoost4 predictions done for Fold 1
  XGBoost5 predictions done for Fold 1
  XGBoost6 predictions done for Fold 1
  XGBoost7 predictions done for Fold 1
  XGBoost8 predictions done for Fold 1
  XGBoost9 predictions done for Fold 1
  XGBoost10 predictions done for Fold 1
  XGBoost11 predictions done for Fold 1
  XGBoost12 predictions done for Fold 1
Fold 2:
  XGBoost1 predictions done for Fold 2
  XGBoost2 predictions done for Fold 2
  XGBoost3 predictions done for Fold 2
  XGBoost4 predictions done for Fold 2
  XGBoost5 predictions done for Fold 2
  XGBoost6 predictions done for Fold 2
  XGBoost7 predictions done for Fold 2
  XGBoost8 predictions done for Fold 2
  XGBoost9 predictions done for Fold 2
  XGBoost10 predictions done for Fold 2
  XGBoost11 predictions done for Fold 2
  XGBoost12 predictions done fo

In [43]:
# Subset the data to the first 100 IDs for plotting
subset_ids = id_train[:100]
subset_y_train = y_train[:100]

# Create a plot
fig = go.Figure()

# Add traces for each model's OOF predictions
fig.add_trace(go.Scatter(x=subset_ids, y=oof_predictions['xgb1'].iloc[:100], mode='lines', name='XGBoost1'))
fig.add_trace(go.Scatter(x=subset_ids, y=oof_predictions['xgb2'].iloc[:100], mode='lines', name='XGBoost2'))
fig.add_trace(go.Scatter(x=subset_ids, y=oof_predictions['xgb3'].iloc[:100], mode='lines', name='XGBoost3'))
fig.add_trace(go.Scatter(x=subset_ids, y=oof_predictions['xgb4'].iloc[:100], mode='lines', name='XGBoost4'))
fig.add_trace(go.Scatter(x=subset_ids, y=oof_predictions['xgb5'].iloc[:100], mode='lines', name='XGBoost5'))
fig.add_trace(go.Scatter(x=subset_ids, y=oof_predictions['xgb6'].iloc[:100], mode='lines', name='XGBoost6'))
fig.add_trace(go.Scatter(x=subset_ids, y=oof_predictions['xgb7'].iloc[:100], mode='lines', name='XGBoost7'))
fig.add_trace(go.Scatter(x=subset_ids, y=oof_predictions['xgb8'].iloc[:100], mode='lines', name='XGBoost8'))
fig.add_trace(go.Scatter(x=subset_ids, y=oof_predictions['xgb9'].iloc[:100], mode='lines', name='XGBoost9'))
fig.add_trace(go.Scatter(x=subset_ids, y=oof_predictions['xgb10'].iloc[:100], mode='lines', name='XGBoost10'))
fig.add_trace(go.Scatter(x=subset_ids, y=oof_predictions['xgb11'].iloc[:100], mode='lines', name='XGBoost11'))
fig.add_trace(go.Scatter(x=subset_ids, y=oof_predictions['xgb12'].iloc[:100], mode='lines', name='XGBoost12'))

# Add the original y_train values
fig.add_trace(go.Scatter(x=subset_ids, y=subset_y_train, mode='lines', name='y_train (Actual)', line=dict(dash='dot')))

# Update layout
fig.update_layout(
    title="Out-of-Fold Predictions vs ID (First 100 IDs)",
    xaxis_title="ID",
    yaxis_title="Predictions",
    legend_title="Model",
    template="plotly_white"
)

# Show the plot
fig.show()


In [44]:
X_train_meta = oof_predictions.copy()
oof_predictions.to_csv('/content/drive/MyDrive/Playground Series - Season 4, Episode 12/data/oof_predictions_V2.csv', index=False) # save the set to save time for next time

In [45]:
X_train_meta = pd.read_csv('/content/drive/MyDrive/Playground Series - Season 4, Episode 12/data/oof_predictions_V2.csv') # how to load the set

In [50]:
X_train_meta.head()

Unnamed: 0,xgb1,xgb2,xgb3,xgb4,xgb5,xgb6,xgb7,xgb8,xgb9,xgb10,xgb11,xgb12
0,965.9505,947.416443,951.459351,944.171997,936.640869,949.81427,749.208313,951.751953,1000.118835,885.523987,1084.231201,984.968384
1,749.623901,765.63385,754.256775,780.753723,729.19574,761.9646,733.855652,763.064819,720.121948,778.035828,730.398071,720.494995
2,809.297424,810.810791,782.908081,789.702026,819.099182,816.32843,735.304443,811.534851,814.628723,777.369812,729.572144,774.099548
3,741.004272,779.355835,731.927612,766.09375,759.428711,785.117554,731.322327,780.228577,707.883667,767.400757,663.971985,730.155762
4,778.198669,781.349609,751.665894,779.712708,793.538696,782.285645,734.062866,781.954895,775.925537,778.114929,730.341309,728.354492


# XGB Regression

In [57]:
# Define parameter search space
search_spaces = {
    'learning_rate': Real(0.001, 0.5, 'uniform'),
    'max_depth': Integer(1, 14),
    'n_estimators': Integer(50, 300),
}

# Initialize the XGBRegressor model with GPU support
model = XGBRegressor(
    objective='reg:squarederror',
    tree_method = 'hist',
    device = 'cuda',
    verbosity=2
)

kf = KFold(n_splits=5)  # 10-fold cross-validation

# Using negative RMSE as the scoring metric
optimizer = BayesSearchCV(
    estimator=model,
    search_spaces=search_spaces,
    n_iter=1,
    cv=kf,
    scoring='neg_root_mean_squared_error',  # RMSE scoring
    verbose=1,
    n_jobs=-1,
    return_train_score=True
)

# Fit the optimizer using X_train and log-transformed y_train
optimizer.fit(X_train_meta, y_train_log)

# Get the best model
best_model = optimizer.best_estimator_

# Save the best model
model_path = '/content/drive/MyDrive/Playground Series - Season 4, Episode 12/models/XGB_4.json'
best_model.save_model(model_path)

# Get cross-validation errors
cv_results = optimizer.cv_results_

# Calculate and print the mean RMSE for each fold (convert negative RMSE back to positive)
mean_rmse = -cv_results['mean_test_score']  # This is negative RMSE, so we negate it to get RMSE

print(f"Best parameters: {optimizer.best_params_}")
print(f"Mean Cross-validation RMSE: {mean_rmse.mean():.4f}")

Fitting 5 folds for each of 1 candidates, totalling 5 fits








Best parameters: OrderedDict([('learning_rate', 0.40429932616686953), ('max_depth', 14), ('n_estimators', 196)])
Mean Cross-validation RMSE: 1.1533


#

In [51]:
linear_meta_model = LinearRegression(copy_X=True, n_jobs=-1)
linear_meta_model.fit(X_train_meta, y_train_log)

In [53]:
# Train XGBoost
xgb_model1.fit(X_train, y_train_log)
xgb_preds1 = np.exp(xgb_model1.predict(X_test))  # Transform back from log

xgb_model2.fit(X_train, y_train_log)
xgb_preds2 = np.exp(xgb_model2.predict(X_test))  # Transform back from log

xgb_model3.fit(X_train, y_train_log)
xgb_preds3 = np.exp(xgb_model3.predict(X_test))  # Transform back from log

xgb_model4.fit(X_train, y_train_log)
xgb_preds4 = np.exp(xgb_model4.predict(X_test))  # Transform back from log

xgb_model5.fit(X_train, y_train_log)
xgb_preds5 = np.exp(xgb_model5.predict(X_test))  # Transform back from log

xgb_model6.fit(X_train, y_train_log)
xgb_preds6 = np.exp(xgb_model6.predict(X_test))  # Transform back from log

xgb_model7.fit(X_train, y_train_log)
xgb_preds7 = np.exp(xgb_model7.predict(X_test))  # Transform back from log

xgb_model8.fit(X_train, y_train_log)
xgb_preds8 = np.exp(xgb_model8.predict(X_test))  # Transform back from log

xgb_model9.fit(X_train, y_train_log)
xgb_preds9 = np.exp(xgb_model9.predict(X_test))  # Transform back from log

xgb_model10.fit(X_train, y_train_log)
xgb_preds10 = np.exp(xgb_model10.predict(X_test))  # Transform back from log

xgb_model11.fit(X_train, y_train_log)
xgb_preds11 = np.exp(xgb_model11.predict(X_test))  # Transform back from log

xgb_model12.fit(X_train, y_train_log)
xgb_preds12 = np.exp(xgb_model12.predict(X_test))  # Transform back from log



# Combine OOF predictions
oof_predictions_test = pd.DataFrame({
    'xgb1': xgb_preds1,
    'xgb2': xgb_preds2,
    'xgb3': xgb_preds3,
    'xgb4': xgb_preds4,
    'xgb5': xgb_preds5,
    'xgb6': xgb_preds6,
    'xgb7': xgb_preds7,
    'xgb8': xgb_preds8,
    'xgb9': xgb_preds9,
    'xgb10': xgb_preds10,
    'xgb11': xgb_preds11,
    'xgb12': xgb_preds12
})



In [54]:
oof_predictions_test.to_csv('/content/drive/MyDrive/Playground Series - Season 4, Episode 12/data/oof_predictions_test_V2.csv', index=False)

In [55]:
oof_predictions_test = pd.read_csv('/content/drive/MyDrive/Playground Series - Season 4, Episode 12/data/oof_predictions_test_V2.csv')

# Generate predictions using the XGB as a Meta-Model

In [58]:
# Convert test data to DMatrix
dtest = xgb.DMatrix(oof_predictions_test)

# Load the trained model
model_path = f'/content/drive/MyDrive/Playground Series - Season 4, Episode 12/models/XGB_3.json'
model = xgb.Booster(model_file=model_path)

# Get predictions
preds_log = model.predict(dtest)

# Convert log-transformed predictions back to original scale
preds_xgb = np.exp(preds_log)

# Ensure the length of predictions matches the number of rows in the test data
assert len(preds_xgb) == len(id_test), "Mismatch between number of predictions and test data IDs"

# Create output DataFrame with original Id and predicted SalePrice
output = pd.DataFrame({'id': id_test, 'Premium Amount': preds_xgb.squeeze()})

# Remove any duplicate rows by 'Id'
output.drop_duplicates(subset='id', keep='first', inplace=True)

# Save predictions to a CSV file
output.to_csv('XGB_meta_predictions.csv', index=False)
files.download('XGB_meta_predictions.csv')

ValueError: feature_names mismatch: ['xgb', 'linear', 'lasso', 'catboost', 'lightgbm', 'random_forest'] ['xgb1', 'xgb2', 'xgb3', 'xgb4', 'xgb5', 'xgb6', 'xgb7', 'xgb8', 'xgb9', 'xgb10', 'xgb11', 'xgb12']
expected xgb, lasso, random_forest, lightgbm, catboost, linear in input data
training data did not have the following fields: xgb9, xgb11, xgb4, xgb12, xgb8, xgb1, xgb3, xgb5, xgb2, xgb10, xgb7, xgb6

# Generate predictions using the a Linear model as the Meta-model

In [56]:
# Get predictions
preds_log = linear_meta_model.predict(oof_predictions_test)

# Convert log-transformed predictions back to original scale
preds_linear = np.exp(preds_log)

# Ensure the length of predictions matches the number of rows in the test data
assert len(preds_linear) == len(id_test), "Mismatch between number of predictions and test data IDs"

# Create output DataFrame with original Id and predicted SalePrice
output = pd.DataFrame({'id': id_test, 'Premium Amount': preds_linear.squeeze()})

# Remove any duplicate rows by 'Id'
output.drop_duplicates(subset='id', keep='first', inplace=True)

# Save predictions to a CSV file
output.to_csv('Linear_meta_predictions.csv', index=False)
files.download('Linear_meta_predictions.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>