# Regression with an Insurance Dataset
## Playground Series - Season 4, Episode 12
Submissions are evaluated using the Root Mean Squared Logarithmic Error (RMSLE).

In [30]:
# Mount Google Drive
from google.colab import drive
import sys

drive.mount('/content/drive', force_remount=True)
sys.path.append('/content/drive/MyDrive')

Mounted at /content/drive


In [None]:
!pip install scikit-optimize
!pip install shap
!pip install xgboost
!pip install catboost
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_log_error, make_scorer
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from skopt import BayesSearchCV
from skopt.space import Real, Integer
import shap
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from google.colab import files

pd.set_option('future.no_silent_downcasting', True)



## Load Train and Test data from google drive

In [None]:
train_set = pd.read_csv('/content/drive/MyDrive/Playground Series - Season 4, Episode 12/data/train.csv')
test_set = pd.read_csv('/content/drive/MyDrive/Playground Series - Season 4, Episode 12/data/test.csv')

train_set.head()

In [None]:
train_set.describe()

## Inspect the data further

In [None]:
for col in train_set.columns:
  if train_set[col].dtype == 'object':
    print(col, train_set[col].unique())

for col in train_set.columns:
    print(col, train_set[col].isna().sum())

for col in test_set.columns:
  if train_set[col].dtype == 'object':
    print(col, test_set[col].unique())

for col in test_set.columns:
    print(col, test_set[col].isna().sum())

# Feature Engineering

In [None]:
# Age
train_set['Age'] = train_set['Age']

def categorize_age(age):
    if age < 25:
        return '18-25'
    elif age < 40:
        return '25-40'
    elif age < 60:
        return '40-60'
    elif age >= 60:
        return '60+'
    else:
        return 'nan'


train_set['Age Cat'] = train_set['Age'].apply(categorize_age)
train_set = pd.get_dummies(train_set, columns=['Age Cat'], drop_first=False, dtype=float)
train_set['Age'] = train_set['Age'] / 100
train_set['Age'] = train_set['Age'].replace('nan', None).fillna(train_set['Age'].mode()[0])

# Gender
train_set['Gender'] = train_set['Gender'].replace({'Male': 0, 'Female': 1}).astype(int)

# Annual Income
train_set['Annual Income isna'] = train_set['Annual Income'].isna().astype(int)
train_set['Annual Income'] = train_set['Annual Income'].fillna(train_set['Annual Income'].mean())
train_set['Monthly Income'] = train_set['Annual Income'] / 12
train_set['Annual Income SQRT'] = np.sqrt(train_set['Annual Income'])
train_set['Annual Income Log'] = np.log(train_set['Annual Income'])
train_set['Annual Income SQ'] = train_set['Annual Income'] ** 2

# Marital Status
train_set['Marital Status'] = train_set['Marital Status'].fillna('nan')
train_set['Marital Status Int'] = train_set['Marital Status'].replace({'Married': 1, 'Divorced': 0.3, 'Single': 0.2, 'nan': 0}).astype(float)
train_set = pd.get_dummies(train_set, columns=['Marital Status'], drop_first=False, dtype=float)

# Number of Dependents
train_set['Number of Dependents isna'] = train_set['Number of Dependents'].isna().astype(int)
train_set['Number of Dependents'] = train_set['Number of Dependents'].fillna(0)
train_set['Number of Dependents'] = train_set['Number of Dependents'] / train_set['Number of Dependents'].max()

# Education Level
train_set['Education Level'] = train_set['Education Level'].fillna('nan')
train_set['Education'] = train_set['Education Level'].copy()
train_set = pd.get_dummies(train_set, columns=['Education'], drop_first=False, dtype=float)
train_set['Education Level'] = train_set['Education Level'].replace({'nan': -1, 'High School': 0, "Bachelor's": 1, "Master's": 2, 'PhD': 3}).astype(int)

# Occupation
train_set['Occupation'] = train_set['Occupation'].fillna('nan')
train_set = pd.get_dummies(train_set, columns=['Occupation'], drop_first=False, dtype=float)

# Health Score
train_set['Health Score'] = train_set['Health Score'].fillna(-100)
train_set['Health Score'] = train_set['Health Score'] / 100

# Location
train_set = pd.get_dummies(train_set, columns=['Location'], drop_first=False, dtype=float)

# Policy Type
train_set = pd.get_dummies(train_set, columns=['Policy Type'], drop_first=False, dtype=float)

# Previous Claims
train_set['Previous Claims Isna'] = train_set['Previous Claims'].isna().astype(int)
train_set['Previous Claims'] = train_set['Previous Claims'].fillna(-100)

# Vehicle Age
train_set['Vehicle Age'] = train_set['Vehicle Age'].fillna(train_set['Vehicle Age'].mode()[0])

# Policy Start Date
train_set['Policy Start Date'] = pd.to_datetime(train_set['Policy Start Date'])
train_set['Policy Start Date Year'] = train_set['Policy Start Date'].dt.year
train_set['Policy Start Date Month'] = train_set['Policy Start Date'].dt.month
train_set['Policy Start Date Day'] = train_set['Policy Start Date'].dt.day
train_set['Policy Start Date Weekday'] = train_set['Policy Start Date'].dt.weekday

# Smocking Status
train_set['Smoking Status'] = train_set['Smoking Status'].replace({'Yes': 1, 'No': 0}).astype(int)

# Exercise Frequency
train_set['Exercise Frequency'] = train_set['Exercise Frequency'].replace({'Daily': 1, 'Weekly': 0.5, 'Monthly': 0.3, 'Rarely': 0}).astype(float)

# Property Type
train_set['Property Type Int'] = train_set['Property Type'].replace({'Condo': 1, 'House': 0.4, 'Apartment': 0.2}).astype(float)
train_set = pd.get_dummies(train_set, columns=['Property Type'], drop_first=False, dtype=float)

# Customer Feedback
train_set['Customer Feedback'] = train_set['Customer Feedback'].fillna('nan')
train_set = pd.get_dummies(train_set, columns=['Customer Feedback'], drop_first=False, dtype=float)

# Credit Score
train_set['Credit Score isna'] = train_set['Credit Score'].isna().astype(int)
train_set['Credit Score'] = train_set['Credit Score'].fillna(train_set['Credit Score'].mean())

# Insurance Duration
train_set['Insurance Duration'] = train_set['Insurance Duration'].fillna(train_set['Insurance Duration'].mode()[0])

train_set.drop(columns=['Policy Start Date'], inplace=True)

# Repeat the process for the Test set

In [None]:
# Age
test_set['Age'] = test_set['Age']

def categorize_age(age):
    if age < 25:
        return '18-25'
    elif age < 40:
        return '25-40'
    elif age < 60:
        return '40-60'
    elif age >= 60:
        return '60+'
    else:
        return 'nan'

test_set['Age Cat'] = test_set['Age'].apply(categorize_age)
test_set = pd.get_dummies(test_set, columns=['Age Cat'], drop_first=False, dtype=float)
test_set['Age'] = test_set['Age'] / 100
test_set['Age'] = test_set['Age'].replace('nan', None).fillna(train_set['Age'].mode()[0])

# Gender
test_set['Gender'] = test_set['Gender'].replace({'Male': 0, 'Female': 1}).astype(int)

# Annual Income
test_set['Annual Income isna'] = test_set['Annual Income'].isna().astype(int)
test_set['Annual Income'] = test_set['Annual Income'].fillna(train_set['Annual Income'].mean())
test_set['Monthly Income'] = test_set['Annual Income'] / 12
test_set['Annual Income SQRT'] = np.sqrt(test_set['Annual Income'])
test_set['Annual Income Log'] = np.log(test_set['Annual Income'])
test_set['Annual Income SQ'] = test_set['Annual Income'] ** 2

# Marital Status
test_set['Marital Status'] = test_set['Marital Status'].fillna('nan')
test_set['Marital Status Int'] = test_set['Marital Status'].replace({'Married': 1, 'Divorced': 0.3, 'Single': 0.2, 'nan': 0}).astype(float)
test_set = pd.get_dummies(test_set, columns=['Marital Status'], drop_first=False, dtype=float)

# Number of Dependents
test_set['Number of Dependents isna'] = test_set['Number of Dependents'].isna().astype(int)
test_set['Number of Dependents'] = test_set['Number of Dependents'].fillna(0)
test_set['Number of Dependents'] = test_set['Number of Dependents'] / test_set['Number of Dependents'].max()

# Education Level
test_set['Education Level'] = test_set['Education Level'].fillna('nan')
test_set['Education'] = test_set['Education Level'].copy()
test_set = pd.get_dummies(test_set, columns=['Education'], drop_first=False, dtype=float)
test_set['Education Level'] = test_set['Education Level'].replace({'nan': -1, 'High School': 0, "Bachelor's": 1, "Master's": 2, 'PhD': 3}).astype(int)

# Occupation
test_set['Occupation'] = test_set['Occupation'].fillna('nan')
test_set = pd.get_dummies(test_set, columns=['Occupation'], drop_first=False, dtype=float)

# Health Score
test_set['Health Score'] = test_set['Health Score'].fillna(-100)
test_set['Health Score'] = test_set['Health Score'] / 100

# Location
test_set = pd.get_dummies(test_set, columns=['Location'], drop_first=False, dtype=float)

# Policy Type
test_set = pd.get_dummies(test_set, columns=['Policy Type'], drop_first=False, dtype=float)

# Previous Claims
test_set['Previous Claims Isna'] = test_set['Previous Claims'].isna().astype(int)
test_set['Previous Claims'] = test_set['Previous Claims'].fillna(-100)

# Vehicle Age
test_set['Vehicle Age'] = test_set['Vehicle Age'].fillna(train_set['Vehicle Age'].mode()[0])

# Policy Start Date
test_set['Policy Start Date'] = pd.to_datetime(test_set['Policy Start Date'])
test_set['Policy Start Date Year'] = test_set['Policy Start Date'].dt.year
test_set['Policy Start Date Month'] = test_set['Policy Start Date'].dt.month
test_set['Policy Start Date Day'] = test_set['Policy Start Date'].dt.day
test_set['Policy Start Date Weekday'] = test_set['Policy Start Date'].dt.weekday

# Smocking Status
test_set['Smoking Status'] = test_set['Smoking Status'].replace({'Yes': 1, 'No': 0}).astype(int)

# Exercise Frequency
test_set['Exercise Frequency'] = test_set['Exercise Frequency'].replace({'Daily': 1, 'Weekly': 0.5, 'Monthly': 0.3, 'Rarely': 0}).astype(float)

# Property Type
test_set['Property Type Int'] = test_set['Property Type'].replace({'Condo': 1, 'House': 0.4, 'Apartment': 0.2}).astype(float)
test_set = pd.get_dummies(test_set, columns=['Property Type'], drop_first=False, dtype=float)

# Customer Feedback
test_set['Customer Feedback'] = test_set['Customer Feedback'].fillna('nan')
test_set = pd.get_dummies(test_set, columns=['Customer Feedback'], drop_first=False, dtype=float)

# Credit Score
test_set['Credit Score isna'] = test_set['Credit Score'].isna().astype(int)
test_set['Credit Score'] = test_set['Credit Score'].fillna(train_set['Credit Score'].mean())

# Insurance Duration
test_set['Insurance Duration'] = test_set['Insurance Duration'].fillna(train_set['Insurance Duration'].mode()[0])

test_set.drop(columns=['Policy Start Date'], inplace=True)

In [None]:
train_set.info()
test_set.info()

In [None]:
print(train_set.shape)
print(test_set.shape)

# Prepare the needed sets for training

In [None]:
id_train = train_set['id']
y_train = train_set['Premium Amount']
X_train = train_set.drop(columns=['id', 'Premium Amount'])
y_train_log = np.log(y_train)

id_test = test_set['id']
X_test = test_set.drop(columns=['id'])

# Training all Base models

The parameters of the non-linear models were determined after tuning.

In [12]:
# Function to calculate RMSLE
def calculate_rmsle(y_true, y_pred):
    return np.sqrt(mean_squared_log_error(y_true, y_pred))

# Log-transform the target variable for training
y_train_log = np.log(y_train)

# Initialize models
xgb_model = XGBRegressor(learning_rate = 0.05104644623518532, max_depth= 7, n_estimators = 133)
linear_model = LinearRegression(copy_X=True, n_jobs=-1)
lasso_model = Lasso(alpha=0.1, max_iter=5000, tol=1e-4)
catboost_model = CatBoostRegressor(iterations=182, learning_rate=0.13328757298571275, depth=8, silent=True, random_state=42)
lightgbm_model = LGBMRegressor(n_estimators=500, learning_rate=0.05, max_depth=6, random_state=42)
random_forest_model = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42, n_jobs=-1)

# Prepare OOF prediction storage
kf = KFold(n_splits=5, shuffle=True, random_state=42)
oof_preds_xgb = np.zeros(len(y_train))
oof_preds_linear = np.zeros(len(y_train))
oof_preds_lasso = np.zeros(len(y_train))
oof_preds_catboost = np.zeros(len(y_train))
oof_preds_lightgbm = np.zeros(len(y_train))
oof_preds_rf = np.zeros(len(y_train))

# Prepare RMSLE scores storage
rmsle_scores = {
    'fold': [],
    'model': [],
    'rmsle': []
}

print("Starting training with 5-fold cross-validation...\n")

# Train models and generate OOF predictions
for fold, (train_idx, val_idx) in enumerate(kf.split(X_train), start=1):
    print(f"Fold {fold}:")

    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_tr_log, y_val = y_train_log.iloc[train_idx], y_train.iloc[val_idx]  # y_val in original scale

    # Train XGBoost
    xgb_model.fit(X_tr, y_tr_log)
    xgb_preds = np.exp(xgb_model.predict(X_val))  # Transform back from log
    oof_preds_xgb[val_idx] = xgb_preds
    rmsle_scores['fold'].append(fold)
    rmsle_scores['model'].append('XGBoost')
    rmsle_scores['rmsle'].append(calculate_rmsle(y_val, xgb_preds))
    print(f"  XGBoost predictions done for Fold {fold}")

    # Train Linear Regression
    linear_model.fit(X_tr, y_tr_log)
    linear_preds = np.exp(linear_model.predict(X_val))  # Transform back from log
    oof_preds_linear[val_idx] = linear_preds
    rmsle_scores['fold'].append(fold)
    rmsle_scores['model'].append('Linear')
    rmsle_scores['rmsle'].append(calculate_rmsle(y_val, linear_preds))
    print(f"  Linear Regression predictions done for Fold {fold}")

    # Train Lasso Regression
    lasso_model.fit(X_tr, y_tr_log)
    lasso_preds = np.exp(lasso_model.predict(X_val))  # Transform back from log
    oof_preds_lasso[val_idx] = lasso_preds
    rmsle_scores['fold'].append(fold)
    rmsle_scores['model'].append('Lasso')
    rmsle_scores['rmsle'].append(calculate_rmsle(y_val, lasso_preds))
    print(f"  Lasso Regression predictions done for Fold {fold}")

    # Train CatBoost
    catboost_model.fit(X_tr, y_tr_log)
    catboost_preds = np.exp(catboost_model.predict(X_val))
    oof_preds_catboost[val_idx] = catboost_preds
    rmsle_scores['fold'].append(fold)
    rmsle_scores['model'].append('CatBoost')
    rmsle_scores['rmsle'].append(calculate_rmsle(y_val, catboost_preds))
    print(f"  CatBoost predictions done for Fold {fold}")

    # Train LightGBM
    lightgbm_model.fit(X_tr, y_tr_log)
    lightgbm_preds = np.exp(lightgbm_model.predict(X_val))
    oof_preds_lightgbm[val_idx] = lightgbm_preds
    rmsle_scores['fold'].append(fold)
    rmsle_scores['model'].append('LightGBM')
    rmsle_scores['rmsle'].append(calculate_rmsle(y_val, lightgbm_preds))
    print(f"  LightGBM predictions done for Fold {fold}")

    # Train Random Forest
    random_forest_model.fit(X_tr, y_tr_log)
    rf_preds = np.exp(random_forest_model.predict(X_val))
    oof_preds_rf[val_idx] = rf_preds
    rmsle_scores['fold'].append(fold)
    rmsle_scores['model'].append('RandomForest')
    rmsle_scores['rmsle'].append(calculate_rmsle(y_val, rf_preds))
    print(f"  Random Forest predictions done for Fold {fold}")

print("\nCross-validation completed.\n")

# Combine OOF predictions
oof_predictions = pd.DataFrame({
    'xgb': oof_preds_xgb,
    'linear': oof_preds_linear,
    'lasso': oof_preds_lasso,
    'catboost': oof_preds_catboost,
    'lightgbm': oof_preds_lightgbm,
    'random_forest': oof_preds_rf
})

# Print OOF Predictions
print("Out-of-Fold Predictions:\n")
print(oof_predictions.head())  # Inspect the combined OOF predictions
print(f"Shape of OOF Predictions: {oof_predictions.shape}")

# Create RMSLE Scores DataFrame
rmsle_df = pd.DataFrame(rmsle_scores)

# Pivot the RMSLE scores to create a table
rmsle_pivot = rmsle_df.pivot(index='fold', columns='model', values='rmsle')

# Print the pivoted table directly
print("\nRMSLE Scores: Models vs Folds\n")
print(rmsle_pivot)


Starting training with 5-fold cross-validation...

Fold 1:
  XGBoost predictions done for Fold 1
  Linear Regression predictions done for Fold 1
  Lasso Regression predictions done for Fold 1
  CatBoost predictions done for Fold 1
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.048301 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2020
[LightGBM] [Info] Number of data points in the train set: 960000, number of used features: 56
[LightGBM] [Info] Start training from score 6.590488
  LightGBM predictions done for Fold 1
  Random Forest predictions done for Fold 1
Fold 2:
  XGBoost predictions done for Fold 2
  Linear Regression predictions done for Fold 2
  Lasso Regression predictions done for Fold 2
  CatBoost predictions done for Fold 2
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.049457 seco

In [13]:
# Subset the data to the first 100 IDs for plotting
subset_ids = id_train[:100]
subset_y_train = y_train[:100]

# Create a plot
fig = go.Figure()

# Add traces for each model's OOF predictions
fig.add_trace(go.Scatter(x=subset_ids, y=oof_predictions['xgb'].iloc[:100], mode='lines', name='XGBoost'))
fig.add_trace(go.Scatter(x=subset_ids, y=oof_predictions['linear'].iloc[:100], mode='lines', name='Linear'))
fig.add_trace(go.Scatter(x=subset_ids, y=oof_predictions['lasso'].iloc[:100], mode='lines', name='Lasso'))
fig.add_trace(go.Scatter(x=subset_ids, y=oof_predictions['catboost'].iloc[:100], mode='lines', name='CatBoost'))
fig.add_trace(go.Scatter(x=subset_ids, y=oof_predictions['lightgbm'].iloc[:100], mode='lines', name='LightGBM'))
fig.add_trace(go.Scatter(x=subset_ids, y=oof_predictions['random_forest'].iloc[:100], mode='lines', name='Random Forest'))

# Add the original y_train values
fig.add_trace(go.Scatter(x=subset_ids, y=subset_y_train, mode='lines', name='y_train (Actual)', line=dict(dash='dot')))

# Update layout
fig.update_layout(
    title="Out-of-Fold Predictions vs ID (First 100 IDs)",
    xaxis_title="ID",
    yaxis_title="Predictions",
    legend_title="Model",
    template="plotly_white"
)

# Show the plot
fig.show()


In [16]:
X_train_meta = oof_predictions.copy()
oof_predictions.to_csv('/content/drive/MyDrive/Playground Series - Season 4, Episode 12/data/oof_predictions.csv', index=False) # save the set to save time for next time

In [17]:
X_train_meta = pd.read_csv('/content/drive/MyDrive/Playground Series - Season 4, Episode 12/data/oof_predictions.csv') # how to load the set

# XGB Regression

In [18]:
# Define parameter search space
search_spaces = {
    'learning_rate': Real(0.001, 0.5, 'uniform'),
    'max_depth': Integer(1, 14),
    'n_estimators': Integer(50, 300),
}

# Initialize the XGBRegressor model with GPU support
model = xgb.XGBRegressor(
    objective='reg:squarederror',
    tree_method = 'hist',
    device = 'cuda',
    verbosity=2
)

kf = KFold(n_splits=5)  # 10-fold cross-validation

# Using negative RMSE as the scoring metric
optimizer = BayesSearchCV(
    estimator=model,
    search_spaces=search_spaces,
    n_iter=64,
    cv=kf,
    scoring='neg_root_mean_squared_error',  # RMSE scoring
    verbose=1,
    n_jobs=-1,
    return_train_score=True
)

# Fit the optimizer using X_train and log-transformed y_train
optimizer.fit(X_train_meta, y_train_log)

# Get the best model
best_model = optimizer.best_estimator_

# Save the best model
model_path = '/content/drive/MyDrive/Playground Series - Season 4, Episode 12/models/XGB_3.json'
best_model.save_model(model_path)

# Get cross-validation errors
cv_results = optimizer.cv_results_

# Calculate and print the mean RMSE for each fold (convert negative RMSE back to positive)
mean_rmse = -cv_results['mean_test_score']  # This is negative RMSE, so we negate it to get RMSE

print(f"Best parameters: {optimizer.best_params_}")
print(f"Mean Cross-validation RMSE: {mean_rmse.mean():.4f}")

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fi







Best parameters: OrderedDict([('learning_rate', 0.19694499004662677), ('max_depth', 4), ('n_estimators', 50)])
Mean Cross-validation RMSE: 1.0600


#

In [27]:
linear_meta_model = LinearRegression(copy_X=True, n_jobs=-1)
linear_meta_model.fit(X_train_meta, y_train_log)

In [19]:
# Initialize models
xgb_model = XGBRegressor(learning_rate = 0.05104644623518532, max_depth= 7, n_estimators = 133)
linear_model = LinearRegression(copy_X=True, n_jobs=-1)
lasso_model = Lasso(alpha=0.1, max_iter=5000, tol=1e-4)
catboost_model = CatBoostRegressor(iterations=182, learning_rate=0.13328757298571275, depth=8, silent=True, random_state=42)
lightgbm_model = LGBMRegressor(n_estimators=500, learning_rate=0.05, max_depth=6, random_state=42)
random_forest_model = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42, n_jobs=-1)

# Train XGBoost
xgb_model.fit(X_train, y_train_log)
xgb_preds = np.exp(xgb_model.predict(X_test))  # Transform back from log

# Train Linear Regression
linear_model.fit(X_train, y_train_log)
linear_preds = np.exp(linear_model.predict(X_test))  # Transform back from log

# Train Lasso Regression
lasso_model.fit(X_train, y_train_log)
lasso_preds = np.exp(lasso_model.predict(X_test))  # Transform back from log

# Train CatBoost
catboost_model.fit(X_train, y_train_log)
catboost_preds = np.exp(catboost_model.predict(X_test))

# Train LightGBM
lightgbm_model.fit(X_train, y_train_log)
lightgbm_preds = np.exp(lightgbm_model.predict(X_test))

# Train Random Forest
random_forest_model.fit(X_train, y_train_log)
rf_preds = np.exp(random_forest_model.predict(X_test))

# Combine OOF predictions
oof_predictions_test = pd.DataFrame({
    'xgb': xgb_preds,
    'linear': linear_preds,
    'lasso': lasso_preds,
    'catboost': catboost_preds,
    'lightgbm': lightgbm_preds,
    'random_forest': rf_preds
})


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.058320 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2022
[LightGBM] [Info] Number of data points in the train set: 1200000, number of used features: 56
[LightGBM] [Info] Start training from score 6.590527


In [20]:
oof_predictions_test.to_csv('/content/drive/MyDrive/Playground Series - Season 4, Episode 12/data/oof_predictions_test.csv', index=False)

In [21]:
oof_predictions_test = pd.read_csv('/content/drive/MyDrive/Playground Series - Season 4, Episode 12/data/oof_predictions_test.csv')

# Generate predictions using the XGB as a Meta-Model

In [22]:
# Convert test data to DMatrix
dtest = xgb.DMatrix(oof_predictions_test)

# Load the trained model
model_path = f'/content/drive/MyDrive/Playground Series - Season 4, Episode 12/models/XGB_3.json'
model = xgb.Booster(model_file=model_path)

# Get predictions
preds_log = model.predict(dtest)

# Convert log-transformed predictions back to original scale
preds_xgb = np.exp(preds_log)

# Ensure the length of predictions matches the number of rows in the test data
assert len(preds_xgb) == len(id_test), "Mismatch between number of predictions and test data IDs"

# Create output DataFrame with original Id and predicted SalePrice
output = pd.DataFrame({'id': id_test, 'Premium Amount': preds_xgb.squeeze()})

# Remove any duplicate rows by 'Id'
output.drop_duplicates(subset='id', keep='first', inplace=True)

# Save predictions to a CSV file
output.to_csv('XGB_meta_predictions.csv', index=False)
files.download('XGB_meta_predictions.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Generate predictions using the a Linear model as the Meta-model

In [26]:
# Get predictions
preds_log = linear_meta_model.predict(oof_predictions_test)

# Convert log-transformed predictions back to original scale
preds_linear = np.exp(preds_log)

# Ensure the length of predictions matches the number of rows in the test data
assert len(preds_linear) == len(id_test), "Mismatch between number of predictions and test data IDs"

# Create output DataFrame with original Id and predicted SalePrice
output = pd.DataFrame({'id': id_test, 'Premium Amount': preds_linear.squeeze()})

# Remove any duplicate rows by 'Id'
output.drop_duplicates(subset='id', keep='first', inplace=True)

# Save predictions to a CSV file
output.to_csv('Linear_meta_predictions.csv', index=False)
files.download('Linear_meta_predictions.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>