## Import necessary libraries

In [1]:
# Clear memory
%reset -f

# Suppress warnings
import warnings
warnings.filterwarnings('ignore')


In [38]:
# System imports
import sys
import importlib
from pathlib import Path
import pandas as pd

# Visualization imports
%matplotlib inline
import matplotlib.pyplot as plt

plt.style.use('ggplot')

# Project setup
project_root = Path().resolve().parent
sys.path.append(str(project_root))

# Project modules
MODULES = [
    'iowa_dream.utils',
    'iowa_dream.data', 
    'iowa_dream.feature_engineering',
    'iowa_dream.feature_engineering.lot_frontage_imputer',
    'iowa_dream.evaluation',
    'iowa_dream.evaluation.metrics_plot',
    'iowa_dream.models.optuna_objective'
]

# Reload modules
for module in MODULES:
    if module in sys.modules:
        importlib.reload(sys.modules[module])
    else:
        __import__(module)

# Project imports
from iowa_dream.data.importer import load_config
from iowa_dream.utils.sample_split import create_sample_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.dummy import DummyRegressor
from sklearn.preprocessing import OneHotEncoder, PolynomialFeatures
from lightgbm import LGBMRegressor
from glum import GeneralizedLinearRegressor, GeneralizedLinearRegressorCV
from iowa_dream.feature_engineering.lot_frontage_imputer import LotFrontageGroupMedianImputer
from iowa_dream.feature_engineering.add_drop_features import Add_Drop_Attributes
from iowa_dream.feature_engineering.categotical_transformer import OrdinalMerger, NominalGrouper
from iowa_dream.feature_engineering.numerical_transformer import WinsorizedRobustScaler
from iowa_dream.evaluation.metrics_plot import reevaluate_models, analyze_glm_coefficients
from iowa_dream.models.optuna_objective import optuna_objective
import numpy as np
from sklearn.model_selection import RandomizedSearchCV

import optuna
from optuna.integration import LightGBMPruningCallback
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold

## Import data and config

In [3]:
# Example usage
data_file = project_root / load_config()['kaggle']['cleaned_path'] / 'cleaned_AmesHousing.parquet'
df = pd.read_parquet(data_file)

In [4]:
# Get data dictionary from config
config = load_config()
cleaned_data_dict = config['cleaned_data_dict']

# Extract feature groups
ordinal_features = cleaned_data_dict['ordinal']['columns']
nominal_features = cleaned_data_dict['nominal']['columns'] 
discrete_features = cleaned_data_dict['discrete']['columns']
continuous_features = cleaned_data_dict['continuous']['columns']
numeric_features = continuous_features + discrete_features
proximity_data = {
    neighborhood: group['category'] 
    for group in config['university_proximity']
    for neighborhood in group['neighborhoods']
}
glm_data_dict = config['glm_data_dict']
glm_ordinal_features = glm_data_dict['categorical']['ordinal']['columns']
glm_nominal_features = glm_data_dict['categorical']['nominal']['columns']
glm_numerical_features = glm_data_dict['numerical']['columns']

## Split data

In [5]:
df = create_sample_split(df, 'pid')
train_df = df[df['sample'] == 'train']
test_df = df[df['sample'] == 'test']
y = df['saleprice']

# Separate features (X) and target (y)
X_train = train_df.drop(['saleprice', 'sample', 'pid'], axis=1)
y_train = train_df['saleprice']
X_test = test_df.drop(['saleprice', 'sample', 'pid'], axis=1)
y_test = test_df['saleprice']

# 0. Baseline median predictor

In [6]:
# Create and fit a baseline model that predicts the mean
baseline_mean = DummyRegressor(strategy='mean')
baseline_mean.fit(X_train, y_train)

# Evaluate the baseline model
reevaluate_models([baseline_mean], X_train, y_train, model_names=['Mean Predictor'])


Unnamed: 0_level_0,RMSE,RMSED,MAPE,MedAE,R-squared
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Mean Predictor,79450.24,43.83%,38.00%,46270.42,0.0


#1. Baseline GLM

In [7]:
baseline_preprocessor = ColumnTransformer(
    transformers=[
        (
            "group_impute",
            LotFrontageGroupMedianImputer(
                group_cols=['neighborhood', 'lot_config'],
                target_col='lot_frontage'
            ),
            ['neighborhood', 'lot_config', 'lot_frontage'],
        ),
        ("cat", OneHotEncoder(sparse_output=False, drop="first", handle_unknown='ignore'), nominal_features),
    ]
)
baseline_preprocessor.set_output(transform="pandas")
baseline_GLM_model_pipeline = Pipeline(
    [
        ("preprocess", baseline_preprocessor),
        (
            "estimate",
            GeneralizedLinearRegressor(
                family='gamma', l1_ratio=1, fit_intercept=True
            ),
        ),
    ]
)
baseline_GLM_model_pipeline.fit(X_train, y_train)
reevaluate_models([baseline_GLM_model_pipeline], X_train, y_train)

Unnamed: 0_level_0,RMSE,RMSED,MAPE,MedAE,R-squared
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Model 1,39739.53,21.92%,16.47%,19772.31,0.75


This is already quite good :), indicating that our model is capturing quite well the pattern in the data. 

# 2. GLM with combined (added features) and processed data

In [8]:
# Preprocessing pipeline for numerical features

numerical_pipeline = Pipeline(steps=[
    ('winsorized_scaler', WinsorizedRobustScaler(range_min=10, range_max=99))
])

# Preprocessing pipeline for ordinal features
ordinal_pipeline = Pipeline(steps=[
    ('ordinal_merger', OrdinalMerger(min_obs=10))
])

# Preprocessing pipeline for nominal features
nominal_pipeline = Pipeline(steps=[
    ('nominal_grouper', NominalGrouper(min_obs=10)),
    ('onehot', OneHotEncoder(sparse_output=False, drop='first', handle_unknown='ignore'))
])

# Combine preprocessing pipelines
glm_preprocessor = ColumnTransformer(transformers=[
    ('num', numerical_pipeline, glm_numerical_features),
    ('ord', ordinal_pipeline, glm_ordinal_features),
    ('nom', nominal_pipeline, glm_nominal_features)
])

# Full pipeline
glm_1_pipeline = Pipeline(steps=[
    ('imputer', LotFrontageGroupMedianImputer(group_cols=['neighborhood', 'lot_config'], target_col='lot_frontage')),
    ('feature_add_drop', Add_Drop_Attributes(proximity_data=proximity_data)),
    ('preprocessor', glm_preprocessor),
    ('glm', GeneralizedLinearRegressor(family='gamma', link='log', l1_ratio=1, fit_intercept=True))  # GeneralizedLinearRegressor equivalent for gamma family
])

glm_1_pipeline

In [9]:
# Fit and evaluate
glm_1_pipeline.fit(X_train, y_train)
reevaluate_models([baseline_GLM_model_pipeline, glm_1_pipeline], X_train, y_train)

Unnamed: 0_level_0,RMSE,RMSED,MAPE,MedAE,R-squared
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Model 1,39739.53,21.92%,16.47%,19772.31,0.75
Model 2,21330.47,11.77%,9.21%,10413.38,0.93


In [10]:
reevaluate_models([baseline_GLM_model_pipeline,  glm_1_pipeline], X_test, y_test)

Unnamed: 0_level_0,RMSE,RMSED,MAPE,MedAE,R-squared
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Model 1,44376.3,25.23%,16.92%,18936.76,0.64
Model 2,22821.78,12.98%,9.37%,10450.47,0.9


In [11]:
# Define interaction terms
interaction_features = [
    ('age', 'exter_qu'),
    ('gr_liv_area', 'overall_score'),
    ('gr_liv_area', 'neighborhood_score'), 
    ('gr_liv_area', 'age'),
]

# Preprocessing pipeline for interaction terms
interaction_pipeline = Pipeline(steps=[
    ('interaction', PolynomialFeatures(degree=2, interaction_only=True, include_bias=False))
])

# Combine preprocessing pipelines
glm_with_interaction_preprocessor = ColumnTransformer(transformers=[
    ('num', numerical_pipeline, glm_numerical_features),
    ('ord', ordinal_pipeline, glm_ordinal_features),
    ('nom', nominal_pipeline, glm_nominal_features),
    ('interaction', interaction_pipeline, [f[0] for f in interaction_features] + [f[1] for f in interaction_features])
])

# Full pipeline
glm_2_pipeline = Pipeline(steps=[
    ('imputer', LotFrontageGroupMedianImputer(group_cols=['neighborhood', 'lot_config'], target_col='lot_frontage')),
    ('feature_add_drop', Add_Drop_Attributes(proximity_data=proximity_data)),
    ('preprocessor', glm_with_interaction_preprocessor),
    ('glm', GeneralizedLinearRegressor(family='gamma', link='log', l1_ratio=1, fit_intercept=True))
])

glm_2_pipeline

In [12]:
# Fit and evaluate
glm_2_pipeline.fit(X_train, y_train)
reevaluate_models([baseline_GLM_model_pipeline, glm_1_pipeline, glm_2_pipeline], X_train, y_train)

Unnamed: 0_level_0,RMSE,RMSED,MAPE,MedAE,R-squared
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Model 1,39739.53,21.92%,16.47%,19772.31,0.75
Model 2,21330.47,11.77%,9.21%,10413.38,0.93
Model 3,20235.85,11.16%,8.57%,9724.03,0.94


In [13]:
reevaluate_models([baseline_GLM_model_pipeline, glm_1_pipeline, glm_2_pipeline], X_test, y_test)

Unnamed: 0_level_0,RMSE,RMSED,MAPE,MedAE,R-squared
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Model 1,44376.3,25.23%,16.92%,18936.76,0.64
Model 2,22821.78,12.98%,9.37%,10450.47,0.9
Model 3,21895.71,12.45%,8.93%,10480.27,0.91


### Finally let's try and find the appropriate degress of regularization for the GLM model by CV

In [14]:
# Full pipeline
glm_3_pipeline = Pipeline(steps=[
    ('imputer', LotFrontageGroupMedianImputer(group_cols=['neighborhood', 'lot_config'], target_col='lot_frontage')),
    ('feature_add_drop', Add_Drop_Attributes(proximity_data=proximity_data)),
    ('preprocessor', glm_with_interaction_preprocessor),
    ('glm', GeneralizedLinearRegressorCV(family='gamma', 
                                         link='log',
                                         fit_intercept=True,
                                         alphas=None,  # default
                                         min_alpha=None,  # default
                                         min_alpha_ratio=None,  # default
                                         l1_ratio=[0, 0.25, 0.5, 0.75, 1.0],
                                         max_iter=150, 
                                         cv=5))
])
glm_3_pipeline.fit(X_train, y_train)

In [15]:
print(f"Chosen alpha:    {glm_3_pipeline.named_steps['glm'].alpha_}")
print(f"Chosen l1 ratio: {glm_3_pipeline.named_steps['glm'].l1_ratio_}")

Chosen alpha:    0.0013219411484660308
Chosen l1 ratio: 0.0


In [16]:
reevaluate_models([baseline_GLM_model_pipeline, glm_1_pipeline, glm_2_pipeline, glm_3_pipeline], X_test, y_test)

Unnamed: 0_level_0,RMSE,RMSED,MAPE,MedAE,R-squared
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Model 1,44376.3,25.23%,16.92%,18936.76,0.64
Model 2,22821.78,12.98%,9.37%,10450.47,0.9
Model 3,21895.71,12.45%,8.93%,10480.27,0.91
Model 4,21778.87,12.38%,8.88%,10109.59,0.91


In [18]:
# Call the function with the fitted pipeline
coefficients_df = analyze_glm_coefficients(
    pipeline=glm_3_pipeline,
    numerical_features=glm_numerical_features,
    ordinal_features=glm_ordinal_features,
    nominal_features=glm_nominal_features,
    top_n=10
)
display(coefficients_df)


Top 10 Most Important Features

Features by Type:
Feature_Type
Numerical    6
Nominal      4

Detailed Feature Importance Ranking:


Unnamed: 0_level_0,Feature,Feature_Type,Coefficient
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,neighborhood_score,Numerical,-0.1792
2,gr_liv_area,Numerical,-0.1713
3,mas_vnr_type_Other,Nominal,-0.1115
4,neighborhood_score,Numerical,0.1099
5,foundation_Slab,Nominal,-0.0985
6,pct_unf_sf,Numerical,-0.0785
7,exterior_1st_BrkFace,Nominal,0.0779
8,total_bsmt_sf,Numerical,0.0617
9,lot_config_FR3,Nominal,-0.0537
10,age,Numerical,0.0458


#3. LGBM

### Baseline LGBM

In [19]:
# Create a preprocessor that handles both numerical and categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(drop='first', handle_unknown='ignore', sparse_output=False), nominal_features)
    ],
    remainder='passthrough'
)

# Create the full pipeline with imputer, preprocessing and model
lgbm_baseline_pipeline = Pipeline([
    ('imputer', LotFrontageGroupMedianImputer(group_cols=['neighborhood', 'lot_config'], target_col='lot_frontage')),
    ('preprocessor', preprocessor),
    ('estimator', LGBMRegressor(objective='gamma'))
])

lgbm_baseline_pipeline.fit(X_train, y_train)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003389 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2178
[LightGBM] [Info] Number of data points in the train set: 2443, number of used features: 129
[LightGBM] [Info] Start training from score 12.107745


In [20]:
reevaluate_models([lgbm_baseline_pipeline], X_train, y_train)

Unnamed: 0_level_0,RMSE,RMSED,MAPE,MedAE,R-squared
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Model 1,11172.95,6.16%,4.63%,5507.76,0.98


In [21]:
reevaluate_models([lgbm_baseline_pipeline], X_test, y_test)

Unnamed: 0_level_0,RMSE,RMSED,MAPE,MedAE,R-squared
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Model 1,21641.58,12.30%,8.55%,9408.22,0.91


Without much pre-processing, the LGBM model is already quite good. Now, let's try to improve the model by using the combined features (that captures meaningful information and reduce redundancy). Though, likely we have have overfitting issue. Very difference in performance between train and test set. Also for comparison, let's then use the same set of features for the GLM model (after some feature engineering).

### LGBM with combined features

In [22]:
# Create a preprocessor that handles both numerical and categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(drop='first', handle_unknown='ignore', sparse_output=False), glm_nominal_features)
    ],
    remainder='passthrough'
)

# Create the full pipeline with imputer, preprocessing and model
lgbm_1_pipeline = Pipeline([
    ('imputer', LotFrontageGroupMedianImputer(group_cols=['neighborhood', 'lot_config'], target_col='lot_frontage')),
    ('feature_add_drop', Add_Drop_Attributes(proximity_data=proximity_data)),
    ('preprocessor', preprocessor),
    ('estimator', LGBMRegressor(objective='gamma'))
])

lgbm_1_pipeline.fit(X_train, y_train)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000613 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1637
[LightGBM] [Info] Number of data points in the train set: 2443, number of used features: 38
[LightGBM] [Info] Start training from score 12.107745


In [23]:
reevaluate_models([lgbm_baseline_pipeline, lgbm_1_pipeline], X_train, y_train)

Unnamed: 0_level_0,RMSE,RMSED,MAPE,MedAE,R-squared
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Model 1,11172.95,6.16%,4.63%,5507.76,0.98
Model 2,11413.23,6.30%,4.65%,5314.04,0.98


In [24]:
reevaluate_models([lgbm_baseline_pipeline, lgbm_1_pipeline], X_test, y_test)

Unnamed: 0_level_0,RMSE,RMSED,MAPE,MedAE,R-squared
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Model 1,21641.58,12.30%,8.55%,9408.22,0.91
Model 2,21491.23,12.22%,9.08%,10499.04,0.91


Here we can see that the original LGBM can be overfitting on the data, in the sense that all its RMSE, RMSED, MAPE are low on the train set but higher on the test set compared to the LGBM models where we combine features into useful, meaningful ones to reduce redundancy.

`LightGBMPruningCallback` helps detect unpromising hyperparameter sets before training them on the data and then reduce the search time.

In [27]:
def objective(trial, X, y):
    # Define the hyperparameter search space
    param_grid = {
        "n_estimators": trial.suggest_int("n_estimators", 50, 500, step=50),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.1),
        "num_leaves": trial.suggest_int("num_leaves", 30, 300, step=30),
        "min_child_weight": trial.suggest_float("min_child_weight", 1.0, 5.0),
    }

    # Update the pipeline with dynamic LGBMRegressor parameters
    pipeline = Pipeline([
        ('imputer', LotFrontageGroupMedianImputer(group_cols=['neighborhood', 'lot_config'], target_col='lot_frontage')),
        ('feature_add_drop', Add_Drop_Attributes(proximity_data=proximity_data)),
        ('preprocessor', preprocessor),
        ('estimator', LGBMRegressor(objective="regression", **param_grid))
    ])

    # Cross-validation for the pipeline
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    cv_scores = []

    for train_idx, val_idx in cv.split(X, y):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

        # Fit the pipeline
        pipeline.fit(X_train, y_train)

        # Predict and calculate RMSE
        preds = pipeline.predict(X_val)
        rmse = mean_squared_error(y_val, preds, squared=False)
        cv_scores.append(rmse)

    # Return the mean RMSE across folds
    return np.mean(cv_scores)

In [28]:
# Best Parameters: {'n_estimators': 350, 'learning_rate': 0.03258641681250722, 'num_leaves': 30, 'min_child_weight': 1.3675858461417085}
# Create an Optuna study
study = optuna.create_study(direction="minimize", study_name="Pipeline Optimization")

# Optimize using the defined objective function
study.optimize(
    lambda trial: optuna_objective(trial, X_train, y_train, lgbm_1_pipeline),
    n_trials=20,  # Number of trials (adjust based on time/needs)
    show_progress_bar=True
)

# Retrieve the best parameters
best_params = study.best_params
print("Best Parameters:", best_params)

[I 2024-12-18 15:47:29,137] A new study created in memory with name: Pipeline Optimization


  0%|          | 0/20 [00:00<?, ?it/s]

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000557 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1628
[LightGBM] [Info] Number of data points in the train set: 1954, number of used features: 38
[LightGBM] [Info] Start training from score 12.102676


Exception ignored on calling ctypes callback function: <function _log_callback at 0x30c63d260>
Traceback (most recent call last):
  File "/Users/congminhnguyen/miniconda/envs/iowa_dream/lib/python3.12/site-packages/lightgbm/basic.py", line 257, in _log_callback
    def _log_callback(msg: bytes) -> None:
    
KeyboardInterrupt: 


No further splits with positive gain, best gain: -inf
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000506 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1626
[LightGBM] [Info] Number of data points in the train set: 1954, number of used features: 38
[LightGBM] [Info] Start training from score 12.109766
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000505 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1625
[LightGBM] [Info] Number of data points in the train set: 1954, number of used features: 38
[LightGBM] [Info] Start training from score 12.109467
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000468 seconds.
You can set `force_row_wise=tr

Exception ignored on calling ctypes callback function: <function _log_callback at 0x30c63d260>
Traceback (most recent call last):
  File "/Users/congminhnguyen/miniconda/envs/iowa_dream/lib/python3.12/site-packages/lightgbm/basic.py", line 257, in _log_callback
    def _log_callback(msg: bytes) -> None:
    
KeyboardInterrupt: 


No further splits with positive gain, best gain: -inf
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000507 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1625
[LightGBM] [Info] Number of data points in the train set: 1954, number of used features: 38
[LightGBM] [Info] Start training from score 12.109467
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000561 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1626
[LightGBM] [Info] Number of data points in the train set: 1955, number of used features: 38
[LightGBM] [Info] Start training from score 12.111759


In [25]:
lgbm_tuned_pipeline = Pipeline([
    ('imputer', LotFrontageGroupMedianImputer(group_cols=['neighborhood', 'lot_config'], target_col='lot_frontage')),
    ('feature_add_drop', Add_Drop_Attributes(proximity_data=proximity_data)),
    ('preprocessor', preprocessor),
    ('estimator', LGBMRegressor(objective="regression", n_estimators=350, learning_rate=0.03258641681250722, num_leaves=30, min_child_weight=1.3675858461417085))
])

# Fit the pipeline with the best parameters
lgbm_tuned_pipeline.fit(X_train, y_train)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000748 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1637
[LightGBM] [Info] Number of data points in the train set: 2443, number of used features: 38
[LightGBM] [Info] Start training from score 181270.423659


In [26]:
reevaluate_models([lgbm_baseline_pipeline, lgbm_1_pipeline, lgbm_tuned_pipeline], X_train, y_train)

Unnamed: 0_level_0,RMSE,RMSED,MAPE,MedAE,R-squared
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Model 1,11172.95,6.16%,4.63%,5507.76,0.98
Model 2,11413.23,6.30%,4.65%,5314.04,0.98
Model 3,10237.52,5.65%,4.82%,5312.93,0.98


In [27]:
reevaluate_models([lgbm_baseline_pipeline, lgbm_1_pipeline, lgbm_tuned_pipeline], X_test, y_test)

Unnamed: 0_level_0,RMSE,RMSED,MAPE,MedAE,R-squared
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Model 1,21641.58,12.30%,8.55%,9408.22,0.91
Model 2,21491.23,12.22%,9.08%,10499.04,0.91
Model 3,22038.3,12.53%,8.89%,10130.22,0.91


# One Step further :) let's try a custom loss function for the LGBM model

In [45]:
from sklearn.model_selection import GridSearchCV
from lightgbm import LGBMRegressor

# Define the custom regression loss function
def custom_regression_loss(y_true, y_pred):
    import numpy as np
    # Calculate the residual (error)
    residual = y_pred - y_true
    # Calculate the gradient (first-order derivative)
    gradient = 2 * residual
    # Calculate the hessian (second-order derivative)
    hessian = 2 * np.ones_like(y_true)
    # Define a penalty term (for demonstration, we'll use a simple linear penalty)
    penalty = np.abs(residual) * 0.1
    # Combine the gradient and penalty
    gradient += penalty
    return gradient, hessian

# Modify the pipeline to use the custom objective directly
lgbm_custom_loss_pipeline = Pipeline([
    ('imputer', LotFrontageGroupMedianImputer(group_cols=['neighborhood', 'lot_config'], target_col='lot_frontage')),
    ('feature_add_drop', Add_Drop_Attributes(proximity_data=proximity_data)),
    ('preprocessor', preprocessor),
    ('estimator', LGBMRegressor(objective=custom_regression_loss, n_estimators=1000))  # Set a large n_estimators for early stopping
])

# Perform GridSearchCV without passing early stopping parameters directly
param_grid = {
    'estimator__num_leaves': [20, 30, 40, 50],
    'estimator__learning_rate': [0.01, 0.05, 0.1],
    'estimator__min_child_weight': [0.5, 1, 1.5],
}

grid_search = GridSearchCV(
    estimator=lgbm_custom_loss_pipeline,
    param_grid=param_grid,
    cv=5,
    scoring='neg_mean_squared_error',
    n_jobs=-1,
    verbose=1
)

# Fit the grid search without early stopping parameters
grid_search.fit(X_train, y_train)

# Retrieve the best pipeline
lgbm_custom_loss_pipeline = grid_search.best_estimator_

# Manually fit the best pipeline with early stopping
lgbm_custom_loss_pipeline.named_steps['estimator'].fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],
    eval_metric='rmse',
    early_stopping_rounds=10,
    verbose=False
)


Fitting 5 folds for each of 36 candidates, totalling 180 fits
[LightGBM] [Info] Using self-defined objective function
[LightGBM] [Info] Using self-defined objective function
[LightGBM] [Info] Using self-defined objective function
[LightGBM] [Info] Using self-defined objective function
[LightGBM] [Info] Using self-defined objective function
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006072 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1623
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005782 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1621
[LightGBM] [Info] Number of data points in the train set: 1954, number of used features: 38
[LightGBM] [Info] Using self-defined objective



[LightGBM] [Info] Using self-defined objective function
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.021031 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1621
[LightGBM] [Info] Number of data points in the train set: 1954, number of used features: 38
[LightGBM] [Info] Using self-defined objective function
[LightGBM] [Info] Using self-defined objective function
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000889 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1623
[LightGBM] [Info] Number of data points in the train set: 1954, number of used features: 38
[LightGBM] [Info] Using self-defined objective function
[LightGBM] [Info] Using self-defined objective function
[LightGBM] [Info



In [36]:
reevaluate_models([lgbm_baseline_pipeline, lgbm_1_pipeline, lgbm_tuned_pipeline, lgbm_custom_loss_pipeline], X_train, y_train)

Unnamed: 0_level_0,RMSE,RMSED,MAPE,MedAE,R-squared
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Model 1,11172.95,6.16%,4.63%,5507.76,0.98
Model 2,11413.23,6.30%,4.65%,5314.04,0.98
Model 3,10237.52,5.65%,4.82%,5312.93,0.98
Model 4,12364.52,6.82%,5.57%,6183.25,0.98


In [37]:
reevaluate_models([lgbm_baseline_pipeline, lgbm_1_pipeline, lgbm_tuned_pipeline, lgbm_custom_loss_pipeline], X_test, y_test)

Unnamed: 0_level_0,RMSE,RMSED,MAPE,MedAE,R-squared
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Model 1,21641.58,12.30%,8.55%,9408.22,0.91
Model 2,21491.23,12.22%,9.08%,10499.04,0.91
Model 3,22038.3,12.53%,8.89%,10130.22,0.91
Model 4,22167.74,12.60%,8.98%,10319.17,0.91
