# House Prices 7: Gradient Boosting
Gradient boosting machines have had a long history of success in machine learning competitions. They are a powerful tool for regression and classification problems. In this notebook, I will try:
1. `XGBoost`
1. `LightGBM`
1. `CatBoost`

I'll use `Optuna` to optimize the hyperparameters of the models.

<!-- I'll also use `ELI5` to understand and compare the importance of features between the models. Additionally, I'll use `SHAP` to understand the importance of features in the models and to understand the predictions of the models. -->

In [None]:
import optuna
import xgboost as xgb

from house_price_utils import *

setup_notebook()

# Data

In [None]:
data, data_test = load_data()
data = data[sorted(data)]
X_raw = data.drop(columns=["SalePrice"])
y = data.SalePrice

In [None]:
from sklearn.compose import make_column_transformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import (
    OrdinalEncoder,
)
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score

# Feature Grouping

In [None]:
features = group_features(X_raw)

In [None]:
feature_counts = {k: len(v) for k, v in features.items()}
feature_counts

# Preprocessor

In [None]:
def make_preprocessor(features, continuous_strategy="mean"):
    continuous_transformer = make_pipeline(SimpleImputer(strategy=continuous_strategy))
    categorical_transformer = make_pipeline(
        OrdinalEncoder(
            handle_unknown="use_encoded_value",
            unknown_value=-1,
        ),
    )
    return make_column_transformer(
        (continuous_transformer, features["continuous"]),
        (categorical_transformer, features["nominal"] + features["ordinal"]),
    )

In [None]:
preprocessor = make_preprocessor(features)

# CV Split
* Model comparison must be done using CV. I'll use `StratifiedKFold` to split the data.

In [None]:
X_t = preprocessor.fit_transform(X_raw)
y_t = np.log1p(y).values
y_quantiles = pd.qcut(y_t, q=5, labels=False)
skfolds = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [None]:
cv = skfolds.split(X_t, y_quantiles)
(X_train, y_train), (X_val, y_val) = [(X_t[idx], y_t[idx]) for idx in next(cv)]

# XGBoost

In [None]:
xgb_regressor = xgb.XGBRegressor(
    objective="reg:squarederror",
    importance_type="gain",
    n_estimators=300,
    learning_rate=0.05,
)

In [None]:
_ = xgb_regressor.fit(
    X_train,
    y_train,
    eval_set=[(X_train, y_train), (X_val, y_val)],
    verbose=25,
)

In [None]:
def plot_importance(regressor, preprocessor, n=20):
    return (
        pd.DataFrame(
            data=regressor.feature_importances_,
            index=[c.split("__")[1] for c in preprocessor.get_feature_names_out()],
            columns=["importance"],
        )
        .sort_values(by="importance")
        .iloc[-n:]
        .plot(kind="barh", figsize=(10, 7), title="Feature Importances")
    )

In [None]:
ax = plot_importance(xgb_regressor, preprocessor)

In [None]:
ax = residual_plots(y_val, xgb_regressor.predict(X_val))

In [None]:
report(xgb_regressor, X_train, y_train, X_val, y_val, mean_squared_error)

# LightGBM

In [None]:
import lightgbm as lgb

In [None]:
lgbm_regressor = lgb.LGBMRegressor(
    objective="regression",
    importance_type="gain",
    n_estimators=300,
    learning_rate=0.05,
)

In [None]:
_ = lgbm_regressor.fit(
    X_train,
    y_train,
)

In [None]:
ax = plot_importance(lgbm_regressor, preprocessor)

In [None]:
ax = residual_plots(y_val, lgbm_regressor.predict(X_val))

In [None]:
report(lgbm_regressor, X_train, y_train, X_val, y_val, mean_squared_error)

# CatBoost

In [None]:
import catboost as cb

In [None]:
cb_regressor = cb.CatBoostRegressor(
    loss_function="RMSE",
    n_estimators=300,
    learning_rate=0.05,
)

In [None]:
_ = cb_regressor.fit(
    X_train,
    y_train,
    eval_set=(X_val, y_val),
    verbose=30,
)

In [None]:
ax = plot_importance(cb_regressor, preprocessor)

In [None]:
ax = residual_plots(y_val, cb_regressor.predict(X_val))

In [None]:
report(cb_regressor, X_train, y_train, X_val, y_val, mean_squared_error)

# Observations
Model rankings are:
1. CatBoost
1. LightGBM
1. XGBoost

# Hyperparameter Optimization with Optuna

In [None]:
def objective(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
        "depth": trial.suggest_int("depth", 2, 7),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 1),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 0.001, 1),
        "verbose": 0,
    }
    return -cross_val_score(
        cb.CatBoostRegressor(**params),
        X_t,
        y_t,
        cv=skfolds.split(X_t, y_quantiles),
        scoring="neg_mean_squared_error",
    ).mean()

In [None]:
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=200, n_jobs=-1)
print(study.best_params)

In [None]:
model = cb.CatBoostRegressor(**study.best_params)
model.fit(X_t, y_t, verbose=0)
print(f"Full data MSE = {evaluate(model, X_t, y_t, mean_absolute_error):.4f}")

In [None]:
fig = residual_plots(y_t, model.predict(X_t))

# Submit

In [None]:
output = pd.DataFrame(
    {
        "Id": data_test.index,
        "SalePrice": np.exp(model.predict(preprocessor.transform(data_test))),
    }
)
output.to_csv("submission.csv", index=False)

In [None]:
if True:
    import kaggle

    result = kaggle.api.competition_submit(
        "submission.csv",
        f"CatBoostRegressor optimized with Optuna",
        "home-data-for-ml-course",
    )
    print(result)