In [None]:
import pathlib
import requests

import joblib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn import compose, datasets, ensemble, impute, metrics
from sklearn import model_selection, pipeline, preprocessing, tree

# Boosting methods


## 1. Framing the Problem

In this tutorial you will learn how to apply boosting methods to predict the sales price of a house. House price prediction is a supervised learning, regression problem. The performance measure that you will use is Mean Absolute Percentage error (MAPE).

In [None]:
metrics.mean_absolute_percentage_error?

In [None]:
SCORING = "neg_mean_absolute_percentage_error"

## 2. Get the data

In [None]:
train_df = pd.read_csv("sample_data/california_housing_train.csv")
test_df = pd.read_csv("sample_data/california_housing_test.csv")

In [None]:
train_df.info()

In [None]:
train_df.describe()

In [None]:
train_features_df = train_df.drop("median_house_value", axis=1)
train_target = train_df.loc[:, "median_house_value"]

In [None]:
train_features_df .head()

In [None]:
train_target.head()

## 3. Exploratory Data Analysis

In [None]:
_ = train_target.hist()

## 4. Prepare Data for ML

In [None]:
preprocessing_pipeline = pipeline.make_pipeline(
    impute.SimpleImputer(),
    preprocessing.StandardScaler(),
    verbose=True,
).set_output(transform="pandas")


## 5. Short-list Promising Models

### Adaboost

In [None]:
ensemble.AdaBoostRegressor?

In [None]:
_estimator = tree.DecisionTreeRegressor()

_regressor = ensemble.AdaBoostRegressor(
    estimator=_estimator
)

_transformed_target_regressor = compose.TransformedTargetRegressor(
    _regressor,
    func=np.log,
    inverse_func=np.exp,
)

adaboost_pipeline = pipeline.make_pipeline(
    preprocessing_pipeline,
    _transformed_target_regressor ,
    verbose=True,
)

In [None]:
adaboost_pipeline

In [None]:
cv_scores = model_selection.cross_val_score(
    adaboost_pipeline,
    train_features_df,
    train_target,
    cv=5,
    n_jobs=-1,
    scoring=SCORING
)

In [None]:
np.mean(-cv_scores * 100)

### Gradient Boosting

In [None]:
ensemble.GradientBoostingRegressor?

In [None]:
_regressor = ensemble.GradientBoostingRegressor()

_transformed_target_regressor = compose.TransformedTargetRegressor(
    _regressor,
    func=np.log,
    inverse_func=np.exp,
)

gradient_boosting_pipeline = pipeline.make_pipeline(
    preprocessing_pipeline,
    _transformed_target_regressor,
    verbose=True,
)

In [None]:
gradient_boosting_pipeline

In [None]:
cv_scores = model_selection.cross_val_score(
    gradient_boosting_pipeline,
    train_features_df,
    train_target,
    cv=5,
    n_jobs=-1,
    scoring=SCORING
)

In [None]:
np.mean(-cv_scores * 100)

### Exercise:

Use early stopping to control the number of estimators (i.e., the number of boosting rounds) for your GradientBoostingRegressor.

#### Solution:

### Histogram Gradient Boosting

In [None]:
_regressor = ensemble.HistGradientBoostingRegressor()

_transformed_target_regressor = compose.TransformedTargetRegressor(
    _regressor,
    func=np.log,
    inverse_func=np.exp,
)

hist_gradient_boosting_pipeline = pipeline.make_pipeline(
    preprocessing_pipeline,
    _transformed_target_regressor,
    verbose=True,
)

In [None]:
hist_gradient_boosting_pipeline

In [None]:
cv_scores = model_selection.cross_val_score(
    hist_gradient_boosting_pipeline,
    train_features_df,
    train_target,
    cv=5,
    n_jobs=-1,
    scoring=SCORING
)

In [None]:
np.mean(-cv_scores)

## 6. Fine tune the most promising models

### Exercise

Fine tune the best AdaBoostRegressor pipeline.

In [None]:
ensemble.AdaBoostRegressor?

### Exercise

Fine tune the GradientBoostingRegressor pipeline

In [None]:
ensemble.GradientBoostingRegressor?

#### Solution

### Exercise

Fine tune the HistGradientBoostingRegressor pipeline.

In [None]:
ensemble.HistGradientBoostingRegressor?

#### Solution

### Exercise

Compare the performance of the fine tuned gradient boosting models. Which is the best model?

#### Solution

## 7. Summarize your results

### Exercise:

Re-fit your best model on the entire training data set.

#### Solution:

### Exercise:

Generate predictions for the test data using your best pipeline.

#### Solution: