In [None]:
%%bash

pip install catboost dask[dataframe]

In [None]:
import pathlib
import requests

import joblib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn import compose, datasets, ensemble, impute, metrics
from sklearn import model_selection, pipeline, preprocessing, tree

import catboost as cb
import lightgbm as lgb
import xgboost as xgb


# Additional Gradient Boosted Tree Implementations


## 2. Get the data

In [None]:
train_df = pd.read_csv("sample_data/california_housing_train.csv")
test_df = pd.read_csv("sample_data/california_housing_test.csv")

In [None]:
train_df.info()

In [None]:
train_features_df = train_df.drop("median_house_value", axis=1)
train_target = train_df.loc[:, "median_house_value"]

In [None]:
train_features_df .head()

In [None]:
train_target

## 3. Exploratory Data Analysis

## 4. Prepare Data for ML

In [None]:
preprocessing_pipeline = pipeline.make_pipeline(
    impute.SimpleImputer(),
    preprocessing.StandardScaler(),
    verbose=True,
).set_output(transform="pandas")


## 5. Short-list Promising Models

### Catboost

In [None]:
cb.CatBoostRegressor?

In [None]:
_regressor = cb.CatBoostRegressor()

_transformed_target_regressor = compose.TransformedTargetRegressor(
    _regressor,
    func=np.log,
    inverse_func=np.exp,
)

catboost_pipeline = pipeline.make_pipeline(
    preprocessing_pipeline,
    _transformed_target_regressor ,
    verbose=True,
)

In [None]:
catboost_pipeline

In [None]:
cv_scores = model_selection.cross_val_score(
    catboost_pipeline,
    train_features_df,
    train_target,
    cv=5,
    n_jobs=-1,
    scoring="neg_root_mean_squared_error"
)

In [None]:
np.mean(-cv_scores)

### Lightgbm

In [None]:
lgb.LGBMRegressor?

In [None]:
_regressor = lgb.LGBMRegressor()

_transformed_target_regressor = compose.TransformedTargetRegressor(
    _regressor,
    func=np.log,
    inverse_func=np.exp,
)

lgb_pipeline = pipeline.make_pipeline(
    preprocessing_pipeline,
    _transformed_target_regressor,
    verbose=True,
)

In [None]:
lgb_pipeline

In [None]:
cv_scores = model_selection.cross_val_score(
    lgb_pipeline,
    train_features_df,
    train_target,
    cv=5,
    n_jobs=-1,
    scoring="neg_root_mean_squared_error"
)

In [None]:
np.mean(-cv_scores)

### XGBoost

In [None]:
_regressor = xgb.XGBRegressor()

_transformed_target_regressor = compose.TransformedTargetRegressor(
    _regressor,
    func=np.log,
    inverse_func=np.exp,
)

xgb_pipeline = pipeline.make_pipeline(
    preprocessing_pipeline,
    _transformed_target_regressor,
    verbose=True,
)

In [None]:
xgb_pipeline

In [None]:
cv_scores = model_selection.cross_val_score(
    xgb_pipeline,
    train_features_df,
    train_target,
    cv=5,
    n_jobs=-1,
    scoring="neg_root_mean_squared_error"
)

In [None]:
np.mean(-cv_scores)

## 6. Fine tune the most promising models

### Exercise

Fine tune the CatBoostRegressor pipeline.

### Exercise

Fine tune the LGBMRegressor pipeline.

#### Solution

### Exercise

Fine tune the XGBRegressor pipeline.

#### Solution

### Exercise

Compare the performance of the fine tuned gradient boosting models. Which is the best model?

#### Solution

## 7. Summarize your results

### Exercise:

Re-fit your best model on the entire training data set.

#### Solution:

### Exercise:

Generate predictions for the test data using your best pipeline.