# XGBoost with Regressions
Common regression metrics: RMSE, MAE

Common xgboost regression loss functions: reg:squarederror, reg:logistic, binary:logistic


In [1]:
# load packages
import xgboost
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_boston
from sklearn.metrics import mean_squared_error

boston_data = load_boston()
data = pd.DataFrame(boston_data.data, columns=boston_data.feature_names)
data['target'] = boston_data.target
data.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,target
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


In [2]:
X, y = data.iloc[:, :-1], data.iloc[:, -1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=123)

In [3]:
xg_reg = xgboost.XGBRegressor(objective='reg:squarederror', n_estimators=10, seed=123)
xg_reg.fit(X_train, y_train)
preds = xg_reg.predict(X_test)

In [4]:
rmse = np.sqrt(mean_squared_error(y_test, preds))
print(f'RMSE: {rmse}')

RMSE: 3.7824431053497274


## Linear Base learners examples: learning API only


In [5]:
DM_train = xgboost.DMatrix(data=X_train, label=y_train)
DM_test = xgboost.DMatrix(data=X_test, label=y_test)
DM_train

<xgboost.core.DMatrix at 0x236333445e0>

In [6]:
params = {'booster': 'gblinear', 'objective':'reg:linear'}
xg_reg = xgboost.train(params=params, dtrain=DM_train, num_boost_round=10)
preds = xg_reg.predict(DM_test)
preds



array([ 8.056912 , 25.237148 , 27.845907 , 17.607294 , 33.50383  ,
       29.472982 , 25.423195 ,  6.8166304, 21.959911 , 31.544586 ,
       30.615196 , 23.466148 , 17.579546 , 26.651802 , 20.534979 ,
       21.706196 , 22.460348 , 26.79021  , 20.88407  , 17.493168 ,
       18.081396 , 25.668032 , 26.79259  , 31.561064 , 28.929924 ,
       21.558855 , 17.342001 , 25.53703  , 25.540302 , 18.509    ,
       25.721325 , 30.146194 ,  8.918329 , 23.071997 , 22.880598 ,
       25.240671 , 26.947212 , 18.580225 , 20.16434  , 27.239523 ,
       27.93982  , 19.9379   , 19.41831  , 29.750648 , 14.577329 ,
       27.748499 , 24.374016 , 24.47593  , 21.578907 , 19.97296  ,
       33.87521  , 22.89961  , 24.494663 , 16.406393 , 19.800056 ,
       18.163189 , 13.965478 ,  9.481693 , 33.36616  , 17.639278 ,
       22.794327 , 25.060879 , 18.362679 , 19.354233 , 25.158009 ,
       27.863115 , 27.0172   , 16.037558 , 24.261993 , 28.219898 ,
       23.031898 , 26.724415 , 21.658415 , 25.916233 , 17.7500

In [7]:
np.sqrt(mean_squared_error(y_test, preds))


6.131859199031218

In [8]:
# Create the DMatrix: housing_dmatrix
housing_dmatrix = xgboost.DMatrix(data=X, label=y)

# Create the parameter dictionary: params
params = {"objective":"reg:linear", "max_depth":4}

# Perform cross-validation: cv_results
cv_results = xgboost.cv(dtrain=housing_dmatrix, params=params, nfold=4, num_boost_round=5, metrics='rmse', as_pandas=True, seed=123)

# Print cv_results
print(cv_results)

# Extract and print final boosting round metric
print((cv_results["test-rmse-mean"]).tail(1))


   train-rmse-mean  train-rmse-std  test-rmse-mean  test-rmse-std
0        17.120438        0.057828       17.151868       0.295723
1        12.353698        0.034428       12.510376       0.372386
2         9.017976        0.038795        9.245966       0.314345
3         6.690101        0.047236        7.060159       0.317659
4         5.069410        0.048645        5.571861       0.252100
4    5.571861
Name: test-rmse-mean, dtype: float64


## Regularization -> control on model complexity
gamma - minimum loss reduction allowed for a split to occur

alpha - l1 regularization on leaf weights, larger values mean more regularization

lambda - l2 on leaf weights (smooth)

In [10]:
params = {'objective': 'reg:linear', 'max_depth':4}
l1_params = [1, 10, 100]
rmse_l1 = []

for reg in l1_params:
    params['alpha'] = reg
    cv_results = xgboost.cv(dtrain=housing_dmatrix, params=params, nfold=4, num_boost_round=10,
                            metrics='rmse', as_pandas=True, seed=123)
    rmse_l1.append(cv_results['test-rmse-mean'].tail(1).values[0])

print(pd.DataFrame(list(zip(l1_params, rmse_l1)), columns=['l1', 'rmse']))

    l1      rmse
0    1  3.461474
1   10  3.821152
2  100  4.645518
