# Generalized Huber Regression
This notebook illustrates how to use LightGBM to optimize the Generalized Huber loss (GHL) function with a log link function.  

In [31]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from scipy import stats
from sklearn.model_selection import GroupKFold
from sklearn.metrics import r2_score
from scipy.stats import skewnorm

As an example we consider a one dimensional linear problem of N data points with a skew normal error distribution and non-constant variance.  

In [32]:
N = 50000
sigma0 = 1
xvec = np.linspace(-500, 500, N)
yvec = np.array([
    x + skewnorm.rvs(a=100, loc=0, scale=(sigma0 + np.abs(x)**(2)))
    for x in xvec
])

Next we define the log link function and it's inverse.

In [33]:
def log_link_trans(x):
    bool1 = x < 0
    bool2 = x >= 0
    x_trans = np.zeros([1, len(x)]).flatten()
    x_trans[bool1] = -np.log1p(-x[bool1])
    x_trans[bool2] = np.log1p(x[bool2])
    return x_trans


def log_link_back_trans(x):
    bool1 = x < 0
    bool2 = x >= 0
    x_back_trans = np.zeros([1, len(x)]).flatten()
    x_back_trans[bool1] = (1 - np.exp(-x[bool1]))
    x_back_trans[bool2] = (np.exp(x[bool2]) - 1)
    return x_back_trans

We define 2 metric functions which both compute the $r^{2}$ score.

In [34]:
def metric_func_1(yhat, dtrain):
    y = dtrain.get_label().flatten()
    yhat = yhat.flatten()
    #####################################################
    yhat_back_trans = log_link_back_trans(yhat)
    #####################################################
    y_back_trans = log_link_back_trans(y)
    #####################################################
    val = r2_score(y_back_trans, yhat_back_trans)
    return ('r2_score', val, True)


def metric_func_2(yhat, dtrain):
    y = dtrain.get_label()
    yhat = yhat.flatten()
    #####################################################
    yhat_back_trans = log_link_back_trans(yhat)
    #####################################################
    val = r2_score(y, yhat_back_trans)
    return ('r2_score', val, True)

The next cell defines the objective function for LightGBM by computing the 1st and 2nd derivative of the GHL function.

In [35]:
def generalized_huber_obj(yhat, dtrain, alpha=0.75):
    y = dtrain.get_label()
    yhat = yhat.flatten()

    def sgn(x):
        sig = np.sign(x)
        sig[sig == 0] = 1
        return sig

    g = lambda x: sgn(x) * np.log(1 + np.abs(x))
    ginv = lambda x: sgn(x) * (np.exp(np.abs(x)) - 1)
    ginvp = lambda x: np.exp(np.abs(x))
    ginvpp = lambda x: sgn(x) * np.exp(np.abs(x))

    diff = g(y) - yhat
    absdiff = np.abs(diff)

    bool1_l = ((absdiff <= alpha) & (diff < 0))
    bool1_r = ((absdiff <= alpha) & (diff >= 0))
    bool2_l = ((absdiff > alpha) & (diff < 0))
    bool2_r = ((absdiff > alpha) & (diff >= 0))

    grad = np.zeros([1, len(yhat)]).flatten()
    hess = np.zeros([1, len(yhat)]).flatten()

    A = np.zeros([1, len(yhat)]).flatten()
    Ap = np.zeros([1, len(yhat)]).flatten()
    App = np.zeros([1, len(yhat)]).flatten()
    B = np.zeros([1, len(yhat)]).flatten()

    A[bool1_l] = ginv(yhat[bool1_l] - alpha) - ginv(yhat[bool1_l])
    A[bool1_r] = ginv(yhat[bool1_r] + alpha) - ginv(yhat[bool1_r])
    Ap[bool1_l] = ginvp(yhat[bool1_l] - alpha) - ginvp(yhat[bool1_l])
    Ap[bool1_r] = ginvp(yhat[bool1_r] + alpha) - ginvp(yhat[bool1_r])
    App[bool1_l] = ginvpp(yhat[bool1_l] - alpha) - ginvpp(yhat[bool1_l])
    App[bool1_r] = ginvpp(yhat[bool1_r] + alpha) - ginvpp(yhat[bool1_r])

    A[bool2_l] = ginv(yhat[bool2_l] - alpha) - ginv(yhat[bool2_l])
    A[bool2_r] = ginv(yhat[bool2_r] + alpha) - ginv(yhat[bool2_r])
    Ap[bool2_l] = ginvp(yhat[bool2_l] - alpha) - ginvp(yhat[bool2_l])
    Ap[bool2_r] = ginvp(yhat[bool2_r] + alpha) - ginvp(yhat[bool2_r])
    App[bool2_l] = ginvpp(yhat[bool2_l] - alpha) - ginvpp(yhat[bool2_l])
    App[bool2_r] = ginvpp(yhat[bool2_r] + alpha) - ginvpp(yhat[bool2_r])

    B[bool1_l] = y[bool1_l] - ginv(g(y[bool1_l]) + alpha)
    B[bool1_r] = y[bool1_r] - ginv(g(y[bool1_r]) - alpha)

    grad[bool1_l] = -2*(y[bool1_l]-ginv(yhat[bool1_l]))*ginvp(yhat[bool1_l])*(1/np.abs(A[bool1_l]) + 1/np.abs(B[bool1_l])) \
                    -(y[bool1_l]-ginv(yhat[bool1_l]))**2*(1/(np.abs(A[bool1_l])**2))*sgn(A[bool1_l])*Ap[bool1_l]

    grad[bool1_r] = -2*(y[bool1_r]-ginv(yhat[bool1_r]))*ginvp(yhat[bool1_r])*(1/np.abs(A[bool1_r]) + 1/np.abs(B[bool1_r])) \
                    -(y[bool1_r]-ginv(yhat[bool1_r]))**2*(1/(np.abs(A[bool1_r])**2))*sgn(A[bool1_r])*Ap[bool1_r]

    hess[bool1_l] = 2*(ginvp(yhat[bool1_l])**2 - (y[bool1_l]-ginv(yhat[bool1_l]))*ginvpp(yhat[bool1_l]))*(1/np.abs(A[bool1_l]) + 1/np.abs(B[bool1_l])) \
                    +4*(y[bool1_l]-ginv(yhat[bool1_l]))*ginvp(yhat[bool1_l])*(1/(np.abs(A[bool1_l])**2))*sgn(A[bool1_l])*Ap[bool1_l] \
                    +2*(y[bool1_l]-ginv(yhat[bool1_l]))**2*(1/(np.abs(A[bool1_l])**3))*Ap[bool1_l]**2 \
                    -(y[bool1_l]-ginv(yhat[bool1_l]))**2*(1/(np.abs(A[bool1_l])**2))*sgn(A[bool1_l])*App[bool1_l]

    hess[bool1_r] = 2*(ginvp(yhat[bool1_r])**2 - (y[bool1_r]-ginv(yhat[bool1_r]))*ginvpp(yhat[bool1_r]))*(1/np.abs(A[bool1_r]) + 1/np.abs(B[bool1_r])) \
                   +4*(y[bool1_r]-ginv(yhat[bool1_r]))*ginvp(yhat[bool1_r])*(1/(np.abs(A[bool1_r])**2))*sgn(A[bool1_r])*Ap[bool1_r] \
                   +2*(y[bool1_r]-ginv(yhat[bool1_r]))**2*(1/(np.abs(A[bool1_r])**3))*Ap[bool1_r]**2 \
                    -(y[bool1_r]-ginv(yhat[bool1_r]))**2*(1/(np.abs(A[bool1_r])**2))*sgn(A[bool1_r])*App[bool1_r]

    grad[bool2_l] = -4 * sgn(y[bool2_l] - ginv(yhat[bool2_l])) * ginvp(
        yhat[bool2_l]) - sgn(A[bool2_l]) * Ap[bool2_l]

    grad[bool2_r] = -4 * sgn(y[bool2_r] - ginv(yhat[bool2_r])) * ginvp(
        yhat[bool2_r]) - sgn(A[bool2_r]) * Ap[bool2_r]

    hess[bool2_l] = -4 * sgn(y[bool2_l] - ginv(yhat[bool2_l])) * ginvpp(
        yhat[bool2_l]) - sgn(A[bool2_l]) * App[bool2_l]

    hess[bool2_r] = -4 * sgn(y[bool2_r] - ginv(yhat[bool2_r])) * ginvpp(
        yhat[bool2_r]) - sgn(A[bool2_r]) * App[bool2_r]

    return grad, hess

Next we put our data into a Dataframe, define the train and test sets and compute 3 cv folds.

In [36]:
df = pd.DataFrame(np.column_stack((xvec, yvec)), columns=['x', 'y'])

X_train, X_test = train_test_split(df, test_size=0.3)
y_train = X_train['y']
y_test = X_test['y']

groups = X_train.index
group_kfold = GroupKFold(n_splits=3)
group_kfold.get_n_splits(X_train, groups)
folds = list(group_kfold.split(X_train, y_train, groups=groups))

We start by training a shallow Lightgbm model whose output will be the starting vector of the GHL model.

In [37]:
lgb_param_start_vector = {
    'learning_rate': 0.1,
    'num_leaves': 2**5 - 1,
    'bagging_fraction': 1,
    'bagging_freq': 1,
    'feature_fraction': 1,
    'alpha': 2,
    'objective': 'huber',
    'metric': 'none',
    'tree_learner': 'data',
    'verbosity': 3
}

lgbtrain_trans = lgb.Dataset(
    X_train.drop('y', axis=1), label=log_link_trans(y_train))
lgbmodel0 = lgb.train(
    lgb_param_start_vector, lgbtrain_trans, num_boost_round=20)

start_vec = lgbmodel0.predict(X_train.drop('y', axis=1))
lgbtrain = lgb.Dataset(
    X_train.drop('y', axis=1), init_score=start_vec, label=y_train)

Next we perform a hyperparameter search to find the optimal $\alpha$ parameter in the GHL function.

In [38]:
lgb_param_ghl = {
    'learning_rate': 0.1,
    'num_leaves': 2**5 - 1,
    'bagging_fraction': 1,
    'bagging_freq': 1,
    'feature_fraction': 1,
    'metric': 'none',
    'max_delta_step': 3,
    'verbosity': 1
}

kwargsvec = np.linspace(0.1, 1, 20)
chvec = np.zeros(20)
for k in range(20):
    kwargs = {'alpha': kwargsvec[k]}
    cvmodel = lgb.cv(
        lgb_param_ghl,
        lgbtrain,
        num_boost_round=10000,
        folds=folds,
        fobj=(lambda a, b: generalized_huber_obj(a, b, **kwargs)),
        metrics='none',
        feval=metric_func_2,
        early_stopping_rounds=20,
        verbose_eval=False)
    chvec[k] = cvmodel['r2_score-mean'][-1]
    print(kwargsvec[k])

best_iter = np.argmax(chvec)
alpha_ch = kwargsvec[best_iter]

0.1
0.147368421053
0.194736842105
0.242105263158
0.289473684211
0.336842105263
0.384210526316
0.431578947368
0.478947368421
0.526315789474
0.573684210526
0.621052631579
0.668421052632
0.715789473684
0.763157894737
0.810526315789
0.857894736842
0.905263157895
0.952631578947
1.0


Having found the optimal $\alpha$ we can now train the GHL model.

In [39]:
kwargs = {'alpha': alpha_ch}
cvmodel = lgb.cv(
    lgb_param_ghl,
    lgbtrain,
    num_boost_round=10000,
    folds=folds,
    fobj=(lambda a, b: generalized_huber_obj(a, b, **kwargs)),
    metrics='none',
    feval=metric_func_2,
    early_stopping_rounds=20,
    verbose_eval=True)

print('generalized huber best alpha: ', alpha_ch)
print('generalized huber best iteration: ', len(cvmodel['r2_score-mean']))

lgbmodel = lgb.train(
    lgb_param_ghl,
    lgbtrain,
    num_boost_round=len(cvmodel['r2_score-mean']),
    fobj=(lambda a, b: generalized_huber_obj(a, b, **kwargs)),
    feval=metric_func_2,
    verbose_eval=True)

[1]	cv_agg's r2_score: 0.315434 + 0.00452089
[2]	cv_agg's r2_score: 0.342636 + 0.00441996
[3]	cv_agg's r2_score: 0.361075 + 0.00422776
[4]	cv_agg's r2_score: 0.374538 + 0.00399017
[5]	cv_agg's r2_score: 0.384616 + 0.00376263
[6]	cv_agg's r2_score: 0.392602 + 0.00359278
[7]	cv_agg's r2_score: 0.398893 + 0.00343957
[8]	cv_agg's r2_score: 0.404034 + 0.0034064
[9]	cv_agg's r2_score: 0.408239 + 0.00334803
[10]	cv_agg's r2_score: 0.411665 + 0.00334058
[11]	cv_agg's r2_score: 0.414488 + 0.00332364
[12]	cv_agg's r2_score: 0.416813 + 0.00332702
[13]	cv_agg's r2_score: 0.418806 + 0.00338979
[14]	cv_agg's r2_score: 0.420461 + 0.00341538
[15]	cv_agg's r2_score: 0.42186 + 0.00344469
[16]	cv_agg's r2_score: 0.423037 + 0.00345049
[17]	cv_agg's r2_score: 0.424073 + 0.00352018
[18]	cv_agg's r2_score: 0.424928 + 0.00356757
[19]	cv_agg's r2_score: 0.425669 + 0.00359542
[20]	cv_agg's r2_score: 0.426287 + 0.003615
[21]	cv_agg's r2_score: 0.426843 + 0.00366473
[22]	cv_agg's r2_score: 0.42726 + 0.00368291
[2

Next a MAE model is trained on $\log(y)$ ...

In [40]:
lgb_param_mae = {
    'learning_rate': 0.1,
    'num_leaves': 2**5 - 1,
    'bagging_fraction': 1,
    'bagging_freq': 1,
    'feature_fraction': 1,
    'objective': 'mae',
    'metric': 'none',
    'tree_learner': 'data',
    'verbosity': 1
}

cvmodel_mae = lgb.cv(
    lgb_param_mae,
    lgbtrain_trans,
    num_boost_round=10000,
    folds=folds,
    feval=metric_func_1,
    metrics='none',
    early_stopping_rounds=20,
    verbose_eval=True)

lgbmodel_mae = lgb.train(
    lgb_param_mae,
    lgbtrain_trans,
    num_boost_round=len(cvmodel_mae['r2_score-mean']),
    verbose_eval=True)

[1]	cv_agg's r2_score: -0.111879 + 0.00170896
[2]	cv_agg's r2_score: -0.0633219 + 0.0020626
[3]	cv_agg's r2_score: -0.0184442 + 0.00261469
[4]	cv_agg's r2_score: 0.023344 + 0.00322538
[5]	cv_agg's r2_score: 0.0622578 + 0.00392642
[6]	cv_agg's r2_score: 0.0987785 + 0.00430932
[7]	cv_agg's r2_score: 0.132495 + 0.00469293
[8]	cv_agg's r2_score: 0.163179 + 0.00506662
[9]	cv_agg's r2_score: 0.191234 + 0.00532241
[10]	cv_agg's r2_score: 0.216405 + 0.00559689
[11]	cv_agg's r2_score: 0.238911 + 0.00587978
[12]	cv_agg's r2_score: 0.259147 + 0.00620783
[13]	cv_agg's r2_score: 0.277605 + 0.00608483
[14]	cv_agg's r2_score: 0.293773 + 0.00615044
[15]	cv_agg's r2_score: 0.308201 + 0.00597854
[16]	cv_agg's r2_score: 0.320864 + 0.00591604
[17]	cv_agg's r2_score: 0.331967 + 0.00574992
[18]	cv_agg's r2_score: 0.341958 + 0.00545441
[19]	cv_agg's r2_score: 0.350699 + 0.00520376
[20]	cv_agg's r2_score: 0.358451 + 0.00495837
[21]	cv_agg's r2_score: 0.365124 + 0.00473855
[22]	cv_agg's r2_score: 0.370948 + 0.

... as well as a RMSE model. 

In [41]:
lgb_param_rmse = {
    'learning_rate': 0.1,
    'num_leaves': 2**5 - 1,
    'bagging_fraction': 1,
    'bagging_freq': 1,
    'feature_fraction': 1,
    'objective': 'rmse',
    'metric': 'none',
    'tree_learner': 'data',
    'verbosity': 1
}

cvmodel_rmse = lgb.cv(
    lgb_param_rmse,
    lgbtrain_trans,
    num_boost_round=10000,
    folds=folds,
    feval=metric_func_1,
    metrics='none',
    early_stopping_rounds=20,
    verbose_eval=True)

lgbmodel_rmse = lgb.train(
    lgb_param_rmse,
    lgbtrain_trans,
    num_boost_round=len(cvmodel_rmse['r2_score-mean']),
    verbose_eval=True)

[1]	cv_agg's r2_score: -0.2598 + 0.00272392
[2]	cv_agg's r2_score: -0.221886 + 0.00162379
[3]	cv_agg's r2_score: -0.184382 + 0.00060316
[4]	cv_agg's r2_score: -0.147394 + 0.000372647
[5]	cv_agg's r2_score: -0.111402 + 0.00144645
[6]	cv_agg's r2_score: -0.0765694 + 0.00243073
[7]	cv_agg's r2_score: -0.0432549 + 0.00340817
[8]	cv_agg's r2_score: -0.0115667 + 0.00420541
[9]	cv_agg's r2_score: 0.0180644 + 0.00497593
[10]	cv_agg's r2_score: 0.0457333 + 0.0055736
[11]	cv_agg's r2_score: 0.0714348 + 0.00606819
[12]	cv_agg's r2_score: 0.0950789 + 0.00650422
[13]	cv_agg's r2_score: 0.11669 + 0.00693169
[14]	cv_agg's r2_score: 0.136469 + 0.00726533
[15]	cv_agg's r2_score: 0.154487 + 0.00751
[16]	cv_agg's r2_score: 0.170871 + 0.00766347
[17]	cv_agg's r2_score: 0.185628 + 0.00775919
[18]	cv_agg's r2_score: 0.198968 + 0.00782995
[19]	cv_agg's r2_score: 0.210918 + 0.00791676
[20]	cv_agg's r2_score: 0.221699 + 0.00794811
[21]	cv_agg's r2_score: 0.231464 + 0.00802169
[22]	cv_agg's r2_score: 0.240198 +

We can now compute the predictions of all 3 models on the train and test set.

In [42]:
start_vec_test = lgbmodel0.predict(X_test.drop('y', axis=1))
scores_ghl_test = lgbmodel.predict(X_test.drop('y', axis=1))
scores_ghl_test = log_link_back_trans(scores_ghl_test + start_vec_test)

scores_ghl_train = lgbmodel.predict(X_train.drop('y', axis=1))
scores_ghl_train = log_link_back_trans(scores_ghl_train + start_vec)

scores_rmse_train = lgbmodel_rmse.predict(X_train.drop('y', axis=1))
scores_rmse_train = log_link_back_trans(scores_rmse_train)

scores_rmse_test = lgbmodel_rmse.predict(X_test.drop('y', axis=1))
scores_rmse_test = log_link_back_trans(scores_rmse_test)

scores_mae_train = lgbmodel_mae.predict(X_train.drop('y', axis=1))
scores_mae_train = log_link_back_trans(scores_mae_train)

scores_mae_test = lgbmodel_mae.predict(X_test.drop('y', axis=1))
scores_mae_test = log_link_back_trans(scores_mae_test)

With the predictions at hand we finally compute the $r^{2}$ score and the realtive error of the global mean.

In [43]:
print("mean error train ghl = ",
      (y_train.mean() - scores_ghl_train.mean()) / (y_train.mean()))
print("mean error test ghl  = ",
      (y_test.mean() - scores_ghl_test.mean()) / (y_test.mean()))
print("mean error train mae = ",
      (y_train.mean() - scores_mae_train.mean()) / (y_train.mean()))
print("mean error test mae = ",
      (y_test.mean() - scores_mae_test.mean()) / (y_test.mean()))
print("mean error train rmse = ",
      (y_train.mean() - scores_rmse_train.mean()) / (y_train.mean()))
print("mean error test rmse = ",
      (y_test.mean() - scores_rmse_test.mean()) / (y_test.mean()))
print("             ..........          ")
print("r2 score train ghl = ", r2_score(y_train, scores_ghl_train))
print("r2 scores test ghl  = ", r2_score(y_test, scores_ghl_test))
print("r2 score train mae = ", r2_score(y_train, scores_mae_train))
print("r2 score test mae = ", r2_score(y_test, scores_mae_test))
print("r2 score train rmse = ", r2_score(y_train, scores_rmse_train))
print("r2 score test rmse = ", r2_score(y_test, scores_rmse_test))

mean error train ghl =  0.0791477914076
mean error test ghl  =  0.0697342566865
mean error train mae =  0.153254702295
mean error test mae =  0.143838062861
mean error train rmse =  0.353968098928
mean error test rmse =  0.347008685339
             ..........          
r2 score train ghl =  0.439297282841
r2 scores test ghl  =  0.423486917981
r2 score train mae =  0.421378611685
r2 score test mae =  0.410388710145
r2 score train rmse =  0.321584801783
r2 score test rmse =  0.314374615368
