In [1]:
from src.models.xgb import XGBBERTResNet152

In [2]:
model = XGBBERTResNet152()

Some weights of the model checkpoint at dbmdz/bert-base-italian-xxl-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [3]:
import numpy as np
import pandas as pd
from time import time
import pprint
import joblib
from functools import partial

# Suppressing warnings because of skopt verbosity
import warnings
warnings.filterwarnings("ignore")

# Classifier/Regressor
from xgboost import XGBRegressor, DMatrix

# Model selection
from sklearn.model_selection import KFold, StratifiedKFold

# Metrics
from sklearn.metrics import mean_squared_error
from sklearn.metrics import make_scorer

# Skopt functions
from skopt import BayesSearchCV
from skopt.callbacks import DeadlineStopper, DeltaYStopper
from skopt.space import Real, Categorical, Integer

In [4]:
def report_perf(optimizer, X, y, title="Model", callbacks=None):
    start = time()
    
    if callbacks is not None:
        optimizer.fit(X, y, callback=callbacks)
    else:
        optimizer.fit(X, y)
        
    d=pd.DataFrame(optimizer.cv_results_)
    best_score = optimizer.best_score_
    best_score_std = d.iloc[optimizer.best_index_].std_test_score
    best_params = optimizer.best_params_
    
    print((title + " took %.2f seconds,  candidates checked: %d, best CV score: %.3f "
           + u"\u00B1"+" %.3f") % (time() - start, 
                                   len(optimizer.cv_results_['params']),
                                   best_score,
                                   best_score_std))    
    print('Best parameters:')
    pprint.pprint(best_params)
    print()
    return optimizer

In [5]:
from pytorch_lightning.utilities.parsing import AttributeDict

hparams = AttributeDict(
    {
        "random_state": 42,
        "booster": "gbtree",
        "objective": "reg:squarederror",
        "tree_method": "hist",
        "n_splits": 5,
    }
)


In [6]:
reg = XGBRegressor(
    random_state=hparams.random_state,
    booster=hparams.booster,
    objective=hparams.objective,
    # eval_metric=mean_squared_error,
    tree_method=hparams.tree_method,
)

In [7]:
scoring = make_scorer(partial(mean_squared_error, squared=False), greater_is_better=False)

In [8]:
X = np.vstack([model.X_train, model.X_dev])
y = np.hstack([model.y_train, model.y_dev])

y_stratified = pd.cut(pd.Series(y).rank(method='first'), bins=10, labels=False)

skf = StratifiedKFold(n_splits=7,
                      shuffle=True, 
                      random_state=0)

cv_strategy = list(skf.split(X, y_stratified))

# cv_strategy = [(np.arange(len(model.y_train)), np.arange(len(model.y_train), len(y)))]

In [9]:
search_spaces = {'learning_rate': Real(0.01, 1.0, 'uniform'),
                 'max_depth': Integer(2, 12),
                 'subsample': Real(0.1, 1.0, 'uniform'),
                 'colsample_bytree': Real(0.1, 1.0, 'uniform'), # subsample ratio of columns by tree
                 'reg_lambda': Real(1e-9, 100., 'uniform'), # L2 regularization
                 'reg_alpha': Real(1e-9, 100., 'uniform'), # L1 regularization
                 'n_estimators': Integer(50, 5000)
   }

In [10]:
# Wrapping everything up into the Bayesian optimizer
opt = BayesSearchCV(estimator=reg,
                    search_spaces=search_spaces,                                                       
                    scoring=scoring,                                   
                    cv=cv_strategy,                                          
                    n_iter=120,                                       # max number of trials
                    n_points=1,                                       # number of hyperparameter sets evaluated at the same time
                    n_jobs=1,                                         # number of jobs
                    iid=False,                                        # if not iid it optimizes on the cv score
                    return_train_score=False,
                    verbose=2,                         
                    refit=False,                                      
                    optimizer_kwargs={"base_estimator": "GP"},        # optmizer parameters: we use Gaussian Process (GP)
                    random_state=hparams.random_state)                # random state for replicability

In [11]:
# Running the optimizer
overdone_control = DeltaYStopper(delta=1)                    # We stop if the gain of the optimization becomes too small
time_limit_control = DeadlineStopper(total_time=60*20)

optimizer = report_perf(opt, X, y,"XGBoost Regression", 
                          callbacks=[overdone_control, time_limit_control])

Fitting 7 folds for each of 1 candidates, totalling 7 fits
[CV] END colsample_bytree=0.46909356296798244, learning_rate=0.7304484857455519, max_depth=11, n_estimators=1613, reg_alpha=67.01479482722331, reg_lambda=41.41186324913973, subsample=0.41583820140922967; total time=12.2min
[CV] END colsample_bytree=0.46909356296798244, learning_rate=0.7304484857455519, max_depth=11, n_estimators=1613, reg_alpha=67.01479482722331, reg_lambda=41.41186324913973, subsample=0.41583820140922967; total time= 8.6min
[CV] END colsample_bytree=0.46909356296798244, learning_rate=0.7304484857455519, max_depth=11, n_estimators=1613, reg_alpha=67.01479482722331, reg_lambda=41.41186324913973, subsample=0.41583820140922967; total time= 8.5min
[CV] END colsample_bytree=0.46909356296798244, learning_rate=0.7304484857455519, max_depth=11, n_estimators=1613, reg_alpha=67.01479482722331, reg_lambda=41.41186324913973, subsample=0.41583820140922967; total time= 8.7min
[CV] END colsample_bytree=0.46909356296798244, le

In [12]:
best_reg = XGBRegressor(
    random_state=hparams.random_state,
    booster=hparams.booster,
    objective=hparams.objective,
    eval_metric=mean_squared_error,
    tree_method=hparams.tree_method,
    **optimizer.best_params
)

In [13]:
best_reg.fit(model.X_train, model.y_train)

In [14]:
pred = best_reg.predict(model.X_test)
mean_squared_error(pred, model.y_test)

378001.03