# About:
- this notebooks:
    - optimizes parameters with Optuna with a pruner that observes intermediate results and stop unpromising trials.
    - Approach:
        1. Fix at a high-ish learning rate and at a fixed iteration
        2. perform Hyperparameter Search
        3. Using the best parameters found, reduce learning rate and find the best iteration using early stopping
- insights:
    - **the above approach is effective** - gains are usually seen in step 3

In [1]:
import xgboost as xgb

In [2]:
import optuna 
from optuna import Trial, visualization
from optuna.samplers import TPESampler

### prepare data
- for catboost, we specify nominal features as cat_features as it was found to improve performance
- note: catboost expects these cat_features to be a string!

In [3]:
label_colnames = ['h1n1_vaccine', 'seasonal_vaccine']
X = pd.read_csv(r"C:\Users\tanch\Documents\Coding Competitions\DataDriven\Flu Shot Learning\local\data\cleaned_train_set.csv",index_col = "respondent_id")
y = X[label_colnames[0]]
X = X.drop(label_colnames,axis=1)

<IPython.core.display.Javascript object>

In [4]:
# boosting libraries have Dataset class that has memory and speed optimizations
dtrain = xgb.DMatrix(X, label=y)

# Objective Function
- should take an input Trial instance and return a score

In [10]:
def objective(trial):

    # n_estimators and learning rate are closely tight and will be tuned afterwards
    param = {
        'learning_rate': 0.1,
        "verbosity": 0 ,
        'max_depth':trial.suggest_int('max_depth', 4, 10),
        'gamma':trial.suggest_int('gamma', 0, 5),
        'min_child_weight':trial.suggest_int('min_child_weight', 0, 20),
        'scale_pos_weight':trial.suggest_int('scale_pos_weight', 0, 20),
        'subsample':trial.suggest_float('subsample',0.4,1),
        'colsample_bytree':trial.suggest_float('colsample_bytree',0.4,1),
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-8, 10.0, log=True),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-8, 10.0, log=True),
    }
    pruning_callback = optuna.integration.XGBoostPruningCallback(trial, "test-auc")

    cv_scores = xgb.cv(param, dtrain, 1000, nfold=3, metrics='auc', early_stopping_rounds=20,
                       verbose_eval = False,
                       callbacks=[pruning_callback])
    return cv_scores['test-auc-mean'].max()    

# create_study

In [11]:
study = optuna.create_study(
    direction='maximize',                     
    sampler=TPESampler(),
    pruner=optuna.pruners.MedianPruner(n_warmup_steps=10)
)
study.optimize(objective, n_trials=300)

[32m[I 2021-07-29 21:17:58,390][0m A new study created in memory with name: no-name-02849f27-1fa9-4bbb-a926-dc3f04ba8835[0m
[32m[I 2021-07-29 21:18:05,538][0m Trial 0 finished with value: 0.8652896666666666 and parameters: {'max_depth': 5, 'gamma': 2, 'min_child_weight': 19, 'scale_pos_weight': 8, 'subsample': 0.4626498014129161, 'colsample_bytree': 0.7892089449724482, 'reg_alpha': 7.473499132895059, 'reg_lambda': 4.211914104441639e-06}. Best is trial 0 with value: 0.8652896666666666.[0m
[32m[I 2021-07-29 21:18:09,358][0m Trial 1 finished with value: 0.8530046666666666 and parameters: {'max_depth': 9, 'gamma': 2, 'min_child_weight': 3, 'scale_pos_weight': 6, 'subsample': 0.4883281885174338, 'colsample_bytree': 0.47547730779043135, 'reg_alpha': 6.177182809314553e-05, 'reg_lambda': 1.5898576878020497e-05}. Best is trial 0 with value: 0.8652896666666666.[0m
[32m[I 2021-07-29 21:18:12,882][0m Trial 2 finished with value: 0.8593543333333334 and parameters: {'max_depth': 6, 'gamma

[32m[I 2021-07-29 21:19:48,751][0m Trial 49 pruned. Trial was pruned at iteration 10.[0m
[32m[I 2021-07-29 21:19:49,354][0m Trial 50 pruned. Trial was pruned at iteration 10.[0m
[32m[I 2021-07-29 21:19:51,143][0m Trial 51 pruned. Trial was pruned at iteration 29.[0m
[32m[I 2021-07-29 21:19:52,347][0m Trial 52 pruned. Trial was pruned at iteration 18.[0m
[32m[I 2021-07-29 21:19:53,114][0m Trial 53 pruned. Trial was pruned at iteration 10.[0m
[32m[I 2021-07-29 21:19:53,880][0m Trial 54 pruned. Trial was pruned at iteration 10.[0m
[32m[I 2021-07-29 21:19:54,583][0m Trial 55 pruned. Trial was pruned at iteration 10.[0m
[32m[I 2021-07-29 21:19:55,238][0m Trial 56 pruned. Trial was pruned at iteration 10.[0m
[32m[I 2021-07-29 21:19:55,747][0m Trial 57 pruned. Trial was pruned at iteration 10.[0m
[32m[I 2021-07-29 21:19:56,375][0m Trial 58 pruned. Trial was pruned at iteration 10.[0m
[32m[I 2021-07-29 21:19:59,944][0m Trial 59 finished with value: 0.86449966666

[32m[I 2021-07-29 21:21:27,601][0m Trial 129 pruned. Trial was pruned at iteration 10.[0m
[32m[I 2021-07-29 21:21:28,214][0m Trial 130 pruned. Trial was pruned at iteration 10.[0m
[32m[I 2021-07-29 21:21:28,842][0m Trial 131 pruned. Trial was pruned at iteration 10.[0m
[32m[I 2021-07-29 21:21:29,475][0m Trial 132 pruned. Trial was pruned at iteration 10.[0m
[32m[I 2021-07-29 21:21:30,120][0m Trial 133 pruned. Trial was pruned at iteration 10.[0m
[32m[I 2021-07-29 21:21:30,800][0m Trial 134 pruned. Trial was pruned at iteration 10.[0m
[32m[I 2021-07-29 21:21:31,419][0m Trial 135 pruned. Trial was pruned at iteration 10.[0m
[32m[I 2021-07-29 21:21:32,039][0m Trial 136 pruned. Trial was pruned at iteration 10.[0m
[32m[I 2021-07-29 21:21:32,746][0m Trial 137 pruned. Trial was pruned at iteration 10.[0m
[32m[I 2021-07-29 21:21:33,546][0m Trial 138 pruned. Trial was pruned at iteration 10.[0m
[32m[I 2021-07-29 21:21:33,911][0m Trial 139 pruned. Trial was prune

[32m[I 2021-07-29 21:22:59,287][0m Trial 192 pruned. Trial was pruned at iteration 10.[0m
[32m[I 2021-07-29 21:23:00,463][0m Trial 193 pruned. Trial was pruned at iteration 21.[0m
[32m[I 2021-07-29 21:23:03,691][0m Trial 194 pruned. Trial was pruned at iteration 66.[0m
[32m[I 2021-07-29 21:23:07,943][0m Trial 195 finished with value: 0.8657593333333332 and parameters: {'max_depth': 6, 'gamma': 1, 'min_child_weight': 19, 'scale_pos_weight': 2, 'subsample': 0.9427148882065371, 'colsample_bytree': 0.7586044000397282, 'reg_alpha': 0.000478574061196293, 'reg_lambda': 0.04294159628036904}. Best is trial 17 with value: 0.8663683333333333.[0m
[32m[I 2021-07-29 21:23:12,563][0m Trial 196 pruned. Trial was pruned at iteration 81.[0m
[32m[I 2021-07-29 21:23:13,260][0m Trial 197 pruned. Trial was pruned at iteration 10.[0m
[32m[I 2021-07-29 21:23:14,065][0m Trial 198 pruned. Trial was pruned at iteration 12.[0m
[32m[I 2021-07-29 21:23:15,274][0m Trial 199 pruned. Trial was p

[32m[I 2021-07-29 21:24:41,881][0m Trial 268 finished with value: 0.8657846666666668 and parameters: {'max_depth': 6, 'gamma': 1, 'min_child_weight': 13, 'scale_pos_weight': 2, 'subsample': 0.8712487711970953, 'colsample_bytree': 0.8298759313297249, 'reg_alpha': 0.00011514955249022182, 'reg_lambda': 0.007556071524598899}. Best is trial 17 with value: 0.8663683333333333.[0m
[32m[I 2021-07-29 21:24:42,495][0m Trial 269 pruned. Trial was pruned at iteration 10.[0m
[32m[I 2021-07-29 21:24:43,156][0m Trial 270 pruned. Trial was pruned at iteration 10.[0m
[32m[I 2021-07-29 21:24:43,909][0m Trial 271 pruned. Trial was pruned at iteration 12.[0m
[32m[I 2021-07-29 21:24:44,548][0m Trial 272 pruned. Trial was pruned at iteration 10.[0m
[32m[I 2021-07-29 21:24:45,204][0m Trial 273 pruned. Trial was pruned at iteration 10.[0m
[32m[I 2021-07-29 21:24:47,326][0m Trial 274 pruned. Trial was pruned at iteration 41.[0m
[32m[I 2021-07-29 21:24:47,997][0m Trial 275 pruned. Trial wa

In [12]:
print("Number of finished trials: {}".format(len(study.trials)))
print("Best trial:")
trial = study.best_trial
print("  Value: {}".format(trial.value))
print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

Number of finished trials: 300
Best trial:
  Value: 0.8663683333333333
  Params: 
    max_depth: 6
    gamma: 1
    min_child_weight: 18
    scale_pos_weight: 4
    subsample: 0.6439846353512728
    colsample_bytree: 0.8774177233229632
    reg_alpha: 7.110378342968055
    reg_lambda: 0.0005005189968821851


## Optimize n_estimators and learning rate
- set to high iterations and low learning rate

In [13]:
param = trial.params
param.update({"learning_rate":0.01,
              "eval_metric" : "auc"})
param

{'max_depth': 6,
 'gamma': 1,
 'min_child_weight': 18,
 'scale_pos_weight': 4,
 'subsample': 0.6439846353512728,
 'colsample_bytree': 0.8774177233229632,
 'reg_alpha': 7.110378342968055,
 'reg_lambda': 0.0005005189968821851,
 'learning_rate': 0.01,
 'eval_metric': 'auc'}

In [14]:
cv_scores = xgb.cv(param, dtrain, 10000, nfold  = 5, early_stopping_rounds = 50,verbose_eval=False)['test-auc-mean']

In [15]:
print(f"Best Iteration {len(cv_scores)} at {cv_scores.max()}")

Best Iteration 1073 at 0.8692816000000001


# Train with best param and export

In [18]:
best_param = {
    'max_depth': 6,
    'gamma': 1,
    'min_child_weight': 18,
    'scale_pos_weight': 4,
    'subsample': 0.6439846353512728,
    'colsample_bytree': 0.8774177233229632,
    'reg_alpha': 7.110378342968055,
    'reg_lambda': 0.0005005189968821851,
    'learning_rate': 0.01,
    'eval_metric': 'auc',
    "use_label_encoder":False,
    "n_estimators": 1073 
}

In [19]:
model = xgb.XGBClassifier()
model.set_params(**best_param)
model.fit(X,y)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.8774177233229632,
              eval_metric='auc', gamma=1, gpu_id=-1, importance_type='gain',
              interaction_constraints='', learning_rate=0.01, max_delta_step=0,
              max_depth=6, min_child_weight=18, missing=nan,
              monotone_constraints='()', n_estimators=1073, n_jobs=8,
              num_parallel_tree=1, objective='binary:logistic', random_state=0,
              reg_alpha=7.110378342968055, reg_lambda=0.0005005189968821851,
              scale_pos_weight=4, subsample=0.6439846353512728,
              tree_method='exact', use_label_encoder=False,
              validate_parameters=1, verbosity=None)

In [20]:
import joblib
joblib.dump(model, r"C:\Users\tanch\Documents\Coding Competitions\DataDriven\Flu Shot Learning\local\model\xgbboost seasonal_vaccine 0.pkl")
# gbm_pickle = joblib.load(r"C:\Users\tanch\Documents\Coding Competitions\DataDriven\Flu Shot Learning\local\model\lightgbm 0.pkl")

['C:\\Users\\tanch\\Documents\\Coding Competitions\\DataDriven\\Flu Shot Learning\\local\\model\\xgbboost seasonal_vaccine 0.pkl']