# About:
- this notebooks:
    - optimizes parameters with Optuna with a pruner that observes intermediate results and stop unpromising trials.
    - Approach:
        1. Fix at a high-ish learning rate and at a fixed iteration
        2. perform Hyperparameter Search
        3. Using the best parameters found, reduce learning rate and find the best iteration using early stopping
- insights:
    - **the above approach is effective** - gains are usually seen in step 3

In [21]:
import lightgbm as lgb
import optuna 
from lightgbm import Dataset
from optuna import Trial, visualization
from optuna.samplers import TPESampler

### prepare data

In [3]:
label_colnames = ['h1n1_vaccine', 'seasonal_vaccine']
X = pd.read_csv(r"C:\Users\tanch\Documents\Coding Competitions\DataDriven\Flu Shot Learning\local\data\cleaned_train_set.csv",index_col = "respondent_id")
y = X[label_colnames[0]]
X = X.drop(label_colnames,axis=1)

<IPython.core.display.Javascript object>

In [4]:
# boosting libraries have Dataset class that has memory and speed optimizations
dtrain = Dataset(X, label=y)

# Objective Function
- should take an input Trial instance and return a score

In [1]:
def objective(trial):

    # n_estimators and learning rate are closely tight and will be tuned afterwards
    param = {
        "objective": "binary",
        "learning_rate":0.1,
        "metric": "auc",
        "verbosity": -1,
        "boosting_type": "gbdt",
        "max_depth" : trial.suggest_int("max_depth", 4, 10),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.4, 1.0),
        "subsample": trial.suggest_float("subsample", 0.4, 1.0),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
        "lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 10.0, log=True),
        "lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True),
    }

    # Add a callback for pruning.
    pruning_callback = optuna.integration.LightGBMPruningCallback(trial, "auc")
    
    cv_scores = lgb.cv(param, dtrain, 1000, nfold  = 3, early_stopping_rounds = 20,verbose_eval=False, 
                       callbacks=[pruning_callback])
    
    return cv_scores['auc-mean'][-1]      # last recorded cv score is the best score from early stopping


# create_study

In [None]:
study = optuna.create_study(
    pruner=optuna.pruners.MedianPruner(n_warmup_steps=10), direction="maximize"
)
study.optimize(objective, n_trials=300)

In [16]:
print("Number of finished trials: {}".format(len(study.trials)))
print("Best trial:")
trial = study.best_trial
print("  Value: {}".format(trial.value))
print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

Number of finished trials: 300
Best trial:
  Value: 0.8687676567096521
  Params: 
    max_depth: 4
    colsample_bytree: 0.42754023012515663
    subsample: 0.5520857036662926
    min_child_samples: 10
    lambda_l1: 1.1370937261781668
    lambda_l2: 0.07055165843421532


## Optimize n_estimators and learning rate
- set to high iterations and low learning rate

In [33]:
param = trial.params
param.update({"learning_rate":0.01,
              "objective": "binary",
              "metric": "auc"})
param

{'max_depth': 4,
 'colsample_bytree': 0.42754023012515663,
 'subsample': 0.5520857036662926,
 'min_child_samples': 10,
 'lambda_l1': 1.1370937261781668,
 'lambda_l2': 0.07055165843421532,
 'learning_rate': 0.01,
 'objective': 'binary',
 'metric': 'auc'}

In [34]:
cv_scores = lgb.cv(param, dtrain, 10000, nfold  = 5, early_stopping_rounds = 50,verbose_eval=False)['auc-mean']

In [35]:
print(f"Best Iteration {len(cv_scores)} at {cv_scores[-1]}")

Best Iteration 1681 at 0.8694553001772934


# Train with best param and export

In [5]:
best_param = {
    'max_depth': 4,
    'colsample_bytree': 0.42754023012515663,
    'subsample': 0.5520857036662926,
    'min_child_samples': 10,
    'lambda_l1': 1.1370937261781668,
    'lambda_l2': 0.07055165843421532,
    'learning_rate': 0.01,
    'objective': 'binary',
    'metric': 'auc',
    "num_iterations":1681 
}

In [6]:
model = lgb.LGBMClassifier()
model.set_params(**best_param)
model.fit(X,y)

<IPython.core.display.Javascript object>

LGBMClassifier(boosting_type='gbdt', class_weight=None,
               colsample_bytree=0.42754023012515663, importance_type='split',
               lambda_l1=1.1370937261781668, lambda_l2=0.07055165843421532,
               learning_rate=0.01, max_depth=4, metric='auc',
               min_child_samples=10, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=100, n_jobs=-1, num_iterations=1681, num_leaves=31,
               objective='binary', random_state=None, reg_alpha=0.0,
               reg_lambda=0.0, silent=True, subsample=0.5520857036662926,
               subsample_for_bin=200000, subsample_freq=0)

In [8]:
import joblib
joblib.dump(model, r"C:\Users\tanch\Documents\Coding Competitions\DataDriven\Flu Shot Learning\local\model\lightgbm h1n1_vaccine 0.pkl")
# gbm_pickle = joblib.load(r"C:\Users\tanch\Documents\Coding Competitions\DataDriven\Flu Shot Learning\local\model\lightgbm 0.pkl")

['C:\\Users\\tanch\\Documents\\Coding Competitions\\DataDriven\\Flu Shot Learning\\local\\model\\lightgbm h1n1_vaccine 0.pkl']