# Load dependencies
---

In [1]:
import pandas as pd
import numpy as np
import time
from tqdm import tqdm

from sklearn.metrics import f1_score, roc_auc_score
from sklearn.model_selection import StratifiedKFold, RepeatedStratifiedKFold
from sklearn.calibration import CalibratedClassifierCV

from xgboost import XGBClassifier

# imbalanced
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import ADASYN, SMOTE
from imblearn.under_sampling import OneSidedSelection, NeighbourhoodCleaningRule, TomekLinks

import optuna
from optuna.visualization import plot_optimization_history, plot_param_importances

In [2]:
train = pd.read_csv('../input/porto-seguro-data-challenge/train.csv', index_col='id')
test = pd.read_csv('../input/porto-seguro-data-challenge/test.csv', index_col='id')
sample_submission = pd.read_csv('../input/porto-seguro-data-challenge/submission_sample.csv')
meta = pd.read_csv('../input/porto-seguro-data-challenge/metadata.csv')

cat_nom = [x for x in meta.iloc[1:-1, :].loc[(meta.iloc[:,1]=="Qualitativo nominal")].iloc[:,0]] 
cat_ord = [x for x in meta.iloc[1:-1, :].loc[(meta.iloc[:,1]=="Qualitativo ordinal")].iloc[:,0]] 
num_dis = [x for x in meta.iloc[1:-1, :].loc[(meta.iloc[:,1]=="Quantitativo discreto")].iloc[:,0]] 
num_con = [x for x in meta.iloc[1:-1, :].loc[(meta.iloc[:,1]=="Quantitativo continua")].iloc[:,0]] 

In [3]:
X_test = test[cat_nom+cat_ord+num_dis+num_con]
X = train[cat_nom+cat_ord+num_dis+num_con]
y = train.y

K=5
N_REPEAT = 2
SEED=314
kf = RepeatedStratifiedKFold(n_splits=K, random_state=SEED, n_repeats=N_REPEAT)

In [4]:
def get_threshold(y_true, y_pred):
    # Moving threshold
    thresholds = np.arange(0.0, 1.0, 0.01)
    f1_scores = []
    for thresh in thresholds:
        f1_scores.append(
            f1_score(y_true, [1 if m>thresh else 0 for m in y_pred]))
    f1s = np.array(f1_scores)
    return thresholds[f1s.argmax()]
    
    
def custom_f1(y_true, y_pred):
     
    max_f1_threshold =  get_threshold(y_true, y_pred)

    y_pred = np.where(y_pred>max_f1_threshold, 1, 0)
    f1_after = f1_score(y_true, y_pred) 
    
    return f1_after

# Baseline
---

In [5]:
%%time

xgb_oof = np.zeros(X.shape[0])
xgb_pred = np.zeros(X_test.shape[0])
trs = []
f1_trs = []

for fold, (train_idx, val_idx) in enumerate(kf.split(X=X, y=y)):
    print(f"➜ FOLD :{fold}")
    X_train = X.iloc[train_idx]
    y_train = y.iloc[train_idx]
    X_val = X.iloc[val_idx]
    y_val = y.iloc[val_idx]
    
    #print("N rows train:", X_train.shape[0])

    start = time.time()
    
    model = XGBClassifier(random_state=SEED,
                          objective="binary:logistic",
                          use_label_encoder=False,
                         n_estimators=10000)
    
    model.fit(X_train, y_train, 
              eval_set=[(X_val, y_val)],
              eval_metric='logloss',
              early_stopping_rounds=100,
              verbose=False)
    
    #print("Best Score:", model.best_score_['learn'])
    #print("Best Interation:", model.best_iteration_)
    
    calib = CalibratedClassifierCV(base_estimator=model, cv='prefit')
    
    calib.fit(X_val, y_val)
    
    #print(f"N trees: {calib.best_iteration_}")
    
    xgb_oof[val_idx] += calib.predict_proba(X_val)[:,1] / N_REPEAT
    
    f1_after = custom_f1(y_val, xgb_oof[val_idx])
    
    f1_trs = f1_trs + [f1_after]
    
    xgb_pred += calib.predict_proba(X_test)[:, 1] / K / N_REPEAT
    
    print(f"score: {f1_after:.6f} ")
    print(f"elapsed: {time.time()-start:.2f} sec\n")
    
    del calib
    
xgb_f1 = custom_f1(y, xgb_oof)
print(f"Final f1 score: {xgb_f1} ✔️ ")

➜ FOLD :0
score: 0.639585 
elapsed: 3.75 sec

➜ FOLD :1
score: 0.675229 
elapsed: 3.54 sec

➜ FOLD :2
score: 0.689537 
elapsed: 3.91 sec

➜ FOLD :3
score: 0.665477 
elapsed: 3.62 sec

➜ FOLD :4
score: 0.660567 
elapsed: 4.70 sec

➜ FOLD :5
score: 0.674503 
elapsed: 3.68 sec

➜ FOLD :6
score: 0.675325 
elapsed: 3.69 sec

➜ FOLD :7
score: 0.688869 
elapsed: 3.82 sec

➜ FOLD :8
score: 0.668321 
elapsed: 3.84 sec

➜ FOLD :9
score: 0.659396 
elapsed: 3.57 sec

Final f1 score: 0.6682733689103115 ✔️ 
CPU times: user 2min 19s, sys: 225 ms, total: 2min 19s
Wall time: 39.6 s


# Optuna
---

In [6]:
fixed_params = {
    'random_state': SEED,
    "objective": "binary:logistic",
    "eval_metric": 'logloss',
    'use_label_encoder':False,
    'n_estimators':10000,
}

def objective(trial):
    
    #max_depth = trial.suggest_int('max_depth', 3, 12)
    #max_num_leaves = (2 ** max_depth) - 1

    hyperparams = {
        'clf':{
        "booster": trial.suggest_categorical("booster", ["gbtree", "gblinear"]),
        "lambda": trial.suggest_float("lambda", 1e-8, 5.0, log=True),
        "alpha": trial.suggest_float("alpha", 1e-8, 5.0, log=True),
        #'n_estimators': trial.suggest_int('n_estimators', 500, 2000, 100)
    },
        'pipe':{
            'resample': trial.suggest_categorical('resample',[None, 'adasyn', 'smote', 'tomek', 'ncr', 'oss'])
        },
        'fit':{
            'early_stopping_rounds': trial.suggest_int('early_stopping_rounds', 50, 200, 50)
        }

    }
    
    if hyperparams['clf']["booster"] == "gbtree" or hyperparams['clf']["booster"] == "dart":
        hyperparams['clf']["max_depth"] = trial.suggest_int("max_depth", 1, 9)
        hyperparams['clf']["eta"] = trial.suggest_float("eta", 0.01, 0.1, log=True)
        hyperparams['clf']["gamma"] = trial.suggest_float("gamma", 1e-8, 1.0, log=True)
        hyperparams['clf']["grow_policy"] = trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"])
        hyperparams['clf']['min_child_weight'] = trial.suggest_int('min_child_weight', 5, 20)
        hyperparams['clf']["subsample"] = trial.suggest_float("subsample", 0.03, 1)
        hyperparams['clf']["colsample_bytree"] = trial.suggest_float("colsample_bytree", 0.03, 1)
        hyperparams['clf']['max_delta_step'] = trial.suggest_int('max_delta_step', 0, 10)
        
    if hyperparams['clf']["booster"] == "dart":
        hyperparams['clf']["sample_type"] = trial.suggest_categorical("sample_type", ["uniform", "weighted"])
        hyperparams['clf']["normalize_type"] = trial.suggest_categorical("normalize_type", ["tree", "forest"])
        hyperparams['clf']["rate_drop"] = trial.suggest_float("rate_drop", 1e-8, 1.0, log=True)
        hyperparams['clf']["skip_drop"] = trial.suggest_float("skip_drop", 1e-8, 1.0, log=True)
    
    if hyperparams['pipe']['resample'] == 'adasyn':
        res = ADASYN(random_state=42)
    elif hyperparams['pipe']['resample'] == 'smote':
        res = SMOTE()
    elif hyperparams['pipe']['resample'] == 'tomek':
        res = TomekLinks()
    elif hyperparams['pipe']['resample'] == 'ncr':
        res = NeighbourhoodCleaningRule(n_neighbors=3,
                                        threshold_cleaning=0.5)
    elif hyperparams['pipe']['resample'] == 'oss':
        res = OneSidedSelection(n_neighbors=1, n_seeds_S=200)
    else:
        res = None
        
    params = dict(**fixed_params, **hyperparams['clf'])
    
    xgb_oof = np.zeros(X.shape[0])
    xgb_pred = pd.DataFrame()
    trs = []
    f1_trs = []

    for fold, (train_idx, val_idx) in enumerate(kf.split(X=X, y=y)):
        X_train = X.iloc[train_idx]
        y_train = y.iloc[train_idx]
        X_val = X.iloc[val_idx]
        y_val = y.iloc[val_idx]

        start = time.time()
        
        model = XGBClassifier(**params)
        
        pipe = Pipeline([('resample', res),
                          ('clf', model) ])

        pipe.fit(X_train, y_train,
                 clf__eval_set=[(X_val, y_val)],
                 clf__early_stopping_rounds=hyperparams['fit']['early_stopping_rounds'],
                 clf__verbose=False)
    
        xgb_oof[val_idx] += pipe.predict_proba(X_val)[:,1] / N_REPEAT

        del model

    return roc_auc_score(y_val, xgb_oof[val_idx])

In [7]:
study_xgb = optuna.create_study(direction='maximize')

study_xgb.optimize(objective, 
               timeout=60*5, 
               #n_trials=20, 
               gc_after_trial=True)

[32m[I 2021-08-13 12:43:31,036][0m A new study created in memory with name: no-name-7db4671f-1bab-4313-89e0-1ef62fbae0f4[0m
[32m[I 2021-08-13 12:44:58,273][0m Trial 0 finished with value: 0.8841179034542879 and parameters: {'booster': 'gbtree', 'lambda': 0.00010784293169012414, 'alpha': 1.4757524682608682e-07, 'resample': 'tomek', 'early_stopping_rounds': 50, 'max_depth': 8, 'eta': 0.029304793125164676, 'gamma': 0.0002742422766267376, 'grow_policy': 'depthwise', 'min_child_weight': 20, 'subsample': 0.9634918462856418, 'colsample_bytree': 0.6407865653648729, 'max_delta_step': 10}. Best is trial 0 with value: 0.8841179034542879.[0m
[32m[I 2021-08-13 12:46:31,966][0m Trial 1 finished with value: 0.8805227354099535 and parameters: {'booster': 'gbtree', 'lambda': 0.44753296826554306, 'alpha': 4.5761388965052125, 'resample': 'tomek', 'early_stopping_rounds': 50, 'max_depth': 2, 'eta': 0.02908982166495431, 'gamma': 0.31638728142275546, 'grow_policy': 'depthwise', 'min_child_weight': 1

In [8]:
print('-> Number of finished trials: ', len(study_xgb.trials))
print('-> Best trial:')
trial = study_xgb.best_trial
print('\tValue: {}'.format(trial.value))
print('-> Params: ')
trial.params

-> Number of finished trials:  4
-> Best trial:
	Value: 0.8841179034542879
-> Params: 


{'booster': 'gbtree',
 'lambda': 0.00010784293169012414,
 'alpha': 1.4757524682608682e-07,
 'resample': 'tomek',
 'early_stopping_rounds': 50,
 'max_depth': 8,
 'eta': 0.029304793125164676,
 'gamma': 0.0002742422766267376,
 'grow_policy': 'depthwise',
 'min_child_weight': 20,
 'subsample': 0.9634918462856418,
 'colsample_bytree': 0.6407865653648729,
 'max_delta_step': 10}

In [9]:
plot_optimization_history(study_xgb)

In [10]:
optuna.visualization.plot_parallel_coordinate(study_xgb)

In [11]:
plot_param_importances(study_xgb)

In [12]:
final_params_xgb = dict()
final_params_xgb['clf']=dict(**fixed_params, **study_xgb.best_params)
del final_params_xgb['clf']['resample']
del final_params_xgb['clf']['early_stopping_rounds']

final_params_xgb['pipe'] = dict()
final_params_xgb['pipe']['resample'] = study_xgb.best_params['resample']

final_params_xgb['fit'] = dict()
final_params_xgb['fit']['early_stopping_rounds'] = study_xgb.best_params['early_stopping_rounds']

# After long train.....
final_params_xgb = {'clf': {'random_state': 314,
  'objective': 'binary:logistic',
  'eval_metric': 'logloss',
  'use_label_encoder': False,
  'n_estimators': 10000,
  'booster': 'gbtree',
  'lambda': 1.9245129630343058e-06,
  'alpha': 0.17771161058308743,
  'max_depth': 5,
  'eta': 0.01122764320311446,
  'gamma': 7.397134352580097e-06,
  'grow_policy': 'lossguide',
  'min_child_weight': 5,
  'subsample': 0.8123753830625202,
  'colsample_bytree': 0.19010362713896298,
  'max_delta_step': 2},
 'pipe': {'resample': 'tomek'},
 'fit': {'early_stopping_rounds': 150}}

# Final Kfold prediction

In [13]:
if final_params_xgb['pipe']['resample'] == 'adasyn':
    res = ADASYN(random_state=42)
elif final_params_xgb['pipe']['resample'] == 'smote':
    res = SMOTE()
elif final_params_xgb['pipe']['resample'] == 'tomek':
    res = TomekLinks()
elif final_params_xgb['pipe']['resample'] == 'ncr':
    res = NeighbourhoodCleaningRule(n_neighbors=3,
                                    threshold_cleaning=0.5)
elif final_params_xgb['pipe']['resample'] == 'oss':
    res = OneSidedSelection(n_neighbors=1, n_seeds_S=200)
else:
    res = None

In [14]:
xgb_oof = np.zeros(X.shape[0])
xgb_pred = np.zeros(X_test.shape[0])
trs = []
f1_trs = []

for fold, (train_idx, val_idx) in enumerate(kf.split(X=X, y=y)):
    print(f"➜ FOLD :{fold}")
    X_train = X.iloc[train_idx]
    y_train = y.iloc[train_idx]
    X_val = X.iloc[val_idx]
    y_val = y.iloc[val_idx]
    
    #print("N rows train:", X_train.shape[0])

    start = time.time()
    
    model = XGBClassifier(**final_params_xgb['clf'])
    
    pipe = Pipeline([('resample', res),
                      ('clf', model) ])
    
    pipe.fit(X_train, y_train,
             clf__eval_set=[(X_val, y_val)],
             clf__early_stopping_rounds=final_params_xgb['fit']['early_stopping_rounds'],
             clf__verbose=False)
    
    #print("Best Score:", pipe[ 'clf'].best_score_['learn'])
    #print("Best Interation:", pipe[ 'clf'].best_iteration_)
    
    calib = CalibratedClassifierCV(base_estimator=pipe, cv='prefit')
    
    calib.fit(X_val, y_val)
    
    #print(f"N trees: {calib.best_iteration_}")
    
    xgb_oof[val_idx] += calib.predict_proba(X_val)[:,1] / N_REPEAT
    
    f1_after = custom_f1(y_val, xgb_oof[val_idx])
    
    f1_trs = f1_trs + [f1_after]
    
    xgb_pred += calib.predict_proba(X_test)[:, 1] / K / N_REPEAT
    
    print(f"score: {f1_after:.6f} ")
    print(f"elapsed: {time.time()-start:.2f} sec\n")
    
    del calib
    
xgb_f1 = custom_f1(y, xgb_oof)
print(f"Final f1 score: {xgb_f1} ✔️ ")

➜ FOLD :0
score: 0.653266 
elapsed: 14.66 sec

➜ FOLD :1
score: 0.686222 
elapsed: 18.53 sec

➜ FOLD :2
score: 0.697164 
elapsed: 18.82 sec

➜ FOLD :3
score: 0.676198 
elapsed: 15.60 sec

➜ FOLD :4
score: 0.680448 
elapsed: 14.95 sec

➜ FOLD :5
score: 0.688333 
elapsed: 19.69 sec

➜ FOLD :6
score: 0.692727 
elapsed: 17.08 sec

➜ FOLD :7
score: 0.698630 
elapsed: 20.15 sec

➜ FOLD :8
score: 0.675236 
elapsed: 15.24 sec

➜ FOLD :9
score: 0.666667 
elapsed: 13.17 sec

Final f1 score: 0.6804621486463183 ✔️ 


In [15]:
final_threshold = get_threshold(train.y, xgb_oof)
final_threshold

0.25

In [16]:
custom_f1(train.y, np.where(xgb_oof>final_threshold, 1, 0))

0.6804621486463183

# Sub
---

In [17]:
# Write predictions to sub
sample_submission['predicted'] = np.where(xgb_pred>final_threshold, 1, 0).astype('int64')
sample_submission.to_csv('xgb_sub.csv',index=False)

In [18]:
# Write predictions to stack
sample_submission['predicted'] = xgb_pred
sample_submission.to_csv('xgb_sub_probs.csv',index=False)
pd.DataFrame({'id':train.index, 'xgb_oof':xgb_oof}).to_csv('xgb_oof.csv',index=False)

# Reference

https://github.com/optuna/optuna-examples/blob/main/xgboost/xgboost_cv_integration.py