# Load dependencies
---

In [1]:
import pandas as pd
import numpy as np
import time
from tqdm import tqdm

from sklearn.metrics import f1_score, roc_auc_score, log_loss
from sklearn.model_selection import StratifiedKFold, RepeatedStratifiedKFold, KFold
from sklearn.calibration import CalibratedClassifierCV

from xgboost import XGBClassifier

# imbalanced
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import ADASYN, SMOTE
from imblearn.under_sampling import OneSidedSelection, NeighbourhoodCleaningRule, TomekLinks

import optuna
from optuna.visualization import plot_optimization_history, plot_param_importances

In [2]:
train = pd.read_csv('../input/porto-seguro-data-challenge/train.csv', index_col='id').reset_index(drop=True)
test = pd.read_csv('../input/porto-seguro-data-challenge/test.csv', index_col='id').reset_index(drop=True)
sample_submission = pd.read_csv('../input/porto-seguro-data-challenge/submission_sample.csv')
meta = pd.read_csv('../input/porto-seguro-data-challenge/metadata.csv')

cat_nom = [x for x in meta.iloc[1:-1, :].loc[(meta.iloc[:,1]=="Qualitativo nominal")].iloc[:,0]] 
cat_ord = [x for x in meta.iloc[1:-1, :].loc[(meta.iloc[:,1]=="Qualitativo ordinal")].iloc[:,0]] 
num_dis = [x for x in meta.iloc[1:-1, :].loc[(meta.iloc[:,1]=="Quantitativo discreto")].iloc[:,0]] 
num_con = [x for x in meta.iloc[1:-1, :].loc[(meta.iloc[:,1]=="Quantitativo continua")].iloc[:,0]] 

In [3]:
X_test = test[cat_nom+cat_ord+num_dis+num_con]
X = train[cat_nom+cat_ord+num_dis+num_con]
y = train.y

K=10
SEED=314
kf = KFold(n_splits=K, random_state=SEED, shuffle=True)

In [4]:
def get_threshold(y_true, y_pred):
    # Moving threshold
    thresholds = np.arange(0.0, 1.0, 0.01)
    f1_scores = []
    for thresh in thresholds:
        f1_scores.append(
            f1_score(y_true, [1 if m>thresh else 0 for m in y_pred]))
    f1s = np.array(f1_scores)
    return thresholds[f1s.argmax()]
    
    
def custom_f1(y_true, y_pred):
     
    max_f1_threshold =  get_threshold(y_true, y_pred)

    y_pred = np.where(y_pred>max_f1_threshold, 1, 0)
    f1_after = f1_score(y_true, y_pred) 
    
    return f1_after

# Baseline
---

In [5]:
%%time

xgb_oof = np.zeros(X.shape[0])
xgb_pred = np.zeros(X_test.shape[0])
trs = []
f1_trs = []

for fold, (train_idx, val_idx) in enumerate(kf.split(X=X, y=y)):
    print(f"➜ FOLD :{fold}")
    X_train = X.iloc[train_idx]
    y_train = y.iloc[train_idx]
    X_val = X.iloc[val_idx]
    y_val = y.iloc[val_idx]
    
    start = time.time()
    
    model = XGBClassifier(random_state=SEED,
                          objective="binary:logistic",
                          use_label_encoder=False,
                          n_estimators=10000)
    
    model.fit(X_train, y_train, 
              eval_set=[(X_val, y_val)],
              eval_metric='logloss',
              early_stopping_rounds=150,
              verbose=False)
    
    xgb_oof[val_idx] = model.predict_proba(X_val)[:,1]
    
    print("Score:", custom_f1(y_val, xgb_oof[val_idx]))
        
    xgb_pred += model.predict_proba(X_test)[:, 1] / K
    
    del model
    
print("Final F1     :", custom_f1(y, xgb_oof))
print("Final AUC    :", roc_auc_score(y, xgb_oof))
print("Final LogLoss:", log_loss(y, xgb_oof))

➜ FOLD :0
Score: 0.6896551724137931
➜ FOLD :1
Score: 0.6613672496025437
➜ FOLD :2
Score: 0.6887417218543047
➜ FOLD :3
Score: 0.6558558558558558
➜ FOLD :4
Score: 0.6388308977035491
➜ FOLD :5
Score: 0.6731141199226306
➜ FOLD :6
Score: 0.6315789473684211
➜ FOLD :7
Score: 0.6579925650557622
➜ FOLD :8
Score: 0.6843033509700176
➜ FOLD :9
Score: 0.7110389610389611
Final F1     : 0.6657367044463819
Final AUC    : 0.8804950502753824
Final LogLoss: 0.31791299894403696
CPU times: user 3min 33s, sys: 307 ms, total: 3min 33s
Wall time: 59.4 s


# Optuna
---

In [6]:
fixed_params = {
    'random_state': SEED,
    "objective": "binary:logistic",
    "eval_metric": 'logloss',
    'use_label_encoder':False,
    'n_estimators':10000,
}

def objective(trial):
    
    hyperparams = {
        'clf':{
        "booster": trial.suggest_categorical("booster", ["gbtree"]),
        "lambda": trial.suggest_float("lambda", 1e-8, 5.0, log=True),
        "alpha": trial.suggest_float("alpha", 1e-8, 5.0, log=True)
        }
    }
    
    if hyperparams['clf']["booster"] == "gbtree" or hyperparams['clf']["booster"] == "dart":
        hyperparams['clf']["max_depth"] = trial.suggest_int("max_depth", 1, 9)
        hyperparams['clf']["eta"] = trial.suggest_float("eta", 0.01, 0.1, log=True)
        hyperparams['clf']["gamma"] = trial.suggest_float("gamma", 1e-8, 1.0, log=True)
        hyperparams['clf']["grow_policy"] = trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"])
        hyperparams['clf']['min_child_weight'] = trial.suggest_int('min_child_weight', 5, 20)
        hyperparams['clf']["subsample"] = trial.suggest_float("subsample", 0.03, 1)
        hyperparams['clf']["colsample_bytree"] = trial.suggest_float("colsample_bytree", 0.03, 1)
        hyperparams['clf']['max_delta_step'] = trial.suggest_float('max_delta_step', 0, 10)
        
    if hyperparams['clf']["booster"] == "dart":
        hyperparams['clf']["sample_type"] = trial.suggest_categorical("sample_type", ["uniform", "weighted"])
        hyperparams['clf']["normalize_type"] = trial.suggest_categorical("normalize_type", ["tree", "forest"])
        hyperparams['clf']["rate_drop"] = trial.suggest_float("rate_drop", 1e-8, 1.0, log=True)
        hyperparams['clf']["skip_drop"] = trial.suggest_float("skip_drop", 1e-8, 1.0, log=True)
    
    params = dict(**fixed_params, **hyperparams['clf'])
    
    xgb_oof = np.zeros(X.shape[0])
    xgb_pred = pd.DataFrame()
    trs = []
    f1_trs = []

    for fold, (train_idx, val_idx) in enumerate(kf.split(X=X, y=y)):
        X_train = X.iloc[train_idx]
        y_train = y.iloc[train_idx]
        X_val = X.iloc[val_idx]
        y_val = y.iloc[val_idx]

        start = time.time()
        
        model = XGBClassifier(**params)
        
        model.fit(X_train, y_train,
                  eval_set=[(X_val, y_val)],
                  early_stopping_rounds=150,
                  verbose=False)
    
        xgb_oof[val_idx] = model.predict_proba(X_val)[:,1]

        del model

    return log_loss(y, xgb_oof)

In [7]:
study_xgb = optuna.create_study(direction='minimize')

study_xgb.optimize(objective, 
               timeout=60*60*7.5, 
               #timeout=60*5, 
               gc_after_trial=True)

[32m[I 2021-09-03 13:24:29,375][0m A new study created in memory with name: no-name-453d63ae-7874-4bc7-902b-13b963f41a01[0m
[32m[I 2021-09-03 13:26:33,167][0m Trial 0 finished with value: 0.30639619074861996 and parameters: {'booster': 'gbtree', 'lambda': 1.4915964227623835, 'alpha': 1.7117391103777633, 'max_depth': 4, 'eta': 0.02639361428514467, 'gamma': 0.001380013726938189, 'grow_policy': 'lossguide', 'min_child_weight': 11, 'subsample': 0.7482524094975311, 'colsample_bytree': 0.5785893036478263, 'max_delta_step': 5.201244493734658}. Best is trial 0 with value: 0.30639619074861996.[0m
[32m[I 2021-09-03 13:28:09,415][0m Trial 1 finished with value: 0.30726920277296815 and parameters: {'booster': 'gbtree', 'lambda': 6.084292031657082e-08, 'alpha': 4.350850696990789, 'max_depth': 6, 'eta': 0.049908386772628925, 'gamma': 0.05888542508034778, 'grow_policy': 'depthwise', 'min_child_weight': 15, 'subsample': 0.7831652794826226, 'colsample_bytree': 0.6749724342215936, 'max_delta_ste

In [8]:
print('-> Number of finished trials: ', len(study_xgb.trials))
print('-> Best trial:')
trial = study_xgb.best_trial
print('\tValue: {}'.format(trial.value))
print('-> Params: ')
trial.params

-> Number of finished trials:  197
-> Best trial:
	Value: 0.3028443879614926
-> Params: 


{'booster': 'gbtree',
 'lambda': 9.012384508756378e-07,
 'alpha': 0.7472040331088792,
 'max_depth': 5,
 'eta': 0.01507605562231303,
 'gamma': 1.0214961302342215e-08,
 'grow_policy': 'lossguide',
 'min_child_weight': 5,
 'subsample': 0.9331005225916879,
 'colsample_bytree': 0.25392142363325004,
 'max_delta_step': 5.685109389498008}

In [9]:
plot_optimization_history(study_xgb)

In [10]:
optuna.visualization.plot_parallel_coordinate(study_xgb)

In [11]:
plot_param_importances(study_xgb)

In [12]:
final_params_xgb = dict()
final_params_xgb['clf']=dict(**fixed_params, **study_xgb.best_params)

# Final Kfold prediction

In [13]:
%%time
n_seeds = 10
xgb_oof = np.zeros((X.shape[0], len(list(range(n_seeds)))))
xgb_pred = np.zeros((X_test.shape[0], len(list(range(n_seeds)))))

for seed in range(n_seeds):
    
    print("\nRANDOM SEED:", seed)
    print("Fold", end=": ")

    kf = KFold(n_splits=K, random_state=seed, shuffle=True)
    
    for fold, (train_idx, val_idx) in enumerate(kf.split(X=X, y=y)):
        
        print(fold, end=",")
        X_train = X.iloc[train_idx]
        y_train = y.iloc[train_idx]
        X_val = X.iloc[val_idx]
        y_val = y.iloc[val_idx]
        
        start = time.time()
        
        final_params_xgb['clf']['seed']=seed

        model = XGBClassifier(**final_params_xgb['clf'])

        model.fit(X_train, y_train,
                  eval_set=[(X_val, y_val)],
                  early_stopping_rounds=150,
                  verbose=False)

        calib = CalibratedClassifierCV(base_estimator=model, cv='prefit')

        calib.fit(X_val, y_val)

        xgb_oof[val_idx, seed] = calib.predict_proba(X_val)[:,1]

        xgb_pred[:, seed] += calib.predict_proba(X_test)[:, 1] / K 

        del calib

    print("\nF1     :", custom_f1(y, pd.Series(xgb_oof[:, seed])))
    print("AUC    :", roc_auc_score(y, pd.Series(xgb_oof[:, seed])))
    print("LogLoss:", log_loss(y, pd.Series(xgb_oof[:, seed])))
    
xgb_oof = np.mean(xgb_oof, axis=1)
xgb_pred = np.mean(xgb_pred, axis=1)
print("-------------------------------------")
print("Final F1     :", custom_f1(y, pd.Series(xgb_oof)))
print("Final AUC    :", roc_auc_score(y, pd.Series(xgb_oof)))
print("Final LogLoss:", log_loss(y, pd.Series(xgb_oof)))


RANDOM SEED: 0
Fold: 0,1,2,3,4,5,6,7,8,9,
F1     : 0.6791424418604651
AUC    : 0.8934517345434909
LogLoss: 0.31244577591123257

RANDOM SEED: 1
Fold: 0,1,2,3,4,5,6,7,8,9,
F1     : 0.6826516220028208
AUC    : 0.8937895701731041
LogLoss: 0.31226714693577784

RANDOM SEED: 2
Fold: 0,1,2,3,4,5,6,7,8,9,
F1     : 0.6788827610745419
AUC    : 0.8942898339612894
LogLoss: 0.3124978285540415

RANDOM SEED: 3
Fold: 0,1,2,3,4,5,6,7,8,9,
F1     : 0.6809318377911994
AUC    : 0.8931149571791628
LogLoss: 0.3134120350746124

RANDOM SEED: 4
Fold: 0,1,2,3,4,5,6,7,8,9,
F1     : 0.6814658210007047
AUC    : 0.8925104231349241
LogLoss: 0.3123779798955537

RANDOM SEED: 5
Fold: 0,1,2,3,4,5,6,7,8,9,
F1     : 0.6817940081545826
AUC    : 0.8922764998186943
LogLoss: 0.31241188156098887

RANDOM SEED: 6
Fold: 0,1,2,3,4,5,6,7,8,9,
F1     : 0.6798623063683304
AUC    : 0.8935272293214186
LogLoss: 0.3125453218874819

RANDOM SEED: 7
Fold: 0,1,2,3,4,5,6,7,8,9,
F1     : 0.6798914886402171
AUC    : 0.89303586741181
LogLoss: 0.

In [14]:
final_threshold = get_threshold(y, xgb_oof)
final_threshold

0.22

# Sub
---

In [15]:
# Write predictions to sub
sample_submission['predicted'] = np.where(xgb_pred>final_threshold, 1, 0).astype('int64')
sample_submission.to_csv('xgb_sub.csv',index=False)

In [16]:
# Write predictions to stack
sample_submission['predicted'] = xgb_pred
sample_submission.to_csv('xgb_sub_probs.csv',index=False)
pd.DataFrame({'id':train.index, 'xgb_oof':xgb_oof}).to_csv('xgb_oof.csv',index=False)