# Load dependencies
---

In [1]:
import pandas as pd
import numpy as np
import time
from tqdm import tqdm

from sklearn.metrics import f1_score, roc_auc_score, log_loss
from sklearn.model_selection import StratifiedKFold, RepeatedStratifiedKFold, KFold
from sklearn.calibration import CalibratedClassifierCV

from xgboost import XGBClassifier
from catboost import CatBoostClassifier
import shap

# imbalanced
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import ADASYN, SMOTE
from imblearn.under_sampling import OneSidedSelection, NeighbourhoodCleaningRule, TomekLinks

import optuna
from optuna.visualization import plot_optimization_history, plot_param_importances

In [2]:
train = pd.read_csv('../input/porto-seguro-data-challenge/train.csv', index_col='id').reset_index(drop=True)
test = pd.read_csv('../input/porto-seguro-data-challenge/test.csv', index_col='id').reset_index(drop=True)
sample_submission = pd.read_csv('../input/porto-seguro-data-challenge/submission_sample.csv')
meta = pd.read_csv('../input/porto-seguro-data-challenge/metadata.csv')

cat_nom = [x for x in meta.iloc[1:-1, :].loc[(meta.iloc[:,1]=="Qualitativo nominal")].iloc[:,0]] 
cat_ord = [x for x in meta.iloc[1:-1, :].loc[(meta.iloc[:,1]=="Qualitativo ordinal")].iloc[:,0]] 
num_dis = [x for x in meta.iloc[1:-1, :].loc[(meta.iloc[:,1]=="Quantitativo discreto")].iloc[:,0]] 
num_con = [x for x in meta.iloc[1:-1, :].loc[(meta.iloc[:,1]=="Quantitativo continua")].iloc[:,0]] 

In [3]:
X_test = test[cat_nom+cat_ord+num_dis+num_con]
X = train[cat_nom+cat_ord+num_dis+num_con]
y = train.y

K=10
SEED=5
kf = KFold(n_splits=K, random_state=SEED, shuffle=True)

In [4]:
knn_feat_train = pd.read_csv('../input/porto-seguro-knn-feature-extraction-k-1/knn_feat_train.csv')
knn_feat_test = pd.read_csv('../input/porto-seguro-knn-feature-extraction-k-1/knn_feat_test.csv')

X = pd.concat([X, knn_feat_train], axis=1)
X_test = pd.concat([X_test, knn_feat_test], axis=1)

In [5]:
def get_threshold(y_true, y_pred):
    # Moving threshold
    thresholds = np.arange(0.0, 1.0, 0.01)
    f1_scores = []
    for thresh in thresholds:
        f1_scores.append(
            f1_score(y_true, [1 if m>thresh else 0 for m in y_pred]))
    f1s = np.array(f1_scores)
    return thresholds[f1s.argmax()]
    
    
def custom_f1(y_true, y_pred):
     
    max_f1_threshold =  get_threshold(y_true, y_pred)

    y_pred = np.where(y_pred>max_f1_threshold, 1, 0)
    f1_after = f1_score(y_true, y_pred) 
    
    return f1_after

# Stage 1: Calcule XGBoost Shap
---

from: https://www.kaggle.com/gomes555/porto-seguro-xgboost

In [6]:
#X = X.replace(-999, np.nan)
#X_test = X_test.replace(-999, np.nan)

In [7]:
final_params_xgb = {'clf': {'random_state': 314,
  'objective': 'binary:logistic',
  'eval_metric': 'logloss',
  'use_label_encoder': False,
  'n_estimators': 10000,
  'booster': 'gbtree',
  'lambda': 1.9245129630343058e-06,
  'alpha': 0.17771161058308743,
  'max_depth': 5,
  'eta': 0.01122764320311446,
  'gamma': 7.397134352580097e-06,
  'grow_policy': 'lossguide',
  'min_child_weight': 5,
  'subsample': 0.8123753830625202,
  'colsample_bytree': 0.19010362713896298,
  'max_delta_step': 2},
 'pipe': {'resample': 'tomek'},
 'fit': {'early_stopping_rounds': 150}}

In [8]:
%%time

shap_oof = np.zeros((X.shape[0], X.shape[1]))
shap_test = np.zeros((X_test.shape[0], X_test.shape[1]))
model_shap_oof = np.zeros(X.shape[0])

for fold, (train_idx, val_idx) in enumerate(kf.split(X=X, y=y)):
    print(f"➜ FOLD :{fold}")
    X_train = X.iloc[train_idx]
    y_train = y.iloc[train_idx]
    X_val = X.iloc[val_idx]
    y_val = y.iloc[val_idx]
    
    start = time.time()
    
    model = XGBClassifier(**final_params_xgb['clf'])
    
    model.fit(X_train, y_train,
              eval_set=[(X_val, y_val)],
              early_stopping_rounds=final_params_xgb['fit']['early_stopping_rounds'],
              verbose=False)
    
    model_shap_oof[val_idx] += model.predict_proba(X_val)[:,1]
    
    f1_after = custom_f1(y_val, model_shap_oof[val_idx])
    
    print(f"F1 custom score: {f1_after:.6f} ")

    explainer = shap.TreeExplainer(model)
    
    shap_oof[val_idx] = explainer.shap_values(X_val)

    shap_test += explainer.shap_values(X_test) / K

    print(f"elapsed: {time.time()-start:.2f} sec\n")
    
shap_oof = pd.DataFrame(shap_oof, columns = [x+"_shap" for x in X.columns])
shap_test = pd.DataFrame(shap_test, columns = [x+"_shap" for x in X_test.columns])

print("Final F1     :", custom_f1(y, model_shap_oof))
print("Final AUC    :", roc_auc_score(y, model_shap_oof))
print("Final LogLoss:", log_loss(y, model_shap_oof))

➜ FOLD :0
F1 custom score: 0.713568 


ntree_limit is deprecated, use `iteration_range` or model slicing instead.


elapsed: 110.90 sec

➜ FOLD :1
F1 custom score: 0.679376 
elapsed: 92.17 sec

➜ FOLD :2
F1 custom score: 0.703375 
elapsed: 74.52 sec

➜ FOLD :3
F1 custom score: 0.677029 
elapsed: 84.88 sec

➜ FOLD :4
F1 custom score: 0.713333 
elapsed: 125.46 sec

➜ FOLD :5
F1 custom score: 0.673993 
elapsed: 83.55 sec

➜ FOLD :6
F1 custom score: 0.665615 
elapsed: 94.13 sec

➜ FOLD :7
F1 custom score: 0.684105 
elapsed: 99.64 sec

➜ FOLD :8
F1 custom score: 0.700525 
elapsed: 99.70 sec

➜ FOLD :9
F1 custom score: 0.681081 
elapsed: 107.62 sec

Final F1     : 0.6845084920226453
Final AUC    : 0.8935149036433895
Final LogLoss: 0.3048030363242558
CPU times: user 1h 1min 45s, sys: 2.84 s, total: 1h 1min 48s
Wall time: 16min 15s


In [9]:
X = pd.concat([X, shap_oof], axis=1)
X_test = pd.concat([X_test, shap_test], axis=1)

# Stage 2: CatBoost

In [10]:
cat_oof = np.zeros(X.shape[0])
cat_pred = np.zeros(X_test.shape[0])
trs = []
f1_trs = []

for fold, (train_idx, val_idx) in enumerate(kf.split(X=X, y=y)):
    print(f"➜ FOLD :{fold}")
    X_train = X.iloc[train_idx]
    y_train = y.iloc[train_idx]
    X_val = X.iloc[val_idx]
    y_val = y.iloc[val_idx]

    start = time.time()
    
    res = TomekLinks()
    
    model = CatBoostClassifier(random_seed=SEED, 
                               verbose = 0,
                               use_best_model=True,
                               eval_metric="AUC",
                               iterations=10000,
                               learning_rate=0.01,
                               od_wait = 200)
    
    pipe = Pipeline([('resample', res),
                      ('clf', model) ])
    
    pipe.fit(X_train, y_train,
              clf__eval_set=[(X_val, y_val)])
    
    #print(f"N trees: {model.best_iteration_}")
    
    #print("Best Score:", model.best_score_['learn'])
    #print("Best Interation:", model.best_iteration_)
    
    calib = CalibratedClassifierCV(base_estimator=pipe, cv='prefit')
    calib.fit(X_val, y_val)
    
    cat_oof[val_idx] += calib.predict_proba(X_val)[:,1]
    
    f1_after = custom_f1(y_val, cat_oof[val_idx])
    
    f1_trs = f1_trs + [f1_after]
    
    cat_pred += calib.predict_proba(X_test)[:, 1] / K
    
    print(f"score: {f1_after:.6f} ")
    print(f"elapsed: {time.time()-start:.2f} sec\n")
    
    del model
    
print("Final F1     :", custom_f1(y, cat_oof))
print("Final AUC    :", roc_auc_score(y, cat_oof))
print("Final LogLoss:", log_loss(y, cat_oof))

➜ FOLD :0
score: 0.710018 
elapsed: 24.39 sec

➜ FOLD :1
score: 0.679443 
elapsed: 45.38 sec

➜ FOLD :2
score: 0.691312 
elapsed: 33.49 sec

➜ FOLD :3
score: 0.676282 
elapsed: 53.68 sec

➜ FOLD :4
score: 0.704944 
elapsed: 40.96 sec

➜ FOLD :5
score: 0.677536 
elapsed: 73.84 sec

➜ FOLD :6
score: 0.677852 
elapsed: 18.44 sec

➜ FOLD :7
score: 0.685921 
elapsed: 47.87 sec

➜ FOLD :8
score: 0.700917 
elapsed: 33.04 sec

➜ FOLD :9
score: 0.685185 
elapsed: 34.59 sec

Final F1     : 0.682759863087732
Final AUC    : 0.8888546301440641
Final LogLoss: 0.3148028523691273


In [11]:
#Final f1 score: 0.682759863087732 ✔️ 

# Optuna
---

In [12]:
#fixed_params = {
#    'random_state':SEED,
#    'verbose': 0
#}
#
#def objective(trial):
#    
#    #max_depth = trial.suggest_int('max_depth', 3, 12)
#    #max_num_leaves = (2 ** max_depth) - 1
#
#    hyperparams = {
#        'clf':{
#            'l2_leaf_reg': 3.0
#        #    'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.1), # 0.1
#        #    #'max_iter': trial.suggest_int('max_iter', 100, 1000)
#        #    'max_depth': max_depth, # None
#        #    'max_leaf_nodes': trial.suggest_int('max_leaf_nodes', 7, max_num_leaves), # 31
#        #    'min_samples_leaf' : trial.suggest_int('min_samples_leaf', 5, 50, 5), # 20
#        #    'l2_regularization': trial.suggest_float("l2_regularization", 1e-8, 10.0, log=True), # 0.0
#        #    'warm_start': trial.suggest_categorical('warm_start',[True, False]), # False
#        #    'n_iter_no_change': trial.suggest_int("n_iter_no_change", 10, 100, 10), # 10
#        },
#        'pipe':{
#            'resample': trial.suggest_categorical('resample',[None, 'adasyn', 'smote', 'tomek', 'ncr', 'oss'])
#        }
#
#    }
#    
#    if hyperparams['pipe']['resample'] == 'adasyn':
#        res = ADASYN(random_state=42)
#    elif hyperparams['pipe']['resample'] == 'smote':
#        res = SMOTE()
#    elif hyperparams['pipe']['resample'] == 'tomek':
#        res = TomekLinks()
#    elif hyperparams['pipe']['resample'] == 'ncr':
#        res = NeighbourhoodCleaningRule(n_neighbors=3,
#                                        threshold_cleaning=0.5)
#    elif hyperparams['pipe']['resample'] == 'oss':
#        res = OneSidedSelection(n_neighbors=1, n_seeds_S=200)
#    else:
#        res = None
#        
#    params = dict(**fixed_params, **hyperparams['clf'])
#    
#    cat_oof = np.zeros(X.shape[0])
#    cat_pred = pd.DataFrame()
#    trs = []
#    f1_trs = []
#
#    for fold, (train_idx, val_idx) in enumerate(kf.split(X=X, y=y)):
#        X_train = X.iloc[train_idx]
#        y_train = y.iloc[train_idx]
#        X_val = X.iloc[val_idx]
#        y_val = y.iloc[val_idx]
#
#        start = time.time()
#        
#        model = CatBoostClassifier(**params)
#        
#        pipe = Pipeline([('resample', res),
#                          ('clf', model) ])
#
#        pipe.fit(X_train, y_train)
#    
#        cat_oof[val_idx] += pipe.predict_proba(X_val)[:,1] / N_REPEAT
#
#        del model
#
#    return roc_auc_score(y_val, cat_oof[val_idx])

In [13]:
#study_cat = optuna.create_study(direction='maximize')
#
#study_cat.optimize(objective, 
#               #timeout=60*60*7.5, 
#               n_trials=20, 
#               gc_after_trial=True)

In [14]:
#print('-> Number of finished trials: ', len(study_cat.trials))
#print('-> Best trial:')
#trial = study_cat.best_trial
#print('\tValue: {}'.format(trial.value))
#print('-> Params: ')
#trial.params

In [15]:
#plot_optimization_history(study_cat)

In [16]:
#optuna.visualization.plot_parallel_coordinate(study_cat)

In [17]:
#plot_param_importances(study_cat)

In [18]:
#final_params_cat = dict()
#final_params_cat['clf']=dict(**fixed_params, **study_cat.best_params)
#del final_params_cat['clf']['resample']
#final_params_cat['pipe'] = dict()
#final_params_cat['pipe']['resample'] = study_cat.best_params['resample']
#
#final_params_cat

# Final Kfold prediction

In [19]:
#if final_params_cat['pipe']['resample'] == 'adasyn':
#    res = ADASYN(random_state=42)
#elif final_params_cat['pipe']['resample'] == 'smote':
#    res = SMOTE()
#elif final_params_cat['pipe']['resample'] == 'tomek':
#    res = TomekLinks()
#elif final_params_cat['pipe']['resample'] == 'ncr':
#    res = NeighbourhoodCleaningRule(n_neighbors=3,
#                                    threshold_cleaning=0.5)
#elif final_params_cat['pipe']['resample'] == 'oss':
#    res = OneSidedSelection(n_neighbors=1, n_seeds_S=200)
#else:
#    res = None

In [20]:
n_seeds=10

cat_oof = np.zeros((X.shape[0], len(list(range(n_seeds)))))
cat_pred = np.zeros((X_test.shape[0], len(list(range(n_seeds)))))

for seed in range(n_seeds):
    
    print("\nRANDOM SEED:", seed)

    kf = KFold(n_splits=K, random_state=seed, shuffle=True)

    for fold, (train_idx, val_idx) in enumerate(kf.split(X=X, y=y)):
        #print(f"➜ FOLD :{fold}")
        X_train = X.iloc[train_idx]
        y_train = y.iloc[train_idx]
        X_val = X.iloc[val_idx]
        y_val = y.iloc[val_idx]

        start = time.time()

        res = TomekLinks()

        model = CatBoostClassifier(random_seed=seed, 
                                   verbose = 0,
                                   use_best_model=True,
                                   eval_metric="AUC",
                                   iterations=10000,
                                   learning_rate=0.01,
                                   od_wait = 200)

        pipe = Pipeline([('resample', res),
                          ('clf', model) ])

        pipe.fit(X_train, y_train,
                  clf__eval_set=[(X_val, y_val)])

        #print(f"N trees: {model.best_iteration_}")

        #print("Best Score:", model.best_score_['learn'])
        #print("Best Interation:", model.best_iteration_)

        calib = CalibratedClassifierCV(base_estimator=pipe, cv='prefit')
        calib.fit(X_val, y_val)

        cat_oof[val_idx, seed] += calib.predict_proba(X_val)[:,1]

        cat_pred[:, seed] += calib.predict_proba(X_test)[:, 1] / K

    print("F1     :", custom_f1(y, pd.Series(cat_oof[:, seed])))
    print("AUC    :", roc_auc_score(y, pd.Series(cat_oof[:, seed])))
    print("LogLoss:", log_loss(y, pd.Series(cat_oof[:, seed])))
    
cat_oof = np.mean(cat_oof, axis=1)
cat_pred = np.mean(cat_pred, axis=1)
print("-------------------------------------")
print("Final F1     :", custom_f1(y, pd.Series(cat_oof)))
print("Final AUC    :", roc_auc_score(y, pd.Series(cat_oof)))
print("Final LogLoss:", log_loss(y, pd.Series(cat_oof)))


RANDOM SEED: 0
F1     : 0.6804638715432649
AUC    : 0.8909232275223675
LogLoss: 0.31335246615070295

RANDOM SEED: 1
F1     : 0.6825341267063354
AUC    : 0.8906714537608101
LogLoss: 0.314625477289244

RANDOM SEED: 2
F1     : 0.6825927163381224
AUC    : 0.8915799433828073
LogLoss: 0.3131188123499769

RANDOM SEED: 3
F1     : 0.6821732338135741
AUC    : 0.8919287351706687
LogLoss: 0.31355713757804177

RANDOM SEED: 4
F1     : 0.6820244328097731
AUC    : 0.8900223013846157
LogLoss: 0.31375306698595823

RANDOM SEED: 5
F1     : 0.682759863087732
AUC    : 0.8888546301440641
LogLoss: 0.3148028523691273

RANDOM SEED: 6
F1     : 0.6824324324324325
AUC    : 0.8912646737041308
LogLoss: 0.31321208232997305

RANDOM SEED: 7
F1     : 0.6821024167652527
AUC    : 0.8912469944487761
LogLoss: 0.31394152109765755

RANDOM SEED: 8
F1     : 0.6812960740613748
AUC    : 0.8918468752383041
LogLoss: 0.3141090733996057

RANDOM SEED: 9
F1     : 0.6836036036036035
AUC    : 0.8919887761628859
LogLoss: 0.31382419308326

In [21]:
final_threshold = get_threshold(train.y, cat_oof)
final_threshold

0.29

In [22]:
custom_f1(train.y, np.where(cat_oof>final_threshold, 1, 0))

0.6824324324324325

# Sub
---

In [23]:
# Write predictions to sub
sample_submission['predicted'] = np.where(cat_pred>final_threshold, 1, 0).astype('int64')
sample_submission.to_csv('cat_shap_sub.csv',index=False)

In [24]:
# Write predictions to stack
sample_submission['predicted'] = cat_pred
sample_submission.to_csv('cat_shap_sub_probs.csv',index=False)
pd.DataFrame({'id':train.index, 'cat_shap_oof':cat_oof}).to_csv('cat_oof.csv',index=False)