# Load dependencies
---

In [1]:
import pandas as pd
import numpy as np
import time
from tqdm import tqdm

from sklearn.metrics import f1_score, roc_auc_score, log_loss
from sklearn.model_selection import StratifiedKFold, RepeatedStratifiedKFold, KFold
from sklearn.calibration import CalibratedClassifierCV

from catboost import CatBoostClassifier
import shap

# imbalanced
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import ADASYN, SMOTE
from imblearn.under_sampling import OneSidedSelection, NeighbourhoodCleaningRule, TomekLinks

import optuna
from optuna.visualization import plot_optimization_history, plot_param_importances

In [2]:
train = pd.read_csv('../input/porto-seguro-data-challenge/train.csv', index_col='id').reset_index(drop=True)
test = pd.read_csv('../input/porto-seguro-data-challenge/test.csv', index_col='id').reset_index(drop=True)
sample_submission = pd.read_csv('../input/porto-seguro-data-challenge/submission_sample.csv')
meta = pd.read_csv('../input/porto-seguro-data-challenge/metadata.csv')

cat_nom = [x for x in meta.iloc[1:-1, :].loc[(meta.iloc[:,1]=="Qualitativo nominal")].iloc[:,0]] 
cat_ord = [x for x in meta.iloc[1:-1, :].loc[(meta.iloc[:,1]=="Qualitativo ordinal")].iloc[:,0]] 
num_dis = [x for x in meta.iloc[1:-1, :].loc[(meta.iloc[:,1]=="Quantitativo discreto")].iloc[:,0]] 
num_con = [x for x in meta.iloc[1:-1, :].loc[(meta.iloc[:,1]=="Quantitativo continua")].iloc[:,0]] 

In [3]:
pseudo_oof = pd.read_csv('../input/porto-seguro-autogluon-shap/autogluon_oof.csv')
pseudo_sub = pd.read_csv('../input/porto-seguro-autogluon-shap/autogluon_shap_sub_probs.csv')

In [4]:
X = pd.concat([train.drop('y', axis=1), pd.DataFrame({'y_oof': pseudo_oof.autogluon_shap_oof})], axis=1)
X_test = pd.concat([test, pd.DataFrame({'y_oof': pseudo_sub.predicted})], axis=1)
y = train.y

In [7]:
K=10
SEED=25
kf = KFold(n_splits=K, random_state=SEED, shuffle=True)

In [8]:
def get_threshold(y_true, y_pred):
    # Moving threshold
    thresholds = np.arange(0.0, 1.0, 0.01)
    f1_scores = []
    for thresh in thresholds:
        f1_scores.append(
            f1_score(y_true, [1 if m>thresh else 0 for m in y_pred]))
    f1s = np.array(f1_scores)
    return thresholds[f1s.argmax()]
    
    
def custom_f1(y_true, y_pred):
     
    max_f1_threshold =  get_threshold(y_true, y_pred)

    y_pred = np.where(y_pred>max_f1_threshold, 1, 0)
    f1_after = f1_score(y_true, y_pred) 
    
    return f1_after

# Final KFold
---

In [9]:
random_seeds=[2020, 2021, 2022, 2023, 2024, 2025, 2026, 2027, 2028, 2029]

cat_oof = np.zeros((X.shape[0], len(random_seeds)))
cat_pred = np.zeros((X_test.shape[0], len(random_seeds)))

for seed in random_seeds:
    
    print("\nRANDOM SEED:", seed)

    kf = KFold(n_splits=K, random_state=seed, shuffle=True)

    for fold, (train_idx, val_idx) in enumerate(kf.split(X=X, y=y)):
        #print(f"➜ FOLD :{fold}")
        X_train = X.iloc[train_idx]
        y_train = y.iloc[train_idx]
        X_val = X.iloc[val_idx]
        y_val = y.iloc[val_idx]

        start = time.time()

        res = TomekLinks()

        model = CatBoostClassifier(random_seed=seed, 
                                   verbose = 0,
                                   use_best_model=True,
                                   eval_metric="AUC",
                                   iterations=10000,
                                   learning_rate=0.01,
                                   od_wait = 200)

        pipe = Pipeline([('resample', res),
                          ('clf', model) ])

        pipe.fit(X_train, y_train,
                  clf__eval_set=[(X_val, y_val)])

        #print(f"N trees: {model.best_iteration_}")

        #print("Best Score:", model.best_score_['learn'])
        #print("Best Interation:", model.best_iteration_)

        calib = CalibratedClassifierCV(base_estimator=pipe, cv='prefit')
        calib.fit(X_val, y_val)

        cat_oof[val_idx, seed-2020] += calib.predict_proba(X_val)[:,1]

        cat_pred[:, seed-2020] += calib.predict_proba(X_test)[:, 1] / K

    print("F1     :", custom_f1(y, pd.Series(cat_oof[:, seed-2020])))
    print("AUC    :", roc_auc_score(y, pd.Series(cat_oof[:, seed-2020])))
    print("LogLoss:", log_loss(y, pd.Series(cat_oof[:, seed-2020])))
    
cat_oof = np.mean(cat_oof, axis=1)
cat_pred = np.mean(cat_pred, axis=1)
print("-------------------------------------")
print("Final F1     :", custom_f1(y, pd.Series(cat_oof)))
print("Final AUC    :", roc_auc_score(y, pd.Series(cat_oof)))
print("Final LogLoss:", log_loss(y, pd.Series(cat_oof)))


RANDOM SEED: 2020
F1     : 0.6830639494026705
AUC    : 0.8925055675647915
LogLoss: 0.31127566943195656

RANDOM SEED: 2021
F1     : 0.6837823178448743
AUC    : 0.8938576415313099
LogLoss: 0.312275233533308

RANDOM SEED: 2022
F1     : 0.6823408271837996
AUC    : 0.8934715614548656
LogLoss: 0.3117907766312128

RANDOM SEED: 2023
F1     : 0.6833447566826594
AUC    : 0.893336165749244
LogLoss: 0.31154484430901297

RANDOM SEED: 2024
F1     : 0.6840804800564773
AUC    : 0.8913918522910665
LogLoss: 0.3124673077554081

RANDOM SEED: 2025
F1     : 0.6831023933777218
AUC    : 0.891856991009414
LogLoss: 0.31161421875538076

RANDOM SEED: 2026
F1     : 0.6829353320550811
AUC    : 0.8943588857711563
LogLoss: 0.31246178149785886

RANDOM SEED: 2027
F1     : 0.6837637071100107
AUC    : 0.8937534646516051
LogLoss: 0.3112537197213606

RANDOM SEED: 2028
F1     : 0.6822345967307346
AUC    : 0.8925827586797206
LogLoss: 0.3119063297003377

RANDOM SEED: 2029
F1     : 0.6823830409356725
AUC    : 0.89447680453684

In [10]:
final_threshold = get_threshold(y, cat_oof)
final_threshold

0.24

In [11]:
custom_f1(y, np.where(cat_oof>final_threshold, 1, 0))

0.6834607366226545

# Sub
---

In [12]:
# Write predictions to sub
sample_submission['predicted'] = np.where(cat_pred>final_threshold, 1, 0).astype('int64')
sample_submission.to_csv('cat_pseudo_sub.csv',index=False)

In [13]:
# Write predictions to stack
sample_submission['predicted'] = cat_pred
sample_submission.to_csv('cat_pseudo_sub_probs.csv',index=False)
pd.DataFrame({'id':train.index, 'cat_pseudo_oof':cat_oof}).to_csv('cat_pseudo_oof.csv',index=False)