In [1]:
!pip install gokinjo scikit-learn
!pip install autogluon
!pip install --upgrade ipykernel

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable


In [2]:
# general packages
import pandas as pd
import numpy as np
import time

# knn features
import gokinjo
from gokinjo import knn_kfold_extract
from gokinjo import knn_extract

# ml tools
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import f1_score, log_loss, roc_auc_score

# models
from catboost import CatBoostClassifier
from xgboost import XGBClassifier

# optimization
import optuna

# interpretable ml
import shap

# automl
from autogluon.tabular import TabularPredictor

# ignore specific warnings
import warnings
warnings.filterwarnings("ignore", message="ntree_limit is deprecated, use `iteration_range` or model slicing instead.")

In [3]:
# aux functions

def get_threshold(y_true, y_pred):
    thresholds = np.arange(0.0, 1.0, 0.01)
    f1_scores = []
    for thresh in thresholds:
        f1_scores.append(
            f1_score(y_true, [1 if m>thresh else 0 for m in y_pred]))
    f1s = np.array(f1_scores)
    return thresholds[f1s.argmax()]
    
def custom_f1(y_true, y_pred):
    max_f1_threshold =  get_threshold(y_true, y_pred)
    y_pred = np.where(y_pred>max_f1_threshold, 1, 0)
    return f1_score(y_true, y_pred) 

In [4]:
# load data
train = pd.read_csv('../datasets/train.csv').drop('id', axis=1)
test = pd.read_csv('../datasets/test.csv').drop('id', axis=1)
sample_submission = pd.read_csv('../datasets/submission_sample.csv')
meta = pd.read_csv('../datasets/metadata.csv')

# get data types
cat_nom = [x for x in meta.iloc[1:-1, :].loc[(meta.iloc[:,1]=="Qualitativo nominal")].iloc[:,0]] 
cat_ord = [x for x in meta.iloc[1:-1, :].loc[(meta.iloc[:,1]=="Qualitativo ordinal")].iloc[:,0]] 
num_dis = [x for x in meta.iloc[1:-1, :].loc[(meta.iloc[:,1]=="Quantitativo discreto")].iloc[:,0]] 
num_con = [x for x in meta.iloc[1:-1, :].loc[(meta.iloc[:,1]=="Quantitativo continua")].iloc[:,0]] 

In [5]:
# convert to numpy because gokinjo expects np arrays
X = train[cat_nom+cat_ord+num_dis+num_con].to_numpy()
y = train.y.to_numpy()
X_test = test[cat_nom+cat_ord+num_dis+num_con].to_numpy()

# extract on train data
KNN_feat_train = knn_kfold_extract(X, y, k=1, normalize='standard')
print("KNN features for training set, shape: ", np.shape(KNN_feat_train))

# extract on test data
KNN_feat_test = knn_extract(X, y, X_test, k=1, normalize='standard')
print("KNN features for test set, shape: ", np.shape(KNN_feat_test))

# convert to dataframe
KNN_feat_train = pd.DataFrame(KNN_feat_train, columns=["knn"+str(x) for x in range(KNN_feat_train.shape[1])])
KNN_feat_test = pd.DataFrame(KNN_feat_test, columns=["knn"+str(x) for x in range(KNN_feat_test.shape[1])])

# store KNN features, they are computationally expensive
KNN_feat_train.to_csv('../datasets/knn_feat_train.csv',index=False)
KNN_feat_test.to_csv('../datasets/knn_feat_test.csv',index=False)

KNN features for training set, shape:  (14123, 2)
KNN features for test set, shape:  (21183, 2)


In [6]:
knn_feat_train = pd.read_csv('../datasets/knn_feat_train.csv')
knn_feat_test = pd.read_csv('../datasets/knn_feat_test.csv')

# Stage 1: Tune XGBoost¶

In [7]:
X_test = test[cat_nom+cat_ord+num_dis+num_con]
X = train[cat_nom+cat_ord+num_dis+num_con]
y = train.y

K=10
SEED=314
kf = KFold(n_splits=K, random_state=SEED, shuffle=True)

In [8]:
fixed_params = {
    'random_state': 9,
    "objective": "binary:logistic",
    "eval_metric": 'logloss',
    'use_label_encoder':False,
    'n_estimators':10000,
}

def objective(trial):
    
    hyperparams = {
        'clf':{
        "booster": trial.suggest_categorical("booster", ["gbtree"]),
        "lambda": trial.suggest_float("lambda", 1e-8, 5.0, log=True),
        "alpha": trial.suggest_float("alpha", 1e-8, 5.0, log=True)
        }
    }
    
    if hyperparams['clf']["booster"] == "gbtree" or hyperparams['clf']["booster"] == "dart":
        hyperparams['clf']["max_depth"] = trial.suggest_int("max_depth", 1, 9)
        hyperparams['clf']["eta"] = trial.suggest_float("eta", 0.01, 0.1, log=True)
        hyperparams['clf']["gamma"] = trial.suggest_float("gamma", 1e-8, 1.0, log=True)
        hyperparams['clf']["grow_policy"] = trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"])
        hyperparams['clf']['min_child_weight'] = trial.suggest_int('min_child_weight', 5, 20)
        hyperparams['clf']["subsample"] = trial.suggest_float("subsample", 0.03, 1)
        hyperparams['clf']["colsample_bytree"] = trial.suggest_float("colsample_bytree", 0.03, 1)
        hyperparams['clf']['max_delta_step'] = trial.suggest_float('max_delta_step', 0, 10)
        
    if hyperparams['clf']["booster"] == "dart":
        hyperparams['clf']["sample_type"] = trial.suggest_categorical("sample_type", ["uniform", "weighted"])
        hyperparams['clf']["normalize_type"] = trial.suggest_categorical("normalize_type", ["tree", "forest"])
        hyperparams['clf']["rate_drop"] = trial.suggest_float("rate_drop", 1e-8, 1.0, log=True)
        hyperparams['clf']["skip_drop"] = trial.suggest_float("skip_drop", 1e-8, 1.0, log=True)
    
    params = dict(**fixed_params, **hyperparams['clf'])
    
    xgb_oof = np.zeros(X.shape[0])

    for fold, (train_idx, val_idx) in enumerate(kf.split(X=X, y=y)):
        X_train = X.iloc[train_idx]
        y_train = y.iloc[train_idx]
        X_val = X.iloc[val_idx]
        y_val = y.iloc[val_idx]
        
        model = XGBClassifier(**params)
        
        model.fit(X_train, y_train,
                  eval_set=[(X_val, y_val)],
                  early_stopping_rounds=150,
                  verbose=False)
    
        xgb_oof[val_idx] = model.predict_proba(X_val)[:,1]

        del model

    return log_loss(y, xgb_oof)

In [9]:
study_xgb = optuna.create_study(direction='minimize')

study_xgb.optimize(objective, 
               timeout=60*5, # original time: 60*60*7.5
               gc_after_trial=True)

[I 2024-04-23 20:57:20,138] A new study created in memory with name: no-name-f0630edb-2a06-441e-8a70-2ce4df01f357
[I 2024-04-23 20:57:52,175] Trial 0 finished with value: 0.3084513667228177 and parameters: {'booster': 'gbtree', 'lambda': 0.12143389113963737, 'alpha': 5.623595141779514e-06, 'max_depth': 8, 'eta': 0.04301348897337207, 'gamma': 1.8649623788840534e-06, 'grow_policy': 'lossguide', 'min_child_weight': 8, 'subsample': 0.7928008286439018, 'colsample_bytree': 0.6206135011258861, 'max_delta_step': 9.087090875582511}. Best is trial 0 with value: 0.3084513667228177.
[I 2024-04-23 20:58:16,066] Trial 1 finished with value: 0.35715860974439667 and parameters: {'booster': 'gbtree', 'lambda': 2.339804285246967e-07, 'alpha': 0.707721263993262, 'max_depth': 7, 'eta': 0.05712099129176861, 'gamma': 1.5737631645070655e-05, 'grow_policy': 'depthwise', 'min_child_weight': 18, 'subsample': 0.03162718782916472, 'colsample_bytree': 0.6827389104364315, 'max_delta_step': 6.190624190143659}. Best 

In [10]:
# After 7.5 hours...
study_xgb = {'booster': 'gbtree',
 'lambda': 9.012384508756378e-07,
 'alpha': 0.7472040331088792,
 'max_depth': 5,
 'eta': 0.01507605562231303,
 'gamma': 1.0214961302342215e-08,
 'grow_policy': 'lossguide',
 'min_child_weight': 5,
 'subsample': 0.9331005225916879,
 'colsample_bytree': 0.25392142363325004,
 'max_delta_step': 5.685109389498008}

In [11]:
final_params_xgb = dict()
final_params_xgb['clf']=dict(**fixed_params, **study_xgb)

# Stage 2: Calcule OOF SHAP values

In [12]:
X_test = test[cat_nom+cat_ord+num_dis+num_con]
X = train[cat_nom+cat_ord+num_dis+num_con]
y = train.y

K=15 # number of bins with Sturge’s rule
SEED=123
kf = StratifiedKFold(n_splits=K, random_state=SEED, shuffle=True)

## XGBoost

In [13]:
pip install shap

Defaulting to user installation because normal site-packages is not writeableNote: you may need to restart the kernel to use updated packages.



In [14]:
shap1_oof = np.zeros((X.shape[0], X.shape[1]))
shap1_test = np.zeros((X_test.shape[0], X_test.shape[1]))
model_shap1_oof = np.zeros(X.shape[0])

for fold, (train_idx, val_idx) in enumerate(kf.split(X=X, y=y)):
    print(f"➜ FOLD :{fold}")
    X_train = X.iloc[train_idx]
    y_train = y.iloc[train_idx]
    X_val = X.iloc[val_idx]
    y_val = y.iloc[val_idx]
    
    start = time.time()
    
    model = XGBClassifier(**final_params_xgb['clf'])
    
    model.fit(X_train, y_train,
              eval_set=[(X_val, y_val)],
              early_stopping_rounds=150,
              verbose=False)
    
    model_shap1_oof[val_idx] += model.predict_proba(X_val)[:,1]
    
    print("Final F1     :", custom_f1(y_val, model_shap1_oof[val_idx]))
    print("Final AUC    :", roc_auc_score(y_val, model_shap1_oof[val_idx]))
    print("Final LogLoss:", log_loss(y_val, model_shap1_oof[val_idx]))

    explainer = shap.TreeExplainer(model)
    
    shap1_oof[val_idx] = explainer.shap_values(X_val)

    shap1_test += explainer.shap_values(X_test) / K

    print(f"elapsed: {time.time()-start:.2f} sec\n")
    
shap1_oof = pd.DataFrame(shap1_oof, columns = [x+"_shap1" for x in X.columns])
shap1_test = pd.DataFrame(shap1_test, columns = [x+"_shap1" for x in X_test.columns])

print("Final F1     :", custom_f1(y, model_shap1_oof))
print("Final AUC    :", roc_auc_score(y, model_shap1_oof))
print("Final LogLoss:", log_loss(y, model_shap1_oof))

➜ FOLD :0




Final F1     : 0.6965699208443271
Final AUC    : 0.899230123180291
Final LogLoss: 0.2978969732779375


NameError: name 'shap' is not defined