In [8]:
import os
import gc
import joblib
import random
import warnings
import itertools
import scipy as sp
import numpy as np
import pandas as pd
from tqdm import tqdm
import xgboost as xgb
import lightgbm as lgb
warnings.filterwarnings('ignore')
from itertools import combinations
pd.set_option('display.width', 1000)
pd.set_option('display.max_rows', 500)
from catboost import CatBoostClassifier
pd.set_option('display.max_columns', 500)
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, train_test_split
import warnings
warnings.filterwarnings('ignore')
import itertools
import matplotlib.pyplot as plt
import seaborn as sns

In [9]:
train = pd.read_pickle("train_addlag.pkl", compression="gzip")
test = pd.read_pickle("test_addlag.pkl", compression="gzip")

In [11]:
# Get feature list
features = [col for col in train.columns if col not in ['customer_ID', 'target']]
print(len(features))

1365


In [12]:
def xgb_amex(y_pred, y_true):
    return 'amex', amex_metric_np(y_pred,y_true.get_label())

# Created by https://www.kaggle.com/yunchonggan
# https://www.kaggle.com/competitions/amex-default-prediction/discussion/328020
def amex_metric_np(preds: np.ndarray, target: np.ndarray) -> float:
    indices = np.argsort(preds)[::-1]
    preds, target = preds[indices], target[indices]

    weight = 20.0 - target * 19.0
    cum_norm_weight = (weight / weight.sum()).cumsum()
    four_pct_mask = cum_norm_weight <= 0.04
    d = np.sum(target[four_pct_mask]) / np.sum(target)

    weighted_target = target * weight
    lorentz = (weighted_target / weighted_target.sum()).cumsum()
    gini = ((lorentz - cum_norm_weight) * weight).sum()

    n_pos = np.sum(target)
    n_neg = target.shape[0] - n_pos
    gini_max = 10 * n_neg * (n_pos + 20 * n_neg - 19) / (n_pos + 20 * n_neg)

    g = gini / gini_max
    return 0.5 * (g + d)

# we still need the official metric since the faster version above is slightly off
def amex_metric(y_true, y_pred):
    labels = np.transpose(np.array([y_true, y_pred]))
    labels = labels[labels[:, 1].argsort()[::-1]]
    weights = np.where(labels[:,0]==0, 20, 1)
    cut_vals = labels[np.cumsum(weights) <= int(0.04 * np.sum(weights))]
    top_four = np.sum(cut_vals[:,0]) / np.sum(labels[:,0])
    gini = [0,0]
    for i in [1,0]:
        labels = np.transpose(np.array([y_true, y_pred]))
        labels = labels[labels[:, i].argsort()[::-1]]
        weight = np.where(labels[:,0]==0, 20, 1)
        weight_random = np.cumsum(weight / np.sum(weight))
        total_pos = np.sum(labels[:, 0] *  weight)
        cum_pos_found = np.cumsum(labels[:, 0] * weight)
        lorentz = cum_pos_found / total_pos
        gini[i] = np.sum((lorentz - weight_random) * weight)
    return 0.5 * (gini[1]/gini[0] + top_four)

In [14]:
# Create a numpy array to store test predictions
test_predictions = np.zeros(len(test))
# Create a numpy array to store out of folds predictions
oof_predictions = np.zeros(len(train))
kfold = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 42)
for fold, (trn_ind, val_ind) in enumerate(kfold.split(train, train['target'])):
    print(' ')
    print('-'*50)
    print(f'Training fold {fold} with {len(features)} features...')
    x_train, x_val = train[features].iloc[trn_ind], train[features].iloc[val_ind]
    y_train, y_val = train['target'].iloc[trn_ind], train['target'].iloc[val_ind]

    dtrain = xgb.DMatrix(data = x_train, label = y_train)
    dvalid = xgb.DMatrix(data = x_val, label = y_val)
    params = {'objective': 'binary:logistic',
            "eval_metric": "logloss",
            'booster': 'dart',
            'seed': 42,
            'min_child_weight':3.529045008103897,
            'eta': 0.03,
            'subsample': 0.8639467537456698,
            'colsample_bytree':0.6059132895850072,
            'lambda': 70,
            'max_depth': 7,
            'gamma':2.5764715666738445,
            'tree_method': 'hist',
            'n_jobs':-1
              }
    watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
    model = xgb.train(params, dtrain = dtrain, num_boost_round = 5000, evals = watchlist, \
                      early_stopping_rounds = 150, feval = xgb_amex, maximize = True, verbose_eval = 1)
    print('best ntree_limit:', model.best_ntree_limit)
    print('best score:', model.best_score)
    
    plt.figure(figsize=(20,15))
    xgb.plot_importance(model, max_num_features=20)
    print(plt.show())
    

    # Save best model
    model.save_model(f'xgboost_fold{fold}_addlag_tune.json')
    # Predict validation
    val_pred = model.predict(xgb.DMatrix(x_val), iteration_range=(0, model.best_ntree_limit))
    
    
    # Add to out of folds array
    oof_predictions[val_ind] = val_pred
    # Predict the test set
    test_pred = model.predict(xgb.DMatrix(test[features]), iteration_range = (0, model.best_ntree_limit))

    test_predictions += test_pred / 5
    # Compute fold metric
    score = amex_metric(y_val, val_pred)
    print(f'Our fold {fold} CV score is {score}')
    del x_train, x_val, y_train, y_val
    gc.collect()

 
--------------------------------------------------
Training fold 0 with 1365 features...
[0]	train-logloss:0.67363	train-amex:0.71136	eval-logloss:0.67372	eval-amex:0.70999
[1]	train-logloss:0.65555	train-amex:0.72523	eval-logloss:0.65558	eval-amex:0.72134
[2]	train-logloss:0.63801	train-amex:0.73153	eval-logloss:0.63825	eval-amex:0.72853
[3]	train-logloss:0.62174	train-amex:0.73286	eval-logloss:0.62185	eval-amex:0.72965
[4]	train-logloss:0.60603	train-amex:0.73520	eval-logloss:0.60634	eval-amex:0.73209
[5]	train-logloss:0.59135	train-amex:0.73505	eval-logloss:0.59162	eval-amex:0.73288
[6]	train-logloss:0.57745	train-amex:0.73886	eval-logloss:0.57772	eval-amex:0.73524
[7]	train-logloss:0.56421	train-amex:0.74164	eval-logloss:0.56453	eval-amex:0.73761
[8]	train-logloss:0.55163	train-amex:0.74304	eval-logloss:0.55200	eval-amex:0.74018
[9]	train-logloss:0.53941	train-amex:0.74450	eval-logloss:0.53994	eval-amex:0.74226
[10]	train-logloss:0.52813	train-amex:0.74598	eval-logloss:0.52855	ev

[96]	train-logloss:0.24417	train-amex:0.77894	eval-logloss:0.24849	eval-amex:0.77144
[97]	train-logloss:0.24353	train-amex:0.77920	eval-logloss:0.24789	eval-amex:0.77127
[98]	train-logloss:0.24291	train-amex:0.77945	eval-logloss:0.24732	eval-amex:0.77184
[99]	train-logloss:0.24234	train-amex:0.77970	eval-logloss:0.24677	eval-amex:0.77247
[100]	train-logloss:0.24173	train-amex:0.78004	eval-logloss:0.24621	eval-amex:0.77231
[101]	train-logloss:0.24113	train-amex:0.78008	eval-logloss:0.24567	eval-amex:0.77228
[102]	train-logloss:0.24057	train-amex:0.78043	eval-logloss:0.24515	eval-amex:0.77296
[103]	train-logloss:0.24001	train-amex:0.78060	eval-logloss:0.24464	eval-amex:0.77304
[104]	train-logloss:0.23950	train-amex:0.78079	eval-logloss:0.24415	eval-amex:0.77291
[105]	train-logloss:0.23894	train-amex:0.78091	eval-logloss:0.24365	eval-amex:0.77290
[106]	train-logloss:0.23844	train-amex:0.78108	eval-logloss:0.24317	eval-amex:0.77342
[107]	train-logloss:0.23795	train-amex:0.78124	eval-loglos

[192]	train-logloss:0.21722	train-amex:0.79691	eval-logloss:0.22555	eval-amex:0.78462
[193]	train-logloss:0.21709	train-amex:0.79699	eval-logloss:0.22547	eval-amex:0.78449
[194]	train-logloss:0.21695	train-amex:0.79722	eval-logloss:0.22540	eval-amex:0.78467
[195]	train-logloss:0.21683	train-amex:0.79742	eval-logloss:0.22532	eval-amex:0.78492
[196]	train-logloss:0.21669	train-amex:0.79751	eval-logloss:0.22523	eval-amex:0.78468
[197]	train-logloss:0.21658	train-amex:0.79748	eval-logloss:0.22516	eval-amex:0.78495
[198]	train-logloss:0.21647	train-amex:0.79764	eval-logloss:0.22509	eval-amex:0.78478
[199]	train-logloss:0.21635	train-amex:0.79787	eval-logloss:0.22501	eval-amex:0.78486
[200]	train-logloss:0.21624	train-amex:0.79817	eval-logloss:0.22494	eval-amex:0.78502
[201]	train-logloss:0.21612	train-amex:0.79834	eval-logloss:0.22487	eval-amex:0.78527
[202]	train-logloss:0.21599	train-amex:0.79851	eval-logloss:0.22478	eval-amex:0.78524
[203]	train-logloss:0.21586	train-amex:0.79862	eval-lo

[288]	train-logloss:0.20794	train-amex:0.80990	eval-logloss:0.22031	eval-amex:0.79029
[289]	train-logloss:0.20784	train-amex:0.81007	eval-logloss:0.22027	eval-amex:0.79055
[290]	train-logloss:0.20775	train-amex:0.81012	eval-logloss:0.22023	eval-amex:0.79071
[291]	train-logloss:0.20768	train-amex:0.81023	eval-logloss:0.22020	eval-amex:0.79091
[292]	train-logloss:0.20761	train-amex:0.81037	eval-logloss:0.22018	eval-amex:0.79106
[293]	train-logloss:0.20753	train-amex:0.81048	eval-logloss:0.22016	eval-amex:0.79086
[294]	train-logloss:0.20748	train-amex:0.81052	eval-logloss:0.22013	eval-amex:0.79099
[295]	train-logloss:0.20742	train-amex:0.81055	eval-logloss:0.22010	eval-amex:0.79088
[296]	train-logloss:0.20733	train-amex:0.81088	eval-logloss:0.22007	eval-amex:0.79114
[297]	train-logloss:0.20726	train-amex:0.81094	eval-logloss:0.22004	eval-amex:0.79132
[298]	train-logloss:0.20718	train-amex:0.81115	eval-logloss:0.22001	eval-amex:0.79131
[299]	train-logloss:0.20711	train-amex:0.81131	eval-lo

KeyboardInterrupt: 

In [None]:
# Compute out of folds metric
score = amex_metric(train['target'], oof_predictions)
print(f'Our out of folds CV score is {score}')

In [13]:
# Create a dataframe to store out of folds predictions
oof_df = pd.DataFrame({'customer_ID': train['customer_ID'], 'target': train['target'], 'prediction': oof_predictions})
oof_df.to_csv('oof_xgboost_addlag_tune.csv', index = False)
# Create a dataframe to store test prediction
test_df = pd.DataFrame({'customer_ID': test['customer_ID'], 'prediction': test_predictions})
test_df.to_csv('sub_xgboost_addlag_tune.csv', index = False) #0.796