<a href="https://www.kaggle.com/code/eduus710/amex-extremely-simple-xgboost-baseline?scriptVersionId=97842951" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# AMEX - simple XGBoost baseline model

I wanted to get an initial baseline model / score. This extremely simple XGBoost model is built using:
- train/test dataframes that have numerical features converted to 16 bits (for compression - customer_ID replaced by integer as well)
- the only encoding is a labelencoder on the categorical fields (just to make XGBoost happy)
- no imputation of missing values
- no feature engineering
- uses *only* the most recent statement for each customer
- no hyperparameter tuning

I created test and train datasets that compress features to 16 bit numerics and drop all but the most recent statement for each customer to make things simpler.

NOTE: I trained with a GPU; turn off the 'gpu_hist' in the XGBClassifier if you want to use CPU (I don't know how long it will take)

In [1]:
from pathlib import Path

import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.model_selection import StratifiedKFold
from xgboost import XGBClassifier

pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', '{:.3f}'.format)

RANDOM_STATE = 42
INPUT_PATH = Path("../input/amex-eda")

### Load premade train dataset; 16 bit numerics, most recent statements only

Note: the customer_ID has been replaced with integer c_ID field

In [2]:
train = pd.read_feather(INPUT_PATH / "train_16_recent_data.feather")
train.set_index(['c_ID'], inplace=True)
train.sort_index(inplace=True)

labels = pd.read_feather(INPUT_PATH / "train_labels.feather")
labels.set_index('c_ID', inplace=True)
labels.sort_index(inplace=True)

display(train.head(2))
display(labels.head(2))

Unnamed: 0_level_0,P_2,D_39,B_1,B_2,R_1,S_3,D_41,B_3,D_42,D_43,D_44,B_4,D_45,B_5,R_2,D_46,D_47,D_48,D_49,B_6,B_7,B_8,D_50,D_51,B_9,R_3,D_52,P_3,B_10,D_53,S_5,B_11,S_6,D_54,R_4,S_7,B_12,S_8,D_55,D_56,B_13,R_5,D_58,S_9,B_14,D_59,D_60,D_61,B_15,S_11,D_62,D_63,D_64,D_65,B_16,B_17,B_18,B_19,D_66,B_20,D_68,S_12,R_6,S_13,B_21,D_69,B_22,D_70,D_71,D_72,S_15,B_23,D_73,P_4,D_74,D_75,D_76,B_24,R_7,D_77,B_25,B_26,D_78,D_79,R_8,R_9,S_16,D_80,R_10,R_11,B_27,D_81,D_82,S_17,R_12,B_28,R_13,D_83,R_14,R_15,D_84,R_16,B_29,B_30,S_18,D_86,D_87,R_17,R_18,D_88,B_31,S_19,R_19,B_32,S_20,R_20,R_21,B_33,D_89,R_22,R_23,D_91,D_92,D_93,D_94,R_24,R_25,D_96,S_22,S_23,S_24,S_25,S_26,D_102,D_103,D_104,D_105,D_106,D_107,B_36,B_37,R_26,R_27,B_38,D_108,D_109,D_110,D_111,B_39,D_112,B_40,S_27,D_113,D_114,D_115,D_116,D_117,D_118,D_119,D_120,D_121,D_122,D_123,D_124,D_125,D_126,D_127,D_128,D_129,B_41,B_42,D_130,D_131,D_132,D_133,R_28,D_134,D_135,D_136,D_137,D_138,D_139,D_140,D_141,D_142,D_143,D_144,D_145
c_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1,Unnamed: 102_level_1,Unnamed: 103_level_1,Unnamed: 104_level_1,Unnamed: 105_level_1,Unnamed: 106_level_1,Unnamed: 107_level_1,Unnamed: 108_level_1,Unnamed: 109_level_1,Unnamed: 110_level_1,Unnamed: 111_level_1,Unnamed: 112_level_1,Unnamed: 113_level_1,Unnamed: 114_level_1,Unnamed: 115_level_1,Unnamed: 116_level_1,Unnamed: 117_level_1,Unnamed: 118_level_1,Unnamed: 119_level_1,Unnamed: 120_level_1,Unnamed: 121_level_1,Unnamed: 122_level_1,Unnamed: 123_level_1,Unnamed: 124_level_1,Unnamed: 125_level_1,Unnamed: 126_level_1,Unnamed: 127_level_1,Unnamed: 128_level_1,Unnamed: 129_level_1,Unnamed: 130_level_1,Unnamed: 131_level_1,Unnamed: 132_level_1,Unnamed: 133_level_1,Unnamed: 134_level_1,Unnamed: 135_level_1,Unnamed: 136_level_1,Unnamed: 137_level_1,Unnamed: 138_level_1,Unnamed: 139_level_1,Unnamed: 140_level_1,Unnamed: 141_level_1,Unnamed: 142_level_1,Unnamed: 143_level_1,Unnamed: 144_level_1,Unnamed: 145_level_1,Unnamed: 146_level_1,Unnamed: 147_level_1,Unnamed: 148_level_1,Unnamed: 149_level_1,Unnamed: 150_level_1,Unnamed: 151_level_1,Unnamed: 152_level_1,Unnamed: 153_level_1,Unnamed: 154_level_1,Unnamed: 155_level_1,Unnamed: 156_level_1,Unnamed: 157_level_1,Unnamed: 158_level_1,Unnamed: 159_level_1,Unnamed: 160_level_1,Unnamed: 161_level_1,Unnamed: 162_level_1,Unnamed: 163_level_1,Unnamed: 164_level_1,Unnamed: 165_level_1,Unnamed: 166_level_1,Unnamed: 167_level_1,Unnamed: 168_level_1,Unnamed: 169_level_1,Unnamed: 170_level_1,Unnamed: 171_level_1,Unnamed: 172_level_1,Unnamed: 173_level_1,Unnamed: 174_level_1,Unnamed: 175_level_1,Unnamed: 176_level_1,Unnamed: 177_level_1,Unnamed: 178_level_1,Unnamed: 179_level_1,Unnamed: 180_level_1,Unnamed: 181_level_1,Unnamed: 182_level_1,Unnamed: 183_level_1,Unnamed: 184_level_1,Unnamed: 185_level_1,Unnamed: 186_level_1,Unnamed: 187_level_1,Unnamed: 188_level_1
1,0.935,0.009,0.009,1.008,0.006,0.135,0.002,0.007,,,0.003,0.071,0.74,0.232,0.008,0.42,0.54,0.192,,0.15,0.058,0.003,0.153,0.673,0.01,0.0,0.203,0.629,0.326,,0.035,0.01,0.002,1.008,0.009,0.106,0.112,0.488,0.187,0.167,0.1,0.009,0.007,0.007,0.01,0.063,0.259,0.228,0.015,0.402,0.447,CR,O,0.009,0.006,,1.008,0.005,,0.008,6.0,0.184,0.004,0.687,0.005,0.005,0.008,0.008,0.378,0.007,0.305,0.04,,0.007,0.001,0.001,,0.002,0.009,0.421,0.006,0.001,0.001,0.002,0.006,,0.0,0.208,0.001,0.009,0.007,0.009,0.507,0.007,1.002,0.085,0.006,0.0,0.003,0.001,0.002,0.003,,0.0,0.001,0.01,,0.002,0.003,,1,0.004,0.008,0.008,0.009,0.006,0.008,1.008,0.005,0.001,0.003,1.006,0.004,0.005,0.004,0.006,0.005,0.001,0.918,0.132,0.936,0.972,0.001,0.943,1.003,1.015,1.074,,0.672,0.007,0.009,,1.006,2.0,,0.0,,,,1.01,0.1,0.929,0.004,1.0,0.256,0.0,4.0,0.26,0.257,0.0,0.72,0.434,0.004,0.685,0.008,1.0,1.009,1.0,1.009,0.004,,0.004,0.006,,0.006,0.003,,,,,,0.007,0.004,0.005,,0.006,0.003,0.009
2,0.88,0.178,0.035,1.004,0.007,0.166,0.006,0.005,,0.061,0.009,0.021,0.266,0.027,0.005,0.439,0.402,0.015,,0.168,0.028,0.001,,0.34,0.013,0.102,0.242,0.571,0.297,,0.044,0.015,0.003,1.003,0.003,0.208,0.019,0.406,0.036,0.749,0.018,0.006,0.01,0.128,0.019,0.213,0.412,0.049,0.01,0.364,0.234,CO,O,0.009,0.003,,1.004,0.008,,0.004,6.0,0.192,0.008,0.287,0.006,0.008,0.006,0.001,0.008,0.001,0.305,0.015,,0.005,0.008,0.01,,0.002,0.008,0.228,0.021,0.0,0.002,0.009,0.007,,0.007,0.002,0.002,0.01,0.005,0.005,,0.005,1.009,0.02,0.006,0.005,0.006,0.01,0.001,0.007,,0.0,0.009,0.006,,0.007,0.007,,1,0.005,0.002,0.009,0.009,0.003,0.0,1.001,0.005,0.001,0.004,0.009,0.005,0.0,0.001,0.006,0.005,0.002,0.921,0.133,0.931,0.978,0.003,0.002,0.009,0.004,,,0.008,0.007,0.033,,1.008,2.0,,0.005,,,,1.007,0.02,0.292,0.009,1.0,0.454,0.0,-1.0,0.446,0.437,0.0,0.551,0.287,0.009,0.137,0.009,1.0,0.0,0.999,0.002,0.003,,0.002,0.002,,0.003,0.002,,,,,,0.003,0.007,0.008,,0.003,0.003,0.009


Unnamed: 0_level_0,target
c_ID,Unnamed: 1_level_1
1,0
2,0


### Preprocessing

Here we encode the categoricals with a LabelEncoder.

(I tried to use a sklearn ColumnTransformer, but it blows out all the numerics back to float64, exhausting memory - I didn't want to fight with it anymore).

In [3]:
encoders = {}
categoricals = ['D_63', 'D_64', 'D_66', 'D_68', 'B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126']
def make_x_y(df, labels=None):
    df = df.sort_index()
    for col in categoricals:
        if not col in encoders:
            le = LabelEncoder()
            df[f'{col}_enc'] = le.fit_transform(df[col])
            df.drop(columns=col, inplace=True)
            encoders[col] = le
        else:
            le = encoders[col]
            df[f'{col}_enc'] = le.transform(df[col])
            df.drop(columns=col, inplace=True)
    if not labels is None:
        labels = labels.sort_index().target
    return df, labels


### AMEX metric

(thanks to https://www.kaggle.com/code/rohanrao/amex-competition-metric-implementations)

In [4]:
def amex_metric_numpy(y_true: np.array, y_pred: np.array) -> float:
    
    # count of positives and negatives
    n_pos = y_true.sum()
    n_neg = y_true.shape[0] - n_pos

    # sorting by descring prediction values
    indices = np.argsort(y_pred)[::-1]
    preds, target = y_pred[indices], y_true[indices]

    # filter the top 4% by cumulative row weights
    weight = 20.0 - target * 19.0
    cum_norm_weight = (weight / weight.sum()).cumsum()
    four_pct_filter = cum_norm_weight <= 0.04

    # default rate captured at 4%
    d = target[four_pct_filter].sum() / n_pos

    # weighted gini coefficient
    lorentz = (target / n_pos).cumsum()
    gini = ((lorentz - cum_norm_weight) * weight).sum()

    # max weighted gini coefficient
    gini_max = 10 * n_neg * (1 - 19 / (n_pos + 20 * n_neg))

    # normalized weighted gini coefficient
    g = gini / gini_max

    return 0.5 * (g + d)


# Train a first model

I use 3-fold cross-validation just to sanity check consistency of results. It's rolled out by hand so that I can use the AMEX scorer.

In [5]:
X, y = make_x_y(train, labels)
display(X.head(2))
display(X.dtypes)

xgb = XGBClassifier(objective='binary:logistic',
                    random_state=RANDOM_STATE,
                    tree_method='gpu_hist')

skf = StratifiedKFold(n_splits=3)
scores = []
for train_idx, test_idx in skf.split(X,y):
    X_train = X.iloc[train_idx]
    y_train = y.iloc[train_idx]
    X_test = X.iloc[test_idx]
    y_test = y.iloc[test_idx]

    xgb.fit(X_train, y_train)
    probs = xgb.predict_proba(X_test)[:,1]
    scores.append(amex_metric_numpy(y_test.to_numpy(), probs))

print("Scores: ")
display(scores)

Unnamed: 0_level_0,P_2,D_39,B_1,B_2,R_1,S_3,D_41,B_3,D_42,D_43,D_44,B_4,D_45,B_5,R_2,D_46,D_47,D_48,D_49,B_6,B_7,B_8,D_50,D_51,B_9,R_3,D_52,P_3,B_10,D_53,S_5,B_11,S_6,D_54,R_4,S_7,B_12,S_8,D_55,D_56,B_13,R_5,D_58,S_9,B_14,D_59,D_60,D_61,B_15,S_11,D_62,D_65,B_16,B_17,B_18,B_19,B_20,S_12,R_6,S_13,B_21,D_69,B_22,D_70,D_71,D_72,S_15,B_23,D_73,P_4,D_74,D_75,D_76,B_24,R_7,D_77,B_25,B_26,D_78,D_79,R_8,R_9,S_16,D_80,R_10,R_11,B_27,D_81,D_82,S_17,R_12,B_28,R_13,D_83,R_14,R_15,D_84,R_16,B_29,S_18,D_86,D_87,R_17,R_18,D_88,B_31,S_19,R_19,B_32,S_20,R_20,R_21,B_33,D_89,R_22,R_23,D_91,D_92,D_93,D_94,R_24,R_25,D_96,S_22,S_23,S_24,S_25,S_26,D_102,D_103,D_104,D_105,D_106,D_107,B_36,B_37,R_26,R_27,D_108,D_109,D_110,D_111,B_39,D_112,B_40,S_27,D_113,D_115,D_118,D_119,D_121,D_122,D_123,D_124,D_125,D_127,D_128,D_129,B_41,B_42,D_130,D_131,D_132,D_133,R_28,D_134,D_135,D_136,D_137,D_138,D_139,D_140,D_141,D_142,D_143,D_144,D_145,D_63_enc,D_64_enc,D_66_enc,D_68_enc,B_30_enc,B_38_enc,D_114_enc,D_116_enc,D_117_enc,D_120_enc,D_126_enc
c_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1,Unnamed: 102_level_1,Unnamed: 103_level_1,Unnamed: 104_level_1,Unnamed: 105_level_1,Unnamed: 106_level_1,Unnamed: 107_level_1,Unnamed: 108_level_1,Unnamed: 109_level_1,Unnamed: 110_level_1,Unnamed: 111_level_1,Unnamed: 112_level_1,Unnamed: 113_level_1,Unnamed: 114_level_1,Unnamed: 115_level_1,Unnamed: 116_level_1,Unnamed: 117_level_1,Unnamed: 118_level_1,Unnamed: 119_level_1,Unnamed: 120_level_1,Unnamed: 121_level_1,Unnamed: 122_level_1,Unnamed: 123_level_1,Unnamed: 124_level_1,Unnamed: 125_level_1,Unnamed: 126_level_1,Unnamed: 127_level_1,Unnamed: 128_level_1,Unnamed: 129_level_1,Unnamed: 130_level_1,Unnamed: 131_level_1,Unnamed: 132_level_1,Unnamed: 133_level_1,Unnamed: 134_level_1,Unnamed: 135_level_1,Unnamed: 136_level_1,Unnamed: 137_level_1,Unnamed: 138_level_1,Unnamed: 139_level_1,Unnamed: 140_level_1,Unnamed: 141_level_1,Unnamed: 142_level_1,Unnamed: 143_level_1,Unnamed: 144_level_1,Unnamed: 145_level_1,Unnamed: 146_level_1,Unnamed: 147_level_1,Unnamed: 148_level_1,Unnamed: 149_level_1,Unnamed: 150_level_1,Unnamed: 151_level_1,Unnamed: 152_level_1,Unnamed: 153_level_1,Unnamed: 154_level_1,Unnamed: 155_level_1,Unnamed: 156_level_1,Unnamed: 157_level_1,Unnamed: 158_level_1,Unnamed: 159_level_1,Unnamed: 160_level_1,Unnamed: 161_level_1,Unnamed: 162_level_1,Unnamed: 163_level_1,Unnamed: 164_level_1,Unnamed: 165_level_1,Unnamed: 166_level_1,Unnamed: 167_level_1,Unnamed: 168_level_1,Unnamed: 169_level_1,Unnamed: 170_level_1,Unnamed: 171_level_1,Unnamed: 172_level_1,Unnamed: 173_level_1,Unnamed: 174_level_1,Unnamed: 175_level_1,Unnamed: 176_level_1,Unnamed: 177_level_1,Unnamed: 178_level_1,Unnamed: 179_level_1,Unnamed: 180_level_1,Unnamed: 181_level_1,Unnamed: 182_level_1,Unnamed: 183_level_1,Unnamed: 184_level_1,Unnamed: 185_level_1,Unnamed: 186_level_1,Unnamed: 187_level_1,Unnamed: 188_level_1
1,0.935,0.009,0.009,1.008,0.006,0.135,0.002,0.007,,,0.003,0.071,0.74,0.232,0.008,0.42,0.54,0.192,,0.15,0.058,0.003,0.153,0.673,0.01,0.0,0.203,0.629,0.326,,0.035,0.01,0.002,1.008,0.009,0.106,0.112,0.488,0.187,0.167,0.1,0.009,0.007,0.007,0.01,0.063,0.259,0.228,0.015,0.402,0.447,0.009,0.006,,1.008,0.005,0.008,0.184,0.004,0.687,0.005,0.005,0.008,0.008,0.378,0.007,0.305,0.04,,0.007,0.001,0.001,,0.002,0.009,0.421,0.006,0.001,0.001,0.002,0.006,,0.0,0.208,0.001,0.009,0.007,0.009,0.507,0.007,1.002,0.085,0.006,0.0,0.003,0.001,0.002,0.003,,0.001,0.01,,0.002,0.003,,1,0.004,0.008,0.008,0.009,0.006,0.008,1.008,0.005,0.001,0.003,1.006,0.004,0.005,0.004,0.006,0.005,0.001,0.918,0.132,0.936,0.972,0.001,0.943,1.003,1.015,1.074,,0.672,0.007,0.009,,1.006,,0.0,,,,1.01,0.1,0.929,0.004,0.256,0.26,0.257,0.72,0.434,0.004,0.685,0.008,1.009,1.0,1.009,0.004,,0.004,0.006,,0.006,0.003,,,,,,0.007,0.004,0.005,,0.006,0.003,0.009,2,0,1,5,0,1,1,0,4,0,1
2,0.88,0.178,0.035,1.004,0.007,0.166,0.006,0.005,,0.061,0.009,0.021,0.266,0.027,0.005,0.439,0.402,0.015,,0.168,0.028,0.001,,0.34,0.013,0.102,0.242,0.571,0.297,,0.044,0.015,0.003,1.003,0.003,0.208,0.019,0.406,0.036,0.749,0.018,0.006,0.01,0.128,0.019,0.213,0.412,0.049,0.01,0.364,0.234,0.009,0.003,,1.004,0.008,0.004,0.192,0.008,0.287,0.006,0.008,0.006,0.001,0.008,0.001,0.305,0.015,,0.005,0.008,0.01,,0.002,0.008,0.228,0.021,0.0,0.002,0.009,0.007,,0.007,0.002,0.002,0.01,0.005,0.005,,0.005,1.009,0.02,0.006,0.005,0.006,0.01,0.001,0.007,,0.009,0.006,,0.007,0.007,,1,0.005,0.002,0.009,0.009,0.003,0.0,1.001,0.005,0.001,0.004,0.009,0.005,0.0,0.001,0.006,0.005,0.002,0.921,0.133,0.931,0.978,0.003,0.002,0.009,0.004,,,0.008,0.007,0.033,,1.008,,0.005,,,,1.007,0.02,0.292,0.009,0.454,0.446,0.437,0.551,0.287,0.009,0.137,0.009,0.0,0.999,0.002,0.003,,0.002,0.002,,0.003,0.002,,,,,,0.003,0.007,0.008,,0.003,0.003,0.009,1,0,1,5,0,1,1,0,0,0,1


P_2          float16
D_39         float16
B_1          float16
B_2          float16
R_1          float16
              ...   
D_114_enc      int64
D_116_enc      int64
D_117_enc      int64
D_120_enc      int64
D_126_enc      int64
Length: 188, dtype: object

Scores: 


[0.7740322727582949, 0.7730386632877226, 0.7756133245631428]

# Fit full model

In [6]:
xgb = XGBClassifier(objective='binary:logistic',
                    random_state=RANDOM_STATE,
                    tree_method='gpu_hist')
xgb.fit(X, y)

XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, gamma=0, gpu_id=0, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=100,
              n_jobs=0, num_parallel_tree=1, predictor='auto', random_state=42,
              reg_alpha=0, reg_lambda=1, ...)

In [7]:
# try to free up RAM
del(X)
del(train)

# Make Predictions

again, the test file has been converted to 16 bit numerics and contains only the most recent statement for each customer

In [8]:
import gc

# try to free up RAM
gc.collect()

test = pd.read_feather(INPUT_PATH / "test_16_recent_data.feather")
test.set_index(['c_ID'], inplace=True)
test.sort_index(inplace=True)

# original customer keys (for submission file)
cust = pd.read_feather(INPUT_PATH / "test_cust.feather")
cust.set_index('c_ID', inplace=True)
cust.sort_index(inplace=True)

display(test.head(2))
display(cust.head(2))


Unnamed: 0_level_0,P_2,D_39,B_1,B_2,R_1,S_3,D_41,B_3,D_42,D_43,D_44,B_4,D_45,B_5,R_2,D_46,D_47,D_48,D_49,B_6,B_7,B_8,D_50,D_51,B_9,R_3,D_52,P_3,B_10,D_53,S_5,B_11,S_6,D_54,R_4,S_7,B_12,S_8,D_55,D_56,B_13,R_5,D_58,S_9,B_14,D_59,D_60,D_61,B_15,S_11,D_62,D_63,D_64,D_65,B_16,B_17,B_18,B_19,D_66,B_20,D_68,S_12,R_6,S_13,B_21,D_69,B_22,D_70,D_71,D_72,S_15,B_23,D_73,P_4,D_74,D_75,D_76,B_24,R_7,D_77,B_25,B_26,D_78,D_79,R_8,R_9,S_16,D_80,R_10,R_11,B_27,D_81,D_82,S_17,R_12,B_28,R_13,D_83,R_14,R_15,D_84,R_16,B_29,B_30,S_18,D_86,D_87,R_17,R_18,D_88,B_31,S_19,R_19,B_32,S_20,R_20,R_21,B_33,D_89,R_22,R_23,D_91,D_92,D_93,D_94,R_24,R_25,D_96,S_22,S_23,S_24,S_25,S_26,D_102,D_103,D_104,D_105,D_106,D_107,B_36,B_37,R_26,R_27,B_38,D_108,D_109,D_110,D_111,B_39,D_112,B_40,S_27,D_113,D_114,D_115,D_116,D_117,D_118,D_119,D_120,D_121,D_122,D_123,D_124,D_125,D_126,D_127,D_128,D_129,B_41,B_42,D_130,D_131,D_132,D_133,R_28,D_134,D_135,D_136,D_137,D_138,D_139,D_140,D_141,D_142,D_143,D_144,D_145
c_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1,Unnamed: 102_level_1,Unnamed: 103_level_1,Unnamed: 104_level_1,Unnamed: 105_level_1,Unnamed: 106_level_1,Unnamed: 107_level_1,Unnamed: 108_level_1,Unnamed: 109_level_1,Unnamed: 110_level_1,Unnamed: 111_level_1,Unnamed: 112_level_1,Unnamed: 113_level_1,Unnamed: 114_level_1,Unnamed: 115_level_1,Unnamed: 116_level_1,Unnamed: 117_level_1,Unnamed: 118_level_1,Unnamed: 119_level_1,Unnamed: 120_level_1,Unnamed: 121_level_1,Unnamed: 122_level_1,Unnamed: 123_level_1,Unnamed: 124_level_1,Unnamed: 125_level_1,Unnamed: 126_level_1,Unnamed: 127_level_1,Unnamed: 128_level_1,Unnamed: 129_level_1,Unnamed: 130_level_1,Unnamed: 131_level_1,Unnamed: 132_level_1,Unnamed: 133_level_1,Unnamed: 134_level_1,Unnamed: 135_level_1,Unnamed: 136_level_1,Unnamed: 137_level_1,Unnamed: 138_level_1,Unnamed: 139_level_1,Unnamed: 140_level_1,Unnamed: 141_level_1,Unnamed: 142_level_1,Unnamed: 143_level_1,Unnamed: 144_level_1,Unnamed: 145_level_1,Unnamed: 146_level_1,Unnamed: 147_level_1,Unnamed: 148_level_1,Unnamed: 149_level_1,Unnamed: 150_level_1,Unnamed: 151_level_1,Unnamed: 152_level_1,Unnamed: 153_level_1,Unnamed: 154_level_1,Unnamed: 155_level_1,Unnamed: 156_level_1,Unnamed: 157_level_1,Unnamed: 158_level_1,Unnamed: 159_level_1,Unnamed: 160_level_1,Unnamed: 161_level_1,Unnamed: 162_level_1,Unnamed: 163_level_1,Unnamed: 164_level_1,Unnamed: 165_level_1,Unnamed: 166_level_1,Unnamed: 167_level_1,Unnamed: 168_level_1,Unnamed: 169_level_1,Unnamed: 170_level_1,Unnamed: 171_level_1,Unnamed: 172_level_1,Unnamed: 173_level_1,Unnamed: 174_level_1,Unnamed: 175_level_1,Unnamed: 176_level_1,Unnamed: 177_level_1,Unnamed: 178_level_1,Unnamed: 179_level_1,Unnamed: 180_level_1,Unnamed: 181_level_1,Unnamed: 182_level_1,Unnamed: 183_level_1,Unnamed: 184_level_1,Unnamed: 185_level_1,Unnamed: 186_level_1,Unnamed: 187_level_1,Unnamed: 188_level_1
1,0.569,0.121,0.011,1.01,0.007,0.149,0.0,0.004,0.104,0.007,0.007,0.125,0.025,0.05,0.008,0.446,0.49,0.517,,0.025,0.163,1.009,,0.333,0.005,0.109,0.116,0.561,0.034,,0.012,0.005,0.003,1.008,0.006,0.16,0.059,0.464,0.458,,0.063,0.005,0.381,0.017,0.009,0.425,0.014,0.589,0.014,1.768,0.049,CR,U,0.001,0.002,,0.59,0.004,,0.005,6.0,0.188,0.003,0.556,0.007,0.008,0.007,0.007,0.281,0.009,0.301,0.147,0.179,0.965,0.217,0.202,,0.007,0.006,,0.005,0.009,0.002,0.009,0.009,,0.009,0.006,0.007,0.004,0.001,0.0,0.503,0.007,1.007,0.14,0.008,0.007,0.007,0.002,0.004,0.001,0.003,0.0,0.009,0.009,,0.006,0.005,,1,0.003,0.004,0.007,0.005,0.004,0.007,1.008,0.009,0.001,0.003,0.003,1.006,0.006,0.001,0.008,0.006,0.005,0.98,0.136,0.958,0.971,0.081,0.004,0.004,0.006,,,0.0,0.001,0.008,,1.002,2.0,,0.0,,,,1.007,0.375,0.346,0.01,0.0,0.251,0.0,-1.0,0.247,0.254,1.0,0.199,0.144,0.004,0.01,0.0,0.0,0.01,0.002,0.01,0.007,,0.005,0.001,,0.006,0.009,,,,,,0.006,0.001,0.007,,0.009,0.004,0.003
2,0.841,0.126,0.017,1.009,0.01,0.112,0.006,0.011,,,0.004,0.015,0.1,0.136,0.002,0.446,0.362,0.042,,0.183,0.014,1.009,,0.002,0.017,0.008,0.144,0.526,0.299,,0.009,0.002,0.005,1.003,0.008,0.082,0.054,0.769,0.053,0.064,0.059,0.004,0.006,0.019,0.011,0.609,0.686,0.047,0.005,0.128,0.331,CO,O,0.0,0.091,,1.005,0.003,,0.001,6.0,0.191,0.01,0.284,0.004,0.001,0.002,0.004,0.01,0.007,0.107,0.005,,0.005,0.002,0.005,,0.008,0.007,0.302,0.027,0.006,0.005,0.004,0.001,,0.005,0.002,0.006,0.0,0.0,0.005,,0.002,1.004,0.009,0.006,0.008,0.006,0.004,0.005,0.003,,0.0,0.009,0.004,,0.004,0.007,,1,0.008,0.005,0.007,0.002,0.002,0.002,1.005,0.007,0.001,0.006,0.005,0.007,0.004,0.002,0.005,0.001,0.01,0.929,0.136,0.939,0.975,0.021,0.007,0.006,0.005,,,0.003,0.007,0.013,,1.001,2.0,,0.006,,,,1.004,0.005,0.591,0.001,1.0,0.317,0.0,3.0,0.317,0.318,0.0,0.855,0.579,0.0,0.646,0.005,1.0,0.01,1.0,0.001,0.007,,0.001,0.008,,0.003,0.009,,,,,,0.004,0.001,0.009,,0.002,0.0,0.008


Unnamed: 0_level_0,customer_ID
c_ID,Unnamed: 1_level_1
1,00000469ba478561f23a92a868bd366de6f6527a684c9a...
2,00001bf2e77ff879fab36aa4fac689b9ba411dae63ae39...


In [9]:
X_test, _ = make_x_y(test)
probs = xgb.predict_proba(X_test)
probs

array([[0.9771619 , 0.02283811],
       [0.99837387, 0.00162616],
       [0.960901  , 0.03909899],
       ...,
       [0.3696879 , 0.6303121 ],
       [0.7539091 , 0.2460909 ],
       [0.91369754, 0.08630247]], dtype=float32)

# Make Submission

NOTE: join with the saved customer keys from the raw dataset

In [10]:
submit = pd.DataFrame(probs[:,1], columns=['prediction'],index=test.index)
submit = submit.join(cust).reset_index().set_index('customer_ID')
display(submit.head(2))

submit['prediction'].to_csv('./submission.csv')

Unnamed: 0_level_0,c_ID,prediction
customer_ID,Unnamed: 1_level_1,Unnamed: 2_level_1
00000469ba478561f23a92a868bd366de6f6527a684c9a2e78fb826dcac3b9b7,1,0.023
00001bf2e77ff879fab36aa4fac689b9ba411dae63ae397d4263dafa1daedef5,2,0.002
