In [1]:
import os

In [2]:
import cudf
import cupy
from tqdm import tqdm
import numpy as np
import gc
import xgboost as xgb
from utils import amex_metric_np

cudf.__version__, xgb.__version__

('22.12.01', '1.7.3')

### Please register kaggle and install kaggle API by: 
- `pip install kaggle`
- complete [authentication](https://www.kaggle.com/docs/api)

In [3]:
PATH = '/raid/data/ml/kaggle/amex'

In [4]:
cmd = f'kaggle datasets download -d raddar/amex-data-integer-dtypes-parquet-format -p {PATH}'

In [5]:
os.system(cmd)

amex-data-integer-dtypes-parquet-format.zip: Skipping, found more recently modified local copy (use --force to force download)


0

In [6]:
os.listdir(PATH)

['test.parquet',
 'train_labels.csv.zip',
 'train_labels.csv',
 'amex-data-integer-dtypes-parquet-format.zip',
 'train.parquet']

In [7]:
cmd = f'cd {PATH} && unzip amex-data-integer-dtypes-parquet-format.zip'
os.system(cmd)

Archive:  amex-data-integer-dtypes-parquet-format.zip


replace test.parquet? [y]es, [n]o, [A]ll, [N]one, [r]ename:  NULL
(EOF or read error, treating as "[N]one" ...)


256

In [8]:
os.listdir(PATH)

['test.parquet',
 'train_labels.csv.zip',
 'train_labels.csv',
 'amex-data-integer-dtypes-parquet-format.zip',
 'train.parquet']

### Basic EDA

In [9]:
%%time
train = cudf.read_parquet(f'{PATH}/train.parquet')
print(train.shape)
train.head()

(5531451, 190)
CPU times: user 1.11 s, sys: 827 ms, total: 1.93 s
Wall time: 1.93 s


Unnamed: 0,customer_ID,S_2,P_2,D_39,B_1,B_2,R_1,S_3,D_41,B_3,...,D_136,D_137,D_138,D_139,D_140,D_141,D_142,D_143,D_144,D_145
0,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-03-09,0.938469,0,0.008724,1.006838,0.009228,0.124035,0.0,0.004709,...,-1,-1,-1,0,0,0.0,,0,0.00061,0
1,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-04-07,0.936665,0,0.004923,1.000653,0.006151,0.12675,0.0,0.002714,...,-1,-1,-1,0,0,0.0,,0,0.005492,0
2,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-05-28,0.95418,3,0.021655,1.009672,0.006815,0.123977,0.0,0.009423,...,-1,-1,-1,0,0,0.0,,0,0.006986,0
3,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-06-13,0.960384,0,0.013683,1.0027,0.001373,0.117169,0.0,0.005531,...,-1,-1,-1,0,0,0.0,,0,0.006527,0
4,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-07-16,0.947248,0,0.015193,1.000727,0.007605,0.117325,0.0,0.009312,...,-1,-1,-1,0,0,0.0,,0,0.008126,0


In [10]:
%%time
count_df = train.groupby('customer_ID').size().to_frame('num_profiles')
count_df.head()

CPU times: user 3.83 ms, sys: 7.89 ms, total: 11.7 ms
Wall time: 10.1 ms


Unnamed: 0_level_0,num_profiles
customer_ID,Unnamed: 1_level_1
c761f5f5b15e563daa67f0a41c3ec2a870d3c9daaadf0cd11dd808d3aaa82c46,13
e16b5594d9dce9ebd2f8e0d7074391736b2641afa9e349f67a53f7cc780c120b,13
8c846c26e1f1d4afa04977155c41bc3b6bb77c72efc5db3f592ec3d72f12cfdc,13
463e8a9b5b0161764bbbb0b5b58956bb8ebff6244219b21ac257a07364fa8dd9,13
92bbe3e2a159bcc838b86241471eb14153c8d712b6647feffbe49d5266cdfd3f,13


In [11]:
count_df.num_profiles.max()

13

In [12]:
%%time
train['S_2'] = cudf.to_datetime(train['S_2'])
train.head()

CPU times: user 13.4 ms, sys: 970 µs, total: 14.4 ms
Wall time: 13 ms


Unnamed: 0,customer_ID,S_2,P_2,D_39,B_1,B_2,R_1,S_3,D_41,B_3,...,D_136,D_137,D_138,D_139,D_140,D_141,D_142,D_143,D_144,D_145
0,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-03-09,0.938469,0,0.008724,1.006838,0.009228,0.124035,0.0,0.004709,...,-1,-1,-1,0,0,0.0,,0,0.00061,0
1,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-04-07,0.936665,0,0.004923,1.000653,0.006151,0.12675,0.0,0.002714,...,-1,-1,-1,0,0,0.0,,0,0.005492,0
2,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-05-28,0.95418,3,0.021655,1.009672,0.006815,0.123977,0.0,0.009423,...,-1,-1,-1,0,0,0.0,,0,0.006986,0
3,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-06-13,0.960384,0,0.013683,1.0027,0.001373,0.117169,0.0,0.005531,...,-1,-1,-1,0,0,0.0,,0,0.006527,0
4,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-07-16,0.947248,0,0.015193,1.000727,0.007605,0.117325,0.0,0.009312,...,-1,-1,-1,0,0,0.0,,0,0.008126,0


In [13]:
train.S_2.min(), train.S_2.max()

(numpy.datetime64('2017-03-01T00:00:00.000000000'),
 numpy.datetime64('2018-03-31T00:00:00.000000000'))

#### Download the training data labels

In [14]:
cmd = f'kaggle competitions download -c amex-default-prediction -f train_labels.csv -p {PATH}/'
os.system(cmd)

train_labels.csv.zip: Skipping, found more recently modified local copy (use --force to force download)


0

In [15]:
cmd = f'cd {PATH} && unzip train_labels.csv.zip'
os.system(cmd)

Archive:  train_labels.csv.zip


replace train_labels.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename:  NULL
(EOF or read error, treating as "[N]one" ...)


256

In [16]:
%%time
trainl = cudf.read_csv(f'{PATH}/train_labels.csv')
print(trainl.shape)
trainl.head()

(458913, 2)
CPU times: user 8.54 ms, sys: 12.4 ms, total: 20.9 ms
Wall time: 19.4 ms


Unnamed: 0,customer_ID,target
0,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,0
1,00000fd6641609c6ece5454664794f0340ad84dddce9a2...,0
2,00001b22f846c82c51f6e3958ccd81970162bae8b007e8...,0
3,000041bdba6ecadd89a52d11886e8eaaec9325906c9723...,0
4,00007889e4fcd2614b6cbe7f8f3d2e5c728eca32d9eb8a...,0


In [17]:
trainl['target'].value_counts()

0    340085
1    118828
Name: target, dtype: int32

In [18]:
%%time
train = train.merge(trainl, on='customer_ID', how='left')
print(train.shape)
train.head()

(5531451, 191)
CPU times: user 59.7 ms, sys: 47.8 ms, total: 108 ms
Wall time: 106 ms


Unnamed: 0,customer_ID,S_2,P_2,D_39,B_1,B_2,R_1,S_3,D_41,B_3,...,D_137,D_138,D_139,D_140,D_141,D_142,D_143,D_144,D_145,target
0,0013d20434fd2e0953a135662fe2ff53583153b6bdf673...,2017-04-26,0.665826,0,0.00693,1.00397,0.001937,0.344979,0.0,0.008888,...,-1,-1,0,0,0.0,,0,0.008449,0,0
1,0013d20434fd2e0953a135662fe2ff53583153b6bdf673...,2017-05-27,0.669217,0,0.005883,0.811781,0.000755,0.348051,0.0,0.008335,...,-1,-1,0,0,0.0,,0,0.000778,0,0
2,0013d20434fd2e0953a135662fe2ff53583153b6bdf673...,2017-06-26,0.635634,0,0.00296,0.811709,0.002077,0.265256,0.0,0.004931,...,-1,-1,0,0,0.0,,0,3.2e-05,0,0
3,0013d20434fd2e0953a135662fe2ff53583153b6bdf673...,2017-07-27,0.667708,0,0.009547,0.816745,0.006937,0.261403,0.0,0.006312,...,-1,-1,0,0,0.0,,0,0.001958,0,0
4,0013d20434fd2e0953a135662fe2ff53583153b6bdf673...,2017-08-26,0.662239,0,0.002087,0.813605,0.008821,0.259084,0.0,0.001808,...,-1,-1,0,0,0.0,,0,0.006299,0,0


In [19]:
train['cid'], _ = train.customer_ID.factorize()

In [20]:
mask = train['cid']%4 == 0
tr,va = train.loc[~mask],train.loc[mask]
print("Verify target distribution is consistent across tr and va")
print(tr['target'].mean(), va['target'].mean())

Verify target distribution is consistent across tr and va
0.2493462806763533 0.24835027876095958


### Utility Functions

In [21]:
def get_cat_cols():
    return ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120',
                'D_126', 'D_63', 'D_64', 'D_66', 'D_68']

def preprocess(df,last_only):
    df = df.reset_index(drop=True)
    df = df.drop_duplicates('cid',keep='last')
    df = df.sort_values('cid')
    df = df.reset_index(drop=True)
    return df

In [22]:
%%time

tr = preprocess(tr, last_only=True)
print(tr.shape)
tr.head()

(344184, 192)
CPU times: user 70.9 ms, sys: 172 ms, total: 242 ms
Wall time: 241 ms


Unnamed: 0,customer_ID,S_2,P_2,D_39,B_1,B_2,R_1,S_3,D_41,B_3,...,D_138,D_139,D_140,D_141,D_142,D_143,D_144,D_145,target,cid
0,00000fd6641609c6ece5454664794f0340ad84dddce9a2...,2017-05-07,0.922563,19,0.033713,1.007497,0.005594,0.09465535,0.0,0.010466,...,-1,0,0,0.0,,0,0.003254,0,0,1
1,00001b22f846c82c51f6e3958ccd81970162bae8b007e8...,2018-03-12,0.880875,0,0.004284,0.812649,0.00645,,0.0,0.007196,...,-1,0,0,0.0,,0,0.000834,0,0,2
2,000041bdba6ecadd89a52d11886e8eaaec9325906c9723...,2017-11-25,0.622756,0,0.014539,1.006193,0.000199,0.353720278,0.0,0.008099,...,-1,0,0,0.0,,0,0.009919,0,0,3
3,000084e5023181993c2e1b665ac88dbb1ce9ef621ec537...,2018-03-19,0.824061,0,0.007853,1.001713,0.006885,0.395739734,0.0,0.006134,...,-1,0,0,0.0,,0,0.006943,0,0,5
4,000098081fde4fd64bc4d503a5d6f86a0aedc425c96f52...,2017-04-22,0.414269,0,0.059667,0.123964,0.004374,0.434147984,0.0,0.05213,...,-1,0,0,0.0,,0,0.002364,0,0,6


In [23]:
%%time

va = preprocess(va, last_only=True)
print(va.shape)
va.head()

(114729, 192)
CPU times: user 46.3 ms, sys: 84.2 ms, total: 130 ms
Wall time: 129 ms


Unnamed: 0,customer_ID,S_2,P_2,D_39,B_1,B_2,R_1,S_3,D_41,B_3,...,D_138,D_139,D_140,D_141,D_142,D_143,D_144,D_145,target,cid
0,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2018-03-13,0.934745,0,0.009382,1.007647,0.006104,0.135021254,0.0,0.007174,...,-1,0,0,0.0,,0,0.00297,0,0,0
1,00007889e4fcd2614b6cbe7f8f3d2e5c728eca32d9eb8a...,2018-03-30,0.8719,0,0.007679,0.815746,0.001247,,0.0,0.005528,...,-1,0,0,0.0,,0,0.006944,0,0,4
2,0000f99513770170a1aba690daeeb8a96da4a39f11fc27...,2018-03-01,0.424624,18,0.979303,0.029291,0.0085,0.152607679,0.0,1.155846,...,-1,1,0,0.876028,0.184614226,1,0.00335,8,1,8
3,0001812036f1558332e5c0880ecbad70b13a6f28ab04a8...,2018-03-10,0.424076,8,0.917384,0.029441,0.257114,0.153415367,0.0,0.972654,...,-1,0,0,0.0,,0,0.009148,0,1,12
4,0002d381bdd8048d76719042cf1eb63caf53b636f8aacd...,2017-12-19,1.002991,1,0.008605,0.81356,0.00064,0.164536849,0.0,0.005331,...,-1,0,0,0.0,,0,0.002535,0,0,16


In [24]:
not_used = [i for i in tr.columns if i in ['cid','target','S_2'] or tr[i].dtype=='O']
not_used += get_cat_cols()

X_train = tr.drop(not_used,axis=1)
y_train = tr['target']

X_test = va.drop(not_used,axis=1)
y_test = va['target']

X_train.shape, y_train.shape, X_test.shape, y_test.shape

((344184, 177), (344184,), (114729, 177), (114729,))

In [25]:
X_train.columns

Index(['P_2', 'D_39', 'B_1', 'B_2', 'R_1', 'S_3', 'D_41', 'B_3', 'D_42',
       'D_43',
       ...
       'D_136', 'D_137', 'D_138', 'D_139', 'D_140', 'D_141', 'D_142', 'D_143',
       'D_144', 'D_145'],
      dtype='object', length=177)

In [26]:
del train,tr,va
gc.collect()

1600

In [27]:
def get_xgb_model():
    max_depth = 7
    num_trees = 1000
    early_stop = xgb.callback.EarlyStopping(rounds=10,
                                            maximize=True,
                                            metric_name='amex_metric_np',
                                            data_name='validation_0')
    model = xgb.XGBClassifier(
            tree_method='gpu_hist',
            enable_categorical=False,
            use_label_encoder=False,
            predictor='gpu_predictor',
            #eval_metric='auc',
            objective='binary:logistic',
            max_depth=max_depth,
            n_estimators=num_trees,
            #colsample_bytree=0.5,
            min_child_weight=50,
            eval_metric=amex_metric_np,
            callbacks=[early_stop]
            #gamma=10,
    )
    return model

In [28]:
model = get_xgb_model()
model.fit(
        X_train,
        y_train,
        eval_set=[(X_test, y_test)],
        verbose=100
    )
model.best_score



[0]	validation_0-logloss:0.53107	validation_0-amex_metric_np:0.67210
[81]	validation_0-logloss:0.24804	validation_0-amex_metric_np:0.74357


0.74415

The evaluation metric for this competition is the mean of two measures of rank ordering: Normalized Gini Coefficient, and default rate captured at 4%. The larger the metric, the more accurate the model is to predict the default in future. Please find the [description](https://www.kaggle.com/competitions/amex-default-prediction/overview/evaluation) and [analysis](https://www.kaggle.com/competitions/amex-default-prediction/overview/evaluation) to understand more about this metric. For now all we care is **larger metric is better!**

In [29]:
del model
gc.collect()

136

### Add RNN features

In [30]:
%%time
rnn_feas = np.load('rnn_feas.npy')
rnn_feas.shape

CPU times: user 2.47 ms, sys: 1.88 s, total: 1.88 s
Wall time: 1.88 s


(458913, 13, 177)

In [31]:
mask = np.arange(rnn_feas.shape[0])%4==0

In [32]:
tr_rnn = rnn_feas[~mask]
va_rnn = rnn_feas[mask]
tr_rnn.shape, va_rnn.shape

((344184, 13, 177), (114729, 13, 177))

For simplicity, we only use the last profile generated as new features

In [33]:
tr_rnn = tr_rnn[:,-1,:]
va_rnn = va_rnn[:,-1,:]
tr_rnn.shape, va_rnn.shape

((344184, 177), (114729, 177))

In [34]:
tr_rnn_df = cudf.DataFrame(tr_rnn,columns=[f'rnn_{i}' for i in range(tr_rnn.shape[1])])
va_rnn_df = cudf.DataFrame(va_rnn,columns=[f'rnn_{i}' for i in range(tr_rnn.shape[1])])
tr_rnn_df.shape, va_rnn_df.shape

((344184, 177), (114729, 177))

In [35]:
X_train = cudf.concat([X_train,tr_rnn_df],axis=1)
X_test = cudf.concat([X_test,va_rnn_df],axis=1)

In [36]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((344184, 354), (344184,), (114729, 354), (114729,))

In [37]:
model = get_xgb_model()
model.fit(
        X_train,
        y_train,
        eval_set=[(X_test, y_test)],
        verbose=1
    )
model.best_score



[0]	validation_0-logloss:0.51678	validation_0-amex_metric_np:0.74460
[1]	validation_0-logloss:0.41970	validation_0-amex_metric_np:0.75203
[2]	validation_0-logloss:0.35982	validation_0-amex_metric_np:0.75594
[3]	validation_0-logloss:0.32076	validation_0-amex_metric_np:0.75611
[4]	validation_0-logloss:0.29456	validation_0-amex_metric_np:0.75859
[5]	validation_0-logloss:0.27653	validation_0-amex_metric_np:0.75966
[6]	validation_0-logloss:0.26378	validation_0-amex_metric_np:0.76090
[7]	validation_0-logloss:0.25475	validation_0-amex_metric_np:0.76246
[8]	validation_0-logloss:0.24841	validation_0-amex_metric_np:0.76450
[9]	validation_0-logloss:0.24398	validation_0-amex_metric_np:0.76537
[10]	validation_0-logloss:0.24080	validation_0-amex_metric_np:0.76611
[11]	validation_0-logloss:0.23844	validation_0-amex_metric_np:0.76679
[12]	validation_0-logloss:0.23660	validation_0-amex_metric_np:0.76780
[13]	validation_0-logloss:0.23537	validation_0-amex_metric_np:0.76758
[14]	validation_0-logloss:0.23

0.776171

In [38]:
model.best_ntree_limit

63

### We got 0.03 improvement by adding the future profile features! That's huge improvements for default detection!

In [39]:
from pathlib import Path

model_dir = 'amex_xgb'
Path(f'{model_dir}/1').mkdir(parents=True, exist_ok=True)
features = X_test.shape[1]
MAX_MEMORY_BYTES = 60_000_000
num_classes = y_test.unique().shape[0]
bytes_per_sample = (features + num_classes) * 4
max_batch_size = MAX_MEMORY_BYTES // bytes_per_sample

In [40]:
def generate_config(model_dir, max_batch_size, features, deployment_type='gpu', storage_type='AUTO'):
    if deployment_type.lower() == 'cpu':
        instance_kind = 'KIND_CPU'
    else:
        instance_kind = 'KIND_GPU'

    config_text = f"""backend: "fil"
max_batch_size: {max_batch_size}
input [                                 
 {{  
    name: "input__0"
    data_type: TYPE_FP32
    dims: [ {features} ]                    
  }} 
]
output [
 {{
    name: "output__0"
    data_type: TYPE_FP32
    dims: [ {num_classes} ]
  }}
]
instance_group [{{ kind: {instance_kind} }}]
parameters [
  {{
    key: "model_type"
    value: {{ string_value: "xgboost" }}
  }},
  {{
    key: "predict_proba"
    value: {{ string_value: "true" }}
  }},
  {{
    key: "output_class"
    value: {{ string_value: "true" }}
  }},
  {{
    key: "threshold"
    value: {{ string_value: "0.5" }}
  }},
  {{
    key: "storage_type"
    value: {{ string_value: "{storage_type}" }}
  }}
]

dynamic_batching {{
  max_queue_delay_microseconds: 100
}}"""
    config_path = os.path.join(model_dir, 'config.pbtxt')
    with open(config_path, 'w') as file_:
        file_.write(config_text)

    return config_path

In [41]:
generate_config(model_dir, max_batch_size, features)

'amex_xgb/config.pbtxt'

In [42]:
model.save_model(f'{model_dir}/1/xgboost.model')

