### In this notebook, we train two xgboost models to predict whether or not a customer will default in future.
- The first xgboost model is trained with given numerical features as is.
- The second xgboost model is trained with given numerical features as well as the Autoregressive RNN generated features.
- Save the model of the 2nd xgboost and write configure file for triton inference

In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

In [2]:
import cudf
import cupy
from tqdm import tqdm
import numpy as np
import gc
import xgboost as xgb
from utils import amex_metric_np

cudf.__version__, xgb.__version__

('23.04.00', '1.7.1')

In [3]:
PATH = '/raid/data/ml/kaggle/amex'

# Data preprocessing

In [4]:
%%time
train = cudf.read_parquet(f'{PATH}/train.parquet')
trainl = cudf.read_csv(f'{PATH}/train_labels.csv')
print(trainl.shape)
trainl.head()

(458913, 2)
CPU times: user 1.23 s, sys: 1.35 s, total: 2.59 s
Wall time: 2.58 s


Unnamed: 0,customer_ID,target
0,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,0
1,00000fd6641609c6ece5454664794f0340ad84dddce9a2...,0
2,00001b22f846c82c51f6e3958ccd81970162bae8b007e8...,0
3,000041bdba6ecadd89a52d11886e8eaaec9325906c9723...,0
4,00007889e4fcd2614b6cbe7f8f3d2e5c728eca32d9eb8a...,0


In [5]:
trainl['target'].value_counts()

0    340085
1    118828
Name: target, dtype: int32

In [6]:
%%time
train = train.merge(trainl, on='customer_ID', how='left')
print(train.shape)
train.head()

(5531451, 191)
CPU times: user 38.5 ms, sys: 294 ms, total: 332 ms
Wall time: 482 ms


Unnamed: 0,customer_ID,S_2,P_2,D_39,B_1,B_2,R_1,S_3,D_41,B_3,...,D_137,D_138,D_139,D_140,D_141,D_142,D_143,D_144,D_145,target
0,0044e52546003bf8dc7e0234a971910ea411d932148813...,2018-02-05,0.162061,91,0.104386,0.00224,1.00334,0.20222196,1.599497,0.153965,...,-1,-1,0,0,0.0,,0,0.007211,0,1
1,0044e52546003bf8dc7e0234a971910ea411d932148813...,2018-03-08,0.209617,31,0.109365,0.166125,1.256646,0.198744327,1.608763,0.156176,...,-1,-1,0,0,0.0,,0,0.001472,0,1
2,0044f11b5431d326feefaf34642a86e8a38d8b215522b2...,2017-03-25,0.913065,0,0.000341,0.818835,0.002579,,0.0,0.00556,...,-1,-1,0,0,0.0,,0,0.002861,0,0
3,0044f11b5431d326feefaf34642a86e8a38d8b215522b2...,2017-04-25,0.915826,0,0.000356,0.813991,0.007165,,0.0,0.00421,...,-1,-1,0,0,0.0,,0,0.000775,0,0
4,0044f11b5431d326feefaf34642a86e8a38d8b215522b2...,2017-05-26,0.914364,0,0.00156,1.00719,0.00082,,0.0,0.00857,...,-1,-1,0,0,0.0,,0,0.005072,0,0


In [7]:
%%time

train['cid'], _ = train.customer_ID.factorize()
train['S_2'] = cudf.to_datetime(train['S_2'])

CPU times: user 354 ms, sys: 178 ms, total: 533 ms
Wall time: 525 ms


In [8]:
mask = train['cid']%4 == 0
tr,va = train.loc[~mask],train.loc[mask]
print("Verify target distribution is consistent across tr and va")
print(tr['target'].mean(), va['target'].mean())

Verify target distribution is consistent across tr and va
0.2493462806763533 0.24835027876095958


### Utility Functions

In [9]:
def get_cat_cols():
    return ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120',
                'D_126', 'D_63', 'D_64', 'D_66', 'D_68']

def preprocess(df):
    df = df.sort_values(['cid','S_2'])
    df = df.drop_duplicates('cid',keep='last')
    df = df.sort_values('cid')
    df = df.reset_index(drop=True)
    return df

In [10]:
%%time

tr = preprocess(tr)
print(tr.shape)
tr.head()

(344184, 192)
CPU times: user 207 ms, sys: 740 ms, total: 947 ms
Wall time: 1.01 s


Unnamed: 0,customer_ID,S_2,P_2,D_39,B_1,B_2,R_1,S_3,D_41,B_3,...,D_138,D_139,D_140,D_141,D_142,D_143,D_144,D_145,target,cid
0,00000fd6641609c6ece5454664794f0340ad84dddce9a2...,2018-03-25,0.880519,6,0.034684,1.004028,0.006911,0.165509477,0.0,0.005068,...,-1,0,0,0.0,,0,0.003169,0,0,1
1,00001b22f846c82c51f6e3958ccd81970162bae8b007e8...,2018-03-12,0.880875,0,0.004284,0.812649,0.00645,,0.0,0.007196,...,-1,0,0,0.0,,0,0.000834,0,0,2
2,000041bdba6ecadd89a52d11886e8eaaec9325906c9723...,2018-03-29,0.621776,0,0.012564,1.006183,0.007829,0.287765533,0.0,0.009937,...,-1,0,0,0.0,,0,0.00556,0,0,3
3,000084e5023181993c2e1b665ac88dbb1ce9ef621ec537...,2018-03-19,0.824061,0,0.007853,1.001713,0.006885,0.395739734,0.0,0.006134,...,-1,0,0,0.0,,0,0.006943,0,0,5
4,000098081fde4fd64bc4d503a5d6f86a0aedc425c96f52...,2018-03-12,0.477116,0,0.009413,1.009217,0.007775,0.267036825,0.0,0.125927,...,-1,0,0,0.0,,0,0.003703,0,0,6


In [11]:
%%time

va = preprocess(va)
print(va.shape)
va.head()

(114729, 192)
CPU times: user 76.4 ms, sys: 453 ms, total: 529 ms
Wall time: 536 ms


Unnamed: 0,customer_ID,S_2,P_2,D_39,B_1,B_2,R_1,S_3,D_41,B_3,...,D_138,D_139,D_140,D_141,D_142,D_143,D_144,D_145,target,cid
0,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2018-03-13,0.934745,0,0.009382,1.007647,0.006104,0.135021254,0.0,0.007174,...,-1,0,0,0.0,,0,0.00297,0,0,0
1,00007889e4fcd2614b6cbe7f8f3d2e5c728eca32d9eb8a...,2018-03-30,0.8719,0,0.007679,0.815746,0.001247,,0.0,0.005528,...,-1,0,0,0.0,,0,0.006944,0,0,4
2,0000f99513770170a1aba690daeeb8a96da4a39f11fc27...,2018-03-01,0.424624,18,0.979303,0.029291,0.0085,0.152607679,0.0,1.155846,...,-1,1,0,0.876028,0.184614226,1,0.00335,8,1,8
3,0001812036f1558332e5c0880ecbad70b13a6f28ab04a8...,2018-03-10,0.424076,8,0.917384,0.029441,0.257114,0.153415367,0.0,0.972654,...,-1,0,0,0.0,,0,0.009148,0,1,12
4,0002d381bdd8048d76719042cf1eb63caf53b636f8aacd...,2018-03-19,1.004771,0,0.009469,0.810357,0.009299,0.16986692,0.0,0.008108,...,-1,0,0,0.0,,0,0.003928,0,0,16


In [12]:
not_used = [i for i in tr.columns if i in ['cid','target','S_2'] or tr[i].dtype=='O']
not_used += get_cat_cols()

X_train = tr.drop(not_used,axis=1)
y_train = tr['target']

X_test = va.drop(not_used,axis=1)
y_test = va['target']

for i in X_train.columns:
    X_train[i] = X_train[i].astype('float32')
    X_test[i] = X_test[i].astype('float32')

X_train.shape, y_train.shape, X_test.shape, y_test.shape

((344184, 177), (344184,), (114729, 177), (114729,))

In [13]:
X_train.columns

Index(['P_2', 'D_39', 'B_1', 'B_2', 'R_1', 'S_3', 'D_41', 'B_3', 'D_42',
       'D_43',
       ...
       'D_136', 'D_137', 'D_138', 'D_139', 'D_140', 'D_141', 'D_142', 'D_143',
       'D_144', 'D_145'],
      dtype='object', length=177)

In [14]:
del train,tr,va
gc.collect()

1209

# Train the 1st xgboost model with given features only

In [15]:
def get_xgb_model():
    max_depth = 7
    num_trees = 1000
    early_stop = xgb.callback.EarlyStopping(rounds=10,
                                            maximize=True,
                                            metric_name='amex_metric_np',
                                            data_name='validation_0')
    model = xgb.XGBClassifier(
            tree_method='gpu_hist',
            enable_categorical=False,
            use_label_encoder=False,
            predictor='gpu_predictor',
            #eval_metric='auc',
            objective='binary:logistic',
            max_depth=max_depth,
            n_estimators=num_trees,
            #colsample_bytree=0.5,
            min_child_weight=50,
            eval_metric=amex_metric_np,
            callbacks=[early_stop]
            #gamma=10,
    )
    return model

In [16]:
model = get_xgb_model()
model.fit(
        X_train,
        y_train,
        eval_set=[(X_test, y_test)],
        verbose=100
    )
model.best_score



[0]	validation_0-logloss:0.52306	validation_0-amex_metric_np:0.71127
[53]	validation_0-logloss:0.22668	validation_0-amex_metric_np:0.77974


0.779758

### The evaluation metric for this competition is the mean of two measures of rank ordering: Normalized Gini Coefficient, and default rate captured at 4%. The larger the metric, the more accurate the model is to predict the default in future. Please find the [description](https://www.kaggle.com/competitions/amex-default-prediction/overview/evaluation) and [analysis](https://www.kaggle.com/competitions/amex-default-prediction/overview/evaluation) to understand more about this metric. For now all we care is **larger metric is better!**

In [17]:
del model
gc.collect()

136

# Add RNN features and train the xgboost again

In [18]:
%%time
rnn_feas = np.load('rnn_feas.npy')
rnn_feas.shape

CPU times: user 4.09 ms, sys: 1.31 s, total: 1.31 s
Wall time: 1.31 s


(458913, 13, 177)

In [19]:
mask = np.arange(rnn_feas.shape[0])%4==0

In [20]:
tr_rnn = rnn_feas[~mask]
va_rnn = rnn_feas[mask]
tr_rnn.shape, va_rnn.shape

((344184, 13, 177), (114729, 13, 177))

For simplicity, we only use the last profile generated as new features

In [21]:
tr_rnn = tr_rnn[:,-1,:]
va_rnn = va_rnn[:,-1,:]
tr_rnn.shape, va_rnn.shape

((344184, 177), (114729, 177))

In [22]:
tr_rnn_df = cudf.DataFrame(tr_rnn,columns=[f'rnn_{i}' for i in range(tr_rnn.shape[1])])
va_rnn_df = cudf.DataFrame(va_rnn,columns=[f'rnn_{i}' for i in range(tr_rnn.shape[1])])
tr_rnn_df.shape, va_rnn_df.shape

((344184, 177), (114729, 177))

In [23]:
X_train = cudf.concat([X_train,tr_rnn_df],axis=1)
X_test = cudf.concat([X_test,va_rnn_df],axis=1)

In [24]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((344184, 354), (344184,), (114729, 354), (114729,))

In [25]:
model = get_xgb_model()
model.fit(
        X_train,
        y_train,
        eval_set=[(X_test, y_test)],
        verbose=100
    )
model.best_score



[0]	validation_0-logloss:0.51646	validation_0-amex_metric_np:0.74595
[49]	validation_0-logloss:0.22555	validation_0-amex_metric_np:0.78227


0.78302

### We got 0.002 improvement by adding the future profile features! That's significant improvements for default detection! It  could move the rank up by hundreds of places in the [competition](https://www.kaggle.com/competitions/amex-default-prediction/leaderboard)!!

# Save the model and write config.pbtxt for triton inference

In [31]:
from pathlib import Path

model_dir = 'amex_xgb'
Path(f'{model_dir}/1').mkdir(parents=True, exist_ok=True)
features = X_test.shape[1]
MAX_MEMORY_BYTES = 60_000_000
num_classes = y_test.unique().shape[0]
bytes_per_sample = (features + num_classes) * 4
max_batch_size = MAX_MEMORY_BYTES // bytes_per_sample

In [32]:
def generate_config(model_dir, max_batch_size, features, deployment_type='gpu', storage_type='AUTO'):
    if deployment_type.lower() == 'cpu':
        instance_kind = 'KIND_CPU'
    else:
        instance_kind = 'KIND_GPU'

    config_text = f"""backend: "fil"
max_batch_size: {max_batch_size}
input [                                 
 {{  
    name: "input__0"
    data_type: TYPE_FP32
    dims: [ {features} ]                    
  }} 
]
output [
 {{
    name: "output__0"
    data_type: TYPE_FP32
    dims: [ {num_classes} ]
  }}
]
instance_group [{{ kind: {instance_kind} }}]
parameters [
  {{
    key: "model_type"
    value: {{ string_value: "xgboost" }}
  }},
  {{
    key: "predict_proba"
    value: {{ string_value: "true" }}
  }},
  {{
    key: "output_class"
    value: {{ string_value: "true" }}
  }},
  {{
    key: "threshold"
    value: {{ string_value: "0.5" }}
  }},
  {{
    key: "storage_type"
    value: {{ string_value: "{storage_type}" }}
  }}
]

dynamic_batching {{
  max_queue_delay_microseconds: 100
}}"""
    config_path = os.path.join(model_dir, 'config.pbtxt')
    with open(config_path, 'w') as file_:
        file_.write(config_text)

    return config_path

In [33]:
generate_config(model_dir, max_batch_size, features)

'amex_xgb/config.pbtxt'

In [34]:
model.save_model(f'{model_dir}/1/xgboost.model')

