In [1]:
import os

In [2]:
import cudf
import cupy
from tqdm import tqdm
import numpy as np
import gc
import xgboost as xgb
from utils import amex_metric_np
from pathlib import Path

cudf.__version__, xgb.__version__

('23.04.00', '1.7.1')

# Please register kaggle and install kaggle API by: 
- `pip install kaggle`
- complete [authentication](https://www.kaggle.com/docs/api)

In [3]:
PATH = '/raid/data/ml/kaggle/amex'

In [5]:
Path(PATH).mkdir(parents=True,exist_ok=True)

In [6]:
cmd = f'kaggle datasets download -d raddar/amex-data-integer-dtypes-parquet-format -p {PATH}'

In [7]:
os.system(cmd)

Downloading amex-data-integer-dtypes-parquet-format.zip to /raid/data/ml/kaggle/amex


100%|█████████▉| 4.06G/4.07G [01:13<00:00, 63.0MB/s]




100%|██████████| 4.07G/4.07G [01:13<00:00, 59.3MB/s]


0

In [8]:
os.listdir(PATH)

['amex-data-integer-dtypes-parquet-format.zip']

In [9]:
cmd = f'cd {PATH} && unzip amex-data-integer-dtypes-parquet-format.zip'
os.system(cmd)

Archive:  amex-data-integer-dtypes-parquet-format.zip
  inflating: test.parquet            
  inflating: train.parquet           


0

In [10]:
os.listdir(PATH)

['amex-data-integer-dtypes-parquet-format.zip',
 'train.parquet',
 'test.parquet']

# Basic EDA

In [11]:
%%time
train = cudf.read_parquet(f'{PATH}/train.parquet')
print(train.shape)
train.head()

(5531451, 190)
CPU times: user 1.26 s, sys: 1.31 s, total: 2.57 s
Wall time: 2.6 s


Unnamed: 0,customer_ID,S_2,P_2,D_39,B_1,B_2,R_1,S_3,D_41,B_3,...,D_136,D_137,D_138,D_139,D_140,D_141,D_142,D_143,D_144,D_145
0,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-03-09,0.938469,0,0.008724,1.006838,0.009228,0.124035,0.0,0.004709,...,-1,-1,-1,0,0,0.0,,0,0.00061,0
1,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-04-07,0.936665,0,0.004923,1.000653,0.006151,0.12675,0.0,0.002714,...,-1,-1,-1,0,0,0.0,,0,0.005492,0
2,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-05-28,0.95418,3,0.021655,1.009672,0.006815,0.123977,0.0,0.009423,...,-1,-1,-1,0,0,0.0,,0,0.006986,0
3,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-06-13,0.960384,0,0.013683,1.0027,0.001373,0.117169,0.0,0.005531,...,-1,-1,-1,0,0,0.0,,0,0.006527,0
4,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-07-16,0.947248,0,0.015193,1.000727,0.007605,0.117325,0.0,0.009312,...,-1,-1,-1,0,0,0.0,,0,0.008126,0


In [12]:
%%time
count_df = train.groupby('customer_ID').size().to_frame('num_profiles')
count_df.head()

CPU times: user 5.63 ms, sys: 9.56 ms, total: 15.2 ms
Wall time: 14.1 ms


Unnamed: 0_level_0,num_profiles
customer_ID,Unnamed: 1_level_1
c761f5f5b15e563daa67f0a41c3ec2a870d3c9daaadf0cd11dd808d3aaa82c46,13
e16b5594d9dce9ebd2f8e0d7074391736b2641afa9e349f67a53f7cc780c120b,13
8c846c26e1f1d4afa04977155c41bc3b6bb77c72efc5db3f592ec3d72f12cfdc,13
463e8a9b5b0161764bbbb0b5b58956bb8ebff6244219b21ac257a07364fa8dd9,13
92bbe3e2a159bcc838b86241471eb14153c8d712b6647feffbe49d5266cdfd3f,13


In [13]:
count_df.num_profiles.max()

13

In [14]:
%%time
train['S_2'] = cudf.to_datetime(train['S_2'])
train.head()

CPU times: user 12.6 ms, sys: 4.63 ms, total: 17.2 ms
Wall time: 15.7 ms


Unnamed: 0,customer_ID,S_2,P_2,D_39,B_1,B_2,R_1,S_3,D_41,B_3,...,D_136,D_137,D_138,D_139,D_140,D_141,D_142,D_143,D_144,D_145
0,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-03-09,0.938469,0,0.008724,1.006838,0.009228,0.124035,0.0,0.004709,...,-1,-1,-1,0,0,0.0,,0,0.00061,0
1,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-04-07,0.936665,0,0.004923,1.000653,0.006151,0.12675,0.0,0.002714,...,-1,-1,-1,0,0,0.0,,0,0.005492,0
2,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-05-28,0.95418,3,0.021655,1.009672,0.006815,0.123977,0.0,0.009423,...,-1,-1,-1,0,0,0.0,,0,0.006986,0
3,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-06-13,0.960384,0,0.013683,1.0027,0.001373,0.117169,0.0,0.005531,...,-1,-1,-1,0,0,0.0,,0,0.006527,0
4,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-07-16,0.947248,0,0.015193,1.000727,0.007605,0.117325,0.0,0.009312,...,-1,-1,-1,0,0,0.0,,0,0.008126,0


In [15]:
train.S_2.min(), train.S_2.max()

(numpy.datetime64('2017-03-01T00:00:00.000000000'),
 numpy.datetime64('2018-03-31T00:00:00.000000000'))

## Download the training data labels

In [16]:
cmd = f'kaggle competitions download -c amex-default-prediction -f train_labels.csv -p {PATH}/'
os.system(cmd)

Downloading train_labels.csv.zip to /raid/data/ml/kaggle/amex


 31%|███       | 5.00M/16.2M [00:00<00:00, 29.6MB/s]




100%|██████████| 16.2M/16.2M [00:00<00:00, 21.8MB/s]


0

In [17]:
cmd = f'cd {PATH} && unzip train_labels.csv.zip'
os.system(cmd)

Archive:  train_labels.csv.zip
  inflating: train_labels.csv        


0

In [18]:
%%time
trainl = cudf.read_csv(f'{PATH}/train_labels.csv')
print(trainl.shape)
trainl.head()

(458913, 2)
CPU times: user 19.3 ms, sys: 11.2 ms, total: 30.5 ms
Wall time: 34.7 ms


Unnamed: 0,customer_ID,target
0,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,0
1,00000fd6641609c6ece5454664794f0340ad84dddce9a2...,0
2,00001b22f846c82c51f6e3958ccd81970162bae8b007e8...,0
3,000041bdba6ecadd89a52d11886e8eaaec9325906c9723...,0
4,00007889e4fcd2614b6cbe7f8f3d2e5c728eca32d9eb8a...,0


In [19]:
trainl['target'].value_counts()

0    340085
1    118828
Name: target, dtype: int32

In [20]:
%%time
train = train.merge(trainl, on='customer_ID', how='left')
print(train.shape)
train.head()

(5531451, 191)
CPU times: user 49 ms, sys: 157 ms, total: 206 ms
Wall time: 205 ms


Unnamed: 0,customer_ID,S_2,P_2,D_39,B_1,B_2,R_1,S_3,D_41,B_3,...,D_137,D_138,D_139,D_140,D_141,D_142,D_143,D_144,D_145,target
0,0003b7891c49786441d138c01f55f1712df645989dcd2a...,2017-08-20,0.951668,0,0.021847,0.705988,0.001923,0.544475,0.0,0.04075,...,-1,-1,0,0,0.0,,0,0.002867,0,0
1,0003b7891c49786441d138c01f55f1712df645989dcd2a...,2017-09-27,0.949354,0,0.021467,0.519889,0.002993,0.505413,0.0,0.045911,...,-1,-1,0,0,0.0,,0,0.004061,0,0
2,0003b7891c49786441d138c01f55f1712df645989dcd2a...,2017-10-22,0.928322,0,0.017777,1.002783,0.007028,0.499947,0.0,0.027716,...,-1,-1,0,0,0.0,,0,0.005397,0,0
3,0003b7891c49786441d138c01f55f1712df645989dcd2a...,2017-11-01,0.929362,3,0.020861,0.811287,0.005151,0.50569,0.0,0.023649,...,-1,-1,0,0,0.0,,0,0.00278,0,0
4,0003b7891c49786441d138c01f55f1712df645989dcd2a...,2017-12-04,0.948366,6,0.03701,1.004685,0.006001,0.549692,0.0,0.019136,...,-1,-1,0,0,0.0,,0,0.005114,0,0


In [21]:
train['cid'], _ = train.customer_ID.factorize()

In [22]:
mask = train['cid']%4 == 0
tr,va = train.loc[~mask],train.loc[mask]
print("Verify target distribution is consistent across tr and va")
print(tr['target'].mean(), va['target'].mean())

Verify target distribution is consistent across tr and va
0.2493462806763533 0.24835027876095958
