In [1]:
import os

In [2]:
import cudf
import cupy
from tqdm import tqdm
import numpy as np
import gc
import xgboost as xgb
from utils import amex_metric_np

cudf.__version__, xgb.__version__

('22.12.01', '1.7.3')

### Please register kaggle and install kaggle API by: 
- `pip install kaggle`
- complete [authentication](https://www.kaggle.com/docs/api)

In [3]:
PATH = '/raid/data/ml/kaggle/amex'

In [4]:
cmd = f'kaggle datasets download -d raddar/amex-data-integer-dtypes-parquet-format -p {PATH}'

In [5]:
os.system(cmd)

amex-data-integer-dtypes-parquet-format.zip: Skipping, found more recently modified local copy (use --force to force download)


0

In [6]:
os.listdir(PATH)

['test.parquet',
 'train_labels.csv.zip',
 'train_labels.csv',
 'amex-data-integer-dtypes-parquet-format.zip',
 'train.parquet']

In [7]:
cmd = f'cd {PATH} && unzip amex-data-integer-dtypes-parquet-format.zip'
os.system(cmd)

Archive:  amex-data-integer-dtypes-parquet-format.zip


replace test.parquet? [y]es, [n]o, [A]ll, [N]one, [r]ename:  NULL
(EOF or read error, treating as "[N]one" ...)


256

In [8]:
os.listdir(PATH)

['test.parquet',
 'train_labels.csv.zip',
 'train_labels.csv',
 'amex-data-integer-dtypes-parquet-format.zip',
 'train.parquet']

### Basic EDA

In [9]:
%%time
train = cudf.read_parquet(f'{PATH}/train.parquet')
print(train.shape)
train.head()

(5531451, 190)
CPU times: user 1.25 s, sys: 780 ms, total: 2.03 s
Wall time: 2.03 s


Unnamed: 0,customer_ID,S_2,P_2,D_39,B_1,B_2,R_1,S_3,D_41,B_3,...,D_136,D_137,D_138,D_139,D_140,D_141,D_142,D_143,D_144,D_145
0,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-03-09,0.938469,0,0.008724,1.006838,0.009228,0.124035,0.0,0.004709,...,-1,-1,-1,0,0,0.0,,0,0.00061,0
1,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-04-07,0.936665,0,0.004923,1.000653,0.006151,0.12675,0.0,0.002714,...,-1,-1,-1,0,0,0.0,,0,0.005492,0
2,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-05-28,0.95418,3,0.021655,1.009672,0.006815,0.123977,0.0,0.009423,...,-1,-1,-1,0,0,0.0,,0,0.006986,0
3,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-06-13,0.960384,0,0.013683,1.0027,0.001373,0.117169,0.0,0.005531,...,-1,-1,-1,0,0,0.0,,0,0.006527,0
4,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-07-16,0.947248,0,0.015193,1.000727,0.007605,0.117325,0.0,0.009312,...,-1,-1,-1,0,0,0.0,,0,0.008126,0


In [10]:
%%time
count_df = train.groupby('customer_ID').size().to_frame('num_profiles')
count_df.head()

CPU times: user 7.67 ms, sys: 5.01 ms, total: 12.7 ms
Wall time: 10.8 ms


Unnamed: 0_level_0,num_profiles
customer_ID,Unnamed: 1_level_1
c761f5f5b15e563daa67f0a41c3ec2a870d3c9daaadf0cd11dd808d3aaa82c46,13
e16b5594d9dce9ebd2f8e0d7074391736b2641afa9e349f67a53f7cc780c120b,13
8c846c26e1f1d4afa04977155c41bc3b6bb77c72efc5db3f592ec3d72f12cfdc,13
463e8a9b5b0161764bbbb0b5b58956bb8ebff6244219b21ac257a07364fa8dd9,13
92bbe3e2a159bcc838b86241471eb14153c8d712b6647feffbe49d5266cdfd3f,13


In [11]:
count_df.num_profiles.max()

13

In [12]:
%%time
train['S_2'] = cudf.to_datetime(train['S_2'])
train.head()

CPU times: user 11.3 ms, sys: 3.48 ms, total: 14.7 ms
Wall time: 13.9 ms


Unnamed: 0,customer_ID,S_2,P_2,D_39,B_1,B_2,R_1,S_3,D_41,B_3,...,D_136,D_137,D_138,D_139,D_140,D_141,D_142,D_143,D_144,D_145
0,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-03-09,0.938469,0,0.008724,1.006838,0.009228,0.124035,0.0,0.004709,...,-1,-1,-1,0,0,0.0,,0,0.00061,0
1,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-04-07,0.936665,0,0.004923,1.000653,0.006151,0.12675,0.0,0.002714,...,-1,-1,-1,0,0,0.0,,0,0.005492,0
2,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-05-28,0.95418,3,0.021655,1.009672,0.006815,0.123977,0.0,0.009423,...,-1,-1,-1,0,0,0.0,,0,0.006986,0
3,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-06-13,0.960384,0,0.013683,1.0027,0.001373,0.117169,0.0,0.005531,...,-1,-1,-1,0,0,0.0,,0,0.006527,0
4,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-07-16,0.947248,0,0.015193,1.000727,0.007605,0.117325,0.0,0.009312,...,-1,-1,-1,0,0,0.0,,0,0.008126,0


In [13]:
train.S_2.min(), train.S_2.max()

(numpy.datetime64('2017-03-01T00:00:00.000000000'),
 numpy.datetime64('2018-03-31T00:00:00.000000000'))

#### Download the training data labels

In [14]:
cmd = f'kaggle competitions download -c amex-default-prediction -f train_labels.csv -p {PATH}/'
os.system(cmd)

train_labels.csv.zip: Skipping, found more recently modified local copy (use --force to force download)


0

In [15]:
cmd = f'cd {PATH} && unzip train_labels.csv.zip'
os.system(cmd)

Archive:  train_labels.csv.zip


replace train_labels.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename:  NULL
(EOF or read error, treating as "[N]one" ...)


256

In [16]:
%%time
trainl = cudf.read_csv(f'{PATH}/train_labels.csv')
print(trainl.shape)
trainl.head()

(458913, 2)
CPU times: user 12.2 ms, sys: 12.1 ms, total: 24.3 ms
Wall time: 22 ms


Unnamed: 0,customer_ID,target
0,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,0
1,00000fd6641609c6ece5454664794f0340ad84dddce9a2...,0
2,00001b22f846c82c51f6e3958ccd81970162bae8b007e8...,0
3,000041bdba6ecadd89a52d11886e8eaaec9325906c9723...,0
4,00007889e4fcd2614b6cbe7f8f3d2e5c728eca32d9eb8a...,0


In [17]:
trainl['target'].value_counts()

0    340085
1    118828
Name: target, dtype: int32

In [18]:
%%time
train = train.merge(trainl, on='customer_ID', how='left')
print(train.shape)
train.head()

(5531451, 191)
CPU times: user 63 ms, sys: 56.6 ms, total: 120 ms
Wall time: 119 ms


Unnamed: 0,customer_ID,S_2,P_2,D_39,B_1,B_2,R_1,S_3,D_41,B_3,...,D_137,D_138,D_139,D_140,D_141,D_142,D_143,D_144,D_145,target
0,00597bc3d552264d841bd1a52cfaf3ebe40755f96d85a5...,2017-09-16,0.338559,0,0.786239,0.02018,0.257668,0.265720874,0.306263,1.132187,...,-1,-1,0,0,0.0,,0,0.006487,0,1
1,00597bc3d552264d841bd1a52cfaf3ebe40755f96d85a5...,2017-10-21,0.370168,1,0.753035,0.038276,0.257267,,0.381708,1.158877,...,-1,-1,0,0,0.0,,0,0.0089,0,1
2,00597bc3d552264d841bd1a52cfaf3ebe40755f96d85a5...,2017-11-20,0.378734,1,0.740267,0.028689,0.258875,,0.354888,1.17771,...,-1,-1,0,0,0.0,,0,0.009689,0,1
3,00597bc3d552264d841bd1a52cfaf3ebe40755f96d85a5...,2017-12-21,0.366894,1,0.742525,0.028862,0.252372,,0.328623,1.166416,...,-1,-1,0,0,0.0,,0,0.003046,0,1
4,00597bc3d552264d841bd1a52cfaf3ebe40755f96d85a5...,2018-01-25,0.363433,6,0.754621,0.026783,0.250846,,0.0,1.169429,...,-1,-1,0,0,0.0,,0,0.007394,0,1


In [19]:
train['cid'], _ = train.customer_ID.factorize()

In [20]:
mask = train['cid']%4 == 0
tr,va = train.loc[~mask],train.loc[mask]
print("Verify target distribution is consistent across tr and va")
print(tr['target'].mean(), va['target'].mean())

Verify target distribution is consistent across tr and va
0.2493462806763533 0.24835027876095958
