In [67]:
import h2o
from h2o.estimators.deepwater import H2ODeepWaterEstimator
from h2o.estimators.gbm import H2OGradientBoostingEstimator
from h2o.estimators.random_forest import H2ORandomForestEstimator

import os.path

PATH=os.path.expanduser("~/CC_default/")

h2o.init(nthreads=-1)
if not H2ODeepWaterEstimator.available(): exit

Checking whether there is an H2O instance running at http://localhost:54321. connected.


0,1
H2O cluster uptime:,02 secs
H2O cluster version:,3.11.0.99999
H2O cluster version age:,7 hours and 6 minutes
H2O cluster name:,arno
H2O cluster total nodes:,1
H2O cluster free memory:,13.96 Gb
H2O cluster total cores:,12
H2O cluster allowed cores:,12
H2O cluster status:,"accepting new members, healthy"
H2O connection url:,http://localhost:54321


In [68]:
df = h2o.import_file(PATH+"default of credit card clients.csv")
df.show()
df.dim

Parse progress: |█████████████████████████████████████████████████████████████████████████████| 100%


ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
1,20000,2,2,1,24,2,2,-1,-1,-2,-2,3913,3102,689,0,0,0,0,689,0,0,0,0,1
2,120000,2,2,2,26,-1,2,0,0,0,2,2682,1725,2682,3272,3455,3261,0,1000,1000,1000,0,2000,1
3,90000,2,2,2,34,0,0,0,0,0,0,29239,14027,13559,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
4,50000,2,2,1,37,0,0,0,0,0,0,46990,48233,49291,28314,28959,29547,2000,2019,1200,1100,1069,1000,0
5,50000,1,2,1,57,-1,0,-1,0,0,0,8617,5670,35835,20940,19146,19131,2000,36681,10000,9000,689,679,0
6,50000,1,1,2,37,0,0,0,0,0,0,64400,57069,57608,19394,19619,20024,2500,1815,657,1000,1000,800,0
7,500000,1,1,2,29,0,0,0,0,0,0,367965,412023,445007,542653,483003,473944,55000,40000,38000,20239,13750,13770,0
8,100000,2,2,2,23,0,-1,-1,0,0,-1,11876,380,601,221,-159,567,380,601,0,581,1687,1542,0
9,140000,2,3,1,28,0,0,2,0,0,0,11285,14096,12108,12211,11793,3719,3329,0,432,1000,1000,1000,0
10,20000,1,3,2,35,-2,-2,-2,-2,-1,-1,0,0,0,0,13007,13912,0,0,0,13007,1122,0,0


[30000, 25]

In [75]:
response = "default payment next month"
for i in ['SEX','EDUCATION','MARRIAGE',response]: 
    df[i] = df[i].asfactor() 
predictors = list(set(df.names) - set([response, 'ID']))

In [189]:
r = df.runif(seed=42)
train = df[r  < 0.8]                 ## 80% for training
valid = df[(r >= 0.8) & (r < 0.9)]   ## 10% for early stopping (only enabled by default for Deep Water)
test  = df[r  >= 0.9]                ## 10% for final testing
print(train.dim)
print(valid.dim)
print(test .dim)

[24087, 25]
[2871, 25]
[3042, 25]


In [190]:
deepwater = H2ODeepWaterEstimator()
deepwater.train(x=predictors, y=response, training_frame=train, validation_frame=valid)
deepwater.model_performance(valid=True).auc()

deepwater Model Build progress: |█████████████████████████████████████████████████████████████| 100%


0.7046546156950931

In [191]:
gbm = H2OGradientBoostingEstimator()
gbm.train(x=predictors, y=response, training_frame=train, validation_frame=valid)
gbm.model_performance(valid=True).auc()

gbm Model Build progress: |███████████████████████████████████████████████████████████████████| 100%


0.7792643285475408

In [192]:
drf = H2ORandomForestEstimator()
drf.train(x=predictors, y=response, training_frame=train, validation_frame=valid)
drf.model_performance(valid=True).auc()

drf Model Build progress: |███████████████████████████████████████████████████████████████████| 100%


0.7693493051951796

In [193]:
pdeepwater = deepwater.predict(test)
pdeepwater

deepwater prediction progress: |██████████████████████████████████████████████████████████████| 100%


predict,p0,p1
1,0.68773,0.31227
0,0.958576,0.0414235
1,0.0706593,0.929341
0,0.920109,0.0798908
0,0.978373,0.0216272
1,0.40178,0.59822
1,0.713629,0.286371
0,1.0,4.068e-09
0,0.852017,0.147982
1,0.194967,0.805033




In [194]:
pgbm = gbm.predict(test)
pgbm

gbm prediction progress: |████████████████████████████████████████████████████████████████████| 100%


predict,p0,p1
0,0.804759,0.195241
0,0.939914,0.0600864
1,0.259953,0.740047
0,0.911463,0.0885374
0,0.77337,0.22663
1,0.347837,0.652163
1,0.629427,0.370573
0,0.950552,0.0494482
1,0.272054,0.727946
1,0.262104,0.737896




In [195]:
pdrf = drf.predict(test)
pdrf

drf prediction progress: |████████████████████████████████████████████████████████████████████| 100%


predict,p0,p1
0,0.797418,0.202582
0,0.974096,0.0259039
1,0.143146,0.856854
0,0.89147,0.10853
0,0.730526,0.269474
1,0.358218,0.641782
1,0.617384,0.382616
0,0.918169,0.0818311
1,0.208152,0.791848
1,0.226032,0.773968




In [196]:
h2o.make_metrics(actual=test[response], predicted=pdeepwater[2]).auc()

0.6933370886729568

In [197]:
h2o.make_metrics(actual=test[response], predicted=pgbm[2]).auc()

0.7701031153671627

In [198]:
h2o.make_metrics(actual=test[response], predicted=pdrf[2]).auc()

0.7629366755855983

In [205]:
h2o.make_metrics(actual=test[response], predicted=0.5*(pgbm+pdrf)[2]).auc()

0.7710629175859425

In [206]:
h2o.make_metrics(actual=test[response], predicted=0.1*(5*pgbm+4*pdrf+1*pdeepwater)[2]).auc()

0.7724514544117417

[Inspect model convergence in Flow](http://localhost:54321/flow/index.html)