In [12]:
# import h2o package and specific estimator 
import h2o
from h2o.automl import H2OAutoML

In [2]:
h2o.init() # start h2o

Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
  Java Version: java version "1.8.0_112"; Java(TM) SE Runtime Environment (build 1.8.0_112-b16); Java HotSpot(TM) 64-Bit Server VM (build 25.112-b16, mixed mode)
  Starting server from /Users/phall/anaconda/lib/python3.5/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /var/folders/tc/0ss1l73113j3wdyjsxmy1j2r0000gn/T/tmpnap_or86
  JVM stdout: /var/folders/tc/0ss1l73113j3wdyjsxmy1j2r0000gn/T/tmpnap_or86/h2o_phall_started_from_python.out
  JVM stderr: /var/folders/tc/0ss1l73113j3wdyjsxmy1j2r0000gn/T/tmpnap_or86/h2o_phall_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321... successful.


0,1
H2O cluster uptime:,01 secs
H2O cluster version:,3.12.0.1
H2O cluster version age:,29 days
H2O cluster name:,H2O_from_python_phall_psfin7
H2O cluster total nodes:,1
H2O cluster free memory:,3.556 Gb
H2O cluster total cores:,8
H2O cluster allowed cores:,8
H2O cluster status:,"accepting new members, healthy"
H2O connection url:,http://127.0.0.1:54321


In [3]:
# location of clean data file
path = 'https://raw.githubusercontent.com/h2oai/app-consumer-loan/master/data/loan.csv'

In [4]:
# define input variable measurement levels 
# strings automatically parsed as enums (nominal)
# numbers automatically parsed as numeric
col_types = {'bad_loan': 'enum'}

In [5]:
frame = h2o.import_file(path=path, col_types=col_types) # import from url

Parse progress: |█████████████████████████████████████████████████████████| 100%


In [6]:
frame.describe() # summarize table

Rows:163987
Cols:15




Unnamed: 0,loan_amnt,term,int_rate,emp_length,home_ownership,annual_inc,purpose,addr_state,dti,delinq_2yrs,revol_util,total_acc,bad_loan,longest_credit_length,verification_status
type,int,enum,real,int,enum,real,enum,enum,real,int,real,int,enum,int,enum
mins,500.0,,5.42,0.0,,1896.0,,,0.0,0.0,0.0,1.0,,0.0,
mean,13074.169141456332,,13.715904065566189,5.684352932995338,,71915.67051974905,,,15.881530121290167,0.22735700606252723,54.07917280242262,24.579733834274574,,14.854273655448333,
maxs,35000.0,,26.060000000000002,10.0,,7141778.0,,,39.99,29.0,150.70000000000002,118.0,,65.0,
sigma,7993.556188734672,,4.391939870545809,3.610663731100238,,59070.91565491818,,,7.5876682241925355,0.6941679229284191,25.285366766770498,11.685190365910666,,6.947732922546689,
zeros,0,,0,14248,,0,,,270,139459,1562,0,,11,
missing,0,0,0,5804,0,4,0,0,0,29,193,29,0,29,0
0,5000.0,36 months,10.65,10.0,RENT,24000.0,credit_card,AZ,27.650000000000002,0.0,83.7,9.0,0,26.0,verified
1,2500.0,60 months,15.27,0.0,RENT,30000.0,car,GA,1.0,0.0,9.4,4.0,1,12.0,verified
2,2400.0,36 months,15.96,10.0,RENT,12252.0,small_business,IL,8.72,0.0,98.5,10.0,0,10.0,not verified


In [7]:
# split into training and test for cross validation
train, test = frame.split_frame([0.7])

In [8]:
# assign target and inputs for logistic regression
y = 'bad_loan'
X = [name for name in frame.columns if name != y]
print(y)
print(X)

bad_loan
['loan_amnt', 'term', 'int_rate', 'emp_length', 'home_ownership', 'annual_inc', 'purpose', 'addr_state', 'dti', 'delinq_2yrs', 'revol_util', 'total_acc', 'longest_credit_length', 'verification_status']


In [9]:
# determine column types
reals, enums = [], []
for key, val in frame.types.items():
    if key in X:
        if val == 'enum':
            enums.append(key)
        else: 
            reals.append(key)

print(enums)
print(reals)

['purpose', 'home_ownership', 'term', 'addr_state', 'verification_status']
['dti', 'annual_inc', 'loan_amnt', 'int_rate', 'emp_length', 'total_acc', 'longest_credit_length', 'revol_util', 'delinq_2yrs']


In [10]:
# impute missing values
_ = frame[reals].impute(method='mean')

In [11]:
# set target to factor for logisitic regression
# just to be safe ...
train[y] = train[y].asfactor()
test[y] = test[y].asfactor()

In [13]:
# automl
# runs for 300 seconds then builds a stacked ensemble
auto = H2OAutoML(max_runtime_secs=300) # init automl, run for 300 seconds
auto.train(x=X,  
           y=y,
           training_frame=train,   # training data split into 70/30 train/valid
           leaderboard_frame=test) 

AutoML progress: |████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%


In [14]:
# view leaderboard
lb = auto.leaderboard
lb

model_id,auc,logloss
StackedEnsemble_0_AutoML_20170706_175725,0.70527,0.440551
GLM_grid_0_AutoML_20170706_175725_model_0,0.701672,0.440898
GLM_grid_0_AutoML_20170706_175725_model_1,0.701672,0.440898
GBM_grid_0_AutoML_20170706_175725_model_0,0.700509,0.451737
XRT_0_AutoML_20170706_175725,0.688488,0.445656
DRF_0_AutoML_20170706_175725,0.683476,0.450929




In [16]:
# view best model 
best = auto.leader
best # must use predict(), no POJO/MOJO available yet for Stacked Ensemble

Model Details
H2OStackedEnsembleEstimator :  Stacked Ensemble
Model Key:  StackedEnsemble_0_AutoML_20170706_175725
No model summary for this model


ModelMetricsBinomialGLM: stackedensemble
** Reported on train data. **

MSE: 0.09609371624935503
RMSE: 0.3099898647526319
LogLoss: 0.32303666631845357
Null degrees of freedom: 80242
Residual degrees of freedom: 80237
Null deviance: 76072.75489639521
Residual deviance: 51842.86243078334
AIC: 51854.86243078334
AUC: 0.9357226014693921
Gini: 0.8714452029387842
Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.2557261072855434: 


0,1,2,3,4
,0.0,1.0,Error,Rate
0,60029.0,5631.0,0.0858,(5631.0/65660.0)
1,3949.0,10634.0,0.2708,(3949.0/14583.0)
Total,63978.0,16265.0,0.1194,(9580.0/80243.0)


Maximum Metrics: Maximum metrics at their respective thresholds



0,1,2,3
metric,threshold,value,idx
max f1,0.2557261,0.6894450,214.0
max f2,0.1802497,0.7825040,271.0
max f0point5,0.3482897,0.7380417,159.0
max accuracy,0.3181533,0.8936854,175.0
max precision,0.8380285,1.0,0.0
max recall,0.0939220,1.0,361.0
max specificity,0.8380285,1.0,0.0
max absolute_mcc,0.2784861,0.6184632,198.0
max min_per_class_accuracy,0.2092455,0.8489339,246.0


Gains/Lift Table: Avg response rate: 18.17 %



0,1,2,3,4,5,6,7,8,9,10,11
,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,cumulative_response_rate,capture_rate,cumulative_capture_rate,gain,cumulative_gain
,1,0.0100071,0.6533419,5.5025029,5.5025029,1.0,1.0,0.0550641,0.0550641,450.2502914,450.2502914
,2,0.0200017,0.5932918,5.4544761,5.4785045,0.9912718,0.9956386,0.0545155,0.1095796,445.4476081,447.8504459
,3,0.0300088,0.5459036,5.2421105,5.3996737,0.9526775,0.9813123,0.0524583,0.1620380,424.2110497,439.9673749
,4,0.0400035,0.5094850,5.0908443,5.3225145,0.9251870,0.9672897,0.0508812,0.2129192,409.0844342,432.2514501
,5,0.0500106,0.4794765,4.9474559,5.2474654,0.8991283,0.9536506,0.0495097,0.2624289,394.7455921,424.7465401
,6,0.1000087,0.3696594,4.1639080,4.7057542,0.7567298,0.8552025,0.2081876,0.4706165,316.3907988,370.5754206
,7,0.1500069,0.3007516,2.8938886,4.1018492,0.5259222,0.7454515,0.1446890,0.6153055,189.3888621,310.1849186
,8,0.2000050,0.2567565,2.1697307,3.6188497,0.3943170,0.6576734,0.1084825,0.7237880,116.9730710,261.8849664
,9,0.3000012,0.1981403,1.5395213,2.9257690,0.2797856,0.5317160,0.1539464,0.8777343,53.9521316,192.5769007




ModelMetricsBinomialGLM: stackedensemble
** Reported on validation data. **

MSE: 0.1386155064494514
RMSE: 0.37231103455236375
LogLoss: 0.44063435028299724
Null degrees of freedom: 34532
Residual degrees of freedom: 34527
Null deviance: 32853.05243414602
Residual deviance: 30432.852036645483
AIC: 30444.852036645483
AUC: 0.6955444324815334
Gini: 0.39108886496306683
Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.16962774846672032: 


0,1,2,3,4
,0.0,1.0,Error,Rate
0,18679.0,9540.0,0.3381,(9540.0/28219.0)
1,2394.0,3920.0,0.3792,(2394.0/6314.0)
Total,21073.0,13460.0,0.3456,(11934.0/34533.0)


Maximum Metrics: Maximum metrics at their respective thresholds



0,1,2,3
metric,threshold,value,idx
max f1,0.1696277,0.3964802,270.0
max f2,0.1088220,0.5595105,343.0
max f0point5,0.2992587,0.3615403,166.0
max accuracy,0.6328676,0.8175368,20.0
max precision,0.7392779,0.75,1.0
max recall,0.0630169,1.0,398.0
max specificity,0.7609547,0.9999646,0.0
max absolute_mcc,0.2040986,0.2266158,237.0
max min_per_class_accuracy,0.1639268,0.6406398,276.0


Gains/Lift Table: Avg response rate: 18.28 %



0,1,2,3,4,5,6,7,8,9,10,11
,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,cumulative_response_rate,capture_rate,cumulative_capture_rate,gain,cumulative_gain
,1,0.0100194,0.5740896,2.7504445,2.7504445,0.5028902,0.5028902,0.0275578,0.0275578,175.0444466,175.0444466
,2,0.0200098,0.5170237,2.5206222,2.6356996,0.4608696,0.4819103,0.0251821,0.0527399,152.0622220,163.5699640
,3,0.0300003,0.4769825,2.3779455,2.5498645,0.4347826,0.4662162,0.0237567,0.0764967,137.7945490,154.9864522
,4,0.0400197,0.4439816,2.6872159,2.5842520,0.4913295,0.4725036,0.0269243,0.1034210,168.7215858,158.4252049
,5,0.0500101,0.4190918,2.2511217,2.5177031,0.4115942,0.4603358,0.0224897,0.1259107,125.1121731,151.7703144
,6,0.1000203,0.3389184,1.9571579,2.2374305,0.3578460,0.4090909,0.0978777,0.2237884,95.7157915,123.7430530
,7,0.1500014,0.2877967,1.7016225,2.0588968,0.3111240,0.3764479,0.0850491,0.3088375,70.1622523,105.8896819
,8,0.2000116,0.2510326,1.5486250,1.9313104,0.2831500,0.3531200,0.0774469,0.3862844,54.8624953,93.1310383
,9,0.3000029,0.1996482,1.3035659,1.7220824,0.2383435,0.3148649,0.1303453,0.5166297,30.3565890,72.2082417







In [15]:
# shutdown h2o ... be careful this can erase your work
h2o.cluster().shutdown(prompt=True)

Are you sure you want to shutdown the H2O instance running at http://127.0.0.1:54321 (Y/N)? y
H2O session _sid_955e closed.
