In [1]:
# import h2o lib and allow it to use max. threads
library(h2o)
h2o.init(nthreads = -1)


----------------------------------------------------------------------

Your next step is to start H2O:
    > h2o.init()

For H2O package documentation, ask for help:
    > ??h2o

After starting H2O, you can use the Web UI at http://localhost:54321
For more information visit http://docs.h2o.ai

----------------------------------------------------------------------


Attaching package: ‘h2o’

The following objects are masked from ‘package:stats’:

    cor, sd, var

The following objects are masked from ‘package:base’:

    &&, %*%, %in%, ||, apply, as.factor, as.numeric, colnames,
    colnames<-, ifelse, is.character, is.factor, is.numeric, log,
    log10, log1p, log2, round, signif, trunc



 Connection successful!

R is connected to the H2O cluster: 
    H2O cluster uptime:         7 minutes 21 seconds 
    H2O cluster version:        3.10.4.3 
    H2O cluster version age:    8 days  
    H2O cluster name:           H2O_started_from_R_phall_vtf996 
    H2O cluster total nodes:    1 
    H2O cluster total memory:   3.30 GB 
    H2O cluster total cores:    8 
    H2O cluster allowed cores:  8 
    H2O cluster healthy:        TRUE 
    H2O Connection ip:          localhost 
    H2O Connection port:        54321 
    H2O Connection proxy:       NA 
    H2O Internal Security:      FALSE 
    R Version:                  R version 3.3.2 (2016-10-31) 



In [2]:
# location of clean data file
path <- "https://raw.githubusercontent.com/h2oai/app-consumer-loan/master/data/loan.csv"

In [3]:
# import file
frame <- h2o.importFile(path)

# strings automatically parsed as enums (categorical)
# numbers automatically parsed as numeric
# bad_loan is numeric, but categorical
frame$bad_loan <- as.factor(frame$bad_loan)



In [4]:
# find missing numeric and impute
for (name in names(frame)) {
  if (any(is.na(frame[name]))) {
      h2o.impute(frame, name, "median")
  }
}

In [5]:
h2o.describe(frame) # summarize table, check for missing

Label,Type,Missing,Zeros,PosInf,NegInf,Min,Max,Mean,Sigma,Cardinality
loan_amnt,int,0,0,0,0,500.0,35000.0,13074.1691414563,7993.55618873467,
term,enum,0,129950,0,0,0.0,1.0,0.207559135785154,0.405560530549521,2.0
int_rate,real,0,0,0,0,5.42,26.06,13.7159040655662,4.39193987054581,
emp_length,int,0,14248,0,0,0.0,10.0,5.69552464524627,3.54667110307881,
home_ownership,enum,0,1,0,0,0.0,5.0,,,6.0
annual_inc,real,0,0,0,0,1896.0,7141778.0,71915.404262777,59070.2198125624,
purpose,enum,0,2842,0,0,0.0,13.0,,,14.0
addr_state,enum,0,413,0,0,0.0,49.0,,,50.0
dti,real,0,270,0,0,0.0,39.99,15.8815301212902,7.58766822419254,
delinq_2yrs,int,0,139488,0,0,0.0,29.0,0.227316799502399,0.694113124115403,


In [6]:
# assign target and inputs
y <- 'bad_loan'
X <- names(frame)[names(frame) != y]
print(y)
print(X)

[1] "bad_loan"
 [1] "loan_amnt"             "term"                  "int_rate"             
 [4] "emp_length"            "home_ownership"        "annual_inc"           
 [7] "purpose"               "addr_state"            "dti"                  
[10] "delinq_2yrs"           "revol_util"            "total_acc"            
[13] "longest_credit_length" "verification_status"  


In [7]:
# split into training and test for cross validation
split <- h2o.splitFrame(frame, ratios = 0.7)
train <- split[[1]]
test <- split[[2]]

In [8]:
# elastic net regularized regression
#   - binomial family for logistic regression
#   - L1 for variable selection
#   - L2 for handling multicollinearity
#   - IRLS for handling outliers
#   - standardization very important for penalized regression variable selection
#   - with lamba parameter tuning for variable selection and regularization

# train
loan_glm <- h2o.glm(x = X, 
                    y = y,
                    training_frame = train,
                    validation_frame = test,
                    family = "binomial",
                    model_id = "loan_glm",
                    solver = "IRLSM",
                    standardize = TRUE, 
                    lambda_search = TRUE)

# print model
loan_glm

# view detailed results at http://ip:port/flow/index.html



Model Details:

H2OBinomialModel: glm
Model ID:  loan_glm 
GLM Model: summary
    family  link                                regularization
1 binomial logit Elastic Net (alpha = 0.5, lambda = 5.608E-4 )
                                                                 lambda_search
1 nlambda = 100, lambda.max = 0.1794, lambda.min = 5.608E-4, lambda.1se = -1.0
  number_of_predictors_total number_of_active_predictors number_of_iterations
1                         83                          30                   75
   training_frame
1 RTMP_sid_9f0a_3

Coefficients: glm coefficients
          names coefficients standardized_coefficients
1     Intercept    -2.981285                 -1.504706
2 addr_state.AK     0.000000                  0.000000
3 addr_state.AL     0.000000                  0.000000
4 addr_state.AR     0.000000                  0.000000
5 addr_state.AZ    -0.001281                 -0.001281

---
                   names coefficients standardized_coefficients
79            a

In [10]:
# print sorted, non-zero model parameters
coef <- as.data.frame(h2o.coef(loan_glm))
names(coef) <- "coef"
coef <- coef[order(-coef$coef), , drop = FALSE]
coef <- coef[coef$coef != 0, , drop = FALSE] 
coef

Unnamed: 0,coef
purpose.small_business,0.5957155
addr_state.FL,0.1772415
term.60 months,0.1436663
int_rate,0.101819
addr_state.NJ,0.09888196
purpose.other,0.09827134
addr_state.NV,0.08906357
home_ownership.RENT,0.08723267
addr_state.MI,0.04060527
addr_state.NY,0.0292088
