In [1]:
# import h2o lib and allow it to use max. threads
library(h2o)
h2o.init(nthreads = -1)


----------------------------------------------------------------------

Your next step is to start H2O:
    > h2o.init()

For H2O package documentation, ask for help:
    > ??h2o

After starting H2O, you can use the Web UI at http://localhost:54321
For more information visit http://docs.h2o.ai

----------------------------------------------------------------------


Attaching package: ‘h2o’

The following objects are masked from ‘package:stats’:

    cor, sd, var

The following objects are masked from ‘package:base’:

    &&, %*%, %in%, ||, apply, as.factor, as.numeric, colnames,
    colnames<-, ifelse, is.character, is.factor, is.numeric, log,
    log10, log1p, log2, round, signif, trunc




H2O is not running yet, starting it now...

Note:  In case of errors look at the following log files:
    /var/folders/tc/0ss1l73113j3wdyjsxmy1j2r0000gn/T//RtmpT1fNJA/h2o_phall_started_from_r.out
    /var/folders/tc/0ss1l73113j3wdyjsxmy1j2r0000gn/T//RtmpT1fNJA/h2o_phall_started_from_r.err


Starting H2O JVM and connecting: .. Connection successful!

R is connected to the H2O cluster: 
    H2O cluster uptime:         1 seconds 651 milliseconds 
    H2O cluster version:        3.12.0.1 
    H2O cluster version age:    29 days  
    H2O cluster name:           H2O_started_from_R_phall_mng351 
    H2O cluster total nodes:    1 
    H2O cluster total memory:   3.56 GB 
    H2O cluster total cores:    8 
    H2O cluster allowed cores:  8 
    H2O cluster healthy:        TRUE 
    H2O Connection ip:          localhost 
    H2O Connection port:        54321 
    H2O Connection proxy:       NA 
    H2O Internal Security:      FALSE 
    R Version:                  R version 3.3.2 (2016-10-31) 

In [2]:
# location of clean data file
path <- "/Users/phall/Documents/aetna/share/data/loan.csv"

In [3]:
# import file
frame <- h2o.importFile(path)

# strings automatically parsed as enums (categorical)
# numbers automatically parsed as numeric
# bad_loan is numeric, but categorical
frame$bad_loan <- as.factor(frame$bad_loan)



In [4]:
# find missing numeric and impute
for (name in names(frame)) {
  if (any(is.na(frame[name]))) {
      h2o.impute(frame, name, "median")
  }
}

In [5]:
h2o.describe(frame) # summarize table, check for missing

Label,Type,Missing,Zeros,PosInf,NegInf,Min,Max,Mean,Sigma,Cardinality
loan_amnt,int,0,0,0,0,500.0,35000.0,13074.17,7993.556,
term,enum,0,129950,0,0,0.0,1.0,0.2075591,0.4055605,2.0
int_rate,real,0,0,0,0,5.42,26.06,13.7159,4.39194,
emp_length,int,0,14248,0,0,0.0,10.0,5.695525,3.546671,
home_ownership,enum,0,1,0,0,0.0,5.0,,,6.0
annual_inc,real,0,0,0,0,1896.0,7141778.0,71915.4,59070.22,
purpose,enum,0,2842,0,0,0.0,13.0,,,14.0
addr_state,enum,0,413,0,0,0.0,49.0,,,50.0
dti,real,0,270,0,0,0.0,39.99,15.88153,7.587668,
delinq_2yrs,int,0,139488,0,0,0.0,29.0,0.2273168,0.6941131,


In [6]:
# assign target and inputs
y <- 'bad_loan'
X <- names(frame)[names(frame) != y]
print(y)
print(X)

[1] "bad_loan"
 [1] "loan_amnt"             "term"                  "int_rate"             
 [4] "emp_length"            "home_ownership"        "annual_inc"           
 [7] "purpose"               "addr_state"            "dti"                  
[10] "delinq_2yrs"           "revol_util"            "total_acc"            
[13] "longest_credit_length" "verification_status"  


In [7]:
# split into training and test for cross validation
split <- h2o.splitFrame(frame, ratios = 0.7)
train <- split[[1]]
test <- split[[2]]

In [8]:
# elastic net regularized regression
#   - binomial family for logistic regression
#   - L1 for variable selection
#   - L2 for handling multicollinearity
#   - IRLS for handling outliers
#   - standardization very important for penalized regression variable selection
#   - with lamba parameter tuning for variable selection and regularization

# train
loan_glm <- h2o.glm(x = X, 
                    y = y,
                    training_frame = train,
                    validation_frame = test,
                    family = "binomial",
                    model_id = "loan_glm",
                    solver = "IRLSM",
                    standardize = TRUE, 
                    lambda_search = TRUE)

# print model
loan_glm

# view detailed results at http://ip:port/flow/index.html



Model Details:

H2OBinomialModel: glm
Model ID:  loan_glm 
GLM Model: summary
    family  link                                regularization
1 binomial logit Elastic Net (alpha = 0.5, lambda = 1.257E-4 )
                                                                 lambda_search
1 nlambda = 100, lambda.max = 0.1782, lambda.min = 1.257E-4, lambda.1se = -1.0
  number_of_predictors_total number_of_active_predictors number_of_iterations
1                         83                          55                   91
   training_frame
1 RTMP_sid_8698_3

Coefficients: glm coefficients
          names coefficients standardized_coefficients
1     Intercept    -2.891055                 -1.399306
2 addr_state.AK    -0.142204                 -0.142204
3 addr_state.AL     0.108174                  0.108174
4 addr_state.AR     0.000000                  0.000000
5 addr_state.AZ    -0.023154                 -0.023154

---
                   names coefficients standardized_coefficients
79            a

In [9]:
# print sorted, non-zero model parameters
coef <- as.data.frame(h2o.coef(loan_glm))
names(coef) <- "coef"
coef <- coef[order(-coef$coef), , drop = FALSE]
coef <- coef[coef$coef != 0, , drop = FALSE] 
coef

Unnamed: 0,coef
purpose.small_business,0.6629662
addr_state.NV,0.238895
addr_state.TN,0.2180882
purpose.educational,0.2054414
addr_state.FL,0.1881469
term.60 months,0.1533837
addr_state.MI,0.1432021
addr_state.NJ,0.1334593
addr_state.OK,0.1321407
addr_state.AL,0.1081745


In [10]:
h2o.shutdown(prompt = FALSE)