In [1]:
# import h2o package and specific estimator 
import h2o
from h2o.estimators.glm import H2OGeneralizedLinearEstimator

In [2]:
h2o.init() # start h2o

Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
  Java Version: java version "1.8.0_112"; Java(TM) SE Runtime Environment (build 1.8.0_112-b16); Java HotSpot(TM) 64-Bit Server VM (build 25.112-b16, mixed mode)
  Starting server from /Users/phall/anaconda/lib/python3.5/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /var/folders/tc/0ss1l73113j3wdyjsxmy1j2r0000gn/T/tmpvq6szsev
  JVM stdout: /var/folders/tc/0ss1l73113j3wdyjsxmy1j2r0000gn/T/tmpvq6szsev/h2o_phall_started_from_python.out
  JVM stderr: /var/folders/tc/0ss1l73113j3wdyjsxmy1j2r0000gn/T/tmpvq6szsev/h2o_phall_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321... successful.


0,1
H2O cluster uptime:,03 secs
H2O cluster version:,3.10.4.6
H2O cluster version age:,"7 days, 11 hours and 36 minutes"
H2O cluster name:,H2O_from_python_phall_5pl1en
H2O cluster total nodes:,1
H2O cluster free memory:,3.556 Gb
H2O cluster total cores:,8
H2O cluster allowed cores:,8
H2O cluster status:,"accepting new members, healthy"
H2O connection url:,http://127.0.0.1:54321


In [5]:
# location of clean data file
path = 'https://raw.githubusercontent.com/h2oai/app-consumer-loan/master/data/loan.csv'

In [6]:
# define input variable measurement levels 
# strings automatically parsed as enums (nominal)
# numbers automatically parsed as numeric
col_types = {'bad_loan': 'enum'}

In [7]:
frame = h2o.import_file(path=path, col_types=col_types) # import from url

Parse progress: |█████████████████████████████████████████████████████████| 100%


In [9]:
frame.describe() # summarize table

Rows:163987
Cols:15




Unnamed: 0,loan_amnt,term,int_rate,emp_length,home_ownership,annual_inc,purpose,addr_state,dti,delinq_2yrs,revol_util,total_acc,bad_loan,longest_credit_length,verification_status
type,int,enum,real,int,enum,real,enum,enum,real,int,real,int,enum,int,enum
mins,500.0,,5.42,0.0,,1896.0,,,0.0,0.0,0.0,1.0,,0.0,
mean,13074.169141456332,,13.715904065566189,5.684352932995338,,71915.67051974905,,,15.881530121290167,0.22735700606252723,54.07917280242262,24.579733834274574,,14.854273655448333,
maxs,35000.0,,26.060000000000002,10.0,,7141778.0,,,39.99,29.0,150.70000000000002,118.0,,65.0,
sigma,7993.556188734672,,4.391939870545809,3.610663731100238,,59070.91565491818,,,7.5876682241925355,0.6941679229284191,25.285366766770498,11.685190365910666,,6.947732922546689,
zeros,0,,0,14248,,0,,,270,139459,1562,0,,11,
missing,0,0,0,5804,0,4,0,0,0,29,193,29,0,29,0
0,5000.0,36 months,10.65,10.0,RENT,24000.0,credit_card,AZ,27.650000000000002,0.0,83.7,9.0,0,26.0,verified
1,2500.0,60 months,15.27,0.0,RENT,30000.0,car,GA,1.0,0.0,9.4,4.0,1,12.0,verified
2,2400.0,36 months,15.96,10.0,RENT,12252.0,small_business,IL,8.72,0.0,98.5,10.0,0,10.0,not verified


In [10]:
# split into training and test for cross validation
train, test = frame.split_frame([0.7])

In [11]:
# assign target and inputs for logistic regression
y = 'bad_loan'
X = [name for name in frame.columns if name != y]
print(y)
print(X)

bad_loan
['loan_amnt', 'term', 'int_rate', 'emp_length', 'home_ownership', 'annual_inc', 'purpose', 'addr_state', 'dti', 'delinq_2yrs', 'revol_util', 'total_acc', 'longest_credit_length', 'verification_status']


In [12]:
# determine column types
reals, enums = [], []
for key, val in frame.types.items():
    if key in X:
        if val == 'enum':
            enums.append(key)
        else: 
            reals.append(key)

print(enums)
print(reals)

['home_ownership', 'verification_status', 'addr_state', 'purpose', 'term']
['emp_length', 'revol_util', 'dti', 'longest_credit_length', 'loan_amnt', 'delinq_2yrs', 'annual_inc', 'int_rate', 'total_acc']


In [13]:
# impute missing values
frame[reals].impute(method='mean')

[5.684352932995338,
 54.07917280242262,
 15.881530121290167,
 14.854273655448333,
 13074.169141456332,
 0.22735700606252723,
 71915.67051974905,
 13.715904065566189,
 24.579733834274574]

In [14]:
# set target to factor for logisitic regression
# just to be safe ...
train[y] = train[y].asfactor()
test[y] = test[y].asfactor()

In [16]:
# elastic net regularized regression
#   - binomial family for logistic regression
#   - L1 for variable selection
#   - L2 for handling multicollinearity
#   - IRLS for handling outliers
#   - with lamba parameter tuning for variable selection and regularization

# initialize
loan_glm = H2OGeneralizedLinearEstimator(family='binomial',
                                         model_id='loan_glm2',
                                         solver='IRLSM',
                                         standardize=True,
                                         lambda_search=True)

# train 
loan_glm.train(X, y, training_frame=train)

# view detailed results at http://ip:port/flow/index.html

glm Model Build progress: |███████████████████████████████████████████████| 100%


In [17]:
# print model parameters
loan_glm.coef()

{'Intercept': -2.9382813476928247,
 'addr_state.AK': 0.0,
 'addr_state.AL': 0.09172791087524343,
 'addr_state.AR': 0.0,
 'addr_state.AZ': -0.000764243913588411,
 'addr_state.CA': -0.007349129280579972,
 'addr_state.CO': -0.19929818697896334,
 'addr_state.CT': 0.0,
 'addr_state.DC': -0.013311098229817027,
 'addr_state.DE': 0.0,
 'addr_state.FL': 0.15423831347619052,
 'addr_state.GA': 0.0,
 'addr_state.HI': 0.0,
 'addr_state.IA': 0.0,
 'addr_state.ID': 0.0,
 'addr_state.IL': 0.0,
 'addr_state.IN': 0.02094946353678339,
 'addr_state.KS': 0.0,
 'addr_state.KY': 0.0,
 'addr_state.LA': 0.0010133131692641809,
 'addr_state.MA': 0.0,
 'addr_state.MD': 0.0,
 'addr_state.ME': 0.0,
 'addr_state.MI': 0.09691922042118775,
 'addr_state.MN': -0.01838627872498695,
 'addr_state.MO': 0.039734809346476226,
 'addr_state.MS': 0.0,
 'addr_state.MT': -0.017731406591810724,
 'addr_state.NC': 0.0,
 'addr_state.NE': 0.0,
 'addr_state.NH': 0.0,
 'addr_state.NJ': 0.16740859779021572,
 'addr_state.NM': 0.0,
 'addr_s

In [18]:
# measure train and test AUC
print(loan_glm.auc(train=True))
print(loan_glm.model_performance(test_data=test).auc())

0.6985382460203134
0.6996015237248728


In [19]:
loan_glm

Model Details
H2OGeneralizedLinearEstimator :  Generalized Linear Modeling
Model Key:  loan_glm2


ModelMetricsBinomialGLM: glm
** Reported on train data. **

MSE: 0.13825361792522667
RMSE: 0.37182471397854483
LogLoss: 0.4390953656496221
Null degrees of freedom: 114847
Residual degrees of freedom: 114803
Null deviance: 109386.3541113036
Residual deviance: 100858.44910825562
AIC: 100948.44910825562
AUC: 0.6985382460203134
Gini: 0.3970764920406269
Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.20211828556381106: 


0,1,2,3,4
,0.0,1.0,Error,Rate
0,66502.0,27305.0,0.2911,(27305.0/93807.0)
1,8897.0,12144.0,0.4228,(8897.0/21041.0)
Total,75399.0,39449.0,0.3152,(36202.0/114848.0)


Maximum Metrics: Maximum metrics at their respective thresholds



0,1,2,3
metric,threshold,value,idx
max f1,0.2021183,0.4015209,225.0
max f2,0.1169010,0.5612323,310.0
max f0point5,0.2695206,0.3641440,169.0
max accuracy,0.5314961,0.8171061,36.0
max precision,0.8023883,1.0,0.0
max recall,0.0013031,1.0,399.0
max specificity,0.8023883,1.0,0.0
max absolute_mcc,0.2130758,0.2334833,215.0
max min_per_class_accuracy,0.1832035,0.6435980,243.0


Gains/Lift Table: Avg response rate: 18.32 %



0,1,2,3,4,5,6,7,8,9,10,11
,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,cumulative_response_rate,capture_rate,cumulative_capture_rate,gain,cumulative_gain
,1,0.0100045,0.5190750,2.7837783,2.7837783,0.5100087,0.5100087,0.0278504,0.0278504,178.3778316,178.3778316
,2,0.0200003,0.4748911,2.5342087,2.6590478,0.4642857,0.4871572,0.0253315,0.0531819,153.4208722,165.9047844
,3,0.0300049,0.4448844,2.4084908,2.5755046,0.4412533,0.4718514,0.0240958,0.0772777,140.8490796,157.5504591
,4,0.0400007,0.4209040,2.2917235,2.5045902,0.4198606,0.4588594,0.0229077,0.1001854,129.1723459,150.4590194
,5,0.0500052,0.3996351,2.1947194,2.4425945,0.4020888,0.4475013,0.0219571,0.1221425,119.4719423,144.2594458
,6,0.1000017,0.3313651,1.9819830,2.2123088,0.3631139,0.4053113,0.0990922,0.2212347,98.1983029,121.2308796
,7,0.1500070,0.2888389,1.7915527,2.0720486,0.3282257,0.3796146,0.0895870,0.3108217,79.1552744,107.2048638
,8,0.2000035,0.2587155,1.5627722,1.9447406,0.2863114,0.3562908,0.0781332,0.3889549,56.2772230,94.4740622
,9,0.3000052,0.2162173,1.3601778,1.7498864,0.2491946,0.3205921,0.1360202,0.5249750,36.0177825,74.9886356



Scoring History: 


0,1,2,3,4,5,6,7
,timestamp,duration,iteration,lambda,predictors,deviance_train,deviance_test
,2017-05-04 11:13:58,0.000 sec,3,.16E0,2,0.9465855,
,2017-05-04 11:13:58,0.029 sec,5,.15E0,2,0.9411677,
,2017-05-04 11:13:58,0.052 sec,7,.13E0,2,0.9362429,
,2017-05-04 11:13:58,0.076 sec,9,.12E0,2,0.9318267,
,2017-05-04 11:13:58,0.102 sec,11,.11E0,2,0.9278581,
---,---,---,---,---,---,---,---
,2017-05-04 11:14:00,2.199 sec,78,.46E-3,33,0.8784427,
,2017-05-04 11:14:00,2.239 sec,79,.42E-3,34,0.8784032,
,2017-05-04 11:14:00,2.276 sec,80,.38E-3,37,0.8783377,



See the whole table with table.as_data_frame()




In [20]:
# shutdown h2o ... be careful this can erase your work
h2o.cluster().shutdown(prompt=True)

Are you sure you want to shutdown the H2O instance running at http://127.0.0.1:54321 (Y/N)? y
H2O session _sid_87f3 closed.
