In [1]:
# import h2o package and specific estimator 
import h2o
from h2o.estimators.glm import H2OGeneralizedLinearEstimator

In [2]:
h2o.init() # start h2o

Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
  Java Version: java version "1.8.0_112"; Java(TM) SE Runtime Environment (build 1.8.0_112-b16); Java HotSpot(TM) 64-Bit Server VM (build 25.112-b16, mixed mode)
  Starting server from /Users/phall/anaconda/lib/python3.5/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /var/folders/tc/0ss1l73113j3wdyjsxmy1j2r0000gn/T/tmp7et25rt0
  JVM stdout: /var/folders/tc/0ss1l73113j3wdyjsxmy1j2r0000gn/T/tmp7et25rt0/h2o_phall_started_from_python.out
  JVM stderr: /var/folders/tc/0ss1l73113j3wdyjsxmy1j2r0000gn/T/tmp7et25rt0/h2o_phall_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321... successful.


0,1
H2O cluster uptime:,01 secs
H2O cluster version:,3.12.0.1
H2O cluster version age:,29 days
H2O cluster name:,H2O_from_python_phall_gqrp80
H2O cluster total nodes:,1
H2O cluster free memory:,3.556 Gb
H2O cluster total cores:,8
H2O cluster allowed cores:,8
H2O cluster status:,"accepting new members, healthy"
H2O connection url:,http://127.0.0.1:54321


In [3]:
# location of clean data file
path = 'https://raw.githubusercontent.com/h2oai/app-consumer-loan/master/data/loan.csv'

In [4]:
# define input variable measurement levels 
# strings automatically parsed as enums (nominal)
# numbers automatically parsed as numeric
col_types = {'bad_loan': 'enum'}

In [5]:
frame = h2o.import_file(path=path, col_types=col_types) # import from url

Parse progress: |█████████████████████████████████████████████████████████| 100%


In [6]:
frame.describe() # summarize table

Rows:163987
Cols:15




Unnamed: 0,loan_amnt,term,int_rate,emp_length,home_ownership,annual_inc,purpose,addr_state,dti,delinq_2yrs,revol_util,total_acc,bad_loan,longest_credit_length,verification_status
type,int,enum,real,int,enum,real,enum,enum,real,int,real,int,enum,int,enum
mins,500.0,,5.42,0.0,,1896.0,,,0.0,0.0,0.0,1.0,,0.0,
mean,13074.169141456332,,13.715904065566189,5.684352932995338,,71915.67051974905,,,15.881530121290167,0.22735700606252723,54.07917280242262,24.579733834274574,,14.854273655448333,
maxs,35000.0,,26.060000000000002,10.0,,7141778.0,,,39.99,29.0,150.70000000000002,118.0,,65.0,
sigma,7993.556188734672,,4.391939870545809,3.610663731100238,,59070.91565491818,,,7.5876682241925355,0.6941679229284191,25.285366766770498,11.685190365910666,,6.947732922546689,
zeros,0,,0,14248,,0,,,270,139459,1562,0,,11,
missing,0,0,0,5804,0,4,0,0,0,29,193,29,0,29,0
0,5000.0,36 months,10.65,10.0,RENT,24000.0,credit_card,AZ,27.650000000000002,0.0,83.7,9.0,0,26.0,verified
1,2500.0,60 months,15.27,0.0,RENT,30000.0,car,GA,1.0,0.0,9.4,4.0,1,12.0,verified
2,2400.0,36 months,15.96,10.0,RENT,12252.0,small_business,IL,8.72,0.0,98.5,10.0,0,10.0,not verified


In [7]:
# split into training and test for cross validation
train, test = frame.split_frame([0.7])

In [8]:
# assign target and inputs for logistic regression
y = 'bad_loan'
X = [name for name in frame.columns if name != y]
print(y)
print(X)

bad_loan
['loan_amnt', 'term', 'int_rate', 'emp_length', 'home_ownership', 'annual_inc', 'purpose', 'addr_state', 'dti', 'delinq_2yrs', 'revol_util', 'total_acc', 'longest_credit_length', 'verification_status']


In [9]:
# determine column types
reals, enums = [], []
for key, val in frame.types.items():
    if key in X:
        if val == 'enum':
            enums.append(key)
        else: 
            reals.append(key)

print(enums)
print(reals)

['verification_status', 'term', 'home_ownership', 'addr_state', 'purpose']
['dti', 'annual_inc', 'delinq_2yrs', 'int_rate', 'longest_credit_length', 'revol_util', 'emp_length', 'loan_amnt', 'total_acc']


In [10]:
# impute missing values
_ = frame[reals].impute(method='mean')

In [11]:
# set target to factor for logisitic regression
# just to be safe ...
train[y] = train[y].asfactor()
test[y] = test[y].asfactor()

In [12]:
# elastic net regularized regression
#   - binomial family for logistic regression
#   - L1 for variable selection
#   - L2 for handling multicollinearity
#   - IRLS for handling outliers
#   - with lamba parameter tuning for variable selection and regularization

# initialize
loan_glm = H2OGeneralizedLinearEstimator(family='binomial',
                                         model_id='loan_glm2',
                                         solver='IRLSM',
                                         standardize=True,
                                         lambda_search=True)

# train 
loan_glm.train(X, y, training_frame=train)

# view detailed results at http://ip:port/flow/index.html

glm Model Build progress: |███████████████████████████████████████████████| 100%


In [13]:
# print non-zero model parameters
for name, val in loan_glm.coef().items():
    if val != 0.0:
        print(name, ': ', val)

purpose.car :  -0.11190117054653674
emp_length :  -0.0017696551450257373
addr_state.TN :  0.012526186420074192
term.60 months :  0.15562963741151795
revol_util :  0.0037916526358893828
addr_state.CO :  -0.15983916847312812
addr_state.MI :  0.02091853569392306
loan_amnt :  1.2785877478098858e-05
home_ownership.RENT :  0.09227455982790071
addr_state.NJ :  0.11907705662989007
addr_state.NV :  0.11757735824973783
purpose.other :  0.07133767802649965
total_acc :  -0.010167858694788761
addr_state.OR :  -0.005529884439359159
verification_status.not verified :  -0.009727378241438905
longest_credit_length :  0.004224928762668751
purpose.major_purchase :  -0.04211397535003131
addr_state.WV :  -0.00318184897313048
purpose.small_business :  0.555619849621502
purpose.credit_card :  -0.19244800923871863
purpose.debt_consolidation :  -0.09928103992055347
delinq_2yrs :  0.02652109180190443
home_ownership.MORTGAGE :  -0.035726793917526437
int_rate :  0.10380632783073096
annual_inc :  -6.092965969212584

In [14]:
# measure train and test AUC
print(loan_glm.auc(train=True))
print(loan_glm.model_performance(test_data=test).auc())

0.6991436801020542
0.6957946293349209


In [15]:
# shutdown h2o ... be careful this can erase your work
h2o.cluster().shutdown(prompt=True)

Are you sure you want to shutdown the H2O instance running at http://127.0.0.1:54321 (Y/N)? y
H2O session _sid_955e closed.
