In [1]:
# import h2o package and specific estimator 
import h2o
from h2o.estimators.glm import H2OGeneralizedLinearEstimator

In [2]:
h2o.init() # start h2o

Checking whether there is an H2O instance running at http://localhost:54321. connected.


0,1
H2O cluster uptime:,46 mins 13 secs
H2O cluster version:,3.10.4.1
H2O cluster version age:,20 days
H2O cluster name:,H2O_from_python_phall_1zfxai
H2O cluster total nodes:,1
H2O cluster free memory:,3.224 Gb
H2O cluster total cores:,8
H2O cluster allowed cores:,8
H2O cluster status:,"locked, healthy"
H2O connection url:,http://localhost:54321


In [3]:
# location of clean data file
path = 'https://raw.githubusercontent.com/h2oai/app-consumer-loan/master/data/loan.csv'

In [4]:
# define input variable measurement levels 
# strings automatically parsed as enums (nominal)
# numbers automatically parsed as numeric
col_types = {'bad_loan': 'enum'}

In [5]:
frame = h2o.import_file(path=path, col_types=col_types) # import from url

Parse progress: |█████████████████████████████████████████████████████████| 100%


In [6]:
frame.describe() # summarize table

Rows:163987
Cols:15




Unnamed: 0,loan_amnt,term,int_rate,emp_length,home_ownership,annual_inc,purpose,addr_state,dti,delinq_2yrs,revol_util,total_acc,bad_loan,longest_credit_length,verification_status
type,int,enum,real,int,enum,real,enum,enum,real,int,real,int,enum,int,enum
mins,500.0,,5.42,0.0,,1896.0,,,0.0,0.0,0.0,1.0,,0.0,
mean,13074.169141456332,,13.715904065566189,5.684352932995338,,71915.67051974905,,,15.881530121290167,0.22735700606252723,54.07917280242262,24.579733834274574,,14.854273655448333,
maxs,35000.0,,26.060000000000002,10.0,,7141778.0,,,39.99,29.0,150.70000000000002,118.0,,65.0,
sigma,7993.556188734672,,4.391939870545809,3.610663731100238,,59070.91565491818,,,7.5876682241925355,0.6941679229284191,25.285366766770498,11.685190365910666,,6.947732922546689,
zeros,0,,0,14248,,0,,,270,139459,1562,0,,11,
missing,0,0,0,5804,0,4,0,0,0,29,193,29,0,29,0
0,5000.0,36 months,10.65,10.0,RENT,24000.0,credit_card,AZ,27.650000000000002,0.0,83.7,9.0,0,26.0,verified
1,2500.0,60 months,15.27,0.0,RENT,30000.0,car,GA,1.0,0.0,9.4,4.0,1,12.0,verified
2,2400.0,36 months,15.96,10.0,RENT,12252.0,small_business,IL,8.72,0.0,98.5,10.0,0,10.0,not verified


In [7]:
# split into training and test for cross validation
train, test = frame.split_frame([0.7])

In [8]:
# assign target and inputs for logistic regression
y = 'bad_loan'
X = [name for name in frame.columns if name != y]
print(y)
print(X)

bad_loan
['loan_amnt', 'term', 'int_rate', 'emp_length', 'home_ownership', 'annual_inc', 'purpose', 'addr_state', 'dti', 'delinq_2yrs', 'revol_util', 'total_acc', 'longest_credit_length', 'verification_status']


In [9]:
# determine column types
reals, enums = [], []
for key, val in frame.types.items():
    if key in X:
        if val == 'enum':
            enums.append(key)
        else: 
            reals.append(key)

print(enums)
print(reals)

['term', 'verification_status', 'addr_state', 'home_ownership', 'purpose']
['loan_amnt', 'emp_length', 'annual_inc', 'total_acc', 'delinq_2yrs', 'revol_util', 'int_rate', 'dti', 'longest_credit_length']


In [10]:
# impute missing values
frame[reals].impute(method='mean')

[13074.169141456332,
 5.684352932995338,
 71915.67051974905,
 24.579733834274574,
 0.22735700606252723,
 54.07917280242262,
 13.715904065566189,
 15.881530121290167,
 14.854273655448333]

In [11]:
# set target to factor for logisitic regression
# just to be safe ...
train[y] = train[y].asfactor()
test[y] = test[y].asfactor()

In [12]:
# elastic net regularized regression
#   - binomial family for logistic regression
#   - L1 for variable selection
#   - L2 for handling multicollinearity
#   - IRLS for handling outliers
#   - with lamba parameter tuning for variable selection and regularization

# initialize
loan_glm = H2OGeneralizedLinearEstimator(family='binomial',
                                         model_id='loan_glm2',
                                         solver='IRLSM',
                                         standardize=True,
                                         lambda_search=True)

# train 
loan_glm.train(X, y, training_frame=train)

# view detailed results at http://ip:port/flow/index.html

glm Model Build progress: |███████████████████████████████████████████████| 100%


In [13]:
# print model parameters
loan_glm.coef()

{'Intercept': -2.997668670693661,
 'addr_state.AK': 0.0,
 'addr_state.AL': 0.024308849742242247,
 'addr_state.AR': 0.0,
 'addr_state.AZ': 0.0,
 'addr_state.CA': -0.010673081870049436,
 'addr_state.CO': -0.21361307120658576,
 'addr_state.CT': 0.0,
 'addr_state.DC': 0.0,
 'addr_state.DE': 0.0,
 'addr_state.FL': 0.17629439014357695,
 'addr_state.GA': 0.0,
 'addr_state.HI': 0.0,
 'addr_state.IA': 0.0,
 'addr_state.ID': 0.0,
 'addr_state.IL': 0.0,
 'addr_state.IN': 0.0,
 'addr_state.KS': 0.0,
 'addr_state.KY': 0.0,
 'addr_state.LA': 0.0,
 'addr_state.MA': 0.0,
 'addr_state.MD': 0.0,
 'addr_state.ME': 0.0,
 'addr_state.MI': 0.027448204730300796,
 'addr_state.MN': 0.0,
 'addr_state.MO': 0.0,
 'addr_state.MS': 0.0,
 'addr_state.MT': 0.0,
 'addr_state.NC': 0.0,
 'addr_state.NE': 0.0,
 'addr_state.NH': 0.0,
 'addr_state.NJ': 0.09088178916017763,
 'addr_state.NM': 0.0,
 'addr_state.NV': 0.13335524389548245,
 'addr_state.NY': 0.04227205559537367,
 'addr_state.OH': 0.0,
 'addr_state.OK': 0.0,
 'add

In [14]:
# measure train and test AUC
print(loan_glm.auc(train=True))
print(loan_glm.model_performance(test_data=test).auc())

0.6977824493052591
0.6993423042444329


In [15]:
# shutdown h2o ... be careful this can erase your work
# h2o.cluster().shutdown(prompt=True)