In [None]:
import h2o
from h2o.estimators.gbm import H2OGradientBoostingEstimator
from h2o.estimators.glm import H2OGeneralizedLinearEstimator

In [None]:
# Initialize instance of H2O
h2o.init()
h2o.remove_all()

In [None]:
# Set filepath 
# path = "/Users/avniwadhwa/github/h2o-3/bigdata/laptop/lending-club/LoanStats3a.csv"
path = "http://h2o-public-test-data.s3.amazonaws.com/bigdata/laptop/lending-club/LoanStats3a.csv"

In [None]:
# Specify some column types to "String" that we want to munge later.
types = {"int_rate":"String", "revol_util":"String", "emp_length":"String", 
         "earliest_cr_line":"String", "issue_d":"String", "last_credit_pull_d":"Factor", "verification_status":"String"}

In [None]:
# Import the file and look at the frame.
data = h2o.import_file(path=path, col_types= types)
data.describe()

In [None]:
# Drop all loans that are still in progess and are therefore cannot be deemed good/bad loans.
data = data[~data["loan_status"].isin(["Current", "In Grace Period", "Late (16-30 days)", "Late (31-120 days)"]), :]

In [None]:
data.show()

In [None]:
# Define what labels qualify a loan as "bad"
data["bad_loan"] = data["loan_status"].isin(["Charged Off", "Default", 
                                                  "Does not meet the credit policy.  Status:Charged Off"])
data ["bad_loan"] = data["bad_loan"].asfactor()

In [None]:
# munge int_rate column in place
# strip %, trim ws, convert to double
data ["int_rate"] = data["int_rate"].gsub(pattern = "%", replacement = "") # strip %
data [ "int_rate"] = data["int_rate"].trim() # trim ws
data ["int_rate"] = data["int_rate"].asnumeric() #change to a numeric 
data["int_rate"].show()

In [None]:
# munge revol_util column in place
# strip %, trim ws, convert to double
data["revol_util"] = data["revol_util"].gsub(pattern = "%", replacement = "")
data["revol_util"] = data["revol_util"].trim() 
data["revol_util"] = data["revol_util"].asnumeric() 
data["revol_util"].show()

In [None]:
# Munge emp_length column
data ["emp_length"] = data["emp_length"].gsub(pattern = "([ ]*+[a-zA-Z].*)|(n/a)", 
                                              replacement = "") #remove "year" and "years", also translate n/a to ""
data ["emp_length"] = data["emp_length"].trim()

In [None]:
data ["emp_length"] = data["emp_length"].gsub(pattern = "< 1", replacement = "0.5")
data ["emp_length"] = data["emp_length"].gsub(pattern = "10\\+", replacement = "10")
data ["emp_length"] = data["emp_length"].asnumeric()
data["emp_length"].show()

In [None]:
data["earliest_cr_month"] = data["earliest_cr_line"].strsplit(pattern = "-")[0]
data["earliest_cr_year"] = data["earliest_cr_line"].strsplit(pattern = "-")[1]
data["earliest_cr_year"] = data["earliest_cr_line"].asnumeric()

In [None]:
data["earliest_cr_year"] = data["earliest_cr_line"].strsplit(pattern = "-")[1].asnumeric()
data["earliest_cr_year"].show()

In [None]:
data["issue_d_month"] = data["issue_d"].strsplit(pattern = "-")[0]
data["issue_d_year"] = data["issue_d"].strsplit(pattern = "-")[1]
data["issue_d_year"] = data["issue_d_year"].asnumeric()
data["issue_d_year"].show()

In [None]:
data["credit_length"] = data["issue_d_year"] - data["earliest_cr_year"]
data["credit_length"].show()

In [None]:
data ["verification_status"] = data["verification_status"].sub(pattern = "VERIFIED - income source", 
                                                               replacement = "verified")
data ["verification_status"] = data["verification_status"].sub(pattern = "VERIFIED - income", 
                                                               replacement = "verified")
data ["verification_status"] = data["verification_status"].asfactor()

In [None]:
s = data["int_rate"].runif()
train = data[s <= 0.80]
test  = data[s > 0.80]

In [None]:
y="bad_loan"
x=["loan_amnt", "credit_length", "revol_util", 
     "home_ownership", "annual_inc", "purpose", "addr_state", "dti",
     "delinq_2yrs", "total_acc", "verification_status", "term"]

In [None]:
from h2o.estimators.gbm import H2OGradientBoostingEstimator
model = H2OGradientBoostingEstimator(model_id="BadLoanModel",
                                       score_each_iteration=True,
                                       ntrees=10,
                                       learn_rate=0.05)

In [None]:
model.train(x=x, y=y, training_frame=train, validation_frame=test)

In [None]:
model