In [1]:
%matplotlib inline

In [2]:
import warnings
warnings.filterwarnings("ignore")

from IPython.display import display

import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
import matplotlib
matplotlib.use('agg')
import matplotlib.pyplot as plt
import pandas_profiling
import seaborn as sns
import numpy as np
from collections import Counter
import pickle

import pprint
import operator

import h2o
from h2o.estimators.glm import H2OGeneralizedLinearEstimator
from h2o.estimators.gbm import H2OGradientBoostingEstimator
from h2o.estimators.random_forest import H2ORandomForestEstimator
from h2o.estimators.deeplearning import H2ODeepLearningEstimator
from tabulate import tabulate

In [3]:
FILE = 'marketing_training_cleaned.csv'
TEST_DF_FILE = './marketing_test.pkl'
MODEL_FILE = 'marketing_model'
data = pd.read_csv(FILE)

In [4]:
import h2o #https://www.h2o.ai/
h2o.__PROGRESS_BAR__ = False
h2o.no_progress()
from h2o.estimators.glm import H2OGeneralizedLinearEstimator
from h2o.estimators.gbm import H2OGradientBoostingEstimator
h2o.init(ip="localhost", port=54323)

Checking whether there is an H2O instance running at http://localhost:54323..... not found.
Attempting to start a local H2O server...
; Java HotSpot(TM) Client VM (build 25.101-b13, mixed mode)
  Starting server from C:\ProgramData\Anaconda3\lib\site-packages\h2o\backend\bin\h2o.jar
  Ice root: C:\Users\gsaez\AppData\Local\Temp\tmpj3l7si18
  JVM stdout: C:\Users\gsaez\AppData\Local\Temp\tmpj3l7si18\h2o_gsaez_started_from_python.out
  JVM stderr: C:\Users\gsaez\AppData\Local\Temp\tmpj3l7si18\h2o_gsaez_started_from_python.err
  Server is running at http://127.0.0.1:54323
Connecting to H2O server at http://127.0.0.1:54323... successful.


0,1
H2O cluster uptime:,02 secs
H2O cluster timezone:,America/Chicago
H2O data parsing timezone:,UTC
H2O cluster version:,3.20.0.4
H2O cluster version age:,3 months and 29 days !!!
H2O cluster name:,H2O_from_python_gsaez_mlo48j
H2O cluster total nodes:,1
H2O cluster free memory:,247.5 Mb
H2O cluster total cores:,4
H2O cluster allowed cores:,4


In [5]:
mrk = h2o.H2OFrame(data)
mrk.describe()

Rows:7380
Cols:21




Unnamed: 0,cons.conf.idx,cons.price.idx,contact,day_of_week,default,emp.var.rate,euribor3m,housing,loan,marital,month,nr.employed,poutcome,profession,responded,schooling,campaign1,custAge1,pastEmail1,pcontacted_pdays,pContacted_previous
type,real,real,enum,enum,enum,real,real,enum,enum,enum,enum,real,enum,enum,enum,enum,int,int,enum,enum,enum
mins,-50.8,92.201,,,,-3.4,0.634,,,,,4963.6,,,,,1.0,18.0,,,
mean,-40.56775067750686,93.57061829268248,,,,0.048265582655826506,3.5789429539295465,,,,,5165.0473712737175,,,,,2.0264026402640316,39.80514905149058,,,
maxs,-26.9,94.767,,,,1.4,5.045,,,,,5228.1,,,,,6.0,64.0,,,
sigma,4.653732975650036,0.5791390659374276,,,,1.5697319890162231,1.7462251198691165,,,,,73.14368800240456,,,,,1.2618634459509646,8.93268712954785,,,
zeros,0,0,,,,0,0,,,,,0,,,,,0,0,,,
missing,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,411,0,0,0,0
0,-42.0,93.2,cellular,mon,unknown,-0.1,4.191,no,no,single,nov,5195.8,nonexistent,admin.,no,university.degree,1.0,55.0,no,True,no
1,-42.7,93.918,cellular,mon,no,1.4,4.96,no,no,married,jul,5228.1,nonexistent,blue-collar,no,other,1.0,42.0,no,True,no
2,-36.4,93.994,telephone,mon,no,1.1,4.857,no,no,married,may,5191.0,nonexistent,technician,no,high.school,1.0,42.0,no,True,no


In [12]:
categorical_variables = [] 

for variable in mrk.types.items():
    if variable[1] == 'enum':
        categorical_variables.append(variable[0])

for variable in categorical_variables:
    mrk[variable] = mrk[variable].asfactor() 
    
predictors = mrk.columns
predictors.remove('responded')
response = 'responded'


train, valid, oot = mrk.split_frame(ratios=[.8,.1], seed=25)

#save TEST df for future testing
oot[1,:].as_data_frame().to_pickle(TEST_DF_FILE)

In [None]:
def check_model(model, show_all = False):
    if show_all:
        display(model)
        
    print ("full model training auc:", model.auc())
    print ("full model cv auc:", model.auc(xval=True))
    for model_ in model.get_xval_models():
        print (model_.model_id, " training auc:",model_.auc(), " validation auc: ", model_.auc(valid=True))

## Binary model (GLM)

In [None]:
MRK_binomial = H2OGeneralizedLinearEstimator(family = "binomial", nfolds = 10)
MRK_binomial.train(y = response, x = predictors, training_frame = train)

check_model(MRK_binomial, True)

In [None]:
MRK_binomial_pred = MRK_binomial.predict(oot)

pred.cbind(oot['responded'])

## Gradient boosting machine model (GBM)

In [None]:
mrk_gbm = H2OGradientBoostingEstimator(nbins_cats = 10, seed = 25)
mrk_gbm.train(x = predictors, y= response, training_frame = train, validation_frame = valid)

print ("full model training auc:", mrk_gbm.auc())
print ("full model cv auc:", mrk_gbm.auc(xval=True))

mrk_gbm

In [None]:
mrk_gbm_pred = mrk_gbm.predict(oot)

mrk_gbm_pred.cbind(oot['responded'])


In [None]:
# save the model
model_path = h2o.save_model(model = mrk_gbm, path = MODEL_FILE, force = True)

In [None]:
h2o.cluster().shutdown()