In [1]:
import h2o4gpu
import h2o4gpu.util.import_data as io
import h2o4gpu.util.metrics as metrics
from tabulate import tabulate
import pandas as pd

In [2]:
"""
Import Data for H2O GPU Edition

This function will read in data and prepare it for H2O4GPU's GLM solver

Parameters
----------
data_path : str
             A path to a dataset (The dataset needs to be all numeric)
use_pandas : bool
              Indicate if Pandas should be used to parse
intercept : bool
              Indicate if intercept term is needed
valid_fraction : float
                  Percentage of dataset reserved for a validation set
classification : bool
                  Classification problem?
Returns
-------
If valid_fraction > 0 it will return the following:
    train_x: numpy array of train input variables
    train_y: numpy array of y variable
    valid_x: numpy array of valid input variables
    valid_y: numpy array of valid y variable
    family : string that would either be "logistic" if classification is set to True, otherwise "elasticnet"
If valid_fraction == 0 it will return the following:
    train_x: numpy array of train input variables
    train_y: numpy array of y variable
    family : string that would either be "logistic" if classification is set to True, otherwise "elasticnet"
"""
import os
file = os.path.join(os.getcwd(), "creditcard.csv")
if not os.path.exists(file):
   !wget https://s3.amazonaws.com/h2o-public-test-data/h2o4gpu/open_data/creditcard.csv

train_x,train_y,valid_x,valid_y,family=io.import_data(data_path="creditcard.csv", 
                                                        use_pandas=True, 
                                                        intercept=True,
                                                        valid_fraction=0.2,
                                                        classification=True)

Reading Data with Pandas
(23999, 25)
Original m=23999 n=24
Size of Train rows=19200 & valid rows=4799
Size of Train cols=24 valid cols=24
Size of Train cols=25 & valid cols=25 after adding intercept column


In [3]:
"""
Set up instance of H2O4GPU's GLM solver with default parameters

Need to pass in `family` to indicate problem type to solve
"""
print("Setting up solver")
model = h2o4gpu.ElasticNetH2O(family=family)

Setting up solver


In [4]:
"""
Fit GLM Solver
"""
print("Solving")
%time model.fit(train_x, train_y)
print("Done Solving")
model.summary()

Solving
CPU times: user 17.5 s, sys: 2.84 s, total: 20.3 s
Wall time: 11 s
Done Solving
Logloss per alpha value (-1.00 = missing)

|   Alphas |   Train |   CV |   Valid |
|---------:|--------:|-----:|--------:|
|     0.00 |    0.48 | 0.48 |   -1.00 |
|     0.25 |    0.48 | 0.48 |   -1.00 |
|     0.50 |    0.48 | 0.48 |   -1.00 |
|     0.75 |    0.48 | 0.48 |   -1.00 |
|     1.00 |    0.48 | 0.48 |   -1.00 |


In [5]:
"""
Make predictions on validation set
"""
print("Predictions per alpha")
preds = model.predict_proba(valid_x, valid_y)
print(preds)

Predictions per alpha
[[ 0.76320577  0.14472587  0.14279237 ...,  0.27377224  0.2479739
   0.17788069]
 [ 0.76512319  0.1458478   0.14631642 ...,  0.2623947   0.24248317
   0.17817149]
 [ 0.76369005  0.14476402  0.149213   ...,  0.25479093  0.24055323
   0.17880464]
 [ 0.76870668  0.14425999  0.15220751 ...,  0.255465    0.2437907
   0.18548389]
 [ 0.7688567   0.14338347  0.14736323 ...,  0.25683942  0.24017039
   0.17752922]]


In [6]:
"""
Get logloss or rmse for validation set per alpha
"""
for i in range(model.n_alphas):
    if family == "logistic":
        print("Logloss for alpha = ",model.alphas_best[i])
        print(metrics.log_loss(valid_y, preds[i]))
    else:
        print("RMSE for alpha = ",model.alphas_best[i])
        print(metrics.rmse(valid_y,preds[i]))

Logloss for alpha =  [ 0.]
0.439005
Logloss for alpha =  [ 0.25]
0.439057
Logloss for alpha =  [ 0.5]
0.439167
Logloss for alpha =  [ 0.75]
0.439402
Logloss for alpha =  [ 1.]
0.439157
