In [11]:
import h2o4gpu
import h2o4gpu.util.import_data as io
import h2o4gpu.util.metrics as metrics
from tabulate import tabulate
import pandas as pd

In [12]:
"""
Import Data for H2O GPU Edition

This function will read in data and prepare it for H2O4GPU's GLM solver

Parameters
----------
data_path : str
             A path to a dataset (The dataset needs to be all numeric)
use_pandas : bool
              Indicate if Pandas should be used to parse
intercept : bool
              Indicate if intercept term is needed
valid_fraction : float
                  Percentage of dataset reserved for a validation set
classification : bool
                  Classification problem?
Returns
-------
If valid_fraction > 0 it will return the following:
    train_x: numpy array of train input variables
    train_y: numpy array of y variable
    valid_x: numpy array of valid input variables
    valid_y: numpy array of valid y variable
    family : string that would either be "logistic" if classification is set to True, otherwise "elasticnet"
If valid_fraction == 0 it will return the following:
    train_x: numpy array of train input variables
    train_y: numpy array of y variable
    family : string that would either be "logistic" if classification is set to True, otherwise "elasticnet"
"""
import os
if not os.path.exists("ipums_1k.csv"):
    !wget https://s3.amazonaws.com/h2o-public-test-data/h2o4gpu/open_data/ipums_1k.csv
train_x,train_y,valid_x,valid_y,family=io.import_data(data_path="ipums_1k.csv", 
                                                        use_pandas=True, 
                                                        intercept=True,
                                                        valid_fraction=0.2,
                                                        classification=True)

Reading Data with Pandas
(999, 9733)
Original m=999 n=9732
Size of Train rows=800 & valid rows=199
Size of Train cols=9732 valid cols=9732
Size of Train cols=9733 & valid cols=9733 after adding intercept column


In [13]:
"""
Set up instance of H2O4GPU's Ridge solver with default parameters

Need to pass in `family` to indicate problem type to solve
"""
print("Setting up solver")
model = h2o4gpu.Ridge()

Setting up solver
Running h2o4gpu Ridge Regression


In [14]:
"""
Fit Ridge Solver
"""
print("Solving")
%time model.fit(train_x, train_y)
print("Done Solving")

Solving
CPU times: user 2.84 s, sys: 620 ms, total: 3.46 s
Wall time: 3.42 s
Done Solving


In [15]:
"""
Make predictions on validation set
"""
print("Predictions per alpha")
preds = model.predict(valid_x)
print(preds)

Predictions per alpha
[[  1.44140804e+00   3.95907745e+02   5.85950859e+04   1.35538736e-02
    2.24011857e-03  -9.94564220e-03   2.08838610e-03   1.52874296e-03
    8.47863592e-03   8.15556982e+03  -1.24143763e-03   9.60603240e-04
    6.25534402e-03   1.57668184e+04  -1.02974465e-02   1.85184082e+04
    4.43184264e-02  -2.37068860e-04   2.32363149e-04   2.67238274e+01
   -7.34141785e+02  -1.36826131e-02  -1.16951093e-02   1.73626280e+00
   -2.37373356e-03   1.56881176e-02   4.45240736e+00   1.97632122e+01
    7.47358240e-03   3.67370062e-03  -2.05536117e-03   6.71235046e+02
    1.45517930e-03   1.65146042e+02   1.65146515e+02   1.84123810e+02
   -3.70902475e-03  -5.97808510e-03  -6.31310628e-04  -1.80768259e-02
   -5.15626045e-03  -5.77098923e-03   2.64548877e+03   2.64548047e+03
    4.90870886e-03   4.42881574e-04   1.39958085e-03   3.42940601e+03
   -7.33041016e+02   1.61363633e+04  -3.41391144e-03  -4.12627310e-03
    1.37159340e-02   1.27209687e+00  -3.47321550e-03   1.39156298e-0