In [1]:
import h2o4gpu
import h2o4gpu.util.import_data as io
import h2o4gpu.util.metrics as metrics
from tabulate import tabulate
import pandas as pd

In [2]:
"""
Import Data for H2O GPU Edition

This function will read in data and prepare it for H2O4GPU's GLM solver

Parameters
----------
data_path : str
             A path to a dataset (The dataset needs to be all numeric)
use_pandas : bool
              Indicate if Pandas should be used to parse
intercept : bool
              Indicate if intercept term is needed
valid_fraction : float
                  Percentage of dataset reserved for a validation set
classification : bool
                  Classification problem?
Returns
-------
If valid_fraction > 0 it will return the following:
    train_x: numpy array of train input variables
    train_y: numpy array of y variable
    valid_x: numpy array of valid input variables
    valid_y: numpy array of valid y variable
    family : string that would either be "logistic" if classification is set to True, otherwise "elasticnet"
If valid_fraction == 0 it will return the following:
    train_x: numpy array of train input variables
    train_y: numpy array of y variable
    family : string that would either be "logistic" if classification is set to True, otherwise "elasticnet"
"""
import os
if not os.path.exists("ipums_1k.csv"):
    !wget https://s3.amazonaws.com/h2o-public-test-data/h2o4gpu/open_data/ipums_1k.csv
train_x,train_y,valid_x,valid_y,family=io.import_data(data_path="ipums_1k.csv", 
                                                        use_pandas=True, 
                                                        intercept=True,
                                                        valid_fraction=0.2,
                                                        classification=True)

Reading Data with Pandas
(999, 9733)
Original m=999 n=9732
Size of Train rows=800 & valid rows=199
Size of Train cols=9732 valid cols=9732
Size of Train cols=9733 & valid cols=9733 after adding intercept column


In [3]:
"""
Set up instance of H2O4GPU's Lasso solver with default parameters

Need to pass in `family` to indicate problem type to solve
"""
print("Setting up solver")
model = h2o4gpu.Lasso()

Setting up solver
Running h2o4gpu Lasso Regression


In [4]:
"""
Fit Lasso Solver
"""
print("Solving")
%time model.fit(train_x, train_y)
print("Done Solving")

Solving
CPU times: user 17.6 s, sys: 4.13 s, total: 21.8 s
Wall time: 21.8 s
Done Solving


In [5]:
"""
Make predictions on validation set
"""
preds = model.predict(valid_x)
print(preds)

[[  1.09889756e+04   3.75698984e+04   5.33906016e+04   3.94701289e+04
    1.99471836e+04   3.26172285e+04   1.46580322e+04   3.38587031e+04
    2.57400723e+04   5.38813477e+04  -4.55054321e+02   6.01770312e+03
    1.17396279e+04   1.83504414e+04   5.06654219e+04   4.08623359e+04
    1.75025195e+04   1.76822148e+04   1.50483564e+04   1.10771023e+05
    4.53170215e+03   2.32612793e+04   1.49927441e+04   2.93769385e+03
    7.86341357e+03   3.90922188e+04   4.21069492e+04   7.68087578e+04
    2.41864727e+04   8.20444031e+01   2.97903848e+04   2.61350742e+04
    1.56964771e+03   1.34936582e+04  -5.14499939e+02  -3.15045837e+02
    1.40303164e+04   1.81600508e+04   1.19846484e+04   7.00190156e+04
   -3.97029370e+03  -1.00706309e+04   4.59641484e+04   2.59598672e+04
   -1.22267793e+04   1.15692803e+04   3.72865547e+04   3.06964609e+04
    2.13391680e+04   3.15113086e+04   1.19520117e+04   2.00309551e+04
    2.74033320e+04   2.17361055e+04  -2.11075391e+03   3.58349072e+03
   -6.49273071e+02  