In [1]:
import h2o4gpu
import h2o4gpu.util.import_data as io
import h2o4gpu.util.metrics as metrics
from tabulate import tabulate
from h2o4gpu.solvers.linear_regression import LinearRegression
import pandas as pd
import os, sys

In [8]:
"""
Import Data for H2O GPU Edition

This function will read in data and prepare it for H2O4GPU's GLM solver

Parameters
----------
data_path : str
             A path to a dataset (The dataset needs to be all numeric)
use_pandas : bool
              Indicate if Pandas should be used to parse
intercept : bool
              Indicate if intercept term is needed
valid_fraction : float
                  Percentage of dataset reserved for a validation set
classification : bool
                  Classification problem?
Returns
-------
If valid_fraction > 0 it will return the following:
    train_x: numpy array of train input variables
    train_y: numpy array of y variable
    valid_x: numpy array of valid input variables
    valid_y: numpy array of valid y variable
    family : string that would either be "logistic" if classification is set to True, otherwise "elasticnet"
If valid_fraction == 0 it will return the following:
    train_x: numpy array of train input variables
    train_y: numpy array of y variable
    family : string that would either be "logistic" if classification is set to True, otherwise "elasticnet"
"""
import os
if not os.path.exists("ipums_1k.csv"):
    !wget https://s3.amazonaws.com/h2o-public-test-data/h2o4gpu/open_data/ipums_1k.csv
train_x,train_y,valid_x,valid_y,family=io.import_data(data_path="ipums_1k.csv", 
                                                        use_pandas=True, 
                                                        intercept=True,
                                                        valid_fraction=0.2,
                                                        classification=False)

Reading Data with Pandas
(999, 9733)
Original m=999 n=9732
Size of Train rows=800 & valid rows=199
Size of Train cols=9732 valid cols=9732
Size of Train cols=9733 & valid cols=9733 after adding intercept column


In [9]:
"""
Set up instance of H2O4GPU's GLM solver with default parameters

Need to pass in `family` to indicate problem type to solve
"""
print("Setting up solver")
model = LinearRegression()

Setting up solver


In [12]:
"""
Fit Linear Regression Solver
"""
print("Solving")
%time model.fit(train_x, train_y)
print("Done Solving")

Solving
CPU times: user 3.57 s, sys: 4.25 s, total: 7.82 s
Wall time: 9.43 s
Done Solving


In [5]:
"""
Make predictions on validation set
"""
print("Predictions per alpha")
preds = model.predict(valid_x)
print(preds)

Predictions per alpha
[[  5.89144754e+00   1.13267566e+03   6.29999883e+04   6.59062411e-04
    1.84079254e-04   1.24079944e-03   5.32921113e-04   7.96878594e-05
    1.60439056e-03   1.09333740e+04  -2.98141211e-04  -1.17179035e-04
   -7.60835654e-04   1.70000098e+04  -1.99286174e-03   2.09998965e+04
    4.80395215e+03   2.33241566e-03  -8.13281105e-04   1.05562675e+02
   -1.16573608e+03  -8.37087282e-05   1.42230908e-03   5.89264679e+00
   -8.03512870e-04   7.82845775e-04   1.78153267e+01   3.51782056e+03
   -7.56430149e-04   2.17146997e-04   3.01949005e-03   1.68121606e+03
    1.24978181e-03   6.65558044e+02   6.65558472e+02   7.39584412e+02
   -1.86518743e-03  -1.08923845e-03  -9.00257321e-04  -3.39461910e-03
   -9.70140973e-04  -2.09905044e-03   2.99995117e+03   2.99994995e+03
    3.31795163e-04   1.76473870e-04  -9.76434851e-04   3.27940781e+04
   -1.16139160e+03   1.70000020e+04  -1.00740243e-03  -5.56127401e-04
    2.79090251e-03   9.76173340e+03  -8.03297968e-04  -8.05274758e-0

In [6]:
del model