In [1]:
import h2o4gpu
import time
import sys
import os
import feather
import numpy as np
import pandas as pd

In [2]:
#Can import data using pandas or feather.
use_pandas = True

#Define problem type
classification = True

In [3]:
#Path to data

#Logistic Regression Example
if classification:
    if use_pandas:
        data_file = "https://s3.amazonaws.com/h2o-public-test-data/h2o4gpu/open_data/creditcard.csv" #If importing using pandas
    else:
        data_file = "https://s3.amazonaws.com/h2o-public-test-data/h2o4gpu/open_data/credit.feather"

#Regression Example
else:
    if use_pandas:
        data_file = "../../../h2oai-prototypes/glm-bench/ipums.csv" #If importing using pandas
    else:
        data_file = "../../data/ipums.feather"

#Fraction to split validation set by
valid_fraction = 0.2

#Define if intercept should be used or not
fit_intercept = True

#Set up parameters for GPU GLM
lambda_min_ratio = 1e-9 
n_folds = 5
n_lambdas = 20
n_alphas = 3
store_full_path = 0
n_gpus = 1
verbose = 0
family = "logistic" if classification else "elasticnet"

In [4]:
#Util to calculate logloss & rmse

def ll(actual, predicted):
    """
    Computes the log likelihood.
    This function computes the log likelihood between two numbers,
    or for element between a pair of lists or numpy arrays.
    Parameters
    ----------
    actual : int, float, list of numbers, numpy array
             The ground truth value
    predicted : same type as actual
                The predicted value
    Returns
    -------
    score : double or list of doubles
            The log likelihood error between actual and predicted
    """
    actual = np.array(actual)
    predicted = np.array(predicted)
    for i in range(0,predicted.shape[0]):
        predicted[i] = min(max(1e-15,predicted[i]),1-1e-15)
    err = np.seterr(all='ignore')
    score = -(actual*np.log(predicted)+(1-actual)*np.log(1-predicted))
    np.seterr(divide=err['divide'], over=err['over'],
              under=err['under'], invalid=err['invalid'])
    if type(score)==np.ndarray:
        score[np.isnan(score)] = 0
    else:
        if np.isnan(score):
            score = 0
    return score

def log_loss(actual, predicted):
    """
    Computes the log loss.
    This function computes the log loss between two lists
    of numbers.
    Parameters
    ----------
    actual : list of numbers, numpy array
             The ground truth value
    predicted : same type as actual
                The predicted value
    Returns
    -------
    score : double
            The log loss between actual and predicted
    """
    return np.mean(ll(actual, predicted))

def se(actual, predicted):
    """
    Computes the squared error.
    This function computes the squared error between two numbers,
    or for element between a pair of lists or numpy arrays.
    Parameters
    ----------
    actual : int, float, list of numbers, numpy array
             The ground truth value
    predicted : same type as actual
                The predicted value
    Returns
    -------
    score : double or list of doubles
            The squared error between actual and predicted
    """
    return np.power(np.array(actual)-np.array(predicted), 2)

def mse(actual, predicted):
    """
    Computes the mean squared error.
    This function computes the mean squared error between two lists
    of numbers.
    Parameters
    ----------
    actual : list of numbers, numpy array
             The ground truth value
    predicted : same type as actual
                The predicted value
    Returns
    -------
    score : double
            The mean squared error between actual and predicted
    """
    return np.mean(se(actual, predicted))

def rmse(actual, predicted):
    """
    Computes the root mean squared error.
    This function computes the root mean squared error between two lists
    of numbers.
    Parameters
    ----------
    actual : list of numbers, numpy array
             The ground truth value
    predicted : same type as actual
                The predicted value
    Returns
    -------
    score : double
            The root mean squared error between actual and predicted
    """
    return np.sqrt(mse(actual, predicted))

In [5]:
if use_pandas:
    print("Reading Data with Pandas")
    data = pd.read_csv(data_file)
else:
    print("Reading Data with Feather")
    data = feather.read_dataframe(data_file)
print(data.shape)
data_x = np.array(data.iloc[:, :data.shape[1] - 1], dtype='float32', order='C')
data_y = np.array(data.iloc[:, data.shape[1] - 1], dtype='float32', order='C')

Reading Data with Pandas
(23999, 25)


In [6]:
#Setup train/validation set split (assuming form of mxn where m=row count and n=col count)
morig = data_x.shape[0]
norig = data_x.shape[1]
print("Original m=%d n=%d" % (morig, norig))
sys.stdout.flush()

#Do train/valid split
valid_fraction = valid_fraction
HO = int(valid_fraction * morig)
H = morig - HO
print("Size of Train rows=%d & valid rows=%d" % (H, HO))
sys.stdout.flush()
train_x = np.copy(data_x[0:H, :])
train_y = np.copy(data_y[0:H])
valid_x = np.copy(data_x[H:morig, :])
valid_y = np.copy(data_y[H:morig])
print("Size of Train cols=%d valid cols=%d" % (train_x.shape[1], valid_x.shape[1]))

#Using intercept
if fit_intercept:
    train_x = np.hstack([train_x, np.ones((train_x.shape[0], 1), dtype=train_x.dtype)])
    valid_x = np.hstack([valid_x, np.ones((valid_x.shape[0], 1), dtype=valid_x.dtype)])
    n = train_x.shape[1]
    print("Size of Train cols=%d & valid cols=%d after adding intercept column" % (train_x.shape[1], valid_x.shape[1]))

Original m=23999 n=24
Size of Train rows=19200 & valid rows=4799
Size of Train cols=24 valid cols=24
Size of Train cols=25 & valid cols=25 after adding intercept column


In [7]:
#Choose solver
Solver = h2o4gpu.GLM

In [8]:
fortran = train_x.flags.f_contiguous #Row major vs Column major
print("Setting up solver")
sys.stdout.flush()
solver = Solver(
              n_gpus=n_gpus, 
              order='c' if fortran else 'r', 
              fit_intercept=fit_intercept,
              lambda_min_ratio=lambda_min_ratio,
              n_lambdas=n_lambdas, 
              n_folds=n_folds, 
              n_alphas=n_alphas, 
              verbose=verbose, 
              family=family, 
              store_full_path=store_full_path)

Setting up solver


In [9]:
print("Solving")
fit = solver.fit(train_x, train_y)
print("Done Solving")

Solving
Done Solving


In [10]:
# Show something about Xvsalphalambda or Xvsalpha
print("Xvsalpha")
print(fit.x_vs_alphapure)

print("np.shape(Xvsalpha)")
print(np.shape(fit.x_vs_alphapure))

error_train = fit.error_vs_alpha
if classification:
    print("logloss_train")
else:
    print("rmse_train")
print(error_train)

print("Best lambdas")
lambdas = fit.lambdas_best
print(lambdas)

print("Best alphas")
alphas = fit.alphas_best
print(alphas)

print("Best tols")
tols = fit.tols_best
print(tols)

Xvsalpha
[[  2.22630342e-06  -4.85086105e-07  -8.17872882e-02  -1.10862039e-01
   -1.54789597e-01   5.13954367e-03   5.72748542e-01   6.61257505e-02
    8.27719048e-02  -7.26411492e-03   9.42052305e-02  -1.44003155e-02
   -6.42589930e-06   2.91982656e-06   4.15507202e-07  -4.52019606e-07
    9.12767348e-07   1.53325118e-06  -1.64839712e-05  -6.88376031e-06
   -3.08774770e-06  -4.16595958e-06  -1.68937606e-06  -1.87233877e-06
    1.36942859e-03  -6.61261082e-01]
 [  2.11968040e-06  -4.50975733e-07  -2.37388834e-02  -7.63612837e-02
   -9.00145248e-02   5.85425133e-03   5.60724556e-01   6.25579506e-02
    7.60950744e-02   0.00000000e+00   7.48864710e-02   0.00000000e+00
   -6.67043605e-06   3.19906439e-06   4.69731475e-07  -7.39129746e-07
    1.96886117e-06   7.15584065e-07  -1.70692419e-05  -7.29354133e-06
   -3.12508791e-06  -5.21898755e-06  -9.81220637e-07  -2.07797939e-06
    0.00000000e+00  -9.35642660e-01]
 [  2.31712875e-06  -4.52045981e-07  -3.96523178e-02  -8.53509903e-02
   -1.0

In [11]:
#Make predictions on validation
preds = fit.predict(valid_x, valid_y)
print(preds)

[[ 0.7718392   0.14170469  0.14292425 ...,  0.26786035  0.24226156
   0.1750378 ]
 [ 0.76714426  0.14424451  0.1507562  ...,  0.25439146  0.24270132
   0.18371987]
 [ 0.7737245   0.14222543  0.14817348 ...,  0.25907847  0.24351341
   0.18025337]]


In [12]:
#Get logloss or rmse for validation set per alpha
for i in range(n_alphas):
    if classification:
        print("Logloss for alpha = ",alphas[i])
        print(log_loss(valid_y, preds[i]))
    else:
        print("RMSE for alpha = ",alphas[i])
        print(rmse(valid_y,preds[i]))

Logloss for alpha =  [ 0.]
0.439094
Logloss for alpha =  [ 0.5]
0.439265
Logloss for alpha =  [ 1.]
0.439326
