In [1]:
import h2o4gpu
import time
import sys
import os
import feather
import numpy as np
import pandas as pd
import sklearn

In [2]:
#Can import data using pandas or feather.
use_pandas = False

#Path to data
if use_pandas:
    data_file = "../../data/creditcard.csv" #If importing using pandas
else:
    data_file = "../../data/credit.feather"

#Fraction to split validation set by
valid_fraction = 0.2

#Set up parameters for GPU GLM
intercept = True #Define if intercept should be used or not
lambda_min_ratio = 1e-9 #Minimum lambda used in lambda search. Default is 1e-7.
n_folds = 5 #Number of CV folds
n_lambdas = 20 #Number of lambdas to search from min lambda to lambda max
n_alphas = 3 #Number of alphas to search from min alpha to alpha max
give_full_path = 0 #Give full path of search?
n_gpus = 1 #Number of gpus to use
verbose = 0 #Show logging of solver processes?

In [3]:
#Util to calculate logloss & rmse

def ll(actual, predicted):
    """
    Computes the log likelihood.
    This function computes the log likelihood between two numbers,
    or for element between a pair of lists or numpy arrays.
    Parameters
    ----------
    actual : int, float, list of numbers, numpy array
             The ground truth value
    predicted : same type as actual
                The predicted value
    Returns
    -------
    score : double or list of doubles
            The log likelihood error between actual and predicted
    """
    actual = np.array(actual)
    predicted = np.array(predicted)
    for i in range(0,predicted.shape[0]):
        predicted[i] = min(max(1e-15,predicted[i]),1-1e-15)
    err = np.seterr(all='ignore')
    score = -(actual*np.log(predicted)+(1-actual)*np.log(1-predicted))
    np.seterr(divide=err['divide'], over=err['over'],
              under=err['under'], invalid=err['invalid'])
    if type(score)==np.ndarray:
        score[np.isnan(score)] = 0
    else:
        if np.isnan(score):
            score = 0
    return score

def log_loss(actual, predicted):
    """
    Computes the log loss.
    This function computes the log loss between two lists
    of numbers.
    Parameters
    ----------
    actual : list of numbers, numpy array
             The ground truth value
    predicted : same type as actual
                The predicted value
    Returns
    -------
    score : double
            The log loss between actual and predicted
    """
    return np.mean(ll(actual, predicted))

def se(actual, predicted):
    """
    Computes the squared error.
    This function computes the squared error between two numbers,
    or for element between a pair of lists or numpy arrays.
    Parameters
    ----------
    actual : int, float, list of numbers, numpy array
             The ground truth value
    predicted : same type as actual
                The predicted value
    Returns
    -------
    score : double or list of doubles
            The squared error between actual and predicted
    """
    return np.power(np.array(actual)-np.array(predicted), 2)

def mse(actual, predicted):
    """
    Computes the mean squared error.
    This function computes the mean squared error between two lists
    of numbers.
    Parameters
    ----------
    actual : list of numbers, numpy array
             The ground truth value
    predicted : same type as actual
                The predicted value
    Returns
    -------
    score : double
            The mean squared error between actual and predicted
    """
    return np.mean(se(actual, predicted))

def rmse(actual, predicted):
    """
    Computes the root mean squared error.
    This function computes the root mean squared error between two lists
    of numbers.
    Parameters
    ----------
    actual : list of numbers, numpy array
             The ground truth value
    predicted : same type as actual
                The predicted value
    Returns
    -------
    score : double
            The root mean squared error between actual and predicted
    """
    return np.sqrt(mse(actual, predicted))

In [4]:
if use_pandas:
    print("Reading Data with Pandas")
    data = pd.read_csv(data_file)
else:
    print("Reading Data with Feather")
    data = feather.read_dataframe(data_file)
print(data.shape)
data_x = np.array(data.iloc[:, :data.shape[1] - 1], dtype='float32', order='C')
data_y = np.array(data.iloc[:, data.shape[1] - 1], dtype='float32', order='C')

Reading Data with Feather
(23999, 25)


In [5]:
#Setup train/validation set split (assuming form of mxn where m=row count and n=col count)
morig = data_x.shape[0]
norig = data_x.shape[1]
print("Original m=%d n=%d" % (morig, norig))
sys.stdout.flush()

#Do train/valid split
valid_fraction = valid_fraction
HO = int(valid_fraction * morig)
H = morig - HO
print("Size of Train rows=%d & valid rows=%d" % (H, HO))
sys.stdout.flush()
train_x = np.copy(data_x[0:H, :])
train_y = np.copy(data_y[0:H])
valid_x = np.copy(data_x[H:morig, :])
valid_y = np.copy(data_y[H:morig])
print("Size of Train cols=%d valid cols=%d" % (train_x.shape[1], valid_x.shape[1]))

#Using intercept
if intercept:
    train_x = np.hstack([train_x, np.ones((train_x.shape[0], 1), dtype=train_x.dtype)])
    valid_x = np.hstack([valid_x, np.ones((valid_x.shape[0], 1), dtype=valid_x.dtype)])
    n = train_x.shape[1]
    print("Size of Train cols=%d & valid cols=%d after adding intercept column" % (train_x.shape[1], valid_x.shape[1]))

Original m=23999 n=24
Size of Train rows=19200 & valid rows=4799
Size of Train cols=24 valid cols=24
Size of Train cols=25 & valid cols=25 after adding intercept column


In [6]:
#Choose solver
Solver = h2o4gpu.LogisticRegression

In [7]:
fortran = train_x.flags.f_contiguous #Row major vs Column major
print("Setting up solver")
sys.stdout.flush()
solver = Solver(
              n_gpus=n_gpus,  
              fit_intercept=intercept,
              lambda_min_ratio=lambda_min_ratio,
              n_lambdas=n_lambdas, 
              n_folds=n_folds, 
              n_alphas=n_alphas, 
              verbose=verbose,  
              give_full_path=give_full_path)

Setting up solver


In [8]:
print("Solving")
start = time.time()
fit = solver.fit(train_x, train_y)
end = time.time() - start
print("Done Solving")
print("Took %s seconds to solve." % end)

Solving
Done Solving
Took 6.482304573059082 seconds to solve.


In [9]:
# Show something about Xvsalphalambda or Xvsalpha
print("Xvsalpha")
print(fit.x_vs_alphapure)

print("np.shape(Xvsalpha)")
print(np.shape(fit.x_vs_alphapure))

error_train = fit.error_vs_alpha

print("logloss_train")
print(error_train)

print("Best lambdas")
lambdas = fit.lambdas_best
print(lambdas)

print("Best alphas")
alphas = fit.alphas_best
print(alphas)

print("Best tols")
tols = fit.tols_best
print(tols)

Xvsalpha
[[  1.82659369e-06  -4.62984360e-07  -7.50495866e-02  -1.06810451e-01
   -1.43118232e-01   4.86549875e-03   5.64639986e-01   6.84994832e-02
    8.15961137e-02  -4.72300593e-03   9.08494368e-02  -1.17632560e-02
   -6.69613837e-06   3.32956711e-06   9.69043512e-08  -2.77160268e-08
    1.42902729e-06   7.34622631e-07  -1.70781041e-05  -7.03282331e-06
   -3.60724061e-06  -4.40247004e-06  -9.96036192e-07  -2.00349200e-06
   -6.75806701e-01]
 [  1.92904281e-06  -4.56136746e-07  -2.85321772e-02  -8.10271576e-02
   -9.75653827e-02   5.65608498e-03   5.62728584e-01   6.24704696e-02
    7.64279217e-02   0.00000000e+00   7.54820108e-02   0.00000000e+00
   -6.66864617e-06   3.13863620e-06   7.29369333e-07  -1.29122827e-06
    2.28572208e-06   7.43850819e-07  -1.70064341e-05  -7.32524541e-06
   -2.70869259e-06  -5.49549850e-06  -9.86616101e-07  -2.00756676e-06
   -8.98487151e-01]
 [  1.84303371e-06  -4.61499525e-07  -4.14962620e-02  -8.78860578e-02
   -1.14572413e-01   5.33862831e-03   5.6

In [10]:
#Make predictions on validation
preds = fit.predict(valid_x, valid_y)
print(preds)

[[ 0.768255    0.14183101  0.14306544 ...,  0.26625088  0.24452013
   0.17846589]
 [ 0.76835787  0.14378332  0.14954805 ...,  0.25559866  0.24205837
   0.17870741]
 [ 0.77359319  0.14255883  0.14806251 ...,  0.25893533  0.24191481
   0.17713974]]


In [11]:
#Get logloss or rmse for validation set per alpha
for i in range(n_alphas):
    print("Logloss for alpha = ",alphas[i])
    print(log_loss(valid_y, preds[i]))

Logloss for alpha =  [ 0.]
0.438951469811
Logloss for alpha =  [ 0.5]
0.439302527004
Logloss for alpha =  [ 1.]
0.439180472172
