In [None]:
import h2ogpuml
import time
import sys
import os
import feather
import numpy as np
import pandas as pd

In [None]:
#Path to data

#Can import data using pandas or feather.
use_pandas = False

#Logistic Regression Example
#data_file = "../../data/creditcard.csv" #If importing using pandas
data_file = "../../data/credit.feather"

#Regression Example
#data_file = "../../../h2oai-prototypes/glm-bench/ipums.csv" #If importing using pandas
#data_file = "../../data/ipums.feather"

#Define problem type
classification = True

#Fraction to split validation set by
valid_fraction = 0.2

#Define if intercept should be used or not
intercept = True

#Set up parameters for GPU GLM
lambda_min_ratio = 1e-9 
n_folds = 5
n_lambdas = 20
n_alphas = 3
give_full_path = 0
n_gpus = 1
verbose = 0
family = "logistic" if classification else "elasticnet"

In [None]:
#Util to calculate logloss & rmse

def ll(actual, predicted):
    """
    Computes the log likelihood.
    This function computes the log likelihood between two numbers,
    or for element between a pair of lists or numpy arrays.
    Parameters
    ----------
    actual : int, float, list of numbers, numpy array
             The ground truth value
    predicted : same type as actual
                The predicted value
    Returns
    -------
    score : double or list of doubles
            The log likelihood error between actual and predicted
    """
    actual = np.array(actual)
    predicted = np.array(predicted)
    for i in range(0,predicted.shape[0]):
        predicted[i] = min(max(1e-15,predicted[i]),1-1e-15)
    err = np.seterr(all='ignore')
    score = -(actual*np.log(predicted)+(1-actual)*np.log(1-predicted))
    np.seterr(divide=err['divide'], over=err['over'],
              under=err['under'], invalid=err['invalid'])
    if type(score)==np.ndarray:
        score[np.isnan(score)] = 0
    else:
        if np.isnan(score):
            score = 0
    return score

def log_loss(actual, predicted):
    """
    Computes the log loss.
    This function computes the log loss between two lists
    of numbers.
    Parameters
    ----------
    actual : list of numbers, numpy array
             The ground truth value
    predicted : same type as actual
                The predicted value
    Returns
    -------
    score : double
            The log loss between actual and predicted
    """
    return np.mean(ll(actual, predicted))

def se(actual, predicted):
    """
    Computes the squared error.
    This function computes the squared error between two numbers,
    or for element between a pair of lists or numpy arrays.
    Parameters
    ----------
    actual : int, float, list of numbers, numpy array
             The ground truth value
    predicted : same type as actual
                The predicted value
    Returns
    -------
    score : double or list of doubles
            The squared error between actual and predicted
    """
    return np.power(np.array(actual)-np.array(predicted), 2)

def mse(actual, predicted):
    """
    Computes the mean squared error.
    This function computes the mean squared error between two lists
    of numbers.
    Parameters
    ----------
    actual : list of numbers, numpy array
             The ground truth value
    predicted : same type as actual
                The predicted value
    Returns
    -------
    score : double
            The mean squared error between actual and predicted
    """
    return np.mean(se(actual, predicted))

def rmse(actual, predicted):
    """
    Computes the root mean squared error.
    This function computes the root mean squared error between two lists
    of numbers.
    Parameters
    ----------
    actual : list of numbers, numpy array
             The ground truth value
    predicted : same type as actual
                The predicted value
    Returns
    -------
    score : double
            The root mean squared error between actual and predicted
    """
    return np.sqrt(mse(actual, predicted))

In [None]:
if use_pandas:
    print("Reading Data with Pandas")
    data = pd.read_csv(data_file)
else:
    print("Reading Data with Feather")
    data = feather.read_dataframe(data_file)
print(data.shape)
data_x = np.array(data.iloc[:, :data.shape[1] - 1], dtype='float32', order='C')
data_y = np.array(data.iloc[:, data.shape[1] - 1], dtype='float32', order='C')

In [None]:
#Setup train/validation set split (assuming form of mxn where m=row count and n=col count)
morig = data_x.shape[0]
norig = data_x.shape[1]
print("Original m=%d n=%d" % (morig, norig))
sys.stdout.flush()

#Do train/valid split
valid_fraction = valid_fraction
HO = int(valid_fraction * morig)
H = morig - HO
print("Size of Train rows=%d & valid rows=%d" % (H, HO))
sys.stdout.flush()
train_x = np.copy(data_x[0:H, :])
train_y = np.copy(data_y[0:H])
valid_x = np.copy(data_x[H:morig, :])
valid_y = np.copy(data_y[H:morig])
print("Size of Train cols=%d valid cols=%d" % (train_x.shape[1], valid_x.shape[1]))

#Using intercept
if intercept:
    train_x = np.hstack([train_x, np.ones((train_x.shape[0], 1), dtype=train_x.dtype)])
    valid_x = np.hstack([valid_x, np.ones((valid_x.shape[0], 1), dtype=valid_x.dtype)])
    n = train_x.shape[1]
    print("Size of Train cols=%d & valid cols=%d after adding intercept column" % (train_x.shape[1], valid_x.shape[1]))

In [None]:
#Choose solver
Solver = h2ogpuml.GLM

In [None]:
fortran = train_x.flags.f_contiguous #Row major vs Column major
print("Setting up solver")
sys.stdout.flush()
solver = Solver(
              n_gpus=n_gpus, 
              order='c' if fortran else 'r', 
              intercept=intercept,
              lambda_min_ratio=lambda_min_ratio,
              n_lambdas=n_lambdas, 
              n_folds=n_folds, 
              n_alphas=n_alphas, 
              verbose=verbose, 
              family=family, 
              give_full_path=give_full_path)

In [None]:
print("Solving")
fit = solver.fit(train_x, train_y)
print("Done Solving")

In [None]:
# Show something about Xvsalphalambda or Xvsalpha
print("Xvsalpha")
print(fit.x_vs_alphapure)

print("np.shape(Xvsalpha)")
print(np.shape(fit.x_vs_alphapure))

error_train = fit.error_vs_alpha
if classification:
    print("logloss_train")
else:
    print("rmse_train")
print(error_train)

print("Best lambdas")
lambdas = fit.lambdas_best
print(lambdas)

print("Best alphas")
alphas = fit.alphas_best
print(alphas)

print("Best tols")
tols = fit.tols_best
print(tols)

In [None]:
#Make predictions on validation
preds = fit.predict(valid_x, valid_y)
print(preds)

In [None]:
#Get logloss or rmse for validation set per alpha
for i in range(n_alphas):
    if classification:
        print("Logloss for alpha = ",alphas[i])
        print(log_loss(valid_y, preds[i]))
    else:
        print("RMSE for alpha = ",alphas[i])
        print(rmse(valid_y,preds[i]))