In [14]:
import h2o4gpu as h2o4gpu
from h2o4gpu.types import *

'''
Elastic Net

   minimize    (1/2) ||Ax - b||_2^2 + \alpha * \lambda ||x||_1 + 0.5 * (1-\alpha) * \lambda ||x||_2

   for 100 values of \lambda, and alpha in [0,1]
   See <h2o4gpu>/matlab/examples/lasso_path.m for detailed description.
'''


def elastic_net(X, y, nGPUs=0, nlambda=100, nfolds=5, nalpha=5, validFraction=0.2, family="elasticnet", verbose=0):
    # choose solver
    Solver = h2o4gpu.GLM

    sharedA = 0
    nThreads = None  # let internal method figure this out
    intercept = 0
    standardize = 0
    lambda_min_ratio = 1e-9
    nFolds = nfolds
    nLambdas = nlambda
    nAlphas = nalpha

    if standardize:
        print("implement standardization transformer")
        exit()

    # Setup Train/validation Set Split
    morig = X.shape[0]
    norig = X.shape[1]
    print("Original m=%d n=%d" % (morig, norig))
    fortran = X.flags.f_contiguous
    print("fortran=%d" % fortran)

    # Do train/valid split
    HO = int(validFraction * morig)
    H = morig - HO
    print("Size of Train rows=%d valid rows=%d" % (H, HO))
    trainX = np.copy(X[0:H, :])
    trainY = np.copy(y[0:H])
    validX = np.copy(X[H:-1, :])

    mTrain = trainX.shape[0]
    mvalid = validX.shape[0]
    print("mTrain=%d mvalid=%d" % (mTrain, mvalid))

    if intercept == 1:
        trainX = np.hstack([trainX, np.ones((trainX.shape[0], 1), dtype=trainX.dtype)])
        validX = np.hstack([validX, np.ones((validX.shape[0], 1), dtype=validX.dtype)])
        n = trainX.shape[1]
        print("New n=%d" % n)

    ## Constructor
    print("Setting up solver")
    enet = Solver(sharedA, nThreads, nGPUs, 'c' if fortran else 'r', intercept, standardize, lambda_min_ratio, nLambdas, nFolds, nAlphas, verbose=verbose,family=family)

    print("trainX")
    print(trainX)
    print("trainY")
    print(trainY)

    ## Solve
    print("Solving")
    Xvsalpha = enet.fit(trainX, trainY)
    # Xvsalphalambda, Xvsalpha = enet.fit(trainX, trainY, validX, validY)
    # Xvsalphalambda, Xvsalpha = enet.fit(trainX, trainY, validX, validY, trainW)
    # Xvsalphalambda, Xvsalpha = enet.fit(trainX, trainY, validX, validY, trainW, 0)
    # givefullpath=1
    #  Xvsalphalambda, Xvsalpha = enet.fit(trainX, trainY, validX, validY, trainW, givefullpath)
    print("Done Solving")

    # show something about Xvsalphalambda or Xvsalpha
    print("Xvsalpha")
    print(Xvsalpha)
    print("np.shape(Xvsalpha)")
    print(np.shape(Xvsalpha))

    rmse = enet.getrmse()
    if family == "elasticnet":
        print("rmse")
        print(rmse)
    else:
        print("logloss")
        print(rmse)

    print("lambdas")
    lambdas = enet.getlambdas()
    print(lambdas)

    print("alphas")
    alphas = enet.getalphas()
    print(alphas)

    print("tols")
    tols = enet.gettols()
    print(tols)

    testvalidY = np.dot(trainX, Xvsalpha.T)
    print("testvalidY (newvalidY should be this)")
    print(testvalidY)

    print("Predicting, assuming unity weights")
    if validX == None or mvalid == 0:
        print("Using trainX for validX")
        newvalidY = enet.predict(trainX)  # for testing
    else:
        print("Using validX for validX")
        newvalidY = enet.predict(validX)
    print("newvalidY")
    print(newvalidY)

    print("Done Reporting")
    return trainX,trainY


In [19]:
import numpy as np
#from numpy.random import randn
#  m=1000
#  n=100
#  A=randn(m,n)
#  x_true=(randn(n)/n)*float64(randn(n)<0.8)
#  b=A.dot(x_true)+0.5*randn(m)
import pandas as pd
import feather

# NOTE: cd ~/h2oai-prototypes/glm-bench/ ; gunzip ipums.csv.gz ; Rscript h2oai-prototypes/glm-bench/ipums.R to produce ipums.feather
#df = feather.read_dataframe("../../../h2oai-prototypes/glm-bench/ipums.feather")
df = feather.read_dataframe("../../../h2oai-prototypes/glm-bench/credit.feather")
# df = pd.read_csv("../cpp/train.txt", sep=" ", header=None)
#df = pd.read_csv("../cpp/simple.txt", sep=" ", header=None)
#df = pd.read_csv("Hyatt_Subset.csv")
#df = pd.read_csv("Hyatt_Subset.nohead.csv")
print(df.shape)
X = np.array(df.iloc[:, :df.shape[1] - 1], dtype='float32', order='C')
y = np.array(df.iloc[:, df.shape[1] - 1], dtype='float32', order='C')
# elastic_net(X, y, nGPUs=2, nlambda=100, nfolds=5, nalpha=5, validFraction=0.2)
x_y = elastic_net(X, y, nGPUs=1, nlambda=100, nfolds=1, nalpha=1, validFraction=0, family="logistic",verbose=0)
# elastic_net(X, y, nGPUs=0, nlambda=100, nfolds=1, nalpha=1, validFraction=0)

(23999, 25)
Original m=23999 n=24
fortran=0
Size of Train rows=23999 valid rows=0
mTrain=23999 mvalid=0
Setting up solver

Using GPU GLM solver with 1 GPUs

trainX
[[  1.00000000e+00   2.00000000e+04   2.00000000e+00 ...,   0.00000000e+00
    0.00000000e+00   0.00000000e+00]
 [  2.00000000e+00   1.20000000e+05   2.00000000e+00 ...,   1.00000000e+03
    0.00000000e+00   2.00000000e+03]
 [  3.00000000e+00   9.00000000e+04   2.00000000e+00 ...,   1.00000000e+03
    1.00000000e+03   5.00000000e+03]
 ..., 
 [  2.39970000e+04   2.00000000e+04   1.00000000e+00 ...,   6.06000000e+02
    5.00000000e+02   1.00000000e+03]
 [  2.39980000e+04   1.00000000e+04   1.00000000e+00 ...,   3.20000000e+02
    1.82000000e+03   1.00000000e+03]
 [  2.39990000e+04   2.00000000e+04   1.00000000e+00 ...,   1.00000000e+03
    0.00000000e+00   0.00000000e+00]]
trainY
[ 1.  1.  0. ...,  0.  0.  0.]
Solving
Done Solving
Xvsalpha
[[  6.37776679e-07   3.02116332e-09   2.79653966e-02   7.12255016e-04
    3.50422710e-02

In [21]:
import h2o
h2o.init()

#Make Ax
trainX = x_y[0]
trainY = x_y[1]
h2o_df = h2o.H2OFrame(trainX)
y = h2o.H2OFrame(trainY)
h2o_df = h2o_df.cbind(y)

Checking whether there is an H2O instance running at http://localhost:54321. connected.


0,1
H2O cluster uptime:,14 mins 42 secs
H2O cluster version:,3.10.4.8
H2O cluster version age:,2 months and 11 days
H2O cluster name:,H2O_from_python_navdeep_ma4ybp
H2O cluster total nodes:,1
H2O cluster free memory:,21.25 Gb
H2O cluster total cores:,40
H2O cluster allowed cores:,40
H2O cluster status:,"locked, healthy"
H2O connection url:,http://localhost:54321


Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%


In [9]:
print(h2o_df)
h2o_df["C110"] = h2o_df["C110"].asfactor()

C1,C2,C3,C4,C5,C6,C7,C8,C9,C10,C11,C12,C13,C14,C15,C16,C17,C18,C19,C20,C21,C22,C23,C24,C110
6.37777e-07,6.04233e-05,0.0559308,0.00142451,0.0350423,0.104108,0.19575,0.0404353,-0.0144231,0.00528441,-0.0287625,0.00171544,-0.00302385,0.000586489,-8.49447e-06,0.0,0.0,0.0,0.0,-0.000223817,0.0,0.0,0.0,0.0,1
1.27555e-06,0.00036254,0.0559308,0.00142451,0.0700845,0.112784,-0.097875,0.0404353,0.0,0.0,0.0,-0.00171544,-0.00207257,0.000326142,-3.30656e-05,0.000513597,-0.000357343,0.000367918,0.0,-0.000324844,-0.000545739,-0.000215467,0.0,-0.000475551,1
1.91333e-06,0.000271905,0.0559308,0.00142451,0.0700845,0.147487,0.0,0.0,0.0,0.0,0.0,0.0,-0.022595,0.00265206,-0.000167165,0.0022495,-0.00154604,0.00175429,-0.00143525,-0.000487266,-0.000545739,-0.000215467,-0.000372314,-0.00118888,0
2.55111e-06,0.000151058,0.0559308,0.00142451,0.0350423,0.1605,0.0,0.0,0.0,0.0,0.0,0.0,-0.0363125,0.00911931,-0.000607694,0.00444437,-0.00299517,0.0033336,-0.00189097,-0.00065586,-0.000654886,-0.000237014,-0.000398004,-0.000237775,0
3.18888e-06,0.000151058,0.0279654,0.00142451,0.0350423,0.247257,-0.097875,0.0,-0.0144231,0.0,0.0,0.0,-0.00665896,0.00107202,-0.000441799,0.0032869,-0.00198023,0.00215843,-0.00189097,-0.0119156,-0.00545739,-0.0019392,-0.000256525,-0.00016145,0
3.82666e-06,0.000151058,0.0279654,0.000712255,0.0700845,0.1605,0.0,0.0,0.0,0.0,0.0,0.0,-0.0497664,0.0107899,-0.000710232,0.00304423,-0.00202915,0.00225918,-0.00236372,-0.000589591,-0.00035855,-0.000215467,-0.000372314,-0.00019022,0
4.46444e-06,0.00151058,0.0279654,0.000712255,0.0700845,0.125797,0.0,0.0,0.0,0.0,0.0,0.0,-0.284352,0.0779003,-0.00548636,0.0851788,-0.0499559,0.0534721,-0.0520018,-0.0129937,-0.0207381,-0.00436084,-0.00511932,-0.00327417,0
5.10221e-06,0.000302116,0.0559308,0.00142451,0.0700845,0.0997703,0.0,-0.0202176,-0.0144231,0.0,0.0,0.000857718,-0.00917742,7.18458e-05,-7.40955e-06,3.46898e-05,1.6445e-05,6.3971e-05,-0.000359285,-0.000195231,0.0,-0.000125186,-0.000628094,-0.00036665,0
5.73999e-06,0.000422963,0.0559308,0.00213677,0.0350423,0.12146,0.0,0.0,0.0288461,0.0,0.0,0.0,-0.00872071,0.0026651,-0.000149276,0.00191673,-0.00121972,0.000419591,-0.00314753,0.0,-0.000235759,-0.000215467,-0.000372314,-0.000237775,0
6.37777e-06,6.04233e-05,0.0279654,0.00213677,0.0700845,0.151824,-0.19575,-0.0404353,-0.0288461,0.0105688,-0.0143812,0.000857718,0.0,0.0,0.0,0.0,-0.00134529,0.0015696,0.0,0.0,0.0,-0.00280258,-0.000417737,0.0,0





In [10]:
from h2o.estimators.glm import H2OGeneralizedLinearEstimator
h2o_model = H2OGeneralizedLinearEstimator(family="binomial")
h2o_model.train(x=h2o_df.columns[0:h2o_df.ncol-1], y="C110", training_frame=h2o_df)


glm Model Build progress: |███████████████████████████████████████████████| 100%


In [9]:
h2o_model

Model Details
H2OGeneralizedLinearEstimator :  Generalized Linear Modeling
Model Key:  GLM_model_python_1501618727152_1


ModelMetricsBinomialGLM: glm
** Reported on train data. **

MSE: 0.146489020603
RMSE: 0.382738841251
LogLoss: 0.468923868548
Null degrees of freedom: 23998
Residual degrees of freedom: 23975
Null deviance: 25514.4486641
Residual deviance: 22507.4078426
AIC: 22555.4078426
AUC: 0.721087149974
Gini: 0.442174299949
Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.276534117996: 


0,1,2,3,4
,0.0,1.0,Error,Rate
0,16123.0,2507.0,0.1346,(2507.0/18630.0)
1,2646.0,2723.0,0.4928,(2646.0/5369.0)
Total,18769.0,5230.0,0.2147,(5153.0/23999.0)


Maximum Metrics: Maximum metrics at their respective thresholds



0,1,2,3
metric,threshold,value,idx
max f1,0.2765341,0.5138221,199.0
max f2,0.0971304,0.5943267,343.0
max f0point5,0.3900551,0.5652477,150.0
max accuracy,0.4139961,0.8146173,140.0
max precision,0.7131259,0.7453184,39.0
max recall,0.0008697,1.0,399.0
max specificity,0.9908562,0.9995706,0.0
max absolute_mcc,0.3900551,0.3953803,150.0
max min_per_class_accuracy,0.2157707,0.6596350,247.0


Gains/Lift Table: Avg response rate: 22.37 %



0,1,2,3,4,5,6,7,8,9,10,11
,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,cumulative_response_rate,capture_rate,cumulative_capture_rate,gain,cumulative_gain
,1,0.0100004,0.7211343,3.3338153,3.3338153,0.7458333,0.7458333,0.0333395,0.0333395,233.3815267,233.3815267
,2,0.0200008,0.6384574,3.2406919,3.2872536,0.725,0.7354167,0.0324083,0.0657478,224.0691935,228.7253601
,3,0.0300013,0.6061510,3.2965659,3.2903577,0.7375,0.7361111,0.0329670,0.0987148,229.6565934,229.0357712
,4,0.0400017,0.5772870,3.3151906,3.2965659,0.7416667,0.7375,0.0331533,0.1318681,231.5190600,229.6565934
,5,0.0500021,0.5526400,3.2220673,3.2816662,0.7208333,0.7341667,0.0322220,0.1640901,222.2067269,228.1666201
,6,0.1000042,0.4543369,2.8532989,3.0674825,0.6383333,0.68625,0.1426709,0.3067610,185.3298876,206.7482539
,7,0.1500063,0.3665800,2.1045873,2.7465175,0.4708333,0.6144444,0.1052337,0.4119948,110.4587291,174.6517456
,8,0.2000083,0.2872431,1.4825234,2.4305190,0.3316667,0.54375,0.0741293,0.4861240,48.2523437,143.0518951
,9,0.3000125,0.2424670,0.9777950,1.9462776,0.21875,0.4354167,0.0977836,0.5839076,-2.2205020,94.6277628



Scoring History: 


0,1,2,3,4,5
,timestamp,duration,iteration,negative_log_likelihood,objective
,2017-08-01 13:19:31,0.000 sec,0,12757.2243320,0.5315732
,2017-08-01 13:19:31,0.084 sec,1,11332.2686168,0.4724727
,2017-08-01 13:19:31,0.091 sec,2,11257.7179512,0.4694486
,2017-08-01 13:19:31,0.101 sec,3,11254.0082798,0.4693228
,2017-08-01 13:19:31,0.107 sec,4,11253.7039213,0.4693187


