In [None]:
#%reset -f
%matplotlib inline
import os
import sys
import pandas as pd
import numpy as np
import feather
import time

import scipy, importlib, pprint, matplotlib.pyplot as plt, warnings
sys.path.insert(0, "/home/arno/glmnet_python/lib")
from glmnet import glmnet
from glmnetPlot import glmnetPlot 
from glmnetPrint import glmnetPrint
from glmnetCoef import glmnetCoef
from glmnetPredict import glmnetPredict
from cvglmnet import cvglmnet
from cvglmnetCoef import cvglmnetCoef
from cvglmnetPlot import cvglmnetPlot
from cvglmnetPredict import cvglmnetPredict

import pogs as pogs

### Import Data Frame and create raw X and y arrays

In [None]:
t0 = time.time()
if not os.path.exists("ipums.feather"):
    !R -f ../R/ipums_prep.R
df = feather.read_dataframe("ipums.feather")
#df = pd.read_csv("../R/data.csv")
t1 = time.time()
print("Time to read data via feather: %r" % (t1-t0))

In [None]:
target = df.columns[-1] ## last column is the response
cols = [c for c in df.columns if c != target]

In [None]:
X = np.array(df.ix[:,cols])
y = df[target].values
print(X.shape)
print(y.shape)

### H2O AI GLM

In [None]:
import elastic_net as enet

In [None]:
t0 = time.time()
enet.ElasticNet(X, y, gpu=True, double_precision=True, nlambda=100, alpha=0.5)
t1 = time.time()
print("Time to train H2O AI GLM: %r" % (t1-t0))

### H2O AI GLM using the GPU Pointers

In [None]:
#gpu_ptr_X, gpu_ptr_y = upload_to_gpu(X, y, w_dev)
#h2oglm = H2OAIGLM(valid_fraction=0.2, alpha=[0,0.1,0.2], lambda_search=True, nGPUs=8)
#h2oglm.fit(gpu_ptr_X, gpu_ptr_y, w_dev)

### GLMNET

In [None]:
#glmnet expects scipy arrays
Xscipy = scipy.array(X, dtype='float64')
yscipy = scipy.array(y, dtype='float64')

In [None]:
t0 = time.time()
fit = glmnet(x=Xscipy, y=yscipy, family="gaussian", alpha=0.5, nlambda=100)
t1 = time.time()
print("Time to train glmnet: %r" % (t1-t0))

In [None]:
c = glmnetCoef(fit)[1:, -1]
print(c)

In [None]:
glmnetPlot(fit, xvar = 'lambda', label = True)

In [None]:
glmnetPrint(fit)

In [None]:
glmpred = glmnetPredict(fit, Xscipy, ptype = 'response', s = scipy.float64([0.005466])).reshape(-1)

In [None]:
yscipy

In [None]:
glmpred

In [None]:
#pp = np.dot(Xscipy, c) + glmnetCoef(fit)[0, -1]
#np.sqrt(np.mean(np.square(pp - yscipy)))

In [None]:
pp

In [None]:
np.sqrt(np.mean(np.square(glmpred[0] - yscipy)))

### Vowpal Wabbit

In [None]:
if not os.path.exists("train.vw"):
    vw = np.concatenate([y.reshape(y.shape[0],1),X], axis=1)
    np.savetxt("train.vw", vw, delimiter=" ", fmt="%g")
    !sed -i -e 's/ / |/' train.vw

In [None]:
t0 = time.time()
!./vw-8.20170116 -d train.vw #--l1 1 --l2 1 --ftrl --passes 10 --cache_file cache.vw
t1 = time.time()
print("Time to run one model through Vowpal Wabbit: %r" % (t1-t0))

In [None]:
np.sqrt(1639961193.259483)