In [1]:
#%reset -f
%matplotlib inline
import os
import sys
import pandas as pd
import numpy as np
import feather
import time

import scipy, importlib, pprint, matplotlib.pyplot as plt, warnings
sys.path.insert(0, "/home/arno/glmnet_python/lib")
from glmnet import glmnet
from glmnetPlot import glmnetPlot 
from glmnetPrint import glmnetPrint
from glmnetCoef import glmnetCoef
from glmnetPredict import glmnetPredict
from cvglmnet import cvglmnet
from cvglmnetCoef import cvglmnetCoef
from cvglmnetPlot import cvglmnetPlot
from cvglmnetPredict import cvglmnetPredict

import pogs as pogs


Loaded POGS CPU library

Loaded POGS GPU library.


### Import Data Frame and create raw X and y arrays

In [2]:
t0 = time.time()
if not os.path.exists("../R/ipums_200-2015.csv"):
    print("Please download https://drive.google.com/file/d/0By-R0tLVMSykckNocHZPeEN1VU0/view?usp=sharing")
if not os.path.exists("ipums.feather"):
    !R -f ../R/ipums_prep.R 
df = feather.read_dataframe("ipums.feather")
#df = pd.read_csv("../R/data.csv")
t1 = time.time()
print("Time to read data via feather: %r" % (t1-t0))

Please download https://drive.google.com/file/d/0By-R0tLVMSykckNocHZPeEN1VU0/view?usp=sharing
Time to read data via feather: 4.056962490081787


In [3]:
target = df.columns[-1] ## last column is the response
cols = [c for c in df.columns if c != target]

In [4]:
X = np.array(df.ix[:,cols])
y = df[target].values
print(X.shape)
print(y.shape)

(55776, 9732)
(55776,)


### H2O AI GLM using the GPU Pointers

In [6]:
sourceDev = 0
gpu=False
nGPUs = 2
intercept = 1
lambda_min_ratio = 1e-4
nLambdas = 100
nAlphas = 1

H = (int)(0.8*X.shape[0])

trainX = X[:H,:]
trainY = y[:H]

validX = X[H:,:]
validY = y[H:]

## TODO: compute these in C++ (CPU or GPU)
lambda_max0 = max(abs(trainX.T.dot(trainY)))
sdTrainY = np.sqrt(np.var(trainY))
meanTrainY = np.mean(trainY)
mTrain = trainX.shape[0]
mValid = validX.shape[0]
n = trainX.shape[1]
print(mTrain)
print(mValid)
print(n)

44620
11156
9732


In [7]:
Solver = pogs.ElasticNetSolverGPU if gpu else pogs.ElasticNetSolverCPU
enet = Solver(nGPUs, 'r', intercept, lambda_min_ratio, nLambdas, nAlphas)

In [8]:
a,b,c,d = enet.upload_data(sourceDev, trainX, trainY, validX, validY)

c_void_p(140321571274768)
c_void_p(41163440)
c_void_p(1260111680)
c_void_p(4385220864)


In [None]:
t0 = time.time()
enet.fit(sourceDev, mTrain, n, mValid, lambda_max0, sdTrainY, meanTrainY, a, b, c, d)
t1 = time.time()
print("Time to train H2O AI GLM: %r" % (t1-t0))

### GLMNET

In [None]:
#glmnet expects scipy arrays
Xscipy = scipy.array(X, dtype='float64')
yscipy = scipy.array(y, dtype='float64')

In [None]:
t0 = time.time()
fit = glmnet(x=Xscipy, y=yscipy, family="gaussian", alpha=0.5, nlambda=100)
t1 = time.time()
print("Time to train glmnet: %r" % (t1-t0))

In [None]:
c = glmnetCoef(fit)[1:, -1]
print(c)

In [None]:
glmnetPlot(fit, xvar = 'lambda', label = True)

In [None]:
glmnetPrint(fit)

In [None]:
glmpred = glmnetPredict(fit, Xscipy, ptype = 'response', s = scipy.float64([0.005466])).reshape(-1)

In [None]:
yscipy

In [None]:
glmpred

In [None]:
#pp = np.dot(Xscipy, c) + glmnetCoef(fit)[0, -1]
#np.sqrt(np.mean(np.square(pp - yscipy)))

In [None]:
pp

In [None]:
np.sqrt(np.mean(np.square(glmpred[0] - yscipy)))

### Vowpal Wabbit

In [None]:
if not os.path.exists("train.vw"):
    vw = np.concatenate([y.reshape(y.shape[0],1),X], axis=1)
    np.savetxt("train.vw", vw, delimiter=" ", fmt="%g")
    !sed -i -e 's/ / |/' train.vw

In [None]:
t0 = time.time()
!./vw-8.20170116 -d train.vw #--l1 1 --l2 1 --ftrl --passes 10 --cache_file cache.vw
t1 = time.time()
print("Time to run one model through Vowpal Wabbit: %r" % (t1-t0))

In [None]:
np.sqrt(1639961193.259483)