Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] Adjust all algorithms to work with CuPy #75

Open
wants to merge 12 commits into
base: main
Choose a base branch
from
57 changes: 36 additions & 21 deletions dask_glm/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from scipy.optimize import fmin_l_bfgs_b


from dask.array.utils import normalize_to_array
from dask_glm.utils import dot, normalize
from dask_glm.families import Logistic
from dask_glm.regularizers import Regularizer
Expand Down Expand Up @@ -97,7 +98,7 @@ def gradient_descent(X, y, max_iter=100, tol=1e-14, family=Logistic, **kwargs):
stepSize = 1.0
recalcRate = 10
backtrackMult = firstBacktrackMult
beta = np.zeros(p)
beta = np.zeros_like(X, shape=(p,))

for k in range(max_iter):
# how necessary is this recalculation?
Expand Down Expand Up @@ -161,7 +162,7 @@ def newton(X, y, max_iter=50, tol=1e-8, family=Logistic, **kwargs):
"""
gradient, hessian = family.gradient, family.hessian
n, p = X.shape
beta = np.zeros(p) # always init to zeros?
beta = np.zeros_like(X, shape=(p,)) # always init to zeros?
Xbeta = dot(X, beta)

iter_count = 0
Expand All @@ -178,8 +179,10 @@ def newton(X, y, max_iter=50, tol=1e-8, family=Logistic, **kwargs):

# should this be dask or numpy?
# currently uses Python 3 specific syntax
step, _, _, _ = np.linalg.lstsq(hess, grad)
beta = (beta_old - step)
step, _, _, _ = np.linalg.lstsq(normalize_to_array(hess), normalize_to_array(grad))
step_like = np.empty_like(X, shape=step.shape)
step_like[:] = step
beta = (beta_old - step_like)

iter_count += 1

Expand Down Expand Up @@ -225,14 +228,19 @@ def admm(X, y, regularizer='l1', lamduh=0.1, rho=1, over_relax=1,
def create_local_gradient(func):
@functools.wraps(func)
def wrapped(beta, X, y, z, u, rho):
return func(beta, X, y) + rho * (beta - z + u)
beta_like = np.empty_like(X, shape=beta.shape)
beta_like[:] = beta
return normalize_to_array(func(beta_like, X, y) + rho *
(beta_like - z + u))
return wrapped

def create_local_f(func):
@functools.wraps(func)
def wrapped(beta, X, y, z, u, rho):
return func(beta, X, y) + (rho / 2) * np.dot(beta - z + u,
beta - z + u)
beta_like = np.empty_like(X, shape=beta.shape)
beta_like[:] = beta
return normalize_to_array(func(beta_like, X, y) + (rho / 2) *
np.dot(beta_like - z + u, beta_like - z + u))
return wrapped

f = create_local_f(pointwise_loss)
Expand All @@ -252,23 +260,23 @@ def wrapped(beta, X, y, z, u, rho):
else:
yD = [y]

z = np.zeros(p)
u = np.array([np.zeros(p) for i in range(nchunks)])
betas = np.array([np.ones(p) for i in range(nchunks)])
z = np.zeros_like(X, shape=(p,))
u = np.stack([np.zeros_like(X, shape=(p,)) for i in range(nchunks)])
betas = np.stack([np.ones_like(X, shape=(p,)) for i in range(nchunks)])

for k in range(max_iter):

# x-update step
new_betas = [delayed(local_update)(xx, yy, bb, z, uu, rho, f=f,
fprime=fprime) for
xx, yy, bb, uu in zip(XD, yD, betas, u)]
new_betas = np.array(da.compute(*new_betas))
new_betas = np.stack(da.compute(*new_betas), axis=0)

beta_hat = over_relax * new_betas + (1 - over_relax) * z

# z-update step
zold = z.copy()
ztilde = np.mean(beta_hat + np.array(u), axis=0)
ztilde = np.mean(beta_hat + u, axis=0)
z = regularizer.proximal_operator(ztilde, lamduh / (rho * nchunks))

# u-update step
Expand All @@ -295,11 +303,14 @@ def local_update(X, y, beta, z, u, rho, f, fprime, solver=fmin_l_bfgs_b):
u = u.ravel()
z = z.ravel()
solver_args = (X, y, z, u, rho)
beta, f, d = solver(f, beta, fprime=fprime, args=solver_args,
beta, f, d = solver(f, normalize_to_array(beta),
fprime=fprime, args=solver_args,
maxiter=200,
maxfun=250)

return beta
beta_like = np.empty_like(X, shape=beta.shape)
beta_like[:] = beta
return beta_like


@normalize
Expand Down Expand Up @@ -335,21 +346,25 @@ def lbfgs(X, y, regularizer=None, lamduh=1.0, max_iter=100, tol=1e-4,
pointwise_gradient = regularizer.add_reg_grad(pointwise_gradient, lamduh)

n, p = X.shape
beta0 = np.zeros(p)
beta0 = np.zeros_like(X, shape=(p,))

def compute_loss_grad(beta, X, y):
loss_fn = pointwise_loss(beta, X, y)
gradient_fn = pointwise_gradient(beta, X, y)
beta_like = np.empty_like(X, shape=beta.shape)
beta_like[:] = beta
loss_fn = pointwise_loss(beta_like, X, y)
gradient_fn = pointwise_gradient(beta_like, X, y)
loss, gradient = compute(loss_fn, gradient_fn)
return loss, gradient.copy()
return normalize_to_array(loss), normalize_to_array(gradient.copy())

with dask.config.set(fuse_ave_width=0): # optimizations slows this down
beta, loss, info = fmin_l_bfgs_b(
compute_loss_grad, beta0, fprime=None,
compute_loss_grad, normalize_to_array(beta0), fprime=None,
args=(X, y),
iprint=(verbose > 0) - 1, pgtol=tol, maxiter=max_iter)

return beta
beta_like = np.empty_like(X, shape=beta.shape)
beta_like[:] = beta
return beta_like


@normalize
Expand Down Expand Up @@ -384,7 +399,7 @@ def proximal_grad(X, y, regularizer='l1', lamduh=0.1, family=Logistic,
stepSize = 1.0
recalcRate = 10
backtrackMult = firstBacktrackMult
beta = np.zeros(p)
beta = np.zeros_like(X, shape=(p,))
regularizer = Regularizer.get(regularizer)

for k in range(max_iter):
Expand Down
62 changes: 62 additions & 0 deletions dask_glm/tests/cupy/test_admm.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
import pytest

from dask import persist
import numpy as np
import cupy

from dask.array.utils import normalize_to_array
from dask_glm.algorithms import admm, local_update
from dask_glm.families import Logistic, Normal
from dask_glm.regularizers import L1
from dask_glm.utils import cupy_make_y


@pytest.mark.parametrize('N', [1000, 10000])
@pytest.mark.parametrize('beta',
[cupy.array([-1.5, 3]),
cupy.array([35, 2, 0, -3.2]),
cupy.array([-1e-2, 1e-4, 1.0, 2e-3, -1.2])])
@pytest.mark.parametrize('family', [Logistic, Normal])
def test_local_update(N, beta, family):
M = beta.shape[0]
X = cupy.random.random((N, M))
y = cupy.random.random(N) > 0.4
u = cupy.zeros(M)
z = cupy.random.random(M)
rho = 1e7

def create_local_gradient(func):
def wrapped(beta, X, y, z, u, rho):
beta_like = np.empty_like(X, shape=beta.shape)
beta_like[:] = beta
return normalize_to_array(func(beta_like, X, y) + rho *
(beta_like - z + u))
return wrapped

def create_local_f(func):
def wrapped(beta, X, y, z, u, rho):
beta_like = np.empty_like(X, shape=beta.shape)
beta_like[:] = beta
return normalize_to_array(func(beta_like, X, y) + (rho / 2) *
np.dot(beta_like - z + u, beta_like - z + u))
return wrapped

f = create_local_f(family.pointwise_loss)
fprime = create_local_gradient(family.pointwise_gradient)

result = local_update(X, y, beta, z, u, rho, f=f, fprime=fprime)

assert np.allclose(result, z, atol=2e-3)


@pytest.mark.parametrize('N', [1000, 10000])
@pytest.mark.parametrize('p', [1, 5, 10])
def test_admm_with_large_lamduh(N, p):
X = cupy.random.random((N, p))
beta = cupy.random.random(p)
y = cupy_make_y(X, beta=cupy.array(beta))

X, y = persist(X, y)
z = admm(X, y, regularizer=L1(), lamduh=1e5, rho=20, max_iter=500)

assert np.allclose(z, np.zeros(p), atol=1e-4)
143 changes: 143 additions & 0 deletions dask_glm/tests/cupy/test_algos_families.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
import pytest

from dask import persist
import numpy as np
import cupy

from dask_glm.algorithms import (newton, lbfgs, proximal_grad,
gradient_descent, admm)
from dask_glm.families import Logistic, Normal, Poisson
from dask_glm.regularizers import Regularizer
from dask_glm.utils import sigmoid, cupy_make_y


def add_l1(f, lam):
def wrapped(beta, X, y):
return f(beta, X, y) + lam * (np.abs(beta)).sum()
return wrapped


def make_intercept_data(N, p, seed=20009):
'''Given the desired number of observations (N) and
the desired number of variables (p), creates
random logistic data to test on.'''

# set the seeds
cupy.random.seed(seed)

X = cupy.random.random((N, p + 1))
col_sums = X.sum(axis=0)
X = X / col_sums[None, :]
X[:, p] = 1
y = cupy_make_y(X, beta=cupy.random.random(p + 1))

return X, y


@pytest.mark.parametrize('opt',
[lbfgs,
newton,
gradient_descent])
@pytest.mark.parametrize('N, p, seed',
[(100, 2, 20009),
(250, 12, 90210),
(95, 6, 70605)])
def test_methods(N, p, seed, opt):
X, y = make_intercept_data(N, p, seed=seed)
coefs = opt(X, y)
p = sigmoid(X.dot(coefs))

y_sum = y.sum()
p_sum = p.sum()
assert np.isclose(y_sum, p_sum, atol=1e-1)


@pytest.mark.parametrize('func,kwargs', [
(newton, {'tol': 1e-5}),
(lbfgs, {'tol': 1e-8}),
(gradient_descent, {'tol': 1e-7}),
])
@pytest.mark.parametrize('N', [1000])
@pytest.mark.parametrize('nchunks', [1, 10])
@pytest.mark.parametrize('family', [Logistic, Normal, Poisson])
def test_basic_unreg_descent(func, kwargs, N, nchunks, family):
beta = cupy.random.normal(size=2)
M = len(beta)
X = cupy.random.random((N, M))
y = cupy_make_y(X, beta=cupy.array(beta))

X, y = persist(X, y)

result = func(X, y, family=family, **kwargs)
test_vec = cupy.random.normal(size=2)

opt = family.pointwise_loss(result, X, y)
test_val = family.pointwise_loss(test_vec, X, y)

assert opt < test_val


@pytest.mark.parametrize('func,kwargs', [
(admm, {'abstol': 1e-4}),
(proximal_grad, {'tol': 1e-7}),
])
@pytest.mark.parametrize('N', [1000])
@pytest.mark.parametrize('nchunks', [1, 10])
@pytest.mark.parametrize('family', [Logistic, Normal, Poisson])
@pytest.mark.parametrize('lam', [0.01, 1.2, 4.05])
@pytest.mark.parametrize('reg', [r() for r in Regularizer.__subclasses__()])
def test_basic_reg_descent(func, kwargs, N, nchunks, family, lam, reg):
beta = cupy.random.normal(size=2)
M = len(beta)
X = cupy.random.random((N, M))
y = cupy_make_y(X, beta=cupy.array(beta))

X, y = persist(X, y)

result = func(X, y, family=family, lamduh=lam, regularizer=reg, **kwargs)
test_vec = cupy.random.normal(size=2)

f = reg.add_reg_f(family.pointwise_loss, lam)

opt = f(result, X, y)
test_val = f(test_vec, X, y)

assert opt < test_val


@pytest.mark.parametrize('func,kwargs', [
(admm, {'max_iter': 2}),
(proximal_grad, {'max_iter': 2}),
(newton, {'max_iter': 2}),
(gradient_descent, {'max_iter': 2}),
])
def test_determinism(func, kwargs):
X, y = make_intercept_data(1000, 10)

a = func(X, y, **kwargs)
b = func(X, y, **kwargs)

assert (a == b).all()


try:
from distributed import Client
from distributed.utils_test import cluster, loop # flake8: noqa
except ImportError:
pass
else:
@pytest.mark.parametrize('func,kwargs', [
(admm, {'max_iter': 2}),
(proximal_grad, {'max_iter': 2}),
(newton, {'max_iter': 2}),
(gradient_descent, {'max_iter': 2}),
])
def test_determinism_distributed(func, kwargs, loop):
with cluster() as (s, [a, b]):
with Client(s['address'], loop=loop) as c:
X, y = make_intercept_data(1000, 10)

a = func(X, y, **kwargs)
b = func(X, y, **kwargs)

assert (a == b).all()
Loading