Skip to content

Commit

Permalink
First commit
Browse files Browse the repository at this point in the history
Work taken from combined effort with @mcg1969
  • Loading branch information
mrocklin committed Nov 18, 2016
0 parents commit 4a23b9a
Show file tree
Hide file tree
Showing 10 changed files with 215 additions and 0 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
*.pyc
30 changes: 30 additions & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
language: python
sudo: false

env:
matrix:
- PYTHON=2.7
- PYTHON=3.5

install:
# Install conda
- wget http://repo.continuum.io/miniconda/Miniconda-latest-Linux-x86_64.sh -O miniconda.sh
- bash miniconda.sh -b -p $HOME/miniconda
- export PATH="$HOME/miniconda/bin:$PATH"
- conda config --set always_yes yes --set changeps1 no
- conda update conda

# Install dependencies
- conda create -n test-environment python=$PYTHON
- source activate test-environment
- conda install -c conda-forge numpy dask

# Install dask-glm
- pip install --no-deps -e .

script:
- py.test dask-glm
- flake8 dask

notifications:
email: false
28 changes: 28 additions & 0 deletions LICENSE.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
Copyright (c) 2016, Continuum Analytics, Inc. and contributors
All rights reserved.

Redistribution and use in source and binary forms, with or without modification,
are permitted provided that the following conditions are met:

Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.

Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.

Neither the name of Continuum Analytics nor the names of any contributors
may be used to endorse or promote products derived from this software
without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
THE POSSIBILITY OF SUCH DAMAGE.
9 changes: 9 additions & 0 deletions MANIFEST.in
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
recursive-include dask_glm *.py
recursive-include docs *.rst

include setup.py
include README.rst
include LICENSE.txt
include MANIFEST.in

prune docs/_build
4 changes: 4 additions & 0 deletions README.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
Generalized Linear Models in Dask
=================================

*This library is not ready for use.*
1 change: 1 addition & 0 deletions dask_glm/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .gradient import gradient
75 changes: 75 additions & 0 deletions dask_glm/gradient.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
# Constants

import numpy as np
import dask.array as da


firstBacktrackMult = 0.1
nextBacktrackMult = 0.5
armijoMult = 0.1
stepGrowth = 1.25
stepSize = 1.0
recalcRate = 10
backtrackMult = firstBacktrackMult


# Compute the initial point
def gradient(X, y, max_steps=100):
N, M = X.shape
firstBacktrackMult = 0.1
nextBacktrackMult = 0.5
armijoMult = 0.1
stepGrowth = 1.25
stepSize = 1.0
recalcRate = 10
backtrackMult = firstBacktrackMult
beta = np.zeros(M)

print('## -f |df/f| |dx/x| step')
print('----------------------------------------------')
for k in range(max_steps):
# Compute the gradient
if k % recalcRate == 0:
Xbeta = X.dot(beta)
eXbeta = da.exp(Xbeta)
func = da.log1p(eXbeta).sum() - y.dot(Xbeta)
e1 = eXbeta + 1.0
gradient = X.T.dot(eXbeta / e1 - y)
steplen = (gradient**2).sum()**0.5
Xgradient = X.dot(gradient)

Xbeta, eXbeta, func, gradient, steplen, Xgradient = da.compute(Xbeta, eXbeta, func, gradient, steplen, Xgradient)

obeta = beta
oXbeta = Xbeta

# Compute the step size
lf = func
for ii in range(100):
beta = obeta - stepSize * gradient
if ii and np.array_equal(beta, obeta):
stepSize = 0
break
Xbeta = oXbeta - stepSize * Xgradient
# This prevents overflow
if np.all(Xbeta < 700):
eXbeta = np.exp(Xbeta)
func = np.sum(np.log1p(eXbeta)) - np.dot(y, Xbeta)
df = lf - func
if df >= armijoMult * stepSize * steplen ** 2:
break
stepSize *= backtrackMult
if stepSize == 0:
print('No more progress')
break
df /= max(func, lf)
db = stepSize * steplen / (np.linalg.norm(beta) + stepSize * steplen)
print('%2d %.6e %9.2e %.2e %.1e'%(k+1,func,df,db,stepSize))
if df < 1e-14:
print('Converged')
break
stepSize *= stepGrowth
backtrackMult = nextBacktrackMult

return beta

45 changes: 45 additions & 0 deletions dask_glm/tests/test_gradient.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@

import math

import dask.array as da
import numpy as np
import pytest

from dask_glm import gradient


def logit(y):
return 1.0 / ( 1.0 + da.exp(-y) )


M = 100
N = 100000
S = 2

X = np.random.randn(N,M)
X[:,1] = 1.0
beta0 = np.random.randn(M)


def make_y(X, beta0=beta0):
N, M = X.shape
z0 = X.dot(beta0)
z0 = da.compute(z0)[0] # ensure z0 is a numpy array
scl = S / z0.std()
beta0 *= scl
z0 *= scl
y = np.random.rand(N) < logit(z0)
return y, z0


y, z0 = make_y(X)
L0 = N * math.log(2.0)


dX = da.from_array(X, chunks=(N / 10, M))
dy = da.from_array(y, chunks=(N / 10,))


@pytest.mark.parametrize('X,y', [(X, y), (dX, dy)])
def test_gradient(X, y):
beta = gradient(X, y)
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
dask[array]
21 changes: 21 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
#!/usr/bin/env python

from os.path import exists
from setuptools import setup
import versioneer


setup(name='dask-glm',
version=versioneer.get_version(),
cmdclass=versioneer.get_cmdclass(),
description='Generalized Linear Models with Dask',
url='http://github.com/dask/dask-glm/',
maintainer='Matthew Rocklin',
maintainer_email='mrocklin@gmail.com',
license='BSD',
keywords='dask,glm',
packages=['dask_glm']
long_description=(open('README.rst').read() if exists('README.rst')
else ''),
install_requires=list(open('requirements.txt').read().strip().split('\n')),
zip_safe=False)

0 comments on commit 4a23b9a

Please sign in to comment.