Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DOC, TST: Wrapping of PyTorch models #699

Merged
merged 43 commits into from
Jul 29, 2020
Merged
Show file tree
Hide file tree
Changes from 28 commits
Commits
Show all changes
43 commits
Select commit Hold shift + click to select a range
52f929d
ENH: Wrap PyTorch/Keras models
stsievert Jul 15, 2020
57a12d6
MAINT: ci requirements
stsievert Jul 15, 2020
b766c60
Remove wrapper impl
stsievert Jul 15, 2020
69a7886
delete unused note for now
stsievert Jul 15, 2020
1f54dad
Add doc framework for integration
stsievert Jul 15, 2020
36209a2
Combine XGBoost and LightGBM
stsievert Jul 15, 2020
0c27c66
Skip if package not installed
stsievert Jul 15, 2020
33449f7
Start to fill out docs
stsievert Jul 15, 2020
ea6e95c
Don't depend on dask-examples
stsievert Jul 15, 2020
3dfd914
Build on master
stsievert Jul 15, 2020
c3aa74f
Add note to tests
stsievert Jul 15, 2020
77a0e26
REVERT: run on this PR too
stsievert Jul 15, 2020
aa53c21
Update note
stsievert Jul 15, 2020
102a5aa
typo
stsievert Jul 15, 2020
c071c9e
isort
stsievert Jul 15, 2020
2786099
Temporarily install from source
stsievert Jul 15, 2020
419ef97
remove print statement; resolve warning
stsievert Jul 16, 2020
12ec08b
MAINT: allow models to be scattered
stsievert Jul 16, 2020
76ab03b
Update docs/source/keras.rst
stsievert Jul 16, 2020
4118de0
Update docs/source/keras.rst
stsievert Jul 16, 2020
770c28b
DOC: title
stsievert Jul 16, 2020
7b3a693
Merge branch 'ms-model-docs' of https://github.com/stsievert/dask-ml …
stsievert Jul 16, 2020
2605a8c
remove ci
stsievert Jul 16, 2020
1ea5217
await in hyperband too
stsievert Jul 16, 2020
da3ba8d
Rename to test_{keras, pytorch}.py
stsievert Jul 20, 2020
d7eccf2
tmp
stsievert Jul 21, 2020
f9e50a6
Merge branch 'master' into ms-model-docs
stsievert Jul 21, 2020
19ec22e
Pass check_scoring to submit
stsievert Jul 21, 2020
645353f
Update ci/posix.yaml
stsievert Jul 24, 2020
4d30692
remove keras, give joblib edits
stsievert Jul 26, 2020
c31c600
Remove extra installs
stsievert Jul 26, 2020
4dc1d0a
lint
stsievert Jul 26, 2020
bfaf65a
skip isort for pytest importskip
stsievert Jul 26, 2020
b944890
isort
stsievert Jul 26, 2020
96489f5
isort skip
stsievert Jul 26, 2020
99f2fd0
clean
stsievert Jul 27, 2020
580d77e
isort
stsievert Jul 27, 2020
5b6e20c
lint
TomAugspurger Jul 27, 2020
9478728
try install deps in right env
stsievert Jul 28, 2020
79b8891
no cuda on ci
stsievert Jul 29, 2020
ea2b4f5
remove --no-deps for skorch
stsievert Jul 29, 2020
262815d
quiet
stsievert Jul 29, 2020
58b105c
Update ci/posix.yaml
TomAugspurger Jul 29, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions ci/posix.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ jobs:
sklearnDev:
envFile: 'ci/environment-3.7.yaml'
SKLEARN_DEV: "yes"
WRAPPERS: "no"

steps:
- bash: echo "##vso[task.prependpath]$CONDA/bin"
Expand All @@ -33,6 +34,16 @@ jobs:
- bash: conda env create --quiet --file=$(envFile) --name=dask-ml-test && conda list -n dask-ml-test
displayName: "install"

- bash: |
conda install pytorch torchvision -c pytorch
stsievert marked this conversation as resolved.
Show resolved Hide resolved
pip install skorch
stsievert marked this conversation as resolved.
Show resolved Hide resolved
pip install tensorflow scikeras keras
stsievert marked this conversation as resolved.
Show resolved Hide resolved
pip install -U git+https://github.com/adriangb/scikeras.git
displayName: "install Tensorflow/PyTorch"
# condition: eq(variables['Build.SourceBranch'], 'refs/heads/master')
TomAugspurger marked this conversation as resolved.
Show resolved Hide resolved
# Installing from git is temporary: see
# https://github.com/adriangb/scikeras/pull/17#issuecomment-659064357

- script: |
source activate dask-ml-test
conda uninstall -y --force scikit-learn
Expand Down
2 changes: 1 addition & 1 deletion dask_ml/model_selection/_hyperband.py
Original file line number Diff line number Diff line change
Expand Up @@ -388,7 +388,7 @@ def _get_SHAs(self, brackets):
return SHAs

async def _fit(self, X, y, **fit_params):
X, y, scorer = self._validate_parameters(X, y)
X, y, scorer = await self._validate_parameters(X, y)

brackets = _get_hyperband_params(self.max_iter, eta=self.aggressiveness)
SHAs = self._get_SHAs(brackets)
Expand Down
14 changes: 9 additions & 5 deletions dask_ml/model_selection/_incremental.py
Original file line number Diff line number Diff line change
Expand Up @@ -515,21 +515,25 @@ def __init__(
self.prefix = prefix
super(BaseIncrementalSearchCV, self).__init__(estimator, scoring=scoring)

def _validate_parameters(self, X, y):
async def _validate_parameters(self, X, y):
if (self.max_iter is not None) and self.max_iter < 1:
raise ValueError(
"Received max_iter={}. max_iter < 1 is not supported".format(
self.max_iter
)
)

# Make sure dask arrays are passed so error on unknown chunk size is raised
kwargs = dict(accept_unknown_chunks=True, accept_dask_dataframe=True)
if not isinstance(X, dd.DataFrame):
X = self._check_array(X, **kwargs)
if not isinstance(y, dd.Series):
if not isinstance(y, dd.DataFrame):
y = self._check_array(y, ensure_2d=False, **kwargs)
stsievert marked this conversation as resolved.
Show resolved Hide resolved
scorer = check_scoring(self.estimator, scoring=self.scoring)
estimator = self.estimator
if isinstance(estimator, Future):
client = default_client()
scorer = await client.submit(check_scoring, estimator, scoring=self.scoring)
else:
scorer = check_scoring(self.estimator, scoring=self.scoring)
return X, y, scorer

@property
Expand Down Expand Up @@ -640,7 +644,7 @@ async def _fit(self, X, y, **fit_params):
else:
context = dummy_context()

X, y, scorer = self._validate_parameters(X, y)
X, y, scorer = await self._validate_parameters(X, y)

X_train, X_test, y_train, y_test = self._get_train_test_split(X, y)

Expand Down
2 changes: 1 addition & 1 deletion dask_ml/wrappers.py
Original file line number Diff line number Diff line change
Expand Up @@ -481,7 +481,7 @@ def _fit_for_estimator(self, estimator, X, y, **fit_kwargs):
random_state=self.random_state,
shuffle_blocks=self.shuffle_blocks,
assume_equal_chunks=self.assume_equal_chunks,
**fit_kwargs
**fit_kwargs,
)

copy_learned_attributes(result, self)
Expand Down
2 changes: 1 addition & 1 deletion docs/source/hyper-parameter-search.rst
Original file line number Diff line number Diff line change
Expand Up @@ -403,7 +403,7 @@ generalized to any of the above estimators.

.. note::

These estimators require that the model implement ``partial_fit``
These estimators require that the model implement ``partial_fit``.

By default, these class will call ``partial_fit`` on each chunk of the data.
These classes can stop training any models if their score stops increasing
Expand Down
12 changes: 10 additions & 2 deletions docs/source/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -120,13 +120,21 @@ Scikit-Learn should feel at home with Dask-ML.
hyper-parameter-search.rst
compose.rst
glm.rst
joblib.rst
meta-estimators.rst
incremental.rst
clustering.rst
xgboost.rst
modules/api.rst

.. toctree::
:maxdepth: 2
:hidden:
:caption: Integration

keras.rst
pytorch.rst
xgboost.rst
joblib.rst

.. toctree::
:maxdepth: 2
:hidden:
Expand Down
45 changes: 45 additions & 0 deletions docs/source/keras.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
Keras and Tensorflow
====================

The package SciKeras_ brings a Scikit-learn API to Keras. Install directions
are at https://github.com/adriangb/scikeras/blob/master/README.md#installation.

Example usage
-------------

First, let's start by defining normal function to create our model. This is the
normal way to create a `Keras Sequential model`_

.. _Keras Sequential model: https://keras.io/api/models/sequential/

.. code-block:: python

import tensorflow as tf
from tensorflow.keras.layers import Dense, Activation, Dropout
from tensorflow.keras.models import Sequential

def build_model(lr=0.01):
layers = [Dense(512, input_shape=(784,), activation="relu"),
Dense(10, input_shape=(512,), activation="softmax")]
model = Sequential(layers)

opt = tf.keras.optimizers.SGD(learning_rate=lr)
model.compile(loss="categorical_crossentropy", optimizer=opt, metrics=["accuracy"])
return model

Now, we can use the SciKeras to create a Scikit-learn compatible model:

.. code-block:: python

from scikeras.wrappers import KerasClassifier, KerasRegressor
model = KerasClassifier(build_fn=build_model, lr=0.1)

This model will work with all of Dask-ML: it expects NumPy arrays as inputs and
obeys the Scikit-learn API. For example, it's possible to use Dask-ML to do the
following:

* Use Keras with Dask-ML's model selection, including
:class:`~dask_ml.model_selection.HyperbandSearchCV`.
* Use Keras with Dask-ML's :class:`~dask_ml.wrappers.Incremental`.

.. _SciKeras: https://github.com/adriangb/scikeras
64 changes: 64 additions & 0 deletions docs/source/pytorch.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
PyTorch
=======

Skorch_ brings a Scikit-learn API to PyTorch_. We encourage looking at the
Skorch documentation for complete details.

Example usage
-------------

First, let's create a normal PyTorch model:

.. code-block:: python


import torch.nn as nn
import torch.nn.functional as F

class ShallowNet(nn.Module):
def __init__(self, n_features=5):
super().__init__()
self.layer1 = nn.Linear(n_features, 1)

def forward(self, x):
return F.relu(self.layer1(x))

With this, it's easy to use Skorch:

.. code-block:: python

from skorch import NeuralNetRegressor
import torch.optim as optim

niceties = {
"callbacks": False,
"warm_start": False,
"train_split": None,
"max_epochs": 1,
}

model = NeuralNetRegressor(
module=ShallowNet,
module__n_features=5,
criterion=nn.MSELoss,
optimizer=optim.SGD,
optimizer__lr=0.1,
optimizer__momentum=0.9,
batch_size=64,
**niceties,
)

Each parameter that the PyTorch `nn.Module` takes is prefixed with `module__`,
and same for the optimizer (`optim.SGD` takes a `lr` and `momentum`
parameters). The ``niceties`` make sure Skorch uses all the data for training
and doesn't print excessive amounts of logs.

Now, this model can be used with Dask-ML. For example, it's possible to do the
following:

* Use PyTorch with the Dask-ML's model selection, including
:class:`~dask_ml.model_selection.HyperbandSearchCV`.
* Use PyTorch with Dask-ML's :class:`~dask_ml.wrappers.Incremental`.

.. _Skorch: https://skorch.readthedocs.io/en/stable/
.. _PyTorch: https://pytorch.org
25 changes: 18 additions & 7 deletions docs/source/xgboost.rst
Original file line number Diff line number Diff line change
@@ -1,19 +1,29 @@
XGBoost
=======
XGBoost & LightGBM
==================

.. currentmodule:: dask_ml.xgboost

XGBoost_ is a powerful and popular library for gradient boosted trees. For
larger datasets or faster training XGBoost also provides a distributed
computing solution. LightGBM_ is another library similar to XGBoost; it also
natively supplies native distributed training for decision trees.

Dask-ML can set up distributed XGBoost or LightGBM for you and hand off data
from distributed dask.dataframes. This automates much of the hassle of
preprocessing and setup while still letting XGBoost/LightGBM do what they do
well.

Below, we'll refer to an example with XGBoost. Here are the relevant XGBoost
classes/functions:

.. autosummary::
train
predict
XGBClassifier
XGBRegressor

XGBoost_ is a powerful and popular library for gradient boosted trees. For
larger datasets or faster training XGBoost also provides a distributed
computing solution. Dask-ML can set up distributed XGBoost for you and hand
off data from distributed dask.dataframes. This automates much of the hassle
of preprocessing and setup while still letting XGBoost do what it does well.
The LightGBM implementation and documentation can be found at
https://github.com/dask/dask-lightgbm.

Example
-------
Expand Down Expand Up @@ -63,3 +73,4 @@ relevant GitHub issue here: `dmlc/xgboost #2032 <https://github.com/dmlc/xgboost
See the ":doc:`Dask-ML examples <examples>`" for an example usage.

.. _XGBoost: https://xgboost.readthedocs.io/
.. _LightGBM: https://lightgbm.readthedocs.io/
15 changes: 15 additions & 0 deletions tests/model_selection/test_incremental.py
Original file line number Diff line number Diff line change
Expand Up @@ -853,3 +853,18 @@ def test_warns_scores_per_fit(c, s, a, b):
search = IncrementalSearchCV(model, params, scores_per_fit=2)
with pytest.warns(UserWarning, match="deprecated since Dask-ML v1.4.0"):
yield search.fit(X, y)


@gen_cluster(client=True)
async def test_model_future(c, s, a, b):
X, y = make_classification(n_samples=100, n_features=5, chunks=10)

params = {"value": np.random.RandomState(42).rand(1000)}
model = ConstantFunction()
model_future = await c.scatter(model)

search = IncrementalSearchCV(model_future, params, max_iter=10)

await search.fit(X, y, classes=[0, 1])
assert search.history_
assert search.best_score_ > 0
67 changes: 67 additions & 0 deletions tests/model_selection/test_keras.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
import pickle
from typing import Tuple

import numpy as np
import pytest
from distributed.utils_test import gen_cluster
from scipy.stats import loguniform, uniform
from sklearn.base import clone
from sklearn.datasets import make_classification, make_regression
from sklearn.exceptions import DataConversionWarning
from sklearn.model_selection import RandomizedSearchCV

from dask_ml.model_selection import IncrementalSearchCV

import pytest
pytest.importorskip("tensorflow")
pytest.importorskip("scikeras")

import tensorflow as tf
from tensorflow.keras.datasets import mnist as keras_mnist
from tensorflow.keras.layers import Dense, Activation, Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from scikeras.wrappers import KerasClassifier, KerasRegressor


def mnist() -> Tuple[np.ndarray, np.ndarray]:
(X_train, y_train), _ = keras_mnist.load_data()
X_train = X_train[:100]
y_train = y_train[:100]
X_train = X_train.reshape(X_train.shape[0], 784)
X_train = X_train.astype("float32")
X_train /= 255
Y_train = to_categorical(y_train, 10)
return X_train, y_train


def _keras_build_fn(lr=0.01):
layers = [
Dense(512, input_shape=(784,), activation="relu"),
Dense(10, input_shape=(512,), activation="softmax"),
]

# See https://github.com/adriangb/scikeras/issues/24
try:
model = Sequential(layers)
except TypeError:
model = Sequential(layers)

opt = tf.keras.optimizers.SGD(learning_rate=lr)
model.compile(loss="categorical_crossentropy", optimizer=opt, metrics=["accuracy"])
return model


@gen_cluster(client=True)
def test_keras(c, s, a, b):
X, y = mnist()
assert X.ndim == 2 and X.shape[-1] == 784
assert y.ndim == 1 and len(X) == len(y)
assert isinstance(X, np.ndarray) and isinstance(y, np.ndarray)

model = KerasClassifier(build_fn=_keras_build_fn, epochs=1, lr=0.1)
params = {"lr": loguniform(1e-3, 1e-1)}

search = IncrementalSearchCV(model, params, max_iter=2, decay_rate=None)
yield search.fit(X, y, epochs=1)
assert search.best_score_ >= 0
Loading