dask · TomAugspurger · Jul 29, 2020 · Jul 15, 2020 · Jul 15, 2020 · Jul 15, 2020
diff --git a/ci/posix.yaml b/ci/posix.yaml
@@ -20,6 +20,7 @@ jobs:
       sklearnDev:
         envFile: 'ci/environment-3.7.yaml'
         SKLEARN_DEV: "yes"
+        WRAPPERS: "no"
 
   steps:
   - bash: echo "##vso[task.prependpath]$CONDA/bin"
@@ -33,6 +34,16 @@ jobs:
   - bash: conda env create --quiet --file=$(envFile) --name=dask-ml-test && conda list -n dask-ml-test
     displayName: "install"
 
+  - bash: |
+      conda install pytorch torchvision -c pytorch
+      pip install skorch
+      pip install tensorflow scikeras keras
+      pip install -U git+https://github.com/adriangb/scikeras.git
+    displayName: "install Tensorflow/PyTorch"
+    # condition: eq(variables['Build.SourceBranch'], 'refs/heads/master')
+    # Installing from git is temporary: see
+    # https://github.com/adriangb/scikeras/pull/17#issuecomment-659064357
+
   - script: |
       source activate dask-ml-test
       conda uninstall -y --force scikit-learn

diff --git a/dask_ml/model_selection/_hyperband.py b/dask_ml/model_selection/_hyperband.py
@@ -388,7 +388,7 @@ def _get_SHAs(self, brackets):
         return SHAs
 
     async def _fit(self, X, y, **fit_params):
-        X, y, scorer = self._validate_parameters(X, y)
+        X, y, scorer = await self._validate_parameters(X, y)
 
         brackets = _get_hyperband_params(self.max_iter, eta=self.aggressiveness)
         SHAs = self._get_SHAs(brackets)

diff --git a/dask_ml/model_selection/_incremental.py b/dask_ml/model_selection/_incremental.py
@@ -515,21 +515,25 @@ def __init__(
         self.prefix = prefix
         super(BaseIncrementalSearchCV, self).__init__(estimator, scoring=scoring)
 
-    def _validate_parameters(self, X, y):
+    async def _validate_parameters(self, X, y):
         if (self.max_iter is not None) and self.max_iter < 1:
             raise ValueError(
                 "Received max_iter={}. max_iter < 1 is not supported".format(
                     self.max_iter
                 )
             )
 
-        # Make sure dask arrays are passed so error on unknown chunk size is raised
         kwargs = dict(accept_unknown_chunks=True, accept_dask_dataframe=True)
         if not isinstance(X, dd.DataFrame):
             X = self._check_array(X, **kwargs)
-        if not isinstance(y, dd.Series):
+        if not isinstance(y, dd.DataFrame):
             y = self._check_array(y, ensure_2d=False, **kwargs)
-        scorer = check_scoring(self.estimator, scoring=self.scoring)
+        estimator = self.estimator
+        if isinstance(estimator, Future):
+            client = default_client()
+            scorer = await client.submit(check_scoring, estimator, scoring=self.scoring)
+        else:
+            scorer = check_scoring(self.estimator, scoring=self.scoring)
         return X, y, scorer
 
     @property
@@ -640,7 +644,7 @@ async def _fit(self, X, y, **fit_params):
         else:
             context = dummy_context()
 
-        X, y, scorer = self._validate_parameters(X, y)
+        X, y, scorer = await self._validate_parameters(X, y)
 
         X_train, X_test, y_train, y_test = self._get_train_test_split(X, y)
 

diff --git a/dask_ml/wrappers.py b/dask_ml/wrappers.py
@@ -481,7 +481,7 @@ def _fit_for_estimator(self, estimator, X, y, **fit_kwargs):
                 random_state=self.random_state,
                 shuffle_blocks=self.shuffle_blocks,
                 assume_equal_chunks=self.assume_equal_chunks,
-                **fit_kwargs
+                **fit_kwargs,
             )
 
         copy_learned_attributes(result, self)

diff --git a/docs/source/hyper-parameter-search.rst b/docs/source/hyper-parameter-search.rst
@@ -403,7 +403,7 @@ generalized to any of the above estimators.
 
 .. note::
 
-   These estimators require that the model implement ``partial_fit``
+   These estimators require that the model implement ``partial_fit``.
 
 By default, these class will call ``partial_fit`` on each chunk of the data.
 These classes can stop training any models if their score stops increasing

diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -120,13 +120,21 @@ Scikit-Learn should feel at home with Dask-ML.
    hyper-parameter-search.rst
    compose.rst
    glm.rst
-   joblib.rst
    meta-estimators.rst
    incremental.rst
    clustering.rst
-   xgboost.rst
    modules/api.rst
 
+.. toctree::
+   :maxdepth: 2
+   :hidden:
+   :caption: Integration
+
+   keras.rst
+   pytorch.rst
+   xgboost.rst
+   joblib.rst
+
 .. toctree::
    :maxdepth: 2
    :hidden:

diff --git a/docs/source/keras.rst b/docs/source/keras.rst
@@ -0,0 +1,45 @@
+Keras and Tensorflow
+====================
+
+The package SciKeras_ brings a Scikit-learn API to Keras. Install directions
+are at https://github.com/adriangb/scikeras/blob/master/README.md#installation.
+
+Example usage
+-------------
+
+First, let's start by defining normal function to create our model. This is the
+normal way to create a `Keras Sequential model`_
+
+.. _Keras Sequential model: https://keras.io/api/models/sequential/
+
+.. code-block:: python
+
+   import tensorflow as tf
+   from tensorflow.keras.layers import Dense, Activation, Dropout
+   from tensorflow.keras.models import Sequential
+
+   def build_model(lr=0.01):
+       layers = [Dense(512, input_shape=(784,), activation="relu"),
+                 Dense(10, input_shape=(512,), activation="softmax")]
+       model = Sequential(layers)
+
+       opt = tf.keras.optimizers.SGD(learning_rate=lr)
+       model.compile(loss="categorical_crossentropy", optimizer=opt, metrics=["accuracy"])
+       return model
+
+Now, we can use the SciKeras to create a Scikit-learn compatible model:
+
+.. code-block:: python
+
+   from scikeras.wrappers import KerasClassifier, KerasRegressor
+   model = KerasClassifier(build_fn=build_model, lr=0.1)
+
+This model will work with all of Dask-ML: it expects NumPy arrays as inputs and
+obeys the Scikit-learn API. For example, it's possible to use Dask-ML to do the
+following:
+
+* Use Keras with Dask-ML's model selection, including
+  :class:`~dask_ml.model_selection.HyperbandSearchCV`.
+* Use Keras with Dask-ML's :class:`~dask_ml.wrappers.Incremental`.
+
+.. _SciKeras: https://github.com/adriangb/scikeras
diff --git a/docs/source/pytorch.rst b/docs/source/pytorch.rst
@@ -0,0 +1,64 @@
+PyTorch
+=======
+
+Skorch_ brings a Scikit-learn API to PyTorch_. We encourage looking at the
+Skorch documentation for complete details.
+
+Example usage
+-------------
+
+First, let's create a normal PyTorch model:
+
+.. code-block:: python
+
+
+   import torch.nn as nn
+   import torch.nn.functional as F
+
+   class ShallowNet(nn.Module):
+       def __init__(self, n_features=5):
+           super().__init__()
+           self.layer1 = nn.Linear(n_features, 1)
+
+       def forward(self, x):
+           return F.relu(self.layer1(x))
+
+With this, it's easy to use Skorch:
+
+.. code-block:: python
+
+   from skorch import NeuralNetRegressor
+   import torch.optim as optim
+
+   niceties = {
+       "callbacks": False,
+       "warm_start": False,
+       "train_split": None,
+       "max_epochs": 1,
+   }
+
+   model = NeuralNetRegressor(
+       module=ShallowNet,
+       module__n_features=5,
+       criterion=nn.MSELoss,
+       optimizer=optim.SGD,
+       optimizer__lr=0.1,
+       optimizer__momentum=0.9,
+       batch_size=64,
+       **niceties,
+   )
+
+Each parameter that the PyTorch `nn.Module` takes is prefixed with `module__`,
+and same for the optimizer (`optim.SGD` takes a `lr` and `momentum`
+parameters). The ``niceties`` make sure Skorch uses all the data for training
+and doesn't print excessive amounts of logs.
+
+Now, this model can be used with Dask-ML. For example, it's possible to do the
+following:
+
+* Use PyTorch with the Dask-ML's model selection, including
+  :class:`~dask_ml.model_selection.HyperbandSearchCV`.
+* Use PyTorch with Dask-ML's :class:`~dask_ml.wrappers.Incremental`.
+
+.. _Skorch: https://skorch.readthedocs.io/en/stable/
+.. _PyTorch: https://pytorch.org
diff --git a/docs/source/xgboost.rst b/docs/source/xgboost.rst
@@ -1,19 +1,29 @@
-XGBoost
-=======
+XGBoost & LightGBM
+==================
 
 .. currentmodule:: dask_ml.xgboost
 
+XGBoost_ is a powerful and popular library for gradient boosted trees.  For
+larger datasets or faster training XGBoost also provides a distributed
+computing solution. LightGBM_ is another library similar to XGBoost; it also
+natively supplies native distributed training for decision trees.
+
+Dask-ML can set up distributed XGBoost or LightGBM for you and hand off data
+from distributed dask.dataframes.  This automates much of the hassle of
+preprocessing and setup while still letting XGBoost/LightGBM do what they do
+well.
+
+Below, we'll refer to an example with XGBoost. Here are the relevant XGBoost
+classes/functions:
+
 .. autosummary::
    train
    predict
    XGBClassifier
    XGBRegressor
 
-XGBoost_ is a powerful and popular library for gradient boosted trees.  For
-larger datasets or faster training XGBoost also provides a distributed
-computing solution.  Dask-ML can set up distributed XGBoost for you and hand
-off data from distributed dask.dataframes.  This automates much of the hassle
-of preprocessing and setup while still letting XGBoost do what it does well.
+The LightGBM implementation and documentation can be found at
+https://github.com/dask/dask-lightgbm.
 
 Example
 -------
@@ -63,3 +73,4 @@ relevant GitHub issue here: `dmlc/xgboost #2032 <https://github.com/dmlc/xgboost
 See the ":doc:`Dask-ML examples <examples>`" for an example usage.
 
 .. _XGBoost: https://xgboost.readthedocs.io/
+.. _LightGBM: https://lightgbm.readthedocs.io/
diff --git a/tests/model_selection/test_incremental.py b/tests/model_selection/test_incremental.py
@@ -853,3 +853,18 @@ def test_warns_scores_per_fit(c, s, a, b):
     search = IncrementalSearchCV(model, params, scores_per_fit=2)
     with pytest.warns(UserWarning, match="deprecated since Dask-ML v1.4.0"):
         yield search.fit(X, y)
+
+
+@gen_cluster(client=True)
+async def test_model_future(c, s, a, b):
+    X, y = make_classification(n_samples=100, n_features=5, chunks=10)
+
+    params = {"value": np.random.RandomState(42).rand(1000)}
+    model = ConstantFunction()
+    model_future = await c.scatter(model)
+
+    search = IncrementalSearchCV(model_future, params, max_iter=10)
+
+    await search.fit(X, y, classes=[0, 1])
+    assert search.history_
+    assert search.best_score_ > 0
diff --git a/tests/model_selection/test_keras.py b/tests/model_selection/test_keras.py
@@ -0,0 +1,67 @@
+import pickle
+from typing import Tuple
+
+import numpy as np
+import pytest
+from distributed.utils_test import gen_cluster
+from scipy.stats import loguniform, uniform
+from sklearn.base import clone
+from sklearn.datasets import make_classification, make_regression
+from sklearn.exceptions import DataConversionWarning
+from sklearn.model_selection import RandomizedSearchCV
+
+from dask_ml.model_selection import IncrementalSearchCV
+
+import pytest
+pytest.importorskip("tensorflow")
+pytest.importorskip("scikeras")
+
+import tensorflow as tf
+from tensorflow.keras.datasets import mnist as keras_mnist
+from tensorflow.keras.layers import Dense, Activation, Dropout
+from tensorflow.keras.models import Sequential
+from tensorflow.keras.utils import to_categorical
+from scikeras.wrappers import KerasClassifier, KerasRegressor
+
+
+def mnist() -> Tuple[np.ndarray, np.ndarray]:
+    (X_train, y_train), _ = keras_mnist.load_data()
+    X_train = X_train[:100]
+    y_train = y_train[:100]
+    X_train = X_train.reshape(X_train.shape[0], 784)
+    X_train = X_train.astype("float32")
+    X_train /= 255
+    Y_train = to_categorical(y_train, 10)
+    return X_train, y_train
+
+
+def _keras_build_fn(lr=0.01):
+    layers = [
+        Dense(512, input_shape=(784,), activation="relu"),
+        Dense(10, input_shape=(512,), activation="softmax"),
+    ]
+
+    # See https://github.com/adriangb/scikeras/issues/24
+    try:
+        model = Sequential(layers)
+    except TypeError:
+        model = Sequential(layers)
+
+    opt = tf.keras.optimizers.SGD(learning_rate=lr)
+    model.compile(loss="categorical_crossentropy", optimizer=opt, metrics=["accuracy"])
+    return model
+
+
+@gen_cluster(client=True)
+def test_keras(c, s, a, b):
+    X, y = mnist()
+    assert X.ndim == 2 and X.shape[-1] == 784
+    assert y.ndim == 1 and len(X) == len(y)
+    assert isinstance(X, np.ndarray) and isinstance(y, np.ndarray)
+
+    model = KerasClassifier(build_fn=_keras_build_fn, epochs=1, lr=0.1)
+    params = {"lr": loguniform(1e-3, 1e-1)}
+
+    search = IncrementalSearchCV(model, params, max_iter=2, decay_rate=None)
+    yield search.fit(X, y, epochs=1)
+    assert search.best_score_ >= 0