Basic

dask · May 17, 2018 · 2403401 · 2403401
1 parent 4e0212b
commit 2403401
Show file tree

Hide file tree

Showing 14 changed files with 280 additions and 26 deletions.
diff --git a/README.md b/README.md
@@ -0,0 +1,3 @@
+# Dask-ML Benchmarks
+
+Documenting the scalability and performance of Dask-ML.
diff --git a/dask_ml_benchmarks/plot_parallel_post_fit_scaling.py b/dask_ml_benchmarks/plot_parallel_post_fit_scaling.py
@@ -1,4 +1,6 @@
-"""Parallelizing Predicion.
+"""
+Parallelizing Predicion
+=======================
 
 This example demonstrates :class:`dask_ml.wrappers.ParallelPostFit`.
 A :class:`sklearn.svm.SVC` is fit on a small dataset that easily fits
@@ -28,7 +30,7 @@
 from dask_ml.wrappers import ParallelPostFit
 
 X, y = sklearn.datasets.make_classification(n_samples=1000)
-clf = ParallelPostFit(SVC())
+clf = ParallelPostFit(SVC(gamma='scale'))
 clf.fit(X, y)
 
 

diff --git a/docs/.gitignore b/docs/.gitignore
@@ -0,0 +1,2 @@
+build/
+./auto_examples/
diff --git a/docs/source/auto_examples/auto_examples_jupyter.zip b/docs/source/auto_examples/auto_examples_jupyter.zip
diff --git a/docs/source/auto_examples/auto_examples_python.zip b/docs/source/auto_examples/auto_examples_python.zip
diff --git a/docs/source/auto_examples/images/sphx_glr_plot_parallel_post_fit_scaling_001.png b/docs/source/auto_examples/images/sphx_glr_plot_parallel_post_fit_scaling_001.png
diff --git a/...ce/auto_examples/images/thumb/sphx_glr_plot_parallel_post_fit_scaling_thumb.png b/...ce/auto_examples/images/thumb/sphx_glr_plot_parallel_post_fit_scaling_thumb.png
diff --git a/docs/source/auto_examples/index.rst b/docs/source/auto_examples/index.rst
@@ -0,0 +1,51 @@
+:orphan:
+
+
+
+
+.. raw:: html
+
+    <div class="sphx-glr-thumbcontainer" tooltip="This example demonstrates :class:`dask_ml.wrappers.ParallelPostFit`. A :class:`sklearn.svm.SVC`...">
+
+.. only:: html
+
+    .. figure:: /auto_examples/images/thumb/sphx_glr_plot_parallel_post_fit_scaling_thumb.png
+
+        :ref:`sphx_glr_auto_examples_plot_parallel_post_fit_scaling.py`
+
+.. raw:: html
+
+    </div>
+
+
+.. toctree::
+   :hidden:
+
+   /auto_examples/plot_parallel_post_fit_scaling
+.. raw:: html
+
+    <div style='clear:both'></div>
+
+
+
+.. only :: html
+
+ .. container:: sphx-glr-footer
+
+
+  .. container:: sphx-glr-download
+
+    :download:`Download all examples in Python source code: auto_examples_python.zip <//Users/taugspurger/sandbox/dask-ml-benchmarks/docs/source/auto_examples/auto_examples_python.zip>`
+
+
+
+  .. container:: sphx-glr-download
+
+    :download:`Download all examples in Jupyter notebooks: auto_examples_jupyter.zip <//Users/taugspurger/sandbox/dask-ml-benchmarks/docs/source/auto_examples/auto_examples_jupyter.zip>`
+
+
+.. only:: html
+
+ .. rst-class:: sphx-glr-signature
+
+    `Gallery generated by Sphinx-Gallery <https://sphinx-gallery.readthedocs.io>`_
diff --git a/docs/source/auto_examples/plot_parallel_post_fit_scaling.ipynb b/docs/source/auto_examples/plot_parallel_post_fit_scaling.ipynb
@@ -0,0 +1,54 @@
+{
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "%matplotlib inline"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "\nParallelizing Predicion\n=======================\n\nThis example demonstrates :class:`dask_ml.wrappers.ParallelPostFit`.\nA :class:`sklearn.svm.SVC` is fit on a small dataset that easily fits\nin memory.\n\nAfter training, we predict for successively larger datasets. We compare\n\n* The serial prediction time using the regular SVC.predict method\n* The parallel prediction time using\n  :meth:`dask_ml.warppers.ParallelPostFit.predict`\n\nWe see that the parallel version is faster, especially for larger datasets.\nAdditionally, the parallel version from ParallelPostFit scales out to larger\nthan memory datasets.\n\nWhile only predict is demonstrated here, wrappers.ParallelPostFit is equally\nuseful for predict_proba and transform.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "from timeit import default_timer as tic\n\nimport pandas as pd\nimport seaborn as sns\nimport sklearn.datasets\nfrom sklearn.svm import SVC\n\nimport dask_ml.datasets\nfrom dask_ml.wrappers import ParallelPostFit\n\nX, y = sklearn.datasets.make_classification(n_samples=1000)\nclf = ParallelPostFit(SVC(gamma='scale'))\nclf.fit(X, y)\n\n\nNs = [100_000, 200_000, 400_000, 800_000]\ntimings = []\n\n\nfor n in Ns:\n    X, y = dask_ml.datasets.make_classification(n_samples=n,\n                                                random_state=n,\n                                                chunks=n // 20)\n    t1 = tic()\n    # Serial scikit-learn version\n    clf.estimator.predict(X)\n    timings.append(('Scikit-Learn', n, tic() - t1))\n\n    t1 = tic()\n    # Parallelized scikit-learn version\n    clf.predict(X).compute()\n    timings.append(('dask-ml', n, tic() - t1))\n\n\ndf = pd.DataFrame(timings,\n                  columns=['method', 'Number of Samples', 'Predict Time'])\nax = sns.factorplot(x='Number of Samples', y='Predict Time', hue='method',\n                    data=df, aspect=1.5)"
+      ]
+    }
+  ],
+  "metadata": {
+    "kernelspec": {
+      "display_name": "Python 3",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.6.5"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
diff --git a/docs/source/auto_examples/plot_parallel_post_fit_scaling.py b/docs/source/auto_examples/plot_parallel_post_fit_scaling.py
@@ -0,0 +1,59 @@
+"""
+Parallelizing Predicion
+=======================
+
+This example demonstrates :class:`dask_ml.wrappers.ParallelPostFit`.
+A :class:`sklearn.svm.SVC` is fit on a small dataset that easily fits
+in memory.
+
+After training, we predict for successively larger datasets. We compare
+
+* The serial prediction time using the regular SVC.predict method
+* The parallel prediction time using
+  :meth:`dask_ml.warppers.ParallelPostFit.predict`
+
+We see that the parallel version is faster, especially for larger datasets.
+Additionally, the parallel version from ParallelPostFit scales out to larger
+than memory datasets.
+
+While only predict is demonstrated here, wrappers.ParallelPostFit is equally
+useful for predict_proba and transform.
+"""
+from timeit import default_timer as tic
+
+import pandas as pd
+import seaborn as sns
+import sklearn.datasets
+from sklearn.svm import SVC
+
+import dask_ml.datasets
+from dask_ml.wrappers import ParallelPostFit
+
+X, y = sklearn.datasets.make_classification(n_samples=1000)
+clf = ParallelPostFit(SVC(gamma='scale'))
+clf.fit(X, y)
+
+
+Ns = [100_000, 200_000, 400_000, 800_000]
+timings = []
+
+
+for n in Ns:
+    X, y = dask_ml.datasets.make_classification(n_samples=n,
+                                                random_state=n,
+                                                chunks=n // 20)
+    t1 = tic()
+    # Serial scikit-learn version
+    clf.estimator.predict(X)
+    timings.append(('Scikit-Learn', n, tic() - t1))
+
+    t1 = tic()
+    # Parallelized scikit-learn version
+    clf.predict(X).compute()
+    timings.append(('dask-ml', n, tic() - t1))
+
+
+df = pd.DataFrame(timings,
+                  columns=['method', 'Number of Samples', 'Predict Time'])
+ax = sns.factorplot(x='Number of Samples', y='Predict Time', hue='method',
+                    data=df, aspect=1.5)
diff --git a/docs/source/auto_examples/plot_parallel_post_fit_scaling.py.md5 b/docs/source/auto_examples/plot_parallel_post_fit_scaling.py.md5
@@ -0,0 +1 @@
+6d397165ad9f798ec5746ebf2f35d717
diff --git a/docs/source/auto_examples/plot_parallel_post_fit_scaling.rst b/docs/source/auto_examples/plot_parallel_post_fit_scaling.rst
@@ -0,0 +1,101 @@
+
+
+.. _sphx_glr_auto_examples_plot_parallel_post_fit_scaling.py:
+
+
+Parallelizing Predicion
+=======================
+
+This example demonstrates :class:`dask_ml.wrappers.ParallelPostFit`.
+A :class:`sklearn.svm.SVC` is fit on a small dataset that easily fits
+in memory.
+
+After training, we predict for successively larger datasets. We compare
+
+* The serial prediction time using the regular SVC.predict method
+* The parallel prediction time using
+  :meth:`dask_ml.warppers.ParallelPostFit.predict`
+
+We see that the parallel version is faster, especially for larger datasets.
+Additionally, the parallel version from ParallelPostFit scales out to larger
+than memory datasets.
+
+While only predict is demonstrated here, wrappers.ParallelPostFit is equally
+useful for predict_proba and transform.
+
+
+
+
+.. image:: /auto_examples/images/sphx_glr_plot_parallel_post_fit_scaling_001.png
+    :align: center
+
+
+
+
+
+.. code-block:: python
+
+    from timeit import default_timer as tic
+
+    import pandas as pd
+    import seaborn as sns
+    import sklearn.datasets
+    from sklearn.svm import SVC
+
+    import dask_ml.datasets
+    from dask_ml.wrappers import ParallelPostFit
+
+    X, y = sklearn.datasets.make_classification(n_samples=1000)
+    clf = ParallelPostFit(SVC(gamma='scale'))
+    clf.fit(X, y)
+
+
+    Ns = [100_000, 200_000, 400_000, 800_000]
+    timings = []
+
+
+    for n in Ns:
+        X, y = dask_ml.datasets.make_classification(n_samples=n,
+                                                    random_state=n,
+                                                    chunks=n // 20)
+        t1 = tic()
+        # Serial scikit-learn version
+        clf.estimator.predict(X)
+        timings.append(('Scikit-Learn', n, tic() - t1))
+
+        t1 = tic()
+        # Parallelized scikit-learn version
+        clf.predict(X).compute()
+        timings.append(('dask-ml', n, tic() - t1))
+
+
+    df = pd.DataFrame(timings,
+                      columns=['method', 'Number of Samples', 'Predict Time'])
+    ax = sns.factorplot(x='Number of Samples', y='Predict Time', hue='method',
+                        data=df, aspect=1.5)
+
+**Total running time of the script:** ( 0 minutes  22.372 seconds)
+
+
+
+.. only :: html
+
+ .. container:: sphx-glr-footer
+
+
+  .. container:: sphx-glr-download
+
+     :download:`Download Python source code: plot_parallel_post_fit_scaling.py <plot_parallel_post_fit_scaling.py>`
+
+
+
+  .. container:: sphx-glr-download
+
+     :download:`Download Jupyter notebook: plot_parallel_post_fit_scaling.ipynb <plot_parallel_post_fit_scaling.ipynb>`
+
+
+.. only:: html
+
+ .. rst-class:: sphx-glr-signature
+
+    `Gallery generated by Sphinx-Gallery <https://sphinx-gallery.readthedocs.io>`_
diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -73,32 +73,11 @@
 
 # -- Options for HTML output -------------------------------------------------
 
-# The theme to use for HTML and HTML Help pages.  See the documentation for
-# a list of builtin themes.
-#
-html_theme = 'alabaster'
-
-# Theme options are theme-specific and customize the look and feel of a theme
-# further.  For a list of options available for each theme, see the
-# documentation.
-#
-# html_theme_options = {}
+import sphinx_rtd_theme  # noqa
 
-# Add any paths that contain custom static files (such as style sheets) here,
-# relative to this directory. They are copied after the builtin static files,
-# so a file named "default.css" will overwrite the builtin "default.css".
-html_static_path = ['_static']
-
-# Custom sidebar templates, must be a dictionary that maps document names
-# to template names.
-#
-# The default sidebars (for documents that don't match any pattern) are
-# defined by theme itself.  Builtin themes are using these templates by
-# default: ``['localtoc.html', 'relations.html', 'sourcelink.html',
-# 'searchbox.html']``.
-#
-# html_sidebars = {}
+html_theme = "sphinx_rtd_theme"
 
+html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]
 
 # -- Options for HTMLHelp output ---------------------------------------------
 

diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -10,6 +10,8 @@ Welcome to Dask-ML Benchmarks's documentation!
    :maxdepth: 2
    :caption: Contents:
 
+   auto_examples/plot_parallel_post_fit_scaling
+
 
 
 Indices and tables