Added more

dask · May 17, 2018 · 1adcb4a · 1adcb4a
1 parent 00bf7b3
commit 1adcb4a
Show file tree

Hide file tree

Showing 18 changed files with 452 additions and 0 deletions.
diff --git a/dask_ml_benchmarks/plot_logistic_regression.py b/dask_ml_benchmarks/plot_logistic_regression.py
@@ -0,0 +1,34 @@
+"""
+.. _plot_logistic_regression_example.py:
+
+Logistic Regression Example
+===========================
+
+Comparison of scaling.
+"""
+from dask_ml.datasets import make_classification
+import pandas as pd
+
+from timeit import default_timer as tic
+import sklearn.linear_model
+import dask_ml.linear_model
+import seaborn as sns
+
+Ns = [2500, 5000, 7500, 10000]
+
+timings = []
+
+for n in Ns:
+    X, y = make_classification(n_samples=n, n_features=1_000, random_state=n,
+                               chunks=n // 20)
+    t1 = tic()
+    sklearn.linear_model.LogisticRegression().fit(X, y)
+    timings.append(('Scikit-Learn', n, tic() - t1))
+    t1 = tic()
+    dask_ml.linear_model.LogisticRegression().fit(X, y)
+    timings.append(('dask-ml', n, tic() - t1))
+
+
+df = pd.DataFrame(timings, columns=['method', 'Number of Samples', 'Fit Time'])
+sns.factorplot(x='Number of Samples', y='Fit Time', hue='method',
+               data=df, aspect=1.5)
diff --git a/dask_ml_benchmarks/plot_spectral_clustering.py b/dask_ml_benchmarks/plot_spectral_clustering.py
@@ -0,0 +1,38 @@
+"""
+.. _plot_spectral_clustering_example.py:
+
+Spectral Clustering Example
+===========================
+
+This example shows how dask-ml's ``SpectralClustering`` scales with the
+number of samples, compared to scikit-learn's implementation. The dask
+version uses an approximation to the affinity matrix, which avoids an
+expensive computation at the cost of some approximation error.
+"""
+from sklearn.datasets import make_circles
+from sklearn.utils import shuffle
+import pandas as pd
+
+from timeit import default_timer as tic
+import sklearn.cluster
+import dask_ml.cluster
+import seaborn as sns
+
+Ns = [2500, 5000, 7500, 10000]
+X, y = make_circles(n_samples=10_000, noise=0.05, random_state=0, factor=0.5)
+X, y = shuffle(X, y)
+
+timings = []
+for n in Ns:
+    X, y = make_circles(n_samples=n, random_state=n, noise=0.5, factor=0.5)
+    t1 = tic()
+    sklearn.cluster.SpectralClustering(n_clusters=2).fit(X)
+    timings.append(('Scikit-Learn (exact)', n, tic() - t1))
+    t1 = tic()
+    dask_ml.cluster.SpectralClustering(n_clusters=2, n_components=100).fit(X)
+    timings.append(('dask-ml (approximate)', n, tic() - t1))
+
+
+df = pd.DataFrame(timings, columns=['method', 'Number of Samples', 'Fit Time'])
+sns.factorplot(x='Number of Samples', y='Fit Time', hue='method',
+               data=df, aspect=1.5)
diff --git a/docs/source/auto_examples/auto_examples_jupyter.zip b/docs/source/auto_examples/auto_examples_jupyter.zip
diff --git a/docs/source/auto_examples/auto_examples_python.zip b/docs/source/auto_examples/auto_examples_python.zip
diff --git a/docs/source/auto_examples/images/sphx_glr_plot_logistic_regression_001.png b/docs/source/auto_examples/images/sphx_glr_plot_logistic_regression_001.png
diff --git a/docs/source/auto_examples/images/sphx_glr_plot_spectral_clustering_001.png b/docs/source/auto_examples/images/sphx_glr_plot_spectral_clustering_001.png
diff --git a/docs/source/auto_examples/images/thumb/sphx_glr_plot_logistic_regression_thumb.png b/docs/source/auto_examples/images/thumb/sphx_glr_plot_logistic_regression_thumb.png
diff --git a/docs/source/auto_examples/images/thumb/sphx_glr_plot_spectral_clustering_thumb.png b/docs/source/auto_examples/images/thumb/sphx_glr_plot_spectral_clustering_thumb.png
diff --git a/docs/source/auto_examples/index.rst b/docs/source/auto_examples/index.rst
@@ -3,6 +3,46 @@
 
 
 
+.. raw:: html
+
+    <div class="sphx-glr-thumbcontainer" tooltip="Comparison of scaling. ">
+
+.. only:: html
+
+    .. figure:: /auto_examples/images/thumb/sphx_glr_plot_logistic_regression_thumb.png
+
+        :ref:`sphx_glr_auto_examples_plot_logistic_regression.py`
+
+.. raw:: html
+
+    </div>
+
+
+.. toctree::
+   :hidden:
+
+   /auto_examples/plot_logistic_regression
+
+.. raw:: html
+
+    <div class="sphx-glr-thumbcontainer" tooltip="This example shows how dask-ml&#x27;s ``SpectralClustering`` scales with the number of samples, comp...">
+
+.. only:: html
+
+    .. figure:: /auto_examples/images/thumb/sphx_glr_plot_spectral_clustering_thumb.png
+
+        :ref:`sphx_glr_auto_examples_plot_spectral_clustering.py`
+
+.. raw:: html
+
+    </div>
+
+
+.. toctree::
+   :hidden:
+
+   /auto_examples/plot_spectral_clustering
+
 .. raw:: html
 
     <div class="sphx-glr-thumbcontainer" tooltip="This example demonstrates :class:`dask_ml.wrappers.ParallelPostFit`. A :class:`sklearn.svm.SVC`...">

diff --git a/docs/source/auto_examples/plot_logistic_regression.ipynb b/docs/source/auto_examples/plot_logistic_regression.ipynb
@@ -0,0 +1,54 @@
+{
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "%matplotlib inline"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "\n\nLogistic Regression Example\n===========================\n\nComparison of scaling.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "from dask_ml.datasets import make_classification\nimport pandas as pd\n\nfrom timeit import default_timer as tic\nimport sklearn.linear_model\nimport dask_ml.linear_model\nimport seaborn as sns\n\nNs = [2500, 5000, 7500, 10000]\n\ntimings = []\n\nfor n in Ns:\n    X, y = make_classification(n_samples=n, n_features=1_000, random_state=n,\n                               chunks=n // 20)\n    t1 = tic()\n    sklearn.linear_model.LogisticRegression().fit(X, y)\n    timings.append(('Scikit-Learn', n, tic() - t1))\n    t1 = tic()\n    dask_ml.linear_model.LogisticRegression().fit(X, y)\n    timings.append(('dask-ml', n, tic() - t1))\n\n\ndf = pd.DataFrame(timings, columns=['method', 'Number of Samples', 'Fit Time'])\nsns.factorplot(x='Number of Samples', y='Fit Time', hue='method',\n               data=df, aspect=1.5)"
+      ]
+    }
+  ],
+  "metadata": {
+    "kernelspec": {
+      "display_name": "Python 3",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.6.5"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
diff --git a/docs/source/auto_examples/plot_logistic_regression.py b/docs/source/auto_examples/plot_logistic_regression.py
@@ -0,0 +1,34 @@
+"""
+.. _plot_logistic_regression_example.py:
+
+Logistic Regression Example
+===========================
+
+Comparison of scaling.
+"""
+from dask_ml.datasets import make_classification
+import pandas as pd
+
+from timeit import default_timer as tic
+import sklearn.linear_model
+import dask_ml.linear_model
+import seaborn as sns
+
+Ns = [2500, 5000, 7500, 10000]
+
+timings = []
+
+for n in Ns:
+    X, y = make_classification(n_samples=n, n_features=1_000, random_state=n,
+                               chunks=n // 20)
+    t1 = tic()
+    sklearn.linear_model.LogisticRegression().fit(X, y)
+    timings.append(('Scikit-Learn', n, tic() - t1))
+    t1 = tic()
+    dask_ml.linear_model.LogisticRegression().fit(X, y)
+    timings.append(('dask-ml', n, tic() - t1))
+
+
+df = pd.DataFrame(timings, columns=['method', 'Number of Samples', 'Fit Time'])
+sns.factorplot(x='Number of Samples', y='Fit Time', hue='method',
+               data=df, aspect=1.5)
diff --git a/docs/source/auto_examples/plot_logistic_regression.py.md5 b/docs/source/auto_examples/plot_logistic_regression.py.md5
@@ -0,0 +1 @@
+34aaf9e462bc041dcf9c87ceacb96d0f
diff --git a/docs/source/auto_examples/plot_logistic_regression.rst b/docs/source/auto_examples/plot_logistic_regression.rst
@@ -0,0 +1,76 @@
+
+
+.. _sphx_glr_auto_examples_plot_logistic_regression.py:
+
+
+.. _plot_logistic_regression_example.py:
+
+Logistic Regression Example
+===========================
+
+Comparison of scaling.
+
+
+
+
+.. image:: /auto_examples/images/sphx_glr_plot_logistic_regression_001.png
+    :align: center
+
+
+
+
+
+.. code-block:: python
+
+    from dask_ml.datasets import make_classification
+    import pandas as pd
+
+    from timeit import default_timer as tic
+    import sklearn.linear_model
+    import dask_ml.linear_model
+    import seaborn as sns
+
+    Ns = [2500, 5000, 7500, 10000]
+
+    timings = []
+
+    for n in Ns:
+        X, y = make_classification(n_samples=n, n_features=1_000, random_state=n,
+                                   chunks=n // 20)
+        t1 = tic()
+        sklearn.linear_model.LogisticRegression().fit(X, y)
+        timings.append(('Scikit-Learn', n, tic() - t1))
+        t1 = tic()
+        dask_ml.linear_model.LogisticRegression().fit(X, y)
+        timings.append(('dask-ml', n, tic() - t1))
+
+
+    df = pd.DataFrame(timings, columns=['method', 'Number of Samples', 'Fit Time'])
+    sns.factorplot(x='Number of Samples', y='Fit Time', hue='method',
+                   data=df, aspect=1.5)
+
+**Total running time of the script:** ( 5 minutes  0.900 seconds)
+
+
+
+.. only :: html
+
+ .. container:: sphx-glr-footer
+
+
+  .. container:: sphx-glr-download
+
+     :download:`Download Python source code: plot_logistic_regression.py <plot_logistic_regression.py>`
+
+
+
+  .. container:: sphx-glr-download
+
+     :download:`Download Jupyter notebook: plot_logistic_regression.ipynb <plot_logistic_regression.ipynb>`
+
+
+.. only:: html
+
+ .. rst-class:: sphx-glr-signature
+
+    `Gallery generated by Sphinx-Gallery <https://sphinx-gallery.readthedocs.io>`_
diff --git a/docs/source/auto_examples/plot_spectral_clustering.ipynb b/docs/source/auto_examples/plot_spectral_clustering.ipynb
@@ -0,0 +1,54 @@
+{
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "%matplotlib inline"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "\n\nSpectral Clustering Example\n===========================\n\nThis example shows how dask-ml's ``SpectralClustering`` scales with the\nnumber of samples, compared to scikit-learn's implementation. The dask\nversion uses an approximation to the affinity matrix, which avoids an\nexpensive computation at the cost of some approximation error.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "from sklearn.datasets import make_circles\nfrom sklearn.utils import shuffle\nimport pandas as pd\n\nfrom timeit import default_timer as tic\nimport sklearn.cluster\nimport dask_ml.cluster\nimport seaborn as sns\n\nNs = [2500, 5000, 7500, 10000]\nX, y = make_circles(n_samples=10_000, noise=0.05, random_state=0, factor=0.5)\nX, y = shuffle(X, y)\n\ntimings = []\nfor n in Ns:\n    X, y = make_circles(n_samples=n, random_state=n, noise=0.5, factor=0.5)\n    t1 = tic()\n    sklearn.cluster.SpectralClustering(n_clusters=2).fit(X)\n    timings.append(('Scikit-Learn (exact)', n, tic() - t1))\n    t1 = tic()\n    dask_ml.cluster.SpectralClustering(n_clusters=2, n_components=100).fit(X)\n    timings.append(('dask-ml (approximate)', n, tic() - t1))\n\n\ndf = pd.DataFrame(timings, columns=['method', 'Number of Samples', 'Fit Time'])\nsns.factorplot(x='Number of Samples', y='Fit Time', hue='method',\n               data=df, aspect=1.5)"
+      ]
+    }
+  ],
+  "metadata": {
+    "kernelspec": {
+      "display_name": "Python 3",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.6.5"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
diff --git a/docs/source/auto_examples/plot_spectral_clustering.py b/docs/source/auto_examples/plot_spectral_clustering.py
@@ -0,0 +1,38 @@
+"""
+.. _plot_spectral_clustering_example.py:
+
+Spectral Clustering Example
+===========================
+
+This example shows how dask-ml's ``SpectralClustering`` scales with the
+number of samples, compared to scikit-learn's implementation. The dask
+version uses an approximation to the affinity matrix, which avoids an
+expensive computation at the cost of some approximation error.
+"""
+from sklearn.datasets import make_circles
+from sklearn.utils import shuffle
+import pandas as pd
+
+from timeit import default_timer as tic
+import sklearn.cluster
+import dask_ml.cluster
+import seaborn as sns
+
+Ns = [2500, 5000, 7500, 10000]
+X, y = make_circles(n_samples=10_000, noise=0.05, random_state=0, factor=0.5)
+X, y = shuffle(X, y)
+
+timings = []
+for n in Ns:
+    X, y = make_circles(n_samples=n, random_state=n, noise=0.5, factor=0.5)
+    t1 = tic()
+    sklearn.cluster.SpectralClustering(n_clusters=2).fit(X)
+    timings.append(('Scikit-Learn (exact)', n, tic() - t1))
+    t1 = tic()
+    dask_ml.cluster.SpectralClustering(n_clusters=2, n_components=100).fit(X)
+    timings.append(('dask-ml (approximate)', n, tic() - t1))
+
+
+df = pd.DataFrame(timings, columns=['method', 'Number of Samples', 'Fit Time'])
+sns.factorplot(x='Number of Samples', y='Fit Time', hue='method',
+               data=df, aspect=1.5)
diff --git a/docs/source/auto_examples/plot_spectral_clustering.py.md5 b/docs/source/auto_examples/plot_spectral_clustering.py.md5
@@ -0,0 +1 @@
+649f44189a9a45d720be774f4d86979d