From 399ffbfc0ed991caed276e0c3ceb6e6bdd1251ab Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Tue, 16 Apr 2019 13:47:19 -0500 Subject: [PATCH] Create dataframe best practices doc This moves over the dataframe-performance doc, and updates it slightly. --- docs/source/conf.py | 1 + ...mance.rst => dataframe-best-practices.rst} | 40 +++++++++++++++---- docs/source/dataframe.rst | 2 +- 3 files changed, 35 insertions(+), 8 deletions(-) rename docs/source/{dataframe-performance.rst => dataframe-best-practices.rst} (88%) diff --git a/docs/source/conf.py b/docs/source/conf.py index 33269228ab8..3508e784bd0 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -311,6 +311,7 @@ ('array-overview.html', 'array.html'), ('array-ghost.html', 'array-overlap.html'), ('dataframe-overview.html', 'dataframe.html'), + ('dataframe-performance.html', 'dataframe-best-practices.html'), ('delayed-overview.html', 'delayed.html'), ('scheduler-choice.html', 'setup.html'), ('diagnostics.html', 'diagnostics-local.html'), diff --git a/docs/source/dataframe-performance.rst b/docs/source/dataframe-best-practices.rst similarity index 88% rename from docs/source/dataframe-performance.rst rename to docs/source/dataframe-best-practices.rst index dda3ac6a5b7..d408d8620b4 100644 --- a/docs/source/dataframe-performance.rst +++ b/docs/source/dataframe-best-practices.rst @@ -1,7 +1,12 @@ .. _dataframe.performance: -Dask DataFrame Performance Tips -=============================== +Best Practices +============== + +It is easy to get started with Dask DataFrame, but using it *well* does require +some experience. This page contains suggestions for best practices, and +includes solutions to common problems. + Use Pandas ---------- @@ -10,6 +15,22 @@ For data that fits into RAM, Pandas can often be faster and easier to use than Dask DataFrame. While "Big Data" tools can be exciting, they are almost always worse than normal data tools while those remain appropriate. + +Reduce, and then use Pandas +--------------------------- + +Similar to above, even if you have a large dataset there may be a point in your +computation where you've reduced things to a more manageable level. You may +want to switch to Pandas at this point. + +.. code-block:: python + + df = dd.read_parquet('my-giant-file.parquet') + df = df[df.name == 'Alice'] # Select a subsection + result = df.groupby('id').value.mean() # Reduce to a smaller size + result = result.compute() # Convert to Pandas dataframe + result... # Continue working with Pandas + Pandas Performance Tips Apply to Dask DataFrame ----------------------------------------------- @@ -36,8 +57,11 @@ it sparingly (see below): df.loc['2001-01-05':'2001-01-12'] # this is very fast if you have an index df.merge(df2, left_index=True, right_index=True) # this is also very fast -Avoid Shuffles --------------- +For more information, see documentation on :ref:`dataframe partitions `. + + +Avoid Full-Data Shuffling +------------------------- Setting an index is an important but expensive operation (see above). You should do it infrequently and you should persist afterwards (see below). @@ -146,7 +170,7 @@ using the ``repartition`` method: df = df[df.name == 'Alice'] # only 1/100th of the data df = df.repartition(npartitions=df.npartitions // 100) - df = client.persist(df) # if on a distributed system + df = df.persist() # if on a distributed system This helps to reduce overhead and increase the effectiveness of vectorized Pandas operations. You should aim for partitions that have around 100MB of @@ -183,6 +207,8 @@ operation: dd.merge(a, b, left_index=True, right_on='id') # half-fast, half-slow dd.merge(a, b, left_on='id', right_on='id') # slow +For more information see :doc:`Joins `. + Store Data in Apache Parquet Format ----------------------------------- @@ -202,7 +228,7 @@ systems: df.to_parquet('path/to/my-results/') df = dd.read_parquet('path/to/my-results/') -Dask supports reading parquet files with different engine implementations of +Dask supports reading parquet files with different engine implementations of the Apache Parquet format for Python: .. code-block:: python @@ -216,7 +242,7 @@ These libraries can be installed using: conda install fastparquet pyarrow -c conda-forge -`fastparquet `_ is a Python-based +`fastparquet `_ is a Python-based implementation that uses the `Numba `_ Python-to-LLVM compiler. PyArrow is part of the `Apache Arrow `_ project and uses the `C++ diff --git a/docs/source/dataframe.rst b/docs/source/dataframe.rst index dfe30e7aa91..e3e79167884 100644 --- a/docs/source/dataframe.rst +++ b/docs/source/dataframe.rst @@ -7,7 +7,7 @@ DataFrame dataframe-api.rst dataframe-create.rst - dataframe-performance.rst + dataframe-best-practices.rst dataframe-design.rst dataframe-groupby.rst dataframe-joins.rst