From a2f31bb475a4c060d54d1de5de444ba9b852c9e0 Mon Sep 17 00:00:00 2001 From: enstenr Date: Wed, 19 Nov 2025 19:05:11 +0000 Subject: [PATCH 1/4] docs: Fix typos and spacing issues in Dask bag documentation --- bag.ipynb | 44 ++++++++++++++++++++++++++++++++++---------- 1 file changed, 34 insertions(+), 10 deletions(-) diff --git a/bag.ipynb b/bag.ipynb index 7def4ac6..06b3648a 100644 --- a/bag.ipynb +++ b/bag.ipynb @@ -7,9 +7,9 @@ "# Dask Bags\n", "\n", "\n", - "Dask Bag implements operations like `map`, `filter`, `groupby` and aggregations on collections of Python objects. It does this in parallel and in small memory using Python iterators. It is similar to a parallel version of itertools or a Pythonic version of the PySpark RDD.\n", + "Dask Bag implements operations like `map`, `filter`, `groupby` and aggregations on collections of Python objects. It does this in parallel and in small memory footprint using Python iterators. It is similar to a parallel version of itertools or a Pythonic version of the PySpark RDD.\n", "\n", - "Dask Bags are often used to do simple preprocessing on log files, JSON records, or other user defined Python objects.\n", + "Dask Bags are often used to do simple preprocessing on log files, JSON records, or other user-defined Python objects.\n", "\n", "Full API documentation is available here: http://docs.dask.org/en/latest/bag-api.html" ] @@ -23,14 +23,26 @@ "Starting the Dask Client is optional. It will provide a dashboard which \n", "is useful to gain insight on the computation. \n", "\n", - "The link to the dashboard will become visible when you create the client below. We recommend having it open on one side of your screen while using your notebook on the other side. This can take some effort to arrange your windows, but seeing them both at the same is very useful when learning." + "The link to the dashboard will become visible when you create the client below. We recommend having it open on one side of your screen while using your notebook on the other side. This can take some effort to arrange your windows, but seeing them both at the same time is very useful when learning." ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": {}, - "outputs": [], + "outputs": [ + { + "ename": "ModuleNotFoundError", + "evalue": "No module named 'dask'", + "output_type": "error", + "traceback": [ + "\u001b[31m---------------------------------------------------------------------------\u001b[39m", + "\u001b[31mModuleNotFoundError\u001b[39m Traceback (most recent call last)", + "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[4]\u001b[39m\u001b[32m, line 1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mdask\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mdistributed\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m Client, progress\n\u001b[32m 2\u001b[39m client = Client(n_workers=\u001b[32m4\u001b[39m, threads_per_worker=\u001b[32m1\u001b[39m)\n\u001b[32m 3\u001b[39m client\n", + "\u001b[31mModuleNotFoundError\u001b[39m: No module named 'dask'" + ] + } + ], "source": [ "from dask.distributed import Client, progress\n", "client = Client(n_workers=4, threads_per_worker=1)\n", @@ -68,7 +80,7 @@ "source": [ "## Read JSON data\n", "\n", - "Now that we have some JSON data in a file lets take a look at it with Dask Bag and Python JSON module." + "Now that we have some JSON data in a file let's take a look at it with Dask Bag and Python JSON module." ] }, { @@ -204,7 +216,7 @@ "\n", "Dask Bags are good for reading in initial data, doing a bit of pre-processing, and then handing off to some other more efficient form like Dask Dataframes. Dask Dataframes use Pandas internally, and so can be much faster on numeric data and also have more complex algorithms. \n", "\n", - "However, Dask Dataframes also expect data that is organized as flat columns. It does not support nested JSON data very well (Bag is better for this).\n", + "However, Dask Dataframes also expect data that is organized as flat columns. It does not support nested JSON data very well (Bag is better for this). For deeply nested data, consider flattening or using Bag first, then convert to DataFrame\n", "\n", "Here we make a function to flatten down our nested data structure, map that across our records, and then convert that to a Dask Dataframe." ] @@ -241,9 +253,21 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": {}, - "outputs": [], + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'b' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[31m---------------------------------------------------------------------------\u001b[39m", + "\u001b[31mNameError\u001b[39m Traceback (most recent call last)", + "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[5]\u001b[39m\u001b[32m, line 1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m df = \u001b[43mb\u001b[49m.map(flatten).to_dataframe()\n\u001b[32m 2\u001b[39m df.head()\n", + "\u001b[31mNameError\u001b[39m: name 'b' is not defined" + ] + } + ], "source": [ "df = b.map(flatten).to_dataframe()\n", "df.head()" @@ -295,7 +319,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.12" + "version": "3.12.3" } }, "nbformat": 4, From 2bf7827cce135ce077f3537c6bfbe0b6d82f4bd2 Mon Sep 17 00:00:00 2001 From: enstenr Date: Fri, 21 Nov 2025 12:43:57 +0000 Subject: [PATCH 2/4] removed the output cell that came up during execution --- bag.ipynb | 45 --------------------------------------------- 1 file changed, 45 deletions(-) diff --git a/bag.ipynb b/bag.ipynb index 06b3648a..ab7a344a 100644 --- a/bag.ipynb +++ b/bag.ipynb @@ -26,29 +26,6 @@ "The link to the dashboard will become visible when you create the client below. We recommend having it open on one side of your screen while using your notebook on the other side. This can take some effort to arrange your windows, but seeing them both at the same time is very useful when learning." ] }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "ename": "ModuleNotFoundError", - "evalue": "No module named 'dask'", - "output_type": "error", - "traceback": [ - "\u001b[31m---------------------------------------------------------------------------\u001b[39m", - "\u001b[31mModuleNotFoundError\u001b[39m Traceback (most recent call last)", - "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[4]\u001b[39m\u001b[32m, line 1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mdask\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mdistributed\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m Client, progress\n\u001b[32m 2\u001b[39m client = Client(n_workers=\u001b[32m4\u001b[39m, threads_per_worker=\u001b[32m1\u001b[39m)\n\u001b[32m 3\u001b[39m client\n", - "\u001b[31mModuleNotFoundError\u001b[39m: No module named 'dask'" - ] - } - ], - "source": [ - "from dask.distributed import Client, progress\n", - "client = Client(n_workers=4, threads_per_worker=1)\n", - "client" - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -251,28 +228,6 @@ "b.map(flatten).take(1)" ] }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "ename": "NameError", - "evalue": "name 'b' is not defined", - "output_type": "error", - "traceback": [ - "\u001b[31m---------------------------------------------------------------------------\u001b[39m", - "\u001b[31mNameError\u001b[39m Traceback (most recent call last)", - "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[5]\u001b[39m\u001b[32m, line 1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m df = \u001b[43mb\u001b[49m.map(flatten).to_dataframe()\n\u001b[32m 2\u001b[39m df.head()\n", - "\u001b[31mNameError\u001b[39m: name 'b' is not defined" - ] - } - ], - "source": [ - "df = b.map(flatten).to_dataframe()\n", - "df.head()" - ] - }, { "cell_type": "markdown", "metadata": {}, From 67551af7d97d6ee8e772e7ff9c9aff9fdab3390e Mon Sep 17 00:00:00 2001 From: enstenr Date: Fri, 21 Nov 2025 12:52:58 +0000 Subject: [PATCH 3/4] Revert " removed the output cell that came up during execution" This reverts commit 2bf7827cce135ce077f3537c6bfbe0b6d82f4bd2. --- bag.ipynb | 45 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/bag.ipynb b/bag.ipynb index ab7a344a..06b3648a 100644 --- a/bag.ipynb +++ b/bag.ipynb @@ -26,6 +26,29 @@ "The link to the dashboard will become visible when you create the client below. We recommend having it open on one side of your screen while using your notebook on the other side. This can take some effort to arrange your windows, but seeing them both at the same time is very useful when learning." ] }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "ename": "ModuleNotFoundError", + "evalue": "No module named 'dask'", + "output_type": "error", + "traceback": [ + "\u001b[31m---------------------------------------------------------------------------\u001b[39m", + "\u001b[31mModuleNotFoundError\u001b[39m Traceback (most recent call last)", + "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[4]\u001b[39m\u001b[32m, line 1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mdask\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mdistributed\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m Client, progress\n\u001b[32m 2\u001b[39m client = Client(n_workers=\u001b[32m4\u001b[39m, threads_per_worker=\u001b[32m1\u001b[39m)\n\u001b[32m 3\u001b[39m client\n", + "\u001b[31mModuleNotFoundError\u001b[39m: No module named 'dask'" + ] + } + ], + "source": [ + "from dask.distributed import Client, progress\n", + "client = Client(n_workers=4, threads_per_worker=1)\n", + "client" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -228,6 +251,28 @@ "b.map(flatten).take(1)" ] }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'b' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[31m---------------------------------------------------------------------------\u001b[39m", + "\u001b[31mNameError\u001b[39m Traceback (most recent call last)", + "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[5]\u001b[39m\u001b[32m, line 1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m df = \u001b[43mb\u001b[49m.map(flatten).to_dataframe()\n\u001b[32m 2\u001b[39m df.head()\n", + "\u001b[31mNameError\u001b[39m: name 'b' is not defined" + ] + } + ], + "source": [ + "df = b.map(flatten).to_dataframe()\n", + "df.head()" + ] + }, { "cell_type": "markdown", "metadata": {}, From 440d0fc2d21cc3b9be1c140c53f0214e1715eb24 Mon Sep 17 00:00:00 2001 From: enstenr Date: Fri, 21 Nov 2025 12:54:08 +0000 Subject: [PATCH 4/4] removed the output cell that came up during execution --- bag.ipynb | 32 ++++---------------------------- 1 file changed, 4 insertions(+), 28 deletions(-) diff --git a/bag.ipynb b/bag.ipynb index 06b3648a..663baf4b 100644 --- a/bag.ipynb +++ b/bag.ipynb @@ -28,21 +28,9 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "ename": "ModuleNotFoundError", - "evalue": "No module named 'dask'", - "output_type": "error", - "traceback": [ - "\u001b[31m---------------------------------------------------------------------------\u001b[39m", - "\u001b[31mModuleNotFoundError\u001b[39m Traceback (most recent call last)", - "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[4]\u001b[39m\u001b[32m, line 1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mdask\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mdistributed\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m Client, progress\n\u001b[32m 2\u001b[39m client = Client(n_workers=\u001b[32m4\u001b[39m, threads_per_worker=\u001b[32m1\u001b[39m)\n\u001b[32m 3\u001b[39m client\n", - "\u001b[31mModuleNotFoundError\u001b[39m: No module named 'dask'" - ] - } - ], + "outputs": [], "source": [ "from dask.distributed import Client, progress\n", "client = Client(n_workers=4, threads_per_worker=1)\n", @@ -253,21 +241,9 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "ename": "NameError", - "evalue": "name 'b' is not defined", - "output_type": "error", - "traceback": [ - "\u001b[31m---------------------------------------------------------------------------\u001b[39m", - "\u001b[31mNameError\u001b[39m Traceback (most recent call last)", - "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[5]\u001b[39m\u001b[32m, line 1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m df = \u001b[43mb\u001b[49m.map(flatten).to_dataframe()\n\u001b[32m 2\u001b[39m df.head()\n", - "\u001b[31mNameError\u001b[39m: name 'b' is not defined" - ] - } - ], + "outputs": [], "source": [ "df = b.map(flatten).to_dataframe()\n", "df.head()"