Add blocksize to task name (#6818)

dask · Nov 12, 2020 · 061613e · 061613e
1 parent 5c3d7c0
commit 061613e
Show file tree

Hide file tree

Showing 2 changed files with 11 additions and 1 deletion.
diff --git a/dask/dataframe/io/csv.py b/dask/dataframe/io/csv.py
@@ -318,6 +318,7 @@ def text_blocks_to_pandas(
     enforce=False,
     specified_dtypes=None,
     path=None,
+    blocksize=None,
 ):
     """Convert blocks of bytes to a dask.dataframe
 
@@ -380,7 +381,7 @@ def text_blocks_to_pandas(
     # Create mask of first blocks from nested block_lists
     is_first = tuple(block_mask(block_lists))
 
-    name = "read-csv-" + tokenize(reader, columns, enforce, head)
+    name = "read-csv-" + tokenize(reader, columns, enforce, head, blocksize)
 
     if path:
         block_file_names = [basename(b[1].path) for b in blocks]
@@ -604,6 +605,7 @@ def read_pandas(
         enforce=enforce,
         specified_dtypes=specified_dtypes,
         path=path,
+        blocksize=blocksize,
     )
 
 

diff --git a/dask/dataframe/io/tests/test_csv.py b/dask/dataframe/io/tests/test_csv.py
@@ -28,6 +28,7 @@
 from dask.utils import filetexts, filetext, tmpfile, tmpdir
 from fsspec.compression import compr
 
+
 fmt_bs = [(fmt, None) for fmt in compr] + [(fmt, 10) for fmt in compr]
 
 
@@ -827,6 +828,13 @@ def test_multiple_read_csv_has_deterministic_name():
         assert sorted(a.dask.keys(), key=str) == sorted(b.dask.keys(), key=str)
 
 
+def test_read_csv_has_different_names_based_on_blocksize():
+    with filetext(csv_text) as fn:
+        a = dd.read_csv(fn, blocksize="10kB")
+        b = dd.read_csv(fn, blocksize="20kB")
+        assert a._name != b._name
+
+
 def test_csv_with_integer_names():
     with filetext("alice,1\nbob,2") as fn:
         df = dd.read_csv(fn, header=None)