Skip to content

Commit

Permalink
Add blocksize to task name (#6818)
Browse files Browse the repository at this point in the history
  • Loading branch information
jsignell committed Nov 12, 2020
1 parent 5c3d7c0 commit 061613e
Show file tree
Hide file tree
Showing 2 changed files with 11 additions and 1 deletion.
4 changes: 3 additions & 1 deletion dask/dataframe/io/csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -318,6 +318,7 @@ def text_blocks_to_pandas(
enforce=False,
specified_dtypes=None,
path=None,
blocksize=None,
):
"""Convert blocks of bytes to a dask.dataframe
Expand Down Expand Up @@ -380,7 +381,7 @@ def text_blocks_to_pandas(
# Create mask of first blocks from nested block_lists
is_first = tuple(block_mask(block_lists))

name = "read-csv-" + tokenize(reader, columns, enforce, head)
name = "read-csv-" + tokenize(reader, columns, enforce, head, blocksize)

if path:
block_file_names = [basename(b[1].path) for b in blocks]
Expand Down Expand Up @@ -604,6 +605,7 @@ def read_pandas(
enforce=enforce,
specified_dtypes=specified_dtypes,
path=path,
blocksize=blocksize,
)


Expand Down
8 changes: 8 additions & 0 deletions dask/dataframe/io/tests/test_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
from dask.utils import filetexts, filetext, tmpfile, tmpdir
from fsspec.compression import compr


fmt_bs = [(fmt, None) for fmt in compr] + [(fmt, 10) for fmt in compr]


Expand Down Expand Up @@ -827,6 +828,13 @@ def test_multiple_read_csv_has_deterministic_name():
assert sorted(a.dask.keys(), key=str) == sorted(b.dask.keys(), key=str)


def test_read_csv_has_different_names_based_on_blocksize():
with filetext(csv_text) as fn:
a = dd.read_csv(fn, blocksize="10kB")
b = dd.read_csv(fn, blocksize="20kB")
assert a._name != b._name


def test_csv_with_integer_names():
with filetext("alice,1\nbob,2") as fn:
df = dd.read_csv(fn, header=None)
Expand Down

0 comments on commit 061613e

Please sign in to comment.