You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
When trying to use map_partitions and passing GeoDataFrames as an argument I run into inconsistent behaviour of DASK when the DASK DataFrame is created from delayed. When passing raw DataFrame it throws error about unknown divisions, when first scattering this DataFrame and only then passing It as an argument everything works as (un)expected. Sorry for the long description, but I spent quite a long time trying to figure out what went wrong after DASK upgrade (from 0.17)
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-30-5088b0035b38> in <module>()
1 ddf[['port_id', 'anch_id', 'fac_id', 'berth_id', 'boy_id']] = ddf.map_partitions(
2 find_fids, harbours, meta=pd.DataFrame({'port_id':[-1], 'anch_id':[-1], 'fac_id':[-1],
----> 3 'berth_id':[-1], 'boy_id':[-1]}))
4 df = ddf.compute()
/anaconda3/envs/mariquant/lib/python3.6/site-packages/dask/dataframe/core.py in map_partitions(self, func, *args, **kwargs)
579 >>> ddf.map_partitions(func).clear_divisions() # doctest: +SKIP
580 """
--> 581 return map_partitions(func, self, *args, **kwargs)
582
583 @insert_meta_param_description(pad=12)
/anaconda3/envs/mariquant/lib/python3.6/site-packages/dask/dataframe/core.py in map_partitions(func, *args, **kwargs)
3614 from .multi import _maybe_align_partitions
3615 args = _maybe_from_pandas(args)
-> 3616 args = _maybe_align_partitions(args)
3617
3618 if meta is no_default:
/anaconda3/envs/mariquant/lib/python3.6/site-packages/dask/dataframe/multi.py in _maybe_align_partitions(args)
145 divisions = dfs[0].divisions
146 if not all(df.divisions == divisions for df in dfs):
--> 147 dfs2 = iter(align_partitions(*dfs)[0])
148 return [a if not isinstance(a, _Frame) else next(dfs2) for a in args]
149 return args
/anaconda3/envs/mariquant/lib/python3.6/site-packages/dask/dataframe/multi.py in align_partitions(*dfs)
101 raise ValueError("dfs contains no DataFrame and Series")
102 if not all(df.known_divisions for df in dfs1):
--> 103 raise ValueError("Not all divisions are known, can't align "
104 "partitions. Please use `set_index` "
105 "to set the index.")
ValueError: Not all divisions are known, can't align partitions. Please use `set_index` to set the index.
If I try to set index (adding) the following line
ddf = client.persist(ddf.reset_index().set_index('index'))
it takes forever and when killed gives the following trace
---------------------------------------------------------------------------
KeyboardInterrupt Traceback (most recent call last)
<ipython-input-12-2911203558a9> in <module>()
----> 1 ddf = client.persist(ddf.reset_index().set_index('index'))
/anaconda3/envs/mariquant/lib/python3.6/site-packages/distributed/client.py in persist(self, collections, optimize_graph, workers, allow_other_workers, resources, retries, priority, fifo_timeout, actors, **kwargs)
2469 user_priority=priority,
2470 fifo_timeout=fifo_timeout,
-> 2471 actors=actors)
2472
2473 postpersists = [c.__dask_postpersist__() for c in collections]
/anaconda3/envs/mariquant/lib/python3.6/site-packages/distributed/client.py in _graph_to_futures(self, dsk, keys, restrictions, loose_restrictions, priority, user_priority, resources, retries, fifo_timeout, actors)
2109 dsk = dask.optimization.inline(dsk, keys=values)
2110
-> 2111 d = {k: unpack_remotedata(v, byte_keys=True) for k, v in dsk.items()}
2112 extra_futures = set.union(*[v[1] for v in d.values()]) if d else set()
2113 extra_keys = {tokey(future.key) for future in extra_futures}
/anaconda3/envs/mariquant/lib/python3.6/site-packages/distributed/client.py in <dictcomp>(.0)
2109 dsk = dask.optimization.inline(dsk, keys=values)
2110
-> 2111 d = {k: unpack_remotedata(v, byte_keys=True) for k, v in dsk.items()}
2112 extra_futures = set.union(*[v[1] for v in d.values()]) if d else set()
2113 extra_keys = {tokey(future.key) for future in extra_futures}
/anaconda3/envs/mariquant/lib/python3.6/site-packages/distributed/utils_comm.py in unpack_remotedata(o, byte_keys, myset)
173 if myset is None:
174 myset = set()
--> 175 out = unpack_remotedata(o, byte_keys, myset)
176 return out, myset
177
/anaconda3/envs/mariquant/lib/python3.6/site-packages/distributed/utils_comm.py in unpack_remotedata(o, byte_keys, myset)
181 if not o:
182 return o
--> 183 outs = [unpack_remotedata(item, byte_keys, myset) for item in o]
184 return type(o)(outs)
185 elif typ is dict:
/anaconda3/envs/mariquant/lib/python3.6/site-packages/distributed/utils_comm.py in <listcomp>(.0)
181 if not o:
182 return o
--> 183 outs = [unpack_remotedata(item, byte_keys, myset) for item in o]
184 return type(o)(outs)
185 elif typ is dict:
/anaconda3/envs/mariquant/lib/python3.6/site-packages/distributed/utils_comm.py in unpack_remotedata(o, byte_keys, myset)
181 if not o:
182 return o
--> 183 outs = [unpack_remotedata(item, byte_keys, myset) for item in o]
184 return type(o)(outs)
185 elif typ is dict:
/anaconda3/envs/mariquant/lib/python3.6/site-packages/distributed/utils_comm.py in <listcomp>(.0)
181 if not o:
182 return o
--> 183 outs = [unpack_remotedata(item, byte_keys, myset) for item in o]
184 return type(o)(outs)
185 elif typ is dict:
/anaconda3/envs/mariquant/lib/python3.6/site-packages/distributed/utils_comm.py in unpack_remotedata(o, byte_keys, myset)
181 if not o:
182 return o
--> 183 outs = [unpack_remotedata(item, byte_keys, myset) for item in o]
184 return type(o)(outs)
185 elif typ is dict:
/anaconda3/envs/mariquant/lib/python3.6/site-packages/distributed/utils_comm.py in <listcomp>(.0)
181 if not o:
182 return o
--> 183 outs = [unpack_remotedata(item, byte_keys, myset) for item in o]
184 return type(o)(outs)
185 elif typ is dict:
/anaconda3/envs/mariquant/lib/python3.6/site-packages/distributed/utils_comm.py in unpack_remotedata(o, byte_keys, myset)
181 if not o:
182 return o
--> 183 outs = [unpack_remotedata(item, byte_keys, myset) for item in o]
184 return type(o)(outs)
185 elif typ is dict:
/anaconda3/envs/mariquant/lib/python3.6/site-packages/distributed/utils_comm.py in <listcomp>(.0)
181 if not o:
182 return o
--> 183 outs = [unpack_remotedata(item, byte_keys, myset) for item in o]
184 return type(o)(outs)
185 elif typ is dict:
/anaconda3/envs/mariquant/lib/python3.6/site-packages/distributed/utils_comm.py in unpack_remotedata(o, byte_keys, myset)
181 if not o:
182 return o
--> 183 outs = [unpack_remotedata(item, byte_keys, myset) for item in o]
184 return type(o)(outs)
185 elif typ is dict:
/anaconda3/envs/mariquant/lib/python3.6/site-packages/distributed/utils_comm.py in <listcomp>(.0)
181 if not o:
182 return o
--> 183 outs = [unpack_remotedata(item, byte_keys, myset) for item in o]
184 return type(o)(outs)
185 elif typ is dict:
/anaconda3/envs/mariquant/lib/python3.6/site-packages/distributed/utils_comm.py in unpack_remotedata(o, byte_keys, myset)
176 return out, myset
177
--> 178 typ = type(o)
179
180 if typ in collection_types:
if done without persist, i.e. ddf = ddf.reset_index().set_index('index')
then it got stack on map_partitions with unscattered geodataframe and when killed produce the following trace:
---------------------------------------------------------------------------
KeyboardInterrupt Traceback (most recent call last)
<ipython-input-14-5088b0035b38> in <module>()
2 find_fids, harbours, meta=pd.DataFrame({'port_id':[-1], 'anch_id':[-1], 'fac_id':[-1],
3 'berth_id':[-1], 'boy_id':[-1]}))
----> 4 df = ddf.compute()
/anaconda3/envs/mariquant/lib/python3.6/site-packages/dask/base.py in compute(self, **kwargs)
154 dask.base.compute
155 """
--> 156 (result,) = compute(self, traverse=False, **kwargs)
157 return result
158
/anaconda3/envs/mariquant/lib/python3.6/site-packages/dask/base.py in compute(*args, **kwargs)
393 keys = [x.__dask_keys__() for x in collections]
394 postcomputes = [x.__dask_postcompute__() for x in collections]
--> 395 results = schedule(dsk, keys, **kwargs)
396 return repack([f(r, *a) for r, (f, a) in zip(results, postcomputes)])
397
/anaconda3/envs/mariquant/lib/python3.6/site-packages/distributed/client.py in get(self, dsk, keys, restrictions, loose_restrictions, resources, sync, asynchronous, direct, retries, priority, fifo_timeout, **kwargs)
2202 fifo_timeout=fifo_timeout,
2203 retries=retries,
-> 2204 user_priority=priority,
2205 )
2206 packed = pack_data(keys, futures)
/anaconda3/envs/mariquant/lib/python3.6/site-packages/distributed/client.py in _graph_to_futures(self, dsk, keys, restrictions, loose_restrictions, priority, user_priority, resources, retries, fifo_timeout, actors)
2107 and k not in keyset}
2108 if values:
-> 2109 dsk = dask.optimization.inline(dsk, keys=values)
2110
2111 d = {k: unpack_remotedata(v, byte_keys=True) for k, v in dsk.items()}
/anaconda3/envs/mariquant/lib/python3.6/site-packages/dask/optimization.py in inline(dsk, keys, inline_constants, dependencies)
249 if dependencies is None:
250 dependencies = {k: get_dependencies(dsk, k)
--> 251 for k in dsk}
252
253 if inline_constants:
/anaconda3/envs/mariquant/lib/python3.6/site-packages/dask/optimization.py in <dictcomp>(.0)
249 if dependencies is None:
250 dependencies = {k: get_dependencies(dsk, k)
--> 251 for k in dsk}
252
253 if inline_constants:
/anaconda3/envs/mariquant/lib/python3.6/site-packages/dask/core.py in get_dependencies(dsk, key, task, as_list)
199 new_work = []
200 for w in work:
--> 201 typ = type(w)
202 if typ is tuple and w and callable(w[0]): # istask(w)
203 new_work += w[1:]
KeyboardInterrupt:
I run with scattered geodataframe, then it got stack on map_partitions, consumes all the memory (in real life example) and when killed produce the following trace:
---------------------------------------------------------------------------
KeyboardInterrupt Traceback (most recent call last)
<ipython-input-12-9df5c63458da> in <module>()
2 find_fids, sharbours, meta=pd.DataFrame({'port_id':[-1], 'anch_id':[-1], 'fac_id':[-1],
3 'berth_id':[-1], 'boy_id':[-1]}))
----> 4 df = ddf.compute()
/anaconda3/envs/mariquant/lib/python3.6/site-packages/dask/base.py in compute(self, **kwargs)
154 dask.base.compute
155 """
--> 156 (result,) = compute(self, traverse=False, **kwargs)
157 return result
158
/anaconda3/envs/mariquant/lib/python3.6/site-packages/dask/base.py in compute(*args, **kwargs)
393 keys = [x.__dask_keys__() for x in collections]
394 postcomputes = [x.__dask_postcompute__() for x in collections]
--> 395 results = schedule(dsk, keys, **kwargs)
396 return repack([f(r, *a) for r, (f, a) in zip(results, postcomputes)])
397
/anaconda3/envs/mariquant/lib/python3.6/site-packages/distributed/client.py in get(self, dsk, keys, restrictions, loose_restrictions, resources, sync, asynchronous, direct, retries, priority, fifo_timeout, **kwargs)
2202 fifo_timeout=fifo_timeout,
2203 retries=retries,
-> 2204 user_priority=priority,
2205 )
2206 packed = pack_data(keys, futures)
/anaconda3/envs/mariquant/lib/python3.6/site-packages/distributed/client.py in _graph_to_futures(self, dsk, keys, restrictions, loose_restrictions, priority, user_priority, resources, retries, fifo_timeout, actors)
2109 dsk = dask.optimization.inline(dsk, keys=values)
2110
-> 2111 d = {k: unpack_remotedata(v, byte_keys=True) for k, v in dsk.items()}
2112 extra_futures = set.union(*[v[1] for v in d.values()]) if d else set()
2113 extra_keys = {tokey(future.key) for future in extra_futures}
/anaconda3/envs/mariquant/lib/python3.6/site-packages/distributed/client.py in <dictcomp>(.0)
2109 dsk = dask.optimization.inline(dsk, keys=values)
2110
-> 2111 d = {k: unpack_remotedata(v, byte_keys=True) for k, v in dsk.items()}
2112 extra_futures = set.union(*[v[1] for v in d.values()]) if d else set()
2113 extra_keys = {tokey(future.key) for future in extra_futures}
/anaconda3/envs/mariquant/lib/python3.6/site-packages/distributed/utils_comm.py in unpack_remotedata(o, byte_keys, myset)
173 if myset is None:
174 myset = set()
--> 175 out = unpack_remotedata(o, byte_keys, myset)
176 return out, myset
177
/anaconda3/envs/mariquant/lib/python3.6/site-packages/distributed/utils_comm.py in unpack_remotedata(o, byte_keys, myset)
181 if not o:
182 return o
--> 183 outs = [unpack_remotedata(item, byte_keys, myset) for item in o]
184 return type(o)(outs)
185 elif typ is dict:
/anaconda3/envs/mariquant/lib/python3.6/site-packages/distributed/utils_comm.py in <listcomp>(.0)
181 if not o:
182 return o
--> 183 outs = [unpack_remotedata(item, byte_keys, myset) for item in o]
184 return type(o)(outs)
185 elif typ is dict:
/anaconda3/envs/mariquant/lib/python3.6/site-packages/distributed/utils_comm.py in unpack_remotedata(o, byte_keys, myset)
181 if not o:
182 return o
--> 183 outs = [unpack_remotedata(item, byte_keys, myset) for item in o]
184 return type(o)(outs)
185 elif typ is dict:
/anaconda3/envs/mariquant/lib/python3.6/site-packages/distributed/utils_comm.py in <listcomp>(.0)
181 if not o:
182 return o
--> 183 outs = [unpack_remotedata(item, byte_keys, myset) for item in o]
184 return type(o)(outs)
185 elif typ is dict:
/anaconda3/envs/mariquant/lib/python3.6/site-packages/distributed/utils_comm.py in unpack_remotedata(o, byte_keys, myset)
181 if not o:
182 return o
--> 183 outs = [unpack_remotedata(item, byte_keys, myset) for item in o]
184 return type(o)(outs)
185 elif typ is dict:
/anaconda3/envs/mariquant/lib/python3.6/site-packages/distributed/utils_comm.py in <listcomp>(.0)
181 if not o:
182 return o
--> 183 outs = [unpack_remotedata(item, byte_keys, myset) for item in o]
184 return type(o)(outs)
185 elif typ is dict:
/anaconda3/envs/mariquant/lib/python3.6/site-packages/distributed/utils_comm.py in unpack_remotedata(o, byte_keys, myset)
181 if not o:
182 return o
--> 183 outs = [unpack_remotedata(item, byte_keys, myset) for item in o]
184 return type(o)(outs)
185 elif typ is dict:
/anaconda3/envs/mariquant/lib/python3.6/site-packages/distributed/utils_comm.py in <listcomp>(.0)
181 if not o:
182 return o
--> 183 outs = [unpack_remotedata(item, byte_keys, myset) for item in o]
184 return type(o)(outs)
185 elif typ is dict:
/anaconda3/envs/mariquant/lib/python3.6/site-packages/distributed/utils_comm.py in unpack_remotedata(o, byte_keys, myset)
181 if not o:
182 return o
--> 183 outs = [unpack_remotedata(item, byte_keys, myset) for item in o]
184 return type(o)(outs)
185 elif typ is dict:
/anaconda3/envs/mariquant/lib/python3.6/site-packages/distributed/utils_comm.py in <listcomp>(.0)
181 if not o:
182 return o
--> 183 outs = [unpack_remotedata(item, byte_keys, myset) for item in o]
184 return type(o)(outs)
185 elif typ is dict:
/anaconda3/envs/mariquant/lib/python3.6/site-packages/distributed/utils_comm.py in unpack_remotedata(o, byte_keys, myset)
176 return out, myset
177
--> 178 typ = type(o)
179
180 if typ in collection_types:
KeyboardInterrupt:
The text was updated successfully, but these errors were encountered:
passing GeoPandas as an argument for the map_partitions was working fine at least in 0.17.x. It seems that this was working for quite a while as there are numerous examples of map_partitions with geodataframes (not necessary as arguments) literally all over the net,
passing Geo DataFrame with the from_pandas dask DataFrame (in the toy example) is still working,
map_partitions with scattered GeoDataFrame works in toy example and I almost managed to make it working in real-life.
Just for the cross-reference - my answer to your comment on probably another issue #3972 (comment)
When trying to use map_partitions and passing GeoDataFrames as an argument I run into inconsistent behaviour of DASK when the DASK DataFrame is created from delayed. When passing raw DataFrame it throws error about unknown divisions, when first scattering this DataFrame and only then passing It as an argument everything works as (un)expected. Sorry for the long description, but I spent quite a long time trying to figure out what went wrong after DASK upgrade (from 0.17)
Works fine. However if I the last part will be only
then I receive the following error
If I try to set index (adding) the following line
ddf = client.persist(ddf.reset_index().set_index('index'))
it takes forever and when killed gives the following trace
if done without persist, i.e.
ddf = ddf.reset_index().set_index('index')
then it got stack on map_partitions with unscattered geodataframe and when killed produce the following trace:
I run with scattered geodataframe, then it got stack on map_partitions, consumes all the memory (in real life example) and when killed produce the following trace:
The text was updated successfully, but these errors were encountered: