---------------------------------------------------------------------------
KilledWorker Traceback (most recent call last)
/tmp/ipykernel_29/1086427464.py in <module>
38 ddf = ddf.groupby(by=group_by_columns)
39 ddf = ddf.mean()
---> 40 ddf.compute()
41
42 # Equivalent computation using bags
/usr/local/lib/python3.9/site-packages/dask/base.py in compute(self, **kwargs)
286 dask.base.compute
287 """
--> 288 (result,) = compute(self, traverse=False, **kwargs)
289 return result
290
/usr/local/lib/python3.9/site-packages/dask/base.py in compute(traverse, optimize_graph, scheduler, get, *args, **kwargs)
569 postcomputes.append(x.__dask_postcompute__())
570
--> 571 results = schedule(dsk, keys, **kwargs)
572 return repack([f(r, *a) for r, (f, a) in zip(results, postcomputes)])
573
/usr/local/lib/python3.9/site-packages/distributed/client.py in get(self, dsk, keys, workers, allow_other_workers, resources, sync, asynchronous, direct, retries, priority, fifo_timeout, actors, **kwargs)
2723 should_rejoin = False
2724 try:
-> 2725 results = self.gather(packed, asynchronous=asynchronous, direct=direct)
2726 finally:
2727 for f in futures.values():
/usr/local/lib/python3.9/site-packages/distributed/client.py in gather(self, futures, errors, direct, asynchronous)
1978 else:
1979 local_worker = None
-> 1980 return self.sync(
1981 self._gather,
1982 futures,
/usr/local/lib/python3.9/site-packages/distributed/client.py in sync(self, func, asynchronous, callback_timeout, *args, **kwargs)
866 return future
867 else:
--> 868 return sync(
869 self.loop, func, *args, callback_timeout=callback_timeout, **kwargs
870 )
/usr/local/lib/python3.9/site-packages/distributed/utils.py in sync(loop, func, callback_timeout, *args, **kwargs)
330 if error[0]:
331 typ, exc, tb = error[0]
--> 332 raise exc.with_traceback(tb)
333 else:
334 return result[0]
/usr/local/lib/python3.9/site-packages/distributed/utils.py in f()
313 if callback_timeout is not None:
314 future = asyncio.wait_for(future, callback_timeout)
--> 315 result[0] = yield future
316 except Exception:
317 error[0] = sys.exc_info()
/usr/local/lib/python3.9/site-packages/tornado/gen.py in run(self)
760
761 try:
--> 762 value = future.result()
763 except Exception:
764 exc_info = sys.exc_info()
/usr/local/lib/python3.9/site-packages/distributed/client.py in _gather(self, futures, errors, direct, local_worker)
1843 exc = CancelledError(key)
1844 else:
-> 1845 raise exception.with_traceback(traceback)
1846 raise exc
1847 if errors == "skip":
KilledWorker: ("('dataframe-groupby-sum-combine-615fc68191b82bd112a35cf039ec8c0c', 3, 1, 0)", <WorkerState 'tcp://10.95.251.71:9000', name: dask-stress-worker-1273515da29b4d70a26965b1fd0b589d-58576d46229, status: closed, memory: 0, processing: 3>)
What happened:
KilledWorker exception for a rather small computation using dask dataframes.
We schedule a dask computation based on simulated data as part of our integration tests. In our test code, we create 1k partitions filled with random data which have a fixed size of 7k rows and 50 columns (see example below). The worker seems to go out of memory, the task is unsuccessfully retried on a different worker a couple of times until a
KilledWorkerexception is raised.What you expected to happen:
The computation succeeds, especially because the partition size seems to be small enough and the workers have enough memory (more information to that below).
Minimal Complete Verifiable Example:
Anything else we need to know?:
KilledWorkerexceptiondistributed.utils_comm - INFO - Retrying get_data_from_worker after exception in attempt 3/5: Timed out during handshake while connecting to tcp://172.20.10.118:9000 after 60 sThings we have tried to debug/mitigate the issue:
--> result doesn't change, still worker OOMs and finally a KilledWorker exception.
Environment:
10 Dask workers with 32 GB each, deployed on a kubernetes cluster with istio as service mesh
debian9debian10Cluster Dump State:
Computation fails, can't seem to retrieve this after that?Traceback:
Logs:
Screenshots of dask dashboard: