I am intermittently getting the following error when using dask-kubernetes on an AWS based Pangeo deployment.
The cluster has 50 workers and the error seems to happen randomly.
I've had a google but have not found useful references for that error. Could someone please point me in the right direction?
distributed.scheduler - ERROR - error from worker tcp://100.96.102.208:45331: bad marshal data (invalid reference)
distributed.utils - ERROR - bad marshal data (invalid reference)
Traceback (most recent call last):
File "/opt/conda/lib/python3.6/site-packages/distributed/utils.py", line 238, in f
result[0] = yield make_coro()
File "/opt/conda/lib/python3.6/site-packages/tornado/gen.py", line 1055, in run
value = future.result()
File "/opt/conda/lib/python3.6/site-packages/tornado/concurrent.py", line 238, in result
raise_exc_info(self._exc_info)
File "<string>", line 4, in raise_exc_info
File "/opt/conda/lib/python3.6/site-packages/tornado/gen.py", line 1063, in run
yielded = self.gen.throw(*exc_info)
File "/opt/conda/lib/python3.6/site-packages/distributed/client.py", line 1364, in _gather
traceback)
File "/opt/conda/lib/python3.6/site-packages/six.py", line 692, in reraise
raise value.with_traceback(tb)
File "/opt/conda/lib/python3.6/site-packages/distributed/protocol/pickle.py", line 59, in loads
return pickle.loads(x)
File "<frozen importlib._bootstrap>", line 971, in _find_and_load
File "<frozen importlib._bootstrap>", line 955, in _find_and_load_unlocked
File "<frozen importlib._bootstrap>", line 665, in _load_unlocked
File "<frozen importlib._bootstrap_external>", line 674, in exec_module
File "<frozen importlib._bootstrap_external>", line 779, in get_code
File "<frozen importlib._bootstrap_external>", line 487, in _compile_bytecode
ValueError: bad marshal data (invalid reference)
distributed.scheduler - ERROR - error from worker tcp://100.96.102.210:34063: bad marshal data (invalid reference)
distributed.scheduler - ERROR - error from worker tcp://100.96.106.207:45867: bad marshal data (unknown type code)
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<timed exec> in <module>()
~/eupheme/distributed_version/region_attn.py in load_process_data(startyear, endyear, eventyear, startmonth, endmonth, region, obs, modelroot, modelname, varn, varname_o, freq, lc)
620
621 def load_process_data(startyear,endyear,eventyear,startmonth,endmonth,region,obs,modelroot,modelname,varn,varname_o,freq,lc):
--> 622 all_anom,nat_anom=loadandmask_cmip5da(startyear,endyear,region,obs,modelroot,modelname,varn,varname_o,freq)#GENERIC ATTRIBUTION load routine
623
624 #Look at a single season. Event attribution example. Make sure this is within the model run period!
~/eupheme/distributed_version/region_attn.py in loadandmask_cmip5da(startyear, endyear, region, obs, modelroot, modelname, varn, varname_o, freq)
523 def loadandmask_cmip5da(startyear,endyear,region,obs,modelroot,modelname,varn,varname_o,freq):
524 all_ens=load_region_allforcings_ensemble(region,startyear,endyear,obs[0],modelroot,modelname,varn)#All forcings
--> 525 nat_ens=load_region_ensemble('historicalNat',region,startyear,endyear,obs[0],modelroot,modelname,varn,freq)#Natural forcings
526
527 notlandmask=fl.get_landsea_mask(modelroot+modelname+'/sftlf_fx_'+modelname+'_historical_r0i0p0.nc',obs)
~/eupheme/distributed_version/region_attn.py in load_region_ensemble(forcinglabel, region, startyear, endyear, obs, modelroot, modelname, diagstr, freq, save)
175
176 list_of_cubes = [regridit(i, i_memb) for i,i_memb in enumerate(uniquemembs)]
--> 177 regionens=dask.delayed(iris.cube.CubeList)(list_of_cubes).compute()
178
179
/opt/conda/lib/python3.6/site-packages/dask/base.py in compute(self, **kwargs)
141 dask.base.compute
142 """
--> 143 (result,) = compute(self, traverse=False, **kwargs)
144 return result
145
/opt/conda/lib/python3.6/site-packages/dask/base.py in compute(*args, **kwargs)
390 postcomputes = [a.__dask_postcompute__() if is_dask_collection(a)
391 else (None, a) for a in args]
--> 392 results = get(dsk, keys, **kwargs)
393 results_iter = iter(results)
394 return tuple(a if f is None else f(next(results_iter), *a)
/opt/conda/lib/python3.6/site-packages/distributed/client.py in get(self, dsk, keys, restrictions, loose_restrictions, resources, sync, asynchronous, **kwargs)
2051 should_rejoin = False
2052 try:
-> 2053 results = self.gather(packed, asynchronous=asynchronous)
2054 finally:
2055 for f in futures.values():
/opt/conda/lib/python3.6/site-packages/distributed/client.py in gather(self, futures, errors, maxsize, direct, asynchronous)
1484 return self.sync(self._gather, futures, errors=errors,
1485 direct=direct, local_worker=local_worker,
-> 1486 asynchronous=asynchronous)
1487
1488 @gen.coroutine
/opt/conda/lib/python3.6/site-packages/distributed/client.py in sync(self, func, *args, **kwargs)
606 return future
607 else:
--> 608 return sync(self.loop, func, *args, **kwargs)
609
610 def __repr__(self):
/opt/conda/lib/python3.6/site-packages/distributed/utils.py in sync(loop, func, *args, **kwargs)
252 e.wait(10)
253 if error[0]:
--> 254 six.reraise(*error[0])
255 else:
256 return result[0]
/opt/conda/lib/python3.6/site-packages/six.py in reraise(tp, value, tb)
691 if value.__traceback__ is not tb:
692 raise value.with_traceback(tb)
--> 693 raise value
694 finally:
695 value = None
/opt/conda/lib/python3.6/site-packages/distributed/utils.py in f()
236 yield gen.moment
237 thread_state.asynchronous = True
--> 238 result[0] = yield make_coro()
239 except Exception as exc:
240 logger.exception(exc)
/opt/conda/lib/python3.6/site-packages/tornado/gen.py in run(self)
1053
1054 try:
-> 1055 value = future.result()
1056 except Exception:
1057 self.had_exception = True
/opt/conda/lib/python3.6/site-packages/tornado/concurrent.py in result(self, timeout)
236 if self._exc_info is not None:
237 try:
--> 238 raise_exc_info(self._exc_info)
239 finally:
240 self = None
/opt/conda/lib/python3.6/site-packages/tornado/util.py in raise_exc_info(exc_info)
/opt/conda/lib/python3.6/site-packages/tornado/gen.py in run(self)
1061 if exc_info is not None:
1062 try:
-> 1063 yielded = self.gen.throw(*exc_info)
1064 finally:
1065 # Break up a reference to itself
/opt/conda/lib/python3.6/site-packages/distributed/client.py in _gather(self, futures, errors, direct, local_worker)
1362 six.reraise(type(exception),
1363 exception,
-> 1364 traceback)
1365 if errors == 'skip':
1366 bad_keys.add(key)
/opt/conda/lib/python3.6/site-packages/six.py in reraise(tp, value, tb)
690 value = tp()
691 if value.__traceback__ is not tb:
--> 692 raise value.with_traceback(tb)
693 raise value
694 finally:
/opt/conda/lib/python3.6/site-packages/distributed/protocol/pickle.py in loads()
57 def loads(x):
58 try:
---> 59 return pickle.loads(x)
60 except Exception:
61 logger.info("Failed to deserialize %s", x[:10000], exc_info=True)
/opt/conda/lib/python3.6/importlib/_bootstrap.py in _find_and_load()
/opt/conda/lib/python3.6/importlib/_bootstrap.py in _find_and_load_unlocked()
/opt/conda/lib/python3.6/importlib/_bootstrap.py in _load_unlocked()
/opt/conda/lib/python3.6/importlib/_bootstrap_external.py in exec_module()
/opt/conda/lib/python3.6/importlib/_bootstrap_external.py in get_code()
/opt/conda/lib/python3.6/importlib/_bootstrap_external.py in _compile_bytecode()
ValueError: bad marshal data (invalid reference)
I am intermittently getting the following error when using dask-kubernetes on an AWS based Pangeo deployment.
The cluster has 50 workers and the error seems to happen randomly.
I've had a google but have not found useful references for that error. Could someone please point me in the right direction?