Skip to content

bad marshal data (invalid reference) #2080

@jacobtomlinson

Description

@jacobtomlinson

I am intermittently getting the following error when using dask-kubernetes on an AWS based Pangeo deployment.

The cluster has 50 workers and the error seems to happen randomly.

I've had a google but have not found useful references for that error. Could someone please point me in the right direction?

distributed.scheduler - ERROR - error from worker tcp://100.96.102.208:45331: bad marshal data (invalid reference)
distributed.utils - ERROR - bad marshal data (invalid reference)
Traceback (most recent call last):
  File "/opt/conda/lib/python3.6/site-packages/distributed/utils.py", line 238, in f
    result[0] = yield make_coro()
  File "/opt/conda/lib/python3.6/site-packages/tornado/gen.py", line 1055, in run
    value = future.result()
  File "/opt/conda/lib/python3.6/site-packages/tornado/concurrent.py", line 238, in result
    raise_exc_info(self._exc_info)
  File "<string>", line 4, in raise_exc_info
  File "/opt/conda/lib/python3.6/site-packages/tornado/gen.py", line 1063, in run
    yielded = self.gen.throw(*exc_info)
  File "/opt/conda/lib/python3.6/site-packages/distributed/client.py", line 1364, in _gather
    traceback)
  File "/opt/conda/lib/python3.6/site-packages/six.py", line 692, in reraise
    raise value.with_traceback(tb)
  File "/opt/conda/lib/python3.6/site-packages/distributed/protocol/pickle.py", line 59, in loads
    return pickle.loads(x)
  File "<frozen importlib._bootstrap>", line 971, in _find_and_load
  File "<frozen importlib._bootstrap>", line 955, in _find_and_load_unlocked
  File "<frozen importlib._bootstrap>", line 665, in _load_unlocked
  File "<frozen importlib._bootstrap_external>", line 674, in exec_module
  File "<frozen importlib._bootstrap_external>", line 779, in get_code
  File "<frozen importlib._bootstrap_external>", line 487, in _compile_bytecode
ValueError: bad marshal data (invalid reference)
distributed.scheduler - ERROR - error from worker tcp://100.96.102.210:34063: bad marshal data (invalid reference)
distributed.scheduler - ERROR - error from worker tcp://100.96.106.207:45867: bad marshal data (unknown type code)

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<timed exec> in <module>()

~/eupheme/distributed_version/region_attn.py in load_process_data(startyear, endyear, eventyear, startmonth, endmonth, region, obs, modelroot, modelname, varn, varname_o, freq, lc)
    620 
    621 def load_process_data(startyear,endyear,eventyear,startmonth,endmonth,region,obs,modelroot,modelname,varn,varname_o,freq,lc):
--> 622     all_anom,nat_anom=loadandmask_cmip5da(startyear,endyear,region,obs,modelroot,modelname,varn,varname_o,freq)#GENERIC ATTRIBUTION load routine
    623 
    624     #Look at a single season. Event attribution example. Make sure this is within the model run period!

~/eupheme/distributed_version/region_attn.py in loadandmask_cmip5da(startyear, endyear, region, obs, modelroot, modelname, varn, varname_o, freq)
    523 def loadandmask_cmip5da(startyear,endyear,region,obs,modelroot,modelname,varn,varname_o,freq):
    524     all_ens=load_region_allforcings_ensemble(region,startyear,endyear,obs[0],modelroot,modelname,varn)#All forcings
--> 525     nat_ens=load_region_ensemble('historicalNat',region,startyear,endyear,obs[0],modelroot,modelname,varn,freq)#Natural forcings
    526 
    527     notlandmask=fl.get_landsea_mask(modelroot+modelname+'/sftlf_fx_'+modelname+'_historical_r0i0p0.nc',obs)

~/eupheme/distributed_version/region_attn.py in load_region_ensemble(forcinglabel, region, startyear, endyear, obs, modelroot, modelname, diagstr, freq, save)
    175 
    176     list_of_cubes = [regridit(i, i_memb) for i,i_memb in enumerate(uniquemembs)]
--> 177     regionens=dask.delayed(iris.cube.CubeList)(list_of_cubes).compute()
    178 
    179 

/opt/conda/lib/python3.6/site-packages/dask/base.py in compute(self, **kwargs)
    141         dask.base.compute
    142         """
--> 143         (result,) = compute(self, traverse=False, **kwargs)
    144         return result
    145 

/opt/conda/lib/python3.6/site-packages/dask/base.py in compute(*args, **kwargs)
    390     postcomputes = [a.__dask_postcompute__() if is_dask_collection(a)
    391                     else (None, a) for a in args]
--> 392     results = get(dsk, keys, **kwargs)
    393     results_iter = iter(results)
    394     return tuple(a if f is None else f(next(results_iter), *a)

/opt/conda/lib/python3.6/site-packages/distributed/client.py in get(self, dsk, keys, restrictions, loose_restrictions, resources, sync, asynchronous, **kwargs)
   2051                     should_rejoin = False
   2052             try:
-> 2053                 results = self.gather(packed, asynchronous=asynchronous)
   2054             finally:
   2055                 for f in futures.values():

/opt/conda/lib/python3.6/site-packages/distributed/client.py in gather(self, futures, errors, maxsize, direct, asynchronous)
   1484             return self.sync(self._gather, futures, errors=errors,
   1485                              direct=direct, local_worker=local_worker,
-> 1486                              asynchronous=asynchronous)
   1487 
   1488     @gen.coroutine

/opt/conda/lib/python3.6/site-packages/distributed/client.py in sync(self, func, *args, **kwargs)
    606             return future
    607         else:
--> 608             return sync(self.loop, func, *args, **kwargs)
    609 
    610     def __repr__(self):

/opt/conda/lib/python3.6/site-packages/distributed/utils.py in sync(loop, func, *args, **kwargs)
    252             e.wait(10)
    253     if error[0]:
--> 254         six.reraise(*error[0])
    255     else:
    256         return result[0]

/opt/conda/lib/python3.6/site-packages/six.py in reraise(tp, value, tb)
    691             if value.__traceback__ is not tb:
    692                 raise value.with_traceback(tb)
--> 693             raise value
    694         finally:
    695             value = None

/opt/conda/lib/python3.6/site-packages/distributed/utils.py in f()
    236             yield gen.moment
    237             thread_state.asynchronous = True
--> 238             result[0] = yield make_coro()
    239         except Exception as exc:
    240             logger.exception(exc)

/opt/conda/lib/python3.6/site-packages/tornado/gen.py in run(self)
   1053 
   1054                     try:
-> 1055                         value = future.result()
   1056                     except Exception:
   1057                         self.had_exception = True

/opt/conda/lib/python3.6/site-packages/tornado/concurrent.py in result(self, timeout)
    236         if self._exc_info is not None:
    237             try:
--> 238                 raise_exc_info(self._exc_info)
    239             finally:
    240                 self = None

/opt/conda/lib/python3.6/site-packages/tornado/util.py in raise_exc_info(exc_info)

/opt/conda/lib/python3.6/site-packages/tornado/gen.py in run(self)
   1061                     if exc_info is not None:
   1062                         try:
-> 1063                             yielded = self.gen.throw(*exc_info)
   1064                         finally:
   1065                             # Break up a reference to itself

/opt/conda/lib/python3.6/site-packages/distributed/client.py in _gather(self, futures, errors, direct, local_worker)
   1362                             six.reraise(type(exception),
   1363                                         exception,
-> 1364                                         traceback)
   1365                     if errors == 'skip':
   1366                         bad_keys.add(key)

/opt/conda/lib/python3.6/site-packages/six.py in reraise(tp, value, tb)
    690                 value = tp()
    691             if value.__traceback__ is not tb:
--> 692                 raise value.with_traceback(tb)
    693             raise value
    694         finally:

/opt/conda/lib/python3.6/site-packages/distributed/protocol/pickle.py in loads()
     57 def loads(x):
     58     try:
---> 59         return pickle.loads(x)
     60     except Exception:
     61         logger.info("Failed to deserialize %s", x[:10000], exc_info=True)

/opt/conda/lib/python3.6/importlib/_bootstrap.py in _find_and_load()

/opt/conda/lib/python3.6/importlib/_bootstrap.py in _find_and_load_unlocked()

/opt/conda/lib/python3.6/importlib/_bootstrap.py in _load_unlocked()

/opt/conda/lib/python3.6/importlib/_bootstrap_external.py in exec_module()

/opt/conda/lib/python3.6/importlib/_bootstrap_external.py in get_code()

/opt/conda/lib/python3.6/importlib/_bootstrap_external.py in _compile_bytecode()

ValueError: bad marshal data (invalid reference)

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions