Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Error loading model from checkpoint on Apple M1 #446

Closed
ibulu opened this issue Apr 24, 2022 · 9 comments
Closed

Error loading model from checkpoint on Apple M1 #446

ibulu opened this issue Apr 24, 2022 · 9 comments

Comments

@ibulu
Copy link

ibulu commented Apr 24, 2022

I am trying to load longT5 model from checkpoint and getting the following error. Any help is much appreciated.

`

RuntimeError Traceback (most recent call last)
Input In [9], in <cell line: 1>()
----> 1 t5x_checkpoint = t5x.checkpoints.load_t5x_checkpoint(checkpoint_dir)

File ~/t5x/t5x/checkpoints.py:1594, in load_t5x_checkpoint(path, step, state_transformation_fns, remap, restore_dtype, lazy_parameters)
1592 if not lazy_parameters:
1593 future_state_dict = jax.tree_map(lambda x: x.get_async(), state_dict)
-> 1594 state_dict = _run_future_tree(future_state_dict)
1596 if restore_dtype is not None:
1597 state_dict['target'] = _cast(state_dict['target'], restore_dtype)

File ~/t5x/t5x/checkpoints.py:167, in _run_future_tree(future_tree)
165 # TODO(adarob): Use asyncio.run in py3.7+.
166 loop = asyncio.get_event_loop()
--> 167 leaves = loop.run_until_complete(asyncio.gather(*future_leaves))
168 return jax.tree_unflatten(treedef, leaves)

File ~/opt/miniconda3/lib/python3.9/asyncio/base_events.py:623, in BaseEventLoop.run_until_complete(self, future)
612 """Run until the Future is done.
613
614 If the argument is a coroutine, it is wrapped in a Task.
(...)
620 Return the Future's result, or raise its exception.
621 """
622 self._check_closed()
--> 623 self._check_running()
625 new_task = not futures.isfuture(future)
626 future = tasks.ensure_future(future, loop=self)

File ~/opt/miniconda3/lib/python3.9/asyncio/base_events.py:583, in BaseEventLoop._check_running(self)
581 def _check_running(self):
582 if self.is_running():
--> 583 raise RuntimeError('This event loop is already running')
584 if events._get_running_loop() is not None:
585 raise RuntimeError(
586 'Cannot run the event loop while another loop is running')

RuntimeError: This event loop is already running
`

@adarob
Copy link
Collaborator

adarob commented Apr 24, 2022

Is this in a notebook?

@ibulu
Copy link
Author

ibulu commented Apr 24, 2022

Is this in a notebook?

Yes.

@adarob
Copy link
Collaborator

adarob commented Apr 24, 2022

See https://medium.com/@vyshali.enukonda/how-to-get-around-runtimeerror-this-event-loop-is-already-running-3f26f67e762e

@adarob adarob closed this as completed Apr 24, 2022
@ibulu
Copy link
Author

ibulu commented Apr 25, 2022

Awesome! that worked. I am now running into the following issue:

'''
ValueError Traceback (most recent call last)
Input In [5], in <cell line: 1>()
----> 1 t5x_checkpoint = t5x.checkpoints.load_t5x_checkpoint(checkpoint_dir)

File ~/t5x/t5x/checkpoints.py:1594, in load_t5x_checkpoint(path, step, state_transformation_fns, remap, restore_dtype, lazy_parameters)
1592 if not lazy_parameters:
1593 future_state_dict = jax.tree_map(lambda x: x.get_async(), state_dict)
-> 1594 state_dict = _run_future_tree(future_state_dict)
1596 if restore_dtype is not None:
1597 state_dict['target'] = _cast(state_dict['target'], restore_dtype)

File ~/t5x/t5x/checkpoints.py:167, in _run_future_tree(future_tree)
165 # TODO(adarob): Use asyncio.run in py3.7+.
166 loop = asyncio.get_event_loop()
--> 167 leaves = loop.run_until_complete(asyncio.gather(*future_leaves))
168 return jax.tree_unflatten(treedef, leaves)

File ~/opt/miniconda3/lib/python3.9/site-packages/nest_asyncio.py:89, in _patch_loop..run_until_complete(self, future)
86 if not f.done():
87 raise RuntimeError(
88 'Event loop stopped before Future completed.')
---> 89 return f.result()

File ~/opt/miniconda3/lib/python3.9/asyncio/tasks.py:258, in Task.__step(failed resolving arguments)
256 result = coro.send(None)
257 else:
--> 258 result = coro.throw(exc)
259 except StopIteration as exc:
260 if self._must_cancel:
261 # Task is cancelled right before coro stops.

File ~/t5x/t5x/checkpoint_importer.py:114, in LazyAwaitableArray.get_async.._get_and_cast()
109 async def _get_and_cast():
110 # Pytype has a false positive here, where it treats our _get_fn (_read_ts
111 # in this case) as having a return type of np.ndarray instead of
112 # wrapping it in an Awaitable. Related to this bug
113 # google/pytype#527
--> 114 arr = await self._get_fn() # pytype: disable=bad-return-type
115 if arr.dtype != self.dtype:
116 arr = arr.astype(self.dtype)

File ~/t5x/t5x/checkpoints.py:1422, in _read_ts(param_info, maybe_tspec, ckpt_path, restore_dtype, mesh, axes)
1414 tmp_ts_spec_dict = {
1415 'base': tmp_ts_spec_dict,
1416 'driver': 'cast',
1417 'dtype': jnp.dtype(restore_dtype).name
1418 }
1420 if mesh is None or axes is None:
1421 # Read the array.
-> 1422 t = await ts.open(tmp_ts_spec_dict, open=True)
1423 if param_info.local_chunk_info is not None:
1424 # Just read the subsection we care about.
1425 t = t[param_info.local_chunk_info.slice]

File ~/opt/miniconda3/lib/python3.9/asyncio/futures.py:284, in Future.await(self)
282 if not self.done():
283 self._asyncio_future_blocking = True
--> 284 yield self # This tells Task to wait for completion.
285 if not self.done():
286 raise RuntimeError("await wasn't used with future")

File ~/opt/miniconda3/lib/python3.9/asyncio/tasks.py:328, in Task.__wakeup(self, future)
326 def __wakeup(self, future):
327 try:
--> 328 future.result()
329 except BaseException as exc:
330 # This may also be a cancellation.
331 self.__step(exc)

File ~/opt/miniconda3/lib/python3.9/asyncio/futures.py:201, in Future.result(self)
199 self.__log_traceback = False
200 if self._exception is not None:
--> 201 raise self._exception
202 return self._result

ValueError: Error opening "zarr" driver: Error reading local file "./longt5_base_transient_checkpoint_1000000/target.decoder.layers_0.encoder_decoder_attention.key.kernel/.zarray": Invalid key: "./longt5_base_transient_checkpoint_1000000/target.decoder.layers_0.encoder_decoder_attention.key.kernel/.zarray"
'''

@adarob
Copy link
Collaborator

adarob commented Apr 25, 2022

Are you sure that file exists?

@ibulu
Copy link
Author

ibulu commented Apr 25, 2022

yep. the directory and files exists. I am not very familiar with TensorStore library...I think that's being used...is there a quick test you can suggest that'd help isolate the issue?

@ibulu
Copy link
Author

ibulu commented Apr 25, 2022

giving the full path instead of relative path solved the issue!

@kunfang98927
Copy link

kunfang98927 commented Apr 26, 2022

What version of Tensorstore did you use? I met a similar problem when saving the checkpoint. My Tensorstore is 0.1.19.

At the beginning, I used the relative path and got the same error as you mentioned above:

ValueError: Error opening "zarr" driver: Error reading local file "./pretrain_model/checkpoint_5000.tmp-1650694933/state.param_states.decoder.decoder_norm.scale.v/.zarray": Invalid key: "./pretrain_model/checkpoint_5000.tmp-1650694933/state.param_states.decoder.decoder_norm.scale.v/.zarray"
In call to configurable 'train' (<function train at 0x7fa6818348c0>))

Then I changed the path to the absolute path and this issue was solved. But a new issue occurred.

This error occurred when I ended training for 100 steps and saved the checkpoint to the absolute path '/gpfsnyu/scratch/kf2395/jukemir_t5/pretrain/'
I tried to delete all the files in '/gpfsnyu/scratch/kf2395/jukemir_t5/pretrain/' and trained again. But this issue still existed.

The error message:

I0426 13:05:36.808195 140074531202880 train.py:516] Epoch 0 of 10000
I0426 13:05:36.808564 140055117031168 logging_writer.py:48] [0] collection=train timing/compilation_seconds=160.272345
I0426 13:05:36.828166 140074531202880 train.py:522] BEGIN Train loop.
I0426 13:05:36.828350 140074531202880 train.py:527] Training for 100 steps.
I0426 13:05:36.833504 140074531202880 trainer.py:517] Training: step 0
I0426 13:05:47.585027 140074531202880 trainer.py:517] Training: step 12
I0426 13:05:58.556400 140074531202880 trainer.py:517] Training: step 23
I0426 13:06:09.237899 140074531202880 trainer.py:517] Training: step 34
I0426 13:06:19.734536 140074531202880 trainer.py:517] Training: step 45
I0426 13:06:30.668152 140074531202880 trainer.py:517] Training: step 56
I0426 13:06:41.496444 140074531202880 trainer.py:517] Training: step 67
I0426 13:06:52.412244 140074531202880 trainer.py:517] Training: step 78
I0426 13:07:03.236425 140074531202880 trainer.py:517] Training: step 89
I0426 13:07:13.692245 140074531202880 train.py:550] END Train loop.
I0426 13:07:13.727353 140055117031168 logging_writer.py:48] [100] collection=train accuracy=0.12926435470581055, cross_ent_loss=3456.254063, cross_ent_loss_per_all_target_tokens=0.337525, learning_rate=0.001000000280328095, learning_rate/current=0.0010000000474974513, loss=3460.679688, loss_per_all_target_tokens=0.337957, loss_per_nonpadding_target_token=5.071336, nonpadding_fraction=0.066641, timing/seconds=96.861853, timing/seqs=1000, timing/seqs_per_second=10.323982, timing/seqs_per_second_per_core=10.323982, timing/steps_per_second=1.032398, timing/target_tokens_per_second=10571.757297, timing/target_tokens_per_second_per_core=10571.757297, z_loss=4.426097, z_loss_per_all_target_tokens=0.000432
I0426 13:07:13.728666 140074531202880 train.py:565] Saving checkpoint.
I0426 13:07:13.730171 140074531202880 checkpoints.py:631] Saving checkpoint for step 100 to /gpfsnyu/scratch/kf2395/jukemir_t5/pretrain/checkpoint_100.tmp-1650949633
Traceback (most recent call last):
File "/gpfsnyu/scratch/kf2395/.cache/env/tf2-gpu-py3.7/lib/python3.7/runpy.py", line 193, in _run_module_as_main
"main", mod_spec)
File "/gpfsnyu/scratch/kf2395/.cache/env/tf2-gpu-py3.7/lib/python3.7/runpy.py", line 85, in _run_code
exec(code, run_globals)
File "/gpfsnyu/scratch/kf2395/jukemir_t5/t5x/train.py", line 663, in
gin_utils.run(main)
File "/gpfsnyu/scratch/kf2395/jukemir_t5/t5x/gin_utils.py", line 107, in run
flags_parser=lambda a: app.parse_flags_with_usage(rewrite_gin_args(a)))
File "/gpfsnyu/scratch/kf2395/.cache/env/tf2-gpu-py3.7/lib/python3.7/site-packages/absl/app.py", line 312, in run
_run_main(main, args)
File "/gpfsnyu/scratch/kf2395/.cache/env/tf2-gpu-py3.7/lib/python3.7/site-packages/absl/app.py", line 258, in _run_main
sys.exit(main(argv))
File "/gpfsnyu/scratch/kf2395/jukemir_t5/t5x/train.py", line 641, in main
_main(argv)
File "/gpfsnyu/scratch/kf2395/jukemir_t5/t5x/train.py", line 661, in _main
train_using_gin()
File "/gpfsnyu/scratch/kf2395/.cache/env/tf2-gpu-py3.7/lib/python3.7/site-packages/gin/config.py", line 1605, in gin_wrapper
utils.augment_exception_message_and_reraise(e, err_str)
File "/gpfsnyu/scratch/kf2395/.cache/env/tf2-gpu-py3.7/lib/python3.7/site-packages/gin/utils.py", line 41, in augment_exception_message_and_reraise
raise proxy.with_traceback(exception.traceback) from None
File "/gpfsnyu/scratch/kf2395/.cache/env/tf2-gpu-py3.7/lib/python3.7/site-packages/gin/config.py", line 1582, in gin_wrapper
return fn(*new_args, **new_kwargs)
File "/gpfsnyu/scratch/kf2395/jukemir_t5/t5x/train.py", line 568, in train
checkpoint_cfg.save.state_transformation_fns)
File "/gpfsnyu/scratch/kf2395/jukemir_t5/t5x/checkpoints.py", line 639, in save
tmp_dir, train_state, concurrent_gb, state_transformation_fns)
File "/gpfsnyu/scratch/kf2395/jukemir_t5/t5x/checkpoints.py", line 806, in _write_state_to_tensorstore
written_state_dict = _run_future_tree(future_written_state)
File "/gpfsnyu/scratch/kf2395/jukemir_t5/t5x/checkpoints.py", line 167, in _run_future_tree
leaves = loop.run_until_complete(asyncio.gather(*future_leaves))
File "/gpfsnyu/scratch/kf2395/.cache/env/tf2-gpu-py3.7/lib/python3.7/asyncio/base_events.py", line 587, in run_until_complete
return future.result()
File "/gpfsnyu/scratch/kf2395/jukemir_t5/t5x/checkpoints.py", line 770, in _write_array
'limit': 128
ValueError: Error opening "zarr" driver: Error writing local file "/gpfsnyu/scratch/kf2395/jukemir_t5/pretrain/checkpoint_100.tmp-1650949633/state.param_states.decoder.layers_0.pre_cross_attention_layer_norm.scale.v/.zarray": Failed to acquire lock on file: /gpfsnyu/scratch/kf2395/jukemir_t5/pretrain/checkpoint_100.tmp-1650949633/state.param_states.decoder.layers_0.pre_cross_attention_layer_norm.scale.v/.zarray.__lock [OS error: Invalid argument]
In call to configurable 'train' (<function train at 0x7f651e1e78c0>)

@ibulu
Copy link
Author

ibulu commented Apr 30, 2022

I am using version 0.1.18

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

3 participants