-
Notifications
You must be signed in to change notification settings - Fork 4.7k
Closed
Description
what was working just recently started crashing with using zero3 confg:
Traceback (most recent call last):
File "/mnt/nvme1/code/huggingface/transformers-ds-zero-3/examples/seq2seq/run_seq2seq.py", line 650, in <module>
main()
File "/mnt/nvme1/code/huggingface/transformers-ds-zero-3/examples/seq2seq/run_seq2seq.py", line 590, in main
train_result = trainer.train(resume_from_checkpoint=checkpoint)
File "/mnt/nvme1/code/huggingface/transformers-ds-zero-3/src/transformers/trainer.py", line 1094, in train
self._maybe_log_save_evaluate(tr_loss, model, trial, epoch)
File "/mnt/nvme1/code/huggingface/transformers-ds-zero-3/src/transformers/trainer.py", line 1185, in _maybe_log_save_evaluate
self._save_checkpoint(model, trial, metrics=metrics)
File "/mnt/nvme1/code/huggingface/transformers-ds-zero-3/src/transformers/trainer.py", line 1212, in _save_checkpoint
self.deepspeed.save_checkpoint(output_dir)
File "/mnt/nvme1/code/github/00optimize/DeepSpeed/deepspeed/runtime/engine.py", line 1595, in save_checkpoint
self._save_zero_checkpoint(save_dir, tag)
File "/mnt/nvme1/code/github/00optimize/DeepSpeed/deepspeed/runtime/engine.py", line 1667, in _save_zero_checkpoint
zero_sd = {'optimizer_state_dict': self.optimizer.state_dict()}
File "/mnt/nvme1/code/github/00optimize/DeepSpeed/deepspeed/runtime/zero/stage3.py", line 2664, in state_dict
return self._rigid_state_dict()
File "/mnt/nvme1/code/github/00optimize/DeepSpeed/deepspeed/runtime/zero/stage3.py", line 2642, in _rigid_state_dict
state_dict['optimizer_state_dict'] = self.optimizer.state_dict()
File "/home/stas/anaconda3/envs/main-38/lib/python3.8/site-packages/torch/optim/optimizer.py", line 119, in state_dict
packed_state = {(param_mappings[id(k)] if isinstance(k, torch.Tensor) else k): v
File "/home/stas/anaconda3/envs/main-38/lib/python3.8/site-packages/torch/optim/optimizer.py", line 119, in <dictcomp>
packed_state = {(param_mappings[id(k)] if isinstance(k, torch.Tensor) else k): v
KeyError: 140171684780672
happens with 1 or 2 gpus, this one was with one gpu, on transformers example:
PYTHONPATH=src deepspeed --num_gpus 1 /mnt/nvme1/code/huggingface/transformers-ds-zero-3/examples/seq2seq/run_seq2seq.py --model_name_or_path t5-small --train_file /mnt/nvme1/code/huggingface/transformers-ds-zero-3/examples/test_data/wmt_en_ro/train.json --validation_file /mnt/nvme1/code/huggingface/transformers-ds-zero-3/examples/test_data/wmt_en_ro/val.json --output_dir /tmp/zero3 --overwrite_output_dir --max_train_samples 10 --max_val_samples 10 --max_source_length 12 --max_target_length 12 --val_max_target_length 12 --do_train --do_eval --num_train_epochs 1 --per_device_train_batch_size 2 --learning_rate 3e-3 --warmup_steps 8 --predict_with_generate --logging_steps 0 --save_steps 2 --eval_steps 1 --group_by_length --label_smoothing_factor 0.1 --adafactor --dataset_name wmt16 --dataset_config ro-en --task translation_en_to_ro --source_prefix "translate English to Romanian: " --deepspeed /mnt/nvme1/code/huggingface/transformers-ds-zero-3/examples/tests/deepspeed/ds_config_zero3.json
config used:
{
"fp16": {
"enabled": true,
"loss_scale": 0,
"loss_scale_window": 1000,
"initial_scale_power": 1,
"hysteresis": 2,
"min_loss_scale": 1
},
"zero_optimization": {
"stage": 3,
"cpu_offload": true,
"cpu_offload_params": true,
"cpu_offload_use_pin_memory" : true,
"overlap_comm": true,
"contiguous_gradients": true,
"stage3_max_live_parameters": 1e9,
"stage3_max_reuse_distance": 1e8,
"stage3_prefetch_bucket_size": 2e5,
"stage3_param_persitance_threshold": 1e5,
"reduce_bucket_size": 3e6,
"prefetch_bucket_size": 3e6,
"sub_group_size": 1e6
},
"optimizer": {
"type": "AdamW",
"params": {
"lr": 3e-5,
"betas": [0.8, 0.999],
"eps": 1e-8,
"weight_decay": 3e-7
}
},
"scheduler": {
"type": "WarmupLR",
"params": {
"warmup_min_lr": 0,
"warmup_max_lr": 3e-5,
"warmup_num_steps": 500
}
},
"steps_per_print": 2000,
"wall_clock_breakdown": false
}
Thank you!
Metadata
Metadata
Assignees
Labels
No labels