In [2]:
#!export https_proxy=http://127.0.0.1:7890
#!export http_proxy=http://127.0.0.1:7890
#!export all_proxy=socks5://127.0.0.1:7890

In [4]:
import os
os.environ['http_proxy'] = '127.0.0.1:7890'
os.environ['https_proxy'] = '127.0.0.1:7890'

### misc

- offload optimizer states & parameters
    - cpu
    - nvme
- stage-3 跟 device-map 不兼容跟 low_cpu 
    - 让 Accelerate/DeepSpeed 自动完成设备映射。
    - `device_map = 'auto'`: 在所有可用的 gpu 上均匀分配模型；

## gpu memory

- https://deepspeed.readthedocs.io/en/latest/memory.html

```
from transformers import AutoModel
from deepspeed.runtime.zero.stage_1_and_2 import estimate_zero2_model_states_mem_needs_all_live
model = AutoModel.from_pretrained("meta-llama/Llama-2-7b-hf")
estimate_zero2_model_states_mem_needs_all_live(model, num_gpus_per_node=2, num_nodes=1)
```

```
Estimated memory needed for params, optim states and gradients for a:
HW: Setup with 1 node, 2 GPUs per node.
SW: Model with 6607M total params.
  per CPU  |  per GPU |   Options
  147.69GB |  12.31GB | offload_optimizer=cpu
   73.84GB |  73.84GB | offload_optimizer=non
```

```
python -c 'from transformers import AutoModel; \
from deepspeed.runtime.zero.stage3 import estimate_zero3_model_states_mem_needs_all_live; \
model = AutoModel.from_pretrained("t5-3b"); \
estimate_zero3_model_states_mem_needs_all_live(model, num_gpus_per_node=8, num_nodes=1)'
```

```
Estimated memory needed for params, optim states and gradients for a:
HW: Setup with 1 node, 2 GPUs per node.
SW: Model with 6607M total params, 131M largest layer params.
  per CPU  |  per GPU |   Options
  166.15GB |   0.49GB | offload_param=cpu , offload_optimizer=cpu , zero_init=1
  166.15GB |   0.49GB | offload_param=cpu , offload_optimizer=cpu , zero_init=0
  147.69GB |   6.64GB | offload_param=none, offload_optimizer=cpu , zero_init=1
  147.69GB |   6.64GB | offload_param=none, offload_optimizer=cpu , zero_init=0
    1.46GB |  55.87GB | offload_param=none, offload_optimizer=none, zero_init=1
   73.84GB |  55.87GB | offload_param=none, offload_optimizer=none, zero_init=0
```

## ds_config

- https://www.deepspeed.ai/docs/config-json/

In [1]:
from transformers import AutoTokenizer, AutoConfig, AutoModelForSeq2SeqLM
from transformers.integrations import HfDeepSpeedConfig
import deepspeed
import os
import torch

[2023-12-20 22:59:22,719] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)


In [2]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [3]:
local_rank = int(os.getenv("LOCAL_RANK", "0"))
world_size = int(os.getenv("WORLD_SIZE", "1"))
torch.cuda.set_device(local_rank)
deepspeed.init_distributed()

[2023-12-20 22:59:28,990] [INFO] [comm.py:637:init_distributed] cdb=None
[2023-12-20 22:59:28,993] [INFO] [comm.py:652:init_distributed] Not using the DeepSpeed or dist launchers, attempting to detect MPI environment...
[2023-12-20 22:59:29,450] [INFO] [comm.py:702:mpi_discovery] Discovered MPI settings of world_rank=0, local_rank=0, world_size=1, master_addr=169.254.3.1, master_port=29500
[2023-12-20 22:59:29,453] [INFO] [comm.py:668:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl


In [8]:
model_name = "bigscience/T0_3B"
config = AutoConfig.from_pretrained(model_name)
model_hidden_size = config.d_model
train_batch_size = 1 * world_size

In [9]:
ds_config = {
    "fp16": {
        "enabled": False
    },
    "bf16": {
        "enabled": False
    },
    "zero_optimization": {
        "stage": 3,
        "offload_param": {
            "device": "cpu",
            "pin_memory": True
        },
        "overlap_comm": True,
        "contiguous_gradients": True,
        "reduce_bucket_size": model_hidden_size * model_hidden_size,
        "stage3_prefetch_bucket_size": 0.9 * model_hidden_size * model_hidden_size,
        "stage3_param_persistence_threshold": 10 * model_hidden_size
    },
    "steps_per_print": 2000,
    "train_batch_size": train_batch_size,
    "train_micro_batch_size_per_gpu": 1,
    "wall_clock_breakdown": False
}

In [11]:
dschf = HfDeepSpeedConfig(ds_config) 