In [4]:
import os
os.environ['http_proxy'] = '127.0.0.1:7890'
os.environ['https_proxy'] = '127.0.0.1:7890'

## basics

- https://github.com/microsoft/DeepSpeed
    - https://www.microsoft.com/en-us/research/blog/turing-nlg-a-17-billion-parameter-language-model-by-microsoft/
- huggingface DeepSpeed Integration
    - https://huggingface.co/docs/transformers/main_classes/deepspeed
- pip 安装之后，可以通过 `ds_report` 命令查看环境配置信息；
- bag of tricks
    - $O(n^2)$（$n$ 表示序列长度） => sparse attention；
        - 10x longer seq，up to 6x faster；

## ZeRO: Zero Redundancy Optimizer

## 使用

- https://huggingface.co/docs/transformers/main_classes/deepspeed

### cli

- https://colab.research.google.com/github/stas00/porting/blob/master/transformers/deepspeed/DeepSpeed_on_colab_CLI.ipynb#scrollTo=vSlYvQWLwblN

### inference

- inference-tutorial
    - https://github.com/microsoft/DeepSpeed/blob/master/docs/_tutorials/inference-tutorial.md

In [1]:
from transformers import AutoTokenizer, AutoConfig, AutoModelForSeq2SeqLM
from transformers.integrations import HfDeepSpeedConfig
import deepspeed
import os
import torch

[2023-12-20 22:59:22,719] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)


In [2]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [3]:
local_rank = int(os.getenv("LOCAL_RANK", "0"))
world_size = int(os.getenv("WORLD_SIZE", "1"))
torch.cuda.set_device(local_rank)
deepspeed.init_distributed()

[2023-12-20 22:59:28,990] [INFO] [comm.py:637:init_distributed] cdb=None
[2023-12-20 22:59:28,993] [INFO] [comm.py:652:init_distributed] Not using the DeepSpeed or dist launchers, attempting to detect MPI environment...
[2023-12-20 22:59:29,450] [INFO] [comm.py:702:mpi_discovery] Discovered MPI settings of world_rank=0, local_rank=0, world_size=1, master_addr=169.254.3.1, master_port=29500
[2023-12-20 22:59:29,453] [INFO] [comm.py:668:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl


In [8]:
model_name = "bigscience/T0_3B"
config = AutoConfig.from_pretrained(model_name)
model_hidden_size = config.d_model
train_batch_size = 1 * world_size

In [9]:
ds_config = {
    "fp16": {
        "enabled": False
    },
    "bf16": {
        "enabled": False
    },
    "zero_optimization": {
        "stage": 3,
        "offload_param": {
            "device": "cpu",
            "pin_memory": True
        },
        "overlap_comm": True,
        "contiguous_gradients": True,
        "reduce_bucket_size": model_hidden_size * model_hidden_size,
        "stage3_prefetch_bucket_size": 0.9 * model_hidden_size * model_hidden_size,
        "stage3_param_persistence_threshold": 10 * model_hidden_size
    },
    "steps_per_print": 2000,
    "train_batch_size": train_batch_size,
    "train_micro_batch_size_per_gpu": 1,
    "wall_clock_breakdown": False
}

In [10]:
dschf = HfDeepSpeedConfig(ds_config) 