Skip to content

Commit

Permalink
[legacy] move engine to legacy (hpcaitech#4560)
Browse files Browse the repository at this point in the history
* [legacy] move engine to legacy

* [example] fix seq parallel example

* [example] fix seq parallel example

* [test] test gemini pluging hang

* [test] test gemini pluging hang

* [test] test gemini pluging hang

* [test] test gemini pluging hang

* [test] test gemini pluging hang

* [example] update seq parallel requirements
  • Loading branch information
ver217 authored and flybird11111 committed Sep 6, 2023
1 parent cbfd84a commit d434c24
Show file tree
Hide file tree
Showing 39 changed files with 93 additions and 105 deletions.
2 changes: 1 addition & 1 deletion colossalai/builder/builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ def build_gradient_handler(config, model, optimizer):
optimizer (:class:`torch.optim.Optimizer`): An optimizer object containing parameters for the gradient handler
Returns:
An object of :class:`colossalai.engine.BaseGradientHandler`
An object of :class:`colossalai.legacy.engine.BaseGradientHandler`
"""
config_ = config.copy()
config_['model'] = model
Expand Down
6 changes: 3 additions & 3 deletions colossalai/initialize.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,9 @@
from colossalai.context import Config, ConfigException, ParallelMode
from colossalai.context.moe_context import MOE_CONTEXT
from colossalai.core import global_context as gpc
from colossalai.engine import Engine
from colossalai.engine.gradient_accumulation import accumulate_gradient
from colossalai.engine.schedule import (
from colossalai.legacy.engine import Engine
from colossalai.legacy.engine.gradient_accumulation import accumulate_gradient
from colossalai.legacy.engine.schedule import (
InterleavedPipelineSchedule,
NonPipelineSchedule,
PipelineSchedule,
Expand Down
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,17 @@
from torch.nn import Module
from torch.nn.modules.loss import _Loss

from colossalai.engine.gradient_handler import BaseGradientHandler
from colossalai.engine.schedule import BaseSchedule, InterleavedPipelineSchedule, NonPipelineSchedule, PipelineSchedule
from colossalai.legacy.engine.gradient_handler import BaseGradientHandler
from colossalai.legacy.engine.schedule import (
BaseSchedule,
InterleavedPipelineSchedule,
NonPipelineSchedule,
PipelineSchedule,
)
from colossalai.logging import get_dist_logger
from colossalai.zero.legacy.gemini import BaseOpHook, register_ophooks_recursively
from colossalai.nn.optimizer import ColossalaiOptimizer
from colossalai.zero.legacy.gemini import BaseOpHook, register_ophooks_recursively


class Engine:
"""Basic engine class for training and evaluation. It runs a specific process method
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from torch.optim import Optimizer
from torch.optim.lr_scheduler import _LRScheduler

from colossalai.engine import BaseGradientHandler
from colossalai.legacy.engine import BaseGradientHandler

from ._gradient_accumulation import (
GradAccumDataloader,
Expand Down Expand Up @@ -33,7 +33,7 @@ def accumulate_gradient(model: nn.Module,
dataloader (:class:`torch.utils.data.DataLoader` or iterable objects):
your dataloader object, would be called like iter(dataloader)
accumulate_size (int): the number of steps to accumulate gradients
gradient_handlers (List[:class:`colossalai.engine.BaseGradientHandler`]):
gradient_handlers (List[:class:`colossalai.legacy.engine.BaseGradientHandler`]):
list of gradient handler objects. Default is None.
lr_scheduler (`torch.optim.lr_scheduler` or `colossalai.nn.lr_scheduler`):
your ``lr_scheduler`` object for gradient accumulation. Defaults to None.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from torch.optim.lr_scheduler import _LRScheduler
from torch.utils.data import DataLoader

from colossalai.engine import BaseGradientHandler
from colossalai.legacy.engine import BaseGradientHandler
from colossalai.nn.optimizer import ColossalaiOptimizer
from colossalai.utils import conditional_context

Expand Down Expand Up @@ -262,7 +262,7 @@ class GradAccumGradientHandler:
before accumulation size is reached.
Args:
grad_handler (:class:`colossalai.engine.BaseGradientHandler`):
grad_handler (:class:`colossalai.legacy.engine.BaseGradientHandler`):
Your ``gradient_handler`` object for gradient accumulation, would be called when achieving `accumulate_size`.
accumulate_size (int): The number of steps to accumulate gradients.
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from colossalai.context.parallel_mode import ParallelMode
from colossalai.core import global_context as gpc
from colossalai.registry import GRADIENT_HANDLER

from ...context.parallel_mode import ParallelMode
from ._base_gradient_handler import BaseGradientHandler
from .utils import bucket_allreduce

Expand Down
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
from colossalai.context.moe_context import MOE_CONTEXT
from colossalai.context.parallel_mode import ParallelMode
from colossalai.core import global_context as gpc
from colossalai.registry import GRADIENT_HANDLER
from colossalai.utils.moe import get_moe_epsize_param_dict

from ...context.parallel_mode import ParallelMode
from ._base_gradient_handler import BaseGradientHandler
from .utils import bucket_allreduce

Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from colossalai.context.parallel_mode import ParallelMode
from colossalai.core import global_context as gpc
from colossalai.registry import GRADIENT_HANDLER

from ...context.parallel_mode import ParallelMode
from ._base_gradient_handler import BaseGradientHandler
from .utils import bucket_allreduce

Expand Down
File renamed without changes.
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ def forward_backward_step(self,
"""The process function over a batch of dataset for training or evaluation.
Args:
engine (colossalai.engine.Engine): Colossalai engine for training and inference.
engine (colossalai.legacy.engine.Engine): Colossalai engine for training and inference.
data_iter (Iterable): Data iterator from which get a batch of data, obtained by calling iter(dataloader).
forward_only (bool): If True, the process won't include backward.
return_loss (bool, optional): If False, the loss won't be returned.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ def forward_backward_step(self,
The returned labels and loss will None if :attr:`return_loss` is False.
Args:
engine (colossalai.engine.Engine): Colossalai engine for training and inference.
engine (colossalai.legacy.engine.Engine): Colossalai engine for training and inference.
data_iter (Iterable): Dataloader as the form of an iterator, obtained by calling iter(dataloader).
forward_only (bool, optional):
If True, the model is run for the forward pass, else back propagation will be executed.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -236,7 +236,7 @@ def _forward_step(self, engine, input_obj, return_tensors, return_output_label=T
Returns output tensor. This is a helper function and can be ignored by users.
Args:
engine (colossalai.engine.Engine): Colossalai engine for training and inference.
engine (colossalai.legacy.engine.Engine): Colossalai engine for training and inference.
input_obj (Union[:class:`torch.Tensor`, List[:class:`torch.Tensor`]]): Input tensor for this pipeline stage.
return_tensors (List[:class:`torch.Tensor`]): A list of tensors to return.
return_output_label (bool, optional): Whether returns output labels.
Expand Down Expand Up @@ -274,7 +274,7 @@ def _backward_step(self, engine, input_obj, output_obj, output_obj_grad):
This is a helper function and can be ignored by users.
Args:
engine (colossalai.engine.Engine): Colossalai engine for training and inference.
engine (colossalai.legacy.engine.Engine): Colossalai engine for training and inference.
input_obj (Union[:class:`torch.Tensor`, List[:class:`torch.Tensor`]]): input tensor for this pipeline stage.
output_obj (Union[:class:`torch.Tensor`, List[:class:`torch.Tensor`]]): output tensor for this pipeline stage.
output_obj_grad (Union[:class:`torch.Tensor`, List[:class:`torch.Tensor`]]): gradient of output tensor for this pipeline stage.
Expand Down Expand Up @@ -314,7 +314,7 @@ def forward_backward_step(self, engine, data_iter, forward_only=False, return_lo
Returns a tuple with losses if the last stage, an empty tuple otherwise.
Args:
engine (colossalai.engine.Engine): Colossalai engine for training and inference.
engine (colossalai.legacy.engine.Engine): Colossalai engine for training and inference.
data_iter (Iterable): Dataloader as the form of an iterator, obtained by calling iter(dataloader).
forward_only (bool, optional):
Whether run forward step only. Default is false. If true, no backward will be run.
Expand Down Expand Up @@ -518,7 +518,7 @@ def _forward_step(self,
Returns output tensor. This is a helper function and can be ignored by users.
Args:
engine (colossalai.engine.Engine): Colossalai engine for training and inference.
engine (colossalai.legacy.engine.Engine): Colossalai engine for training and inference.
model_chunk_id (int): The id of model chunks.
input_obj (Union[:class:`torch.Tensor`, List[:class:`torch.Tensor`]]): Input tensor for this pipeline stage.
return_tensors (List[:class:`torch.Tensor`]): A list of tensors to return.
Expand Down Expand Up @@ -555,7 +555,7 @@ def forward_backward_step(self, engine, data_iter, forward_only=False, return_lo
communication between pipeline stages as needed.
Args:
engine (colossalai.engine.Engine): Colossalai engine for training and inference.
engine (colossalai.legacy.engine.Engine): Colossalai engine for training and inference.
data_iter (Iterable): Dataloader as the form of an iterator, obtained by calling iter(dataloader).
forward_only (bool, optional):
Whether run forward step only. Default is false. If true, no backward will be run.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ def forward_backward_step(self,
Returns a tuple with losses if the last stage, an empty tuple otherwise.
Args:
engine (colossalai.engine.Engine): Colossalai engine for training and inference.
engine (colossalai.legacy.engine.Engine): Colossalai engine for training and inference.
data_iter (Iterable): Dataloader as the form of an iterator, obtained by calling iter(dataloader).
forward_only (bool, optional):
Whether run forward step only. Default is false. If true, no backward will be run.
Expand Down
2 changes: 1 addition & 1 deletion colossalai/legacy/trainer/_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from torch.utils.data import DataLoader
from tqdm import tqdm

from colossalai.engine import Engine
from colossalai.legacy.engine import Engine
from colossalai.legacy.trainer.hooks import BaseHook
from colossalai.logging import DistributedLogger
from colossalai.utils import MultiTimer, is_dp_rank_0, is_no_pp_or_last_stage, is_tp_rank_0
Expand Down
18 changes: 9 additions & 9 deletions colossalai/utils/profiler/profiler.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,17 @@
import os
from typing import List
from colossalai.engine import Engine
from torch.profiler import profile as torch_profile
from torch.profiler.profiler import ProfilerAction
from typing import Any, Callable, Iterable, Optional
from torch.autograd import ProfilerActivity
import gzip
import json
import os
import tempfile
import gzip
from typing import Any, Callable, Iterable, List, Optional

from torch.autograd import ProfilerActivity
from torch.profiler import profile as torch_profile
from torch.profiler.profiler import ProfilerAction

from colossalai.legacy.engine import Engine
from colossalai.logging import get_dist_logger
from colossalai.utils.profiler.extention import ProfilerExtension
from colossalai.utils.profiler.stateful_tensor_mem_extention import StatefulTensorMemoryProfilerExtention
from colossalai.logging import get_dist_logger


class profile(torch_profile):
Expand Down
8 changes: 5 additions & 3 deletions colossalai/utils/profiler/stateful_tensor_mem_extention.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
import os
import threading
import time
import torch
from enum import Enum
from typing import List
from colossalai.gemini.stateful_tensor import StatefulTensor

import torch

from colossalai.gemini.ophooks import BaseOpHook
from colossalai.engine import Engine
from colossalai.gemini.stateful_tensor import StatefulTensor
from colossalai.legacy.engine import Engine
from colossalai.utils.profiler.extention import ProfilerExtension


Expand Down
7 changes: 4 additions & 3 deletions docs/source/en/advanced_tutorials/add_your_parallel.md
Original file line number Diff line number Diff line change
Expand Up @@ -92,14 +92,14 @@ follow the steps below to create a new distributed initialization.

Gradient handlers are objects which execute the all-reduce operations on parameters' gradients. As different all-reduce
strategies may be executed for different kinds of parallelism, users can
inherit `colossalai.engine.gradient_handler.BaseGradientHandler` to implement their strategies. Currently, the library
inherit `colossalai.legacy.engine.gradient_handler.BaseGradientHandler` to implement their strategies. Currently, the library
uses the normal data parallel gradient handler which all-reduces the gradients across data parallel ranks. The data
parallel gradient handler is added to the engine automatically if data parallel is detected. You can add your own
gradient handler like below:

```python
from colossalai.registry import GRADIENT_HANDLER
from colossalai.engine import BaseGradientHandler
from colossalai.legacy.engine import BaseGradientHandler

@GRADIENT_HANDLER.register_module
class YourGradientHandler(BaseGradientHandler):
Expand All @@ -121,4 +121,5 @@ gradient_handlers = [

Schedule entails how to execute a forward and backward pass. Currently, Colossal-AI provides pipeline and non-pipeline
schedules. If you want to modify how the forward and backward passes are executed, you can
inherit `colossalai.engine.schedule.BaseSchedule` and implement the `forward_back_step` function.
inherit `colossalai.legacy.engine.schedule.BaseSchedule` and implement the `forward_back_step` function.
<!-- doc-test-command: echo -->
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ from colossalai.amp import AMP_TYPE
from colossalai.builder.pipeline import partition_uniform
from colossalai.context.parallel_mode import ParallelMode
from colossalai.core import global_context as gpc
from colossalai.engine.schedule import (InterleavedPipelineSchedule,
from colossalai.legacy.engine.schedule import (InterleavedPipelineSchedule,
PipelineSchedule)
from colossalai.logging import disable_existing_loggers, get_dist_logger
from colossalai.nn.layer.wrapper import PipelineSharedModuleWrapper
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ import colossalai.nn as col_nn
import torch
import torch.nn as nn
from colossalai.builder import build_pipeline_model
from colossalai.engine.schedule import (InterleavedPipelineSchedule,
from colossalai.legacy.engine.schedule import (InterleavedPipelineSchedule,
PipelineSchedule)
from colossalai.logging import disable_existing_loggers, get_dist_logger
from colossalai.legacy.trainer import Trainer, hooks
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -415,7 +415,7 @@ def build_pipeline_vit(num_layers, num_chunks, device=torch.device('cuda'), **kw

#### Import modules
```python
from colossalai.engine.schedule import (InterleavedPipelineSchedule,
from colossalai.legacy.engine.schedule import (InterleavedPipelineSchedule,
PipelineSchedule)
from colossalai.utils import MultiTimer
import os
Expand Down
3 changes: 2 additions & 1 deletion docs/source/en/features/gradient_handler.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ To implement a customized gradient handler, you need to follow these steps.

```python
from colossalai.registry import GRADIENT_HANDLER
from colossalai.engine.gradient_handler import BaseGradientHandler
from colossalai.legacy.engine.gradient_handler import BaseGradientHandler


@GRADIENT_HANDLER.register_module
Expand Down Expand Up @@ -61,3 +61,4 @@ to demonstrate the use of gradient handler. In this example, we used `DataParall
```shell
python -m torch.distributed.launch --nproc_per_node 4 --master_addr localhost --master_port 29500 train_with_engine.py
```
<!-- doc-test-command: echo -->
7 changes: 4 additions & 3 deletions docs/source/zh-Hans/advanced_tutorials/add_your_parallel.md
Original file line number Diff line number Diff line change
Expand Up @@ -81,14 +81,14 @@ Colossal-AI 为用户提供了一个全局 context,使他们能够轻松地管
## 梯度 Handler

梯度 handler 是对参数的梯度执行 all-reduce 操作的对象。由于不同的 all-reduce 策略或许在不同的并行中被执行,用户可以继承
`colossalai.engine.gradient_handler.BaseGradientHandler` 来实现其策略。目前,Colossal-AI 使用普通的数据并行梯度 handler 在数据并行的 rank 间 all-reduce 梯度。
`colossalai.legacy.engine.gradient_handler.BaseGradientHandler` 来实现其策略。目前,Colossal-AI 使用普通的数据并行梯度 handler 在数据并行的 rank 间 all-reduce 梯度。
如果数据并行被检测到,梯度 handler 会被自动添加进 engine。

你可以添加你自己的梯度 handler,如下所示:

```python
from colossalai.registry import GRADIENT_HANDLER
from colossalai.engine import BaseGradientHandler
from colossalai.legacy.engine import BaseGradientHandler

@GRADIENT_HANDLER.register_module
class YourGradientHandler(BaseGradientHandler):
Expand All @@ -109,4 +109,5 @@ gradient_handlers = [
## Schedule

Schedule 包含了如何执行前向和后向计算。目前, Colossal-AI 提供了流水和非流水的 schedule。
如果你想修改前向和后向计算的执行方式,你可以继承 `colossalai.engine.schedule.BaseSchedule` 并实现 `forward_back_step` 函数。
如果你想修改前向和后向计算的执行方式,你可以继承 `colossalai.legacy.engine.schedule.BaseSchedule` 并实现 `forward_back_step` 函数。
<!-- doc-test-command: echo -->
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ from colossalai.amp import AMP_TYPE
from colossalai.builder.pipeline import partition_uniform
from colossalai.context.parallel_mode import ParallelMode
from colossalai.core import global_context as gpc
from colossalai.engine.schedule import (InterleavedPipelineSchedule,
from colossalai.legacy.engine.schedule import (InterleavedPipelineSchedule,
PipelineSchedule)
from colossalai.logging import disable_existing_loggers, get_dist_logger
from colossalai.nn.layer.wrapper import PipelineSharedModuleWrapper
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ import colossalai.nn as col_nn
import torch
import torch.nn as nn
from colossalai.builder import build_pipeline_model
from colossalai.engine.schedule import (InterleavedPipelineSchedule,
from colossalai.legacy.engine.schedule import (InterleavedPipelineSchedule,
PipelineSchedule)
from colossalai.logging import disable_existing_loggers, get_dist_logger
from colossalai.legacy.trainer import Trainer, hooks
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -380,7 +380,7 @@ def build_pipeline_vit(num_layers, num_chunks, device=torch.device('cuda'), **kw

#### 导入模块
```python
from colossalai.engine.schedule import (InterleavedPipelineSchedule,
from colossalai.legacy.engine.schedule import (InterleavedPipelineSchedule,
PipelineSchedule)
from colossalai.utils import MultiTimer
import os
Expand Down
3 changes: 2 additions & 1 deletion docs/source/zh-Hans/features/gradient_handler.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@

```python
from colossalai.registry import GRADIENT_HANDLER
from colossalai.engine.gradient_handler import BaseGradientHandler
from colossalai.legacy.engine.gradient_handler import BaseGradientHandler


@GRADIENT_HANDLER.register_module
Expand Down Expand Up @@ -57,3 +57,4 @@ gradient_handler = [dict(type='MyGradientHandler')]
```shell
python -m torch.distributed.launch --nproc_per_node 4 --master_addr localhost --master_port 29500 train_with_engine.py
```
<!-- doc-test-command: echo -->
Loading

0 comments on commit d434c24

Please sign in to comment.