Skip to content

Commit

Permalink
deepspeed (#288)
Browse files Browse the repository at this point in the history
* deepspeed

* shard

* full param deepspeed works by this commit

* offload optimizer & documentation

* format & fix save deepspeed weight

* format & update save_checkpoint

* update pipfile

* update pipfile

* zero init for transformers

* add some new config

* fix bug

* min 1e6

* update deepspeed config

* Update requirements.txt

* remove duplicate code

* throw warning when compile w/ deepspeed

* black

* integrate deepspeed into wrap_model_distributed

* remove unuse code

* style

* fix bug

* fix bug

* max token len to 16k

* deepspeed save lora

* update get optimizer

* fix check disk

* comment out offload CPU

* Pipfile.lock

* Update requirements.txt

* make black

* add default

* minor fix

* minor fix

* minor fix

* fix val loader

* potential val loader fix

* update

* lock

* Update requirements.txt

* improve model saving for deepspeed

* solved INFLIGHT problem

* update doc

* deepspeed default push to hub by cpu

* Revert "improve model saving for deepspeed"

This reverts commit 62fc9c5.

* remove unuse code

* Update requirements.txt

* deepspeed==0.11.1

* Update requirements.txt

* temp fix for deepspeed slow gen

* style

* style

* fix

---------

Co-authored-by: haqishen <haqishen@gmail.com>
Co-authored-by: Philipp Singer <killver@gmail.com>
Co-authored-by: psinger <psinger@users.noreply.github.com>
  • Loading branch information
3 people committed Oct 24, 2023
1 parent 08475e3 commit 67d3a3c
Show file tree
Hide file tree
Showing 15 changed files with 814 additions and 494 deletions.
1 change: 1 addition & 0 deletions Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ tiktoken = "==0.5.1"
hf-transfer = "==0.1.3"
peft = "==0.5.0"
azure-storage-file-datalake = ">=12.12.0"
deepspeed = "==0.11.1"
keyring = "==24.2.0"

[dev-packages]
Expand Down
823 changes: 464 additions & 359 deletions Pipfile.lock

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Whether to offload optimizer to cpu for saving more GPU ram during training. Note that turn on offload_optimizer would further slow down training.
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Number of elements reduced/allreduced at a time. Limits the memory required for the allgather for large model sizes. Smaller values use less memory, but slow down training.
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
The maximum number of parameters resident per GPU before releasing. Smaller values use less memory, but slow down training.
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Do not release a parameter if it will be reused within this threshold of parameters. Smaller values use less memory, but slow down training.
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Do not partition parameters smaller than this threshold. Smaller values use less memory, but can greatly increase communication and slow down training. (especially latency-bound messages).
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Maximum number of parameter elements to fetch ahead of use. Smaller values use less memory, but slow down training..
1 change: 1 addition & 0 deletions documentation/docs/tooltips/experiments/_use-deepspeed.mdx
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Whether to use deepspeed for saving GPU ram during training. Note that turning on DeepSpeed can slow down training.
4 changes: 3 additions & 1 deletion llm_studio/app_utils/sections/experiment.py
Original file line number Diff line number Diff line change
Expand Up @@ -1680,7 +1680,9 @@ async def experiment_push_to_huggingface_dialog(q: Q, error: str = ""):
num_running_queued = len(
experiments[experiments["status"].isin(["queued", "running"])]
)
if num_running_queued > 0:
experiment_path = q.client["experiment/display/experiment_path"]
cfg = load_config_yaml(os.path.join(experiment_path, "cfg.yaml"))
if num_running_queued > 0 or cfg.environment.use_deepspeed:
default_device = "cpu"

try:
Expand Down
73 changes: 42 additions & 31 deletions llm_studio/app_utils/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,36 +114,41 @@ def start_process(
env = {**os.environ, **env_vars}

if num_gpus == 0:
p = subprocess.Popen(
[
"python",
"train_wave.py",
"-Y",
config_name,
"-Q",
",".join([str(x) for x in process_queue]),
],
env=env,
)
cmd = [
"python",
"train_wave.py",
"-Y",
config_name,
]
# Do not delete for debug purposes
# elif num_gpus == 1:
# p = subprocess.Popen(
# [
# "env",
# f"CUDA_VISIBLE_DEVICES={','.join(gpu_list)}",
# "python",
# "-u",
# "train_wave.py",
# "-P",
# config_name,
# "-Q",
# ",".join([str(x) for x in process_queue]),
# ]
# )
# cmd = [
# "env",
# f"CUDA_VISIBLE_DEVICES={','.join(gpu_list)}",
# "python",
# "-u",
# "train_wave.py",
# "-P",
# config_name,
# ]
else:
free_port = find_free_port()
p = subprocess.Popen(
[
if cfg.environment.use_deepspeed:
logger.info("Starting deepspeed...")
cmd = [
"env",
"deepspeed",
"--include",
f"localhost:{','.join(gpu_list)}",
"--master_port",
f"{str(free_port)}",
"train_wave.py",
"-Y",
config_name,
]
else:
logger.info("Starting torchrun...")
cmd = [
"env",
f"CUDA_VISIBLE_DEVICES={','.join(gpu_list)}",
"torchrun",
Expand All @@ -152,11 +157,17 @@ def start_process(
"train_wave.py",
"-Y",
config_name,
"-Q",
",".join([str(x) for x in process_queue]),
],
env=env,
)
]

if len(process_queue) > 0:
cmd.append("-Q")
cmd.append(",".join([str(x) for x in process_queue]))

p = subprocess.Popen(
cmd,
env=env,
)

logger.info(f"Percentage of RAM memory used: {psutil.virtual_memory().percent}")

return p
Expand Down
44 changes: 41 additions & 3 deletions llm_studio/python_configs/text_causal_language_modeling_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -227,9 +227,9 @@ class ConfigNLPCausalLMTokenizer(DefaultConfig):

def __post_init__(self):
super().__post_init__()
self._possible_values["max_length_prompt"] = (32, 8192, 32)
self._possible_values["max_length_answer"] = (32, 8192, 32)
self._possible_values["max_length"] = (32, 8192, 32)
self._possible_values["max_length_prompt"] = (32, 1024 * 16, 32)
self._possible_values["max_length_answer"] = (32, 1024 * 16, 32)
self._possible_values["max_length"] = (32, 1024 * 16, 32)
self._possible_values["padding_quantile"] = (0, 1, 0.01)
self._padding_side = "left"

Expand Down Expand Up @@ -343,6 +343,13 @@ class ConfigNLPCausalLMEnvironment(DefaultConfig):

compile_model: bool = False
use_fsdp: bool = False
use_deepspeed: bool = False
deepspeed_reduce_bucket_size: int = 1e6
deepspeed_stage3_prefetch_bucket_size: int = 1e6
deepspeed_stage3_param_persistence_threshold: int = 1e6
# deepspeed_offload_optimizer: bool = False
# deepspeed_stage3_max_live_parameters: int = 1e9
# deepspeed_stage3_max_reuse_distance: int = 1e9

find_unused_parameters: bool = False
trust_remote_code: bool = True
Expand Down Expand Up @@ -376,6 +383,37 @@ def __post_init__(self):

self._possible_values["number_of_workers"] = (1, multiprocessing.cpu_count(), 1)
self._possible_values["seed"] = possible_values.Number(step=1, min=-1)
self._possible_values["deepspeed_reduce_bucket_size"] = possible_values.Number(
step=1, min=1e6
)
self._possible_values[
"deepspeed_stage3_prefetch_bucket_size"
] = possible_values.Number(step=1, min=1e6)
self._possible_values[
"deepspeed_stage3_param_persistence_threshold"
] = possible_values.Number(step=1, min=1e6)
self._possible_values[
"deepspeed_stage3_max_live_parameters"
] = possible_values.Number(step=1, min=1e6)
self._possible_values[
"deepspeed_stage3_max_reuse_distance"
] = possible_values.Number(step=1, min=1e6)
self._nesting.add(
[
"deepspeed_reduce_bucket_size",
"deepspeed_stage3_prefetch_bucket_size",
"deepspeed_stage3_param_persistence_threshold",
# "deepspeed_offload_optimizer",
],
[Dependency(key="use_deepspeed", value=False, is_set=False)],
)
# self._nesting.add(
# [
# "deepspeed_stage3_max_live_parameters",
# "deepspeed_stage3_max_reuse_distance",
# ],
# [Dependency(key="deepspeed_offload_optimizer", value=False, is_set=False)], # noqa: E501
# )


@dataclass
Expand Down
Loading

0 comments on commit 67d3a3c

Please sign in to comment.