From a1cf86fd8bfd05dbc371a057b43ae4a383143c50 Mon Sep 17 00:00:00 2001 From: Samyam Rajbhandari Date: Thu, 27 Aug 2020 23:14:32 +0000 Subject: [PATCH 1/6] Adding gradient accumulation support for ZeRO Stage 2. Changing all Megatron-LM tests to also test gradient accumulation --- deepspeed/pt/deepspeed_light.py | 4 +- deepspeed/pt/deepspeed_zero_optimizer.py | 38 ++++++++++++++----- .../ds_config_func_bs8_no_zero.json | 2 +- .../ds_config_func_bs8_zero1.json | 2 +- .../ds_config_func_bs8_zero2.json | 2 +- tests/model/Megatron_GPT2/run_func_test.py | 16 ++++---- 6 files changed, 42 insertions(+), 22 deletions(-) diff --git a/deepspeed/pt/deepspeed_light.py b/deepspeed/pt/deepspeed_light.py index c6e7623b1792..cffdbd4d67fc 100755 --- a/deepspeed/pt/deepspeed_light.py +++ b/deepspeed/pt/deepspeed_light.py @@ -598,8 +598,8 @@ def _configure_zero_optimizer(self, optimizer): dp_process_group=self.data_parallel_group, mpu=self.mpu) elif zero_stage == ZERO_OPTIMIZATION_GRADIENTS: - assert self.gradient_accumulation_steps( - ) == 1, "ZeRO stage 2 does not support gradient accumulation, if you need gradient accumulation please use stage 1" + #assert self.gradient_accumulation_steps( + #) == 1, "ZeRO stage 2 does not support gradient accumulation, if you need gradient accumulation please use stage 1" optimizer = FP16_DeepSpeedZeroOptimizer( optimizer, timers=self.timers, diff --git a/deepspeed/pt/deepspeed_zero_optimizer.py b/deepspeed/pt/deepspeed_zero_optimizer.py index cbfb249b501d..b1e44b90f1a6 100755 --- a/deepspeed/pt/deepspeed_zero_optimizer.py +++ b/deepspeed/pt/deepspeed_zero_optimizer.py @@ -446,16 +446,35 @@ def independent_gradient_partition_epilogue(self): torch.cuda.synchronize() for i, _ in enumerate(self.fp16_groups): - self.averaged_gradients[i] = self.get_flat_partition( - self.params_in_partition[i], - self.first_offset[i], - self.partition_size[i], - dtype=torch.half, - device=torch.cuda.current_device(), - return_tensor_list=True) - + if self.averaged_gradients[i] is None: + self.averaged_gradients[i] = self.get_flat_partition( + self.params_in_partition[i], + self.first_offset[i], + self.partition_size[i], + dtype=torch.half, + device=torch.cuda.current_device(), + return_tensor_list=True) + else: + #When gradient accumulation is greater that 1 + #This code path will be triggered and will add + #to the accumulated averaged gradients + avg_new = self.get_flat_partition( + self.params_in_partition[i], + self.first_offset[i], + self.partition_size[i], + dtype=torch.half, + device=torch.cuda.current_device(), + return_tensor_list=True) + + for accumulated_grad, new_avg_grad in zip(self.averaged_gradients[i],avg_new): + accumulated_grad.add_(new_avg_new) + self._release_ipg_buffers() - + + # No need to keep the gradients anymore. + # All gradients required by the step + # are in self.averaged_gradients + self.zero_grad() see_memory_usage(f"End ipg_epilogue") # resets all partition to no reduced @@ -1103,6 +1122,7 @@ def step(self, closure=None): if self.overflow: see_memory_usage('After overflow before clearing gradients') self.zero_grad() + self.averaged_gradients[i] = None see_memory_usage('After overflow after clearing gradients') logger.info( diff --git a/tests/model/Megatron_GPT2/ds_config_func_bs8_no_zero.json b/tests/model/Megatron_GPT2/ds_config_func_bs8_no_zero.json index 99637973cd60..5bef6f56b2f9 100755 --- a/tests/model/Megatron_GPT2/ds_config_func_bs8_no_zero.json +++ b/tests/model/Megatron_GPT2/ds_config_func_bs8_no_zero.json @@ -1,6 +1,6 @@ { "train_batch_size": 8, - "gradient_accumulation_steps": 1, + "gradient_accumulation_steps": 2, "steps_per_print": 1, "zero_optimization": { "stage":0 diff --git a/tests/model/Megatron_GPT2/ds_config_func_bs8_zero1.json b/tests/model/Megatron_GPT2/ds_config_func_bs8_zero1.json index 8d44659a9ee3..2b6730fbf419 100755 --- a/tests/model/Megatron_GPT2/ds_config_func_bs8_zero1.json +++ b/tests/model/Megatron_GPT2/ds_config_func_bs8_zero1.json @@ -1,6 +1,6 @@ { "train_batch_size": 8, - "gradient_accumulation_steps": 1, + "gradient_accumulation_steps": 2, "steps_per_print": 1, "zero_optimization":{ "stage":1 diff --git a/tests/model/Megatron_GPT2/ds_config_func_bs8_zero2.json b/tests/model/Megatron_GPT2/ds_config_func_bs8_zero2.json index fde90e8274b8..aaf75daae57c 100755 --- a/tests/model/Megatron_GPT2/ds_config_func_bs8_zero2.json +++ b/tests/model/Megatron_GPT2/ds_config_func_bs8_zero2.json @@ -1,6 +1,6 @@ { "train_batch_size": 8, - "gradient_accumulation_steps": 1, + "gradient_accumulation_steps": 2, "steps_per_print": 1, "zero_optimization": { "stage":2, diff --git a/tests/model/Megatron_GPT2/run_func_test.py b/tests/model/Megatron_GPT2/run_func_test.py index cf4034e585f0..d0824aea1575 100755 --- a/tests/model/Megatron_GPT2/run_func_test.py +++ b/tests/model/Megatron_GPT2/run_func_test.py @@ -8,7 +8,7 @@ import os import time import re -from .test_common import BaseTestCase +from test_common import BaseTestCase LAYERS = 2 HIDDEN_SIZE = 128 @@ -295,15 +295,15 @@ def check_parity(self, base_file, test_file, r_tol): def suite(): suite = unittest.TestSuite() - suite.addTest(GPT2FuncTestCase('test_mp1_gpu1_node1_zero1')) - suite.addTest(GPT2FuncTestCase('test_mp1_gpu2_node1_zero1')) - suite.addTest(GPT2FuncTestCase('test_mp2_gpu4_node1_zero1')) - suite.addTest(GPT2FuncTestCase('test_mp4_gpu4_node1_zero1')) + # suite.addTest(GPT2FuncTestCase('test_mp1_gpu1_node1_zero1')) + # suite.addTest(GPT2FuncTestCase('test_mp1_gpu2_node1_zero1')) + # suite.addTest(GPT2FuncTestCase('test_mp2_gpu4_node1_zero1')) + # suite.addTest(GPT2FuncTestCase('test_mp4_gpu4_node1_zero1')) - suite.addTest(GPT2FuncTestCase('test_mp1_gpu1_node1_zero2')) - suite.addTest(GPT2FuncTestCase('test_mp1_gpu2_node1_zero2')) + # suite.addTest(GPT2FuncTestCase('test_mp1_gpu1_node1_zero2')) + # suite.addTest(GPT2FuncTestCase('test_mp1_gpu2_node1_zero2')) suite.addTest(GPT2FuncTestCase('test_mp2_gpu4_node1_zero2')) - suite.addTest(GPT2FuncTestCase('test_mp4_gpu4_node1_zero2')) + # suite.addTest(GPT2FuncTestCase('test_mp4_gpu4_node1_zero2')) suite.addTest(GPT2FuncTestCase('test_optimizer_scheduler')) return suite From 0b12dc1e525415b11b8840c4bbaa8e9812d17ab1 Mon Sep 17 00:00:00 2001 From: Samyam Rajbhandari Date: Mon, 31 Aug 2020 19:28:01 +0000 Subject: [PATCH 2/6] Gradient Accumulation support for Stage 2. Model tests added to test the feature --- deepspeed/pt/deepspeed_light.py | 13 +++- deepspeed/pt/deepspeed_zero_optimizer.py | 8 +- .../ds_config_func_bs8_no_zero.json | 2 +- .../ds_config_func_bs8_zero0_gas10.json | 30 ++++++++ .../ds_config_func_bs8_zero2.json | 2 +- .../ds_config_func_bs8_zero2_gas10.json | 30 ++++++++ tests/model/Megatron_GPT2/run_func_test.py | 76 ++++++++++++++++--- tests/model/Megatron_GPT2/test_common.py | 4 +- 8 files changed, 142 insertions(+), 23 deletions(-) create mode 100755 tests/model/Megatron_GPT2/ds_config_func_bs8_zero0_gas10.json create mode 100755 tests/model/Megatron_GPT2/ds_config_func_bs8_zero2_gas10.json diff --git a/deepspeed/pt/deepspeed_light.py b/deepspeed/pt/deepspeed_light.py index cffdbd4d67fc..8f1d06794cb4 100755 --- a/deepspeed/pt/deepspeed_light.py +++ b/deepspeed/pt/deepspeed_light.py @@ -721,18 +721,23 @@ def forward(self, *inputs, **kwargs): return loss def allreduce_gradients(self, bucket_size=MEMORY_OPT_ALLREDUCE_SIZE): - if self.is_gradient_accumulation_boundary(): + + #Zero stage 2 communicates during non gradient accumulation boundaries as well + if self.zero_optimization_partition_gradients(): + self.optimizer.overlapping_partition_gradients_reduce_epilogue() + + #Communicate only at gradient accumulation boundaries + elif self.is_gradient_accumulation_boundary(): if self.zero_optimization_stage() == ZERO_OPTIMIZATION_OPTIMIZER_STATES: assert self.zero_reduce_scatter() self.optimizer.reduce_scatter_gradients( postscale_gradients=self.postscale_gradients(), gradient_predivide_factor=self.gradient_predivide_factor(), gradient_average=self.gradient_average) - elif self.zero_optimization_partition_gradients(): - self.optimizer.overlapping_partition_gradients_reduce_epilogue() else: self.buffered_allreduce_fallback(elements_per_buffer=bucket_size) - + + def backward(self, loss, allreduce_gradients=True): r"""Execute backward pass on the loss diff --git a/deepspeed/pt/deepspeed_zero_optimizer.py b/deepspeed/pt/deepspeed_zero_optimizer.py index b1e44b90f1a6..b54c7830ce71 100755 --- a/deepspeed/pt/deepspeed_zero_optimizer.py +++ b/deepspeed/pt/deepspeed_zero_optimizer.py @@ -446,7 +446,7 @@ def independent_gradient_partition_epilogue(self): torch.cuda.synchronize() for i, _ in enumerate(self.fp16_groups): - if self.averaged_gradients[i] is None: + if not i in self.averaged_gradients or self.averaged_gradients[i] is None: self.averaged_gradients[i] = self.get_flat_partition( self.params_in_partition[i], self.first_offset[i], @@ -467,7 +467,7 @@ def independent_gradient_partition_epilogue(self): return_tensor_list=True) for accumulated_grad, new_avg_grad in zip(self.averaged_gradients[i],avg_new): - accumulated_grad.add_(new_avg_new) + accumulated_grad.add_(new_avg_grad) self._release_ipg_buffers() @@ -1122,7 +1122,9 @@ def step(self, closure=None): if self.overflow: see_memory_usage('After overflow before clearing gradients') self.zero_grad() - self.averaged_gradients[i] = None + for key in self.averaged_gradients: + self.averaged_gradients[key] = None + see_memory_usage('After overflow after clearing gradients') logger.info( diff --git a/tests/model/Megatron_GPT2/ds_config_func_bs8_no_zero.json b/tests/model/Megatron_GPT2/ds_config_func_bs8_no_zero.json index 5bef6f56b2f9..99637973cd60 100755 --- a/tests/model/Megatron_GPT2/ds_config_func_bs8_no_zero.json +++ b/tests/model/Megatron_GPT2/ds_config_func_bs8_no_zero.json @@ -1,6 +1,6 @@ { "train_batch_size": 8, - "gradient_accumulation_steps": 2, + "gradient_accumulation_steps": 1, "steps_per_print": 1, "zero_optimization": { "stage":0 diff --git a/tests/model/Megatron_GPT2/ds_config_func_bs8_zero0_gas10.json b/tests/model/Megatron_GPT2/ds_config_func_bs8_zero0_gas10.json new file mode 100755 index 000000000000..5c5efb0a7e57 --- /dev/null +++ b/tests/model/Megatron_GPT2/ds_config_func_bs8_zero0_gas10.json @@ -0,0 +1,30 @@ +{ + "train_micro_batch_size_per_gpu":8, + "gradient_accumulation_steps": 3, + "steps_per_print": 1, + "zero_optimization": { + "stage":0, + "reduce_bucket_size": 7000000, + "allgather_bucket_size": 7000000, + "reduce_scatter": true + }, + "optimizer": { + "type": "Adam", + "params": { + "lr": 0.00015 + } + }, + "gradient_clipping": 1.0, + "fp16": { + "enabled": true, + "loss_scale": 0, + "loss_scale_window": 1000, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "activation_checkpointing": { + "partition_activations": true, + "contiguous_memory_optimization": true + } + +} diff --git a/tests/model/Megatron_GPT2/ds_config_func_bs8_zero2.json b/tests/model/Megatron_GPT2/ds_config_func_bs8_zero2.json index aaf75daae57c..fde90e8274b8 100755 --- a/tests/model/Megatron_GPT2/ds_config_func_bs8_zero2.json +++ b/tests/model/Megatron_GPT2/ds_config_func_bs8_zero2.json @@ -1,6 +1,6 @@ { "train_batch_size": 8, - "gradient_accumulation_steps": 2, + "gradient_accumulation_steps": 1, "steps_per_print": 1, "zero_optimization": { "stage":2, diff --git a/tests/model/Megatron_GPT2/ds_config_func_bs8_zero2_gas10.json b/tests/model/Megatron_GPT2/ds_config_func_bs8_zero2_gas10.json new file mode 100755 index 000000000000..e245309603b8 --- /dev/null +++ b/tests/model/Megatron_GPT2/ds_config_func_bs8_zero2_gas10.json @@ -0,0 +1,30 @@ +{ + "train_micro_batch_size_per_gpu":8, + "gradient_accumulation_steps": 3, + "steps_per_print": 1, + "zero_optimization": { + "stage":2, + "reduce_bucket_size": 7000000, + "allgather_bucket_size": 7000000, + "reduce_scatter": true + }, + "optimizer": { + "type": "Adam", + "params": { + "lr": 0.00015 + } + }, + "gradient_clipping": 1.0, + "fp16": { + "enabled": true, + "loss_scale": 0, + "loss_scale_window": 1000, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "activation_checkpointing": { + "partition_activations": true, + "contiguous_memory_optimization": true + } + +} diff --git a/tests/model/Megatron_GPT2/run_func_test.py b/tests/model/Megatron_GPT2/run_func_test.py index d0824aea1575..6d6a65529312 100755 --- a/tests/model/Megatron_GPT2/run_func_test.py +++ b/tests/model/Megatron_GPT2/run_func_test.py @@ -8,7 +8,7 @@ import os import time import re -from test_common import BaseTestCase +from .test_common import BaseTestCase LAYERS = 2 HIDDEN_SIZE = 128 @@ -198,6 +198,29 @@ def test_mp4_gpu4_node1_zero2(self): succ = self.run_partition_activations_test(test_config, 0.01) self.assertTrue(succ) + def test_mp2_gpu4_node1_zero2_gas(self): + test_config = { + "mp": 2, + "gpus": 4, + "nodes": 1, + "bs": 8, + "steps": 1000, + "layers": LAYERS, + "hidden_size": HIDDEN_SIZE, + "seq_length": SEQ_LEN, + "heads": ATTN_HEADS, + "deepspeed": True, + "json": "ds_config_func_bs8_zero2_gas10.json", + "baseline": "ds_config_func_bs8_zero0_gas10.json", + + } + + succ = self.run_test(test_config, 0.01) + self.assertTrue(succ) + + succ = self.run_partition_activations_test(test_config, 0.01) + self.assertTrue(succ) + def test_optimizer_scheduler(self): test_config = { "mp": 1, @@ -224,9 +247,20 @@ def run_partition_activations_test(self, test_config, r_tol): baseline_prefix = "gpt2_func_" prefix = "gpt2_partition_activation_" + deepspeed_config=test_config["json"] + baseline_deepspeed_config=False + # baseline run... - test_config["deepspeed"] = False - base_file = self.gen_output_name(test_config, baseline_prefix) + # turnoff deepspeed if baseline deepspeed config + # is not provided + if not "baseline" in test_config: + test_config["deepspeed"] = False + else: + test_config["json"] = test_config["baseline"] + baseline_prefix += test_config["json"][0:-5] + baseline_deepspeed_config = True + + base_file = self.gen_output_name(test_config, baseline_prefix, baseline_config=baseline_deepspeed_config) # skip baseline run if it exists. if not self.has_loss_data(base_file): @@ -238,6 +272,7 @@ def run_partition_activations_test(self, test_config, r_tol): # DeepSpeed run... test_config["deepspeed"] = True test_config["other_args"] = "--deepspeed-activation-checkpointing" + test_config["json"] = deepspeed_config print("{0}: DeepSpeed run.".format(self.id())) test_file = self.gen_output_name(test_config, prefix) self.run_gpt2_test(test_config, test_file) @@ -250,9 +285,21 @@ def run_test(self, test_config, r_tol): prefix = "gpt2_func" + deepspeed_config=test_config["json"] + baseline_deepspeed_config = False + + # baseline run... + # turn off deepspeed if a baseline deepspeed config + # is not provided + if not "baseline" in test_config: + test_config["deepspeed"] = False + else: + test_config["json"] = test_config["baseline"] + baseline_prefix = prefix + test_config["json"][0:-5] + baseline_deepspeed_config = True + # baseline run... - test_config["deepspeed"] = False - base_file = self.gen_output_name(test_config, prefix) + base_file = self.gen_output_name(test_config, baseline_prefix, baseline_config = baseline_deepspeed_config) # skip baseline run if it exists. if not self.has_loss_data(base_file): @@ -263,6 +310,8 @@ def run_test(self, test_config, r_tol): # DeepSpeed run... test_config["deepspeed"] = True + test_config["json"] = deepspeed_config + print("{0}: DeepSpeed run.".format(self.id())) test_file = self.gen_output_name(test_config, prefix) self.run_gpt2_test(test_config, test_file) @@ -295,17 +344,20 @@ def check_parity(self, base_file, test_file, r_tol): def suite(): suite = unittest.TestSuite() - # suite.addTest(GPT2FuncTestCase('test_mp1_gpu1_node1_zero1')) - # suite.addTest(GPT2FuncTestCase('test_mp1_gpu2_node1_zero1')) - # suite.addTest(GPT2FuncTestCase('test_mp2_gpu4_node1_zero1')) - # suite.addTest(GPT2FuncTestCase('test_mp4_gpu4_node1_zero1')) + suite.addTest(GPT2FuncTestCase('test_mp1_gpu1_node1_zero1')) + suite.addTest(GPT2FuncTestCase('test_mp1_gpu2_node1_zero1')) + suite.addTest(GPT2FuncTestCase('test_mp2_gpu4_node1_zero1')) + suite.addTest(GPT2FuncTestCase('test_mp4_gpu4_node1_zero1')) - # suite.addTest(GPT2FuncTestCase('test_mp1_gpu1_node1_zero2')) - # suite.addTest(GPT2FuncTestCase('test_mp1_gpu2_node1_zero2')) + suite.addTest(GPT2FuncTestCase('test_mp1_gpu1_node1_zero2')) + suite.addTest(GPT2FuncTestCase('test_mp1_gpu2_node1_zero2')) suite.addTest(GPT2FuncTestCase('test_mp2_gpu4_node1_zero2')) - # suite.addTest(GPT2FuncTestCase('test_mp4_gpu4_node1_zero2')) + suite.addTest(GPT2FuncTestCase('test_mp4_gpu4_node1_zero2')) + + suite.addTest(GPT2FuncTestCase('test_mp2_gpu4_node1_zero2_gas')) suite.addTest(GPT2FuncTestCase('test_optimizer_scheduler')) + return suite diff --git a/tests/model/Megatron_GPT2/test_common.py b/tests/model/Megatron_GPT2/test_common.py index 7567b5a14f46..ae1dd328de2e 100755 --- a/tests/model/Megatron_GPT2/test_common.py +++ b/tests/model/Megatron_GPT2/test_common.py @@ -16,7 +16,7 @@ def __init__(self, methodName="DeepSpeed performance test"): self.baseline_dir = "./baseline" self.timestr = time.strftime("%Y%m%d-%H%M%S") - def gen_output_name(self, test_config, prefix): + def gen_output_name(self, test_config, prefix, baseline_config=False): other_args = test_config["other_args"] if "other_args" in test_config else "" zero_args = "_zero" if "zero" in test_config and test_config["zero"] else "" other_args = other_args.strip(' -\\').replace(" ", "").replace("\"", "") @@ -24,7 +24,7 @@ def gen_output_name(self, test_config, prefix): if other_args: other_args = "_" + other_args - if test_config["deepspeed"]: + if test_config["deepspeed"] and not baseline_config: file_name = "_mp{0}_gpu{1}_node{2}_bs{3}_step{4}_layer{5}_hidden{6}_seq{7}_head{8}{9}_ds{10}-{11}.log".format( test_config["mp"], test_config["gpus"], From cafd9f2de7a97fbf45c742d99aec3d9e6eea8649 Mon Sep 17 00:00:00 2001 From: Samyam Rajbhandari Date: Mon, 31 Aug 2020 19:30:47 +0000 Subject: [PATCH 3/6] formatting --- deepspeed/pt/deepspeed_light.py | 5 ++-- deepspeed/pt/deepspeed_zero_optimizer.py | 27 +++++++++++----------- tests/model/Megatron_GPT2/run_func_test.py | 23 ++++++++++-------- 3 files changed, 28 insertions(+), 27 deletions(-) diff --git a/deepspeed/pt/deepspeed_light.py b/deepspeed/pt/deepspeed_light.py index 8f1d06794cb4..561965ff2ed4 100755 --- a/deepspeed/pt/deepspeed_light.py +++ b/deepspeed/pt/deepspeed_light.py @@ -721,7 +721,7 @@ def forward(self, *inputs, **kwargs): return loss def allreduce_gradients(self, bucket_size=MEMORY_OPT_ALLREDUCE_SIZE): - + #Zero stage 2 communicates during non gradient accumulation boundaries as well if self.zero_optimization_partition_gradients(): self.optimizer.overlapping_partition_gradients_reduce_epilogue() @@ -736,8 +736,7 @@ def allreduce_gradients(self, bucket_size=MEMORY_OPT_ALLREDUCE_SIZE): gradient_average=self.gradient_average) else: self.buffered_allreduce_fallback(elements_per_buffer=bucket_size) - - + def backward(self, loss, allreduce_gradients=True): r"""Execute backward pass on the loss diff --git a/deepspeed/pt/deepspeed_zero_optimizer.py b/deepspeed/pt/deepspeed_zero_optimizer.py index b54c7830ce71..245ecb774097 100755 --- a/deepspeed/pt/deepspeed_zero_optimizer.py +++ b/deepspeed/pt/deepspeed_zero_optimizer.py @@ -454,24 +454,23 @@ def independent_gradient_partition_epilogue(self): dtype=torch.half, device=torch.cuda.current_device(), return_tensor_list=True) - else: + else: #When gradient accumulation is greater that 1 - #This code path will be triggered and will add + #This code path will be triggered and will add #to the accumulated averaged gradients - avg_new = self.get_flat_partition( - self.params_in_partition[i], - self.first_offset[i], - self.partition_size[i], - dtype=torch.half, - device=torch.cuda.current_device(), - return_tensor_list=True) - + avg_new = self.get_flat_partition(self.params_in_partition[i], + self.first_offset[i], + self.partition_size[i], + dtype=torch.half, + device=torch.cuda.current_device(), + return_tensor_list=True) + for accumulated_grad, new_avg_grad in zip(self.averaged_gradients[i],avg_new): accumulated_grad.add_(new_avg_grad) - + self._release_ipg_buffers() - - # No need to keep the gradients anymore. + + # No need to keep the gradients anymore. # All gradients required by the step # are in self.averaged_gradients self.zero_grad() @@ -1124,7 +1123,7 @@ def step(self, closure=None): self.zero_grad() for key in self.averaged_gradients: self.averaged_gradients[key] = None - + see_memory_usage('After overflow after clearing gradients') logger.info( diff --git a/tests/model/Megatron_GPT2/run_func_test.py b/tests/model/Megatron_GPT2/run_func_test.py index 6d6a65529312..7077a05bfe07 100755 --- a/tests/model/Megatron_GPT2/run_func_test.py +++ b/tests/model/Megatron_GPT2/run_func_test.py @@ -212,7 +212,6 @@ def test_mp2_gpu4_node1_zero2_gas(self): "deepspeed": True, "json": "ds_config_func_bs8_zero2_gas10.json", "baseline": "ds_config_func_bs8_zero0_gas10.json", - } succ = self.run_test(test_config, 0.01) @@ -247,8 +246,8 @@ def run_partition_activations_test(self, test_config, r_tol): baseline_prefix = "gpt2_func_" prefix = "gpt2_partition_activation_" - deepspeed_config=test_config["json"] - baseline_deepspeed_config=False + deepspeed_config = test_config["json"] + baseline_deepspeed_config = False # baseline run... # turnoff deepspeed if baseline deepspeed config @@ -260,7 +259,9 @@ def run_partition_activations_test(self, test_config, r_tol): baseline_prefix += test_config["json"][0:-5] baseline_deepspeed_config = True - base_file = self.gen_output_name(test_config, baseline_prefix, baseline_config=baseline_deepspeed_config) + base_file = self.gen_output_name(test_config, + baseline_prefix, + baseline_config=baseline_deepspeed_config) # skip baseline run if it exists. if not self.has_loss_data(base_file): @@ -285,11 +286,11 @@ def run_test(self, test_config, r_tol): prefix = "gpt2_func" - deepspeed_config=test_config["json"] + deepspeed_config = test_config["json"] baseline_deepspeed_config = False - + # baseline run... - # turn off deepspeed if a baseline deepspeed config + # turn off deepspeed if a baseline deepspeed config # is not provided if not "baseline" in test_config: test_config["deepspeed"] = False @@ -297,9 +298,11 @@ def run_test(self, test_config, r_tol): test_config["json"] = test_config["baseline"] baseline_prefix = prefix + test_config["json"][0:-5] baseline_deepspeed_config = True - + # baseline run... - base_file = self.gen_output_name(test_config, baseline_prefix, baseline_config = baseline_deepspeed_config) + base_file = self.gen_output_name(test_config, + baseline_prefix, + baseline_config=baseline_deepspeed_config) # skip baseline run if it exists. if not self.has_loss_data(base_file): @@ -357,7 +360,7 @@ def suite(): suite.addTest(GPT2FuncTestCase('test_mp2_gpu4_node1_zero2_gas')) suite.addTest(GPT2FuncTestCase('test_optimizer_scheduler')) - + return suite From 1c88e9d07131f860955c224fb780e0441d1da9b6 Mon Sep 17 00:00:00 2001 From: Samyam Rajbhandari Date: Mon, 31 Aug 2020 12:33:08 -0700 Subject: [PATCH 4/6] Update deepspeed_light.py removing comment --- deepspeed/pt/deepspeed_light.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/deepspeed/pt/deepspeed_light.py b/deepspeed/pt/deepspeed_light.py index 561965ff2ed4..ef2761ac4f51 100755 --- a/deepspeed/pt/deepspeed_light.py +++ b/deepspeed/pt/deepspeed_light.py @@ -598,8 +598,6 @@ def _configure_zero_optimizer(self, optimizer): dp_process_group=self.data_parallel_group, mpu=self.mpu) elif zero_stage == ZERO_OPTIMIZATION_GRADIENTS: - #assert self.gradient_accumulation_steps( - #) == 1, "ZeRO stage 2 does not support gradient accumulation, if you need gradient accumulation please use stage 1" optimizer = FP16_DeepSpeedZeroOptimizer( optimizer, timers=self.timers, From 2f07fac9bddbab389f0afe1383b6320bf8580b1e Mon Sep 17 00:00:00 2001 From: Samyam Rajbhandari Date: Mon, 31 Aug 2020 13:40:17 -0700 Subject: [PATCH 5/6] Update ds_config_func_bs8_zero1.json reverting this file back. Its not needed for this PR --- tests/model/Megatron_GPT2/ds_config_func_bs8_zero1.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/model/Megatron_GPT2/ds_config_func_bs8_zero1.json b/tests/model/Megatron_GPT2/ds_config_func_bs8_zero1.json index 2b6730fbf419..8d44659a9ee3 100755 --- a/tests/model/Megatron_GPT2/ds_config_func_bs8_zero1.json +++ b/tests/model/Megatron_GPT2/ds_config_func_bs8_zero1.json @@ -1,6 +1,6 @@ { "train_batch_size": 8, - "gradient_accumulation_steps": 2, + "gradient_accumulation_steps": 1, "steps_per_print": 1, "zero_optimization":{ "stage":1 From 67cf72809c5d24891a7f5e5f6ce56cb1e0242f2e Mon Sep 17 00:00:00 2001 From: Samyam Rajbhandari Date: Tue, 1 Sep 2020 00:42:43 +0000 Subject: [PATCH 6/6] defining baseline prefix --- tests/model/Megatron_GPT2/run_func_test.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/model/Megatron_GPT2/run_func_test.py b/tests/model/Megatron_GPT2/run_func_test.py index 7077a05bfe07..dc8ac07ad74e 100755 --- a/tests/model/Megatron_GPT2/run_func_test.py +++ b/tests/model/Megatron_GPT2/run_func_test.py @@ -285,6 +285,7 @@ def run_test(self, test_config, r_tol): print("{0}: starting......".format(self.id())) prefix = "gpt2_func" + baseline_prefix = prefix deepspeed_config = test_config["json"] baseline_deepspeed_config = False