Cleared backward hooks to avoid accumulating over iterations (#1143)

facebookresearch · Oct 10, 2023 · 17ecf4a · 17ecf4a
1 parent 71aeffe
commit 17ecf4a
Showing 1 changed file with 1 addition and 0 deletions.
diff --git a/fairscale/nn/data_parallel/fully_sharded_data_parallel.py b/fairscale/nn/data_parallel/fully_sharded_data_parallel.py
@@ -1865,6 +1865,7 @@ def _finalize_parameters(fsdp_module: FullyShardedDataParallel) -> None:
                 if hasattr(p, "_shard_bwd_hooks") and self._require_backward_grad_sync:
                     for _, handle in p._shard_bwd_hooks:
                         handle.remove()
+                    p._shard_bwd_hooks.clear()
 
                 # Leave the gradient accumulation state as-is if not synchronizing this pass. This ensures p.grad
                 # remains the unsharded gradient accumulated from prior no-sync passes, and p._saved_grad_shard