facebookresearch · ngoyal2707 · Feb 1, 2022 · anj-s · Feb 3, 2022 · ngoyal2707
diff --git a/fairscale/nn/data_parallel/fully_sharded_data_parallel.py b/fairscale/nn/data_parallel/fully_sharded_data_parallel.py
@@ -1543,8 +1543,6 @@ def _register_post_backward_hooks(self) -> None:
             return  # don't register grad hooks if grad isn't enabled
         for p in self.params:
             if p.requires_grad:
-                if hasattr(p, "_shard_bwd_hook"):
-                    continue
                 # Register a hook on the first call, empirically, autograd
                 # fires it at the end for this param, which makes sense.
                 p_tmp = p.expand_as(p)  # Get a grad_fn on p_tmp.
@@ -1751,8 +1749,6 @@ def _finalize_parameters(fsdp_module: FullyShardedDataParallel) -> None:
                     continue
                 if hasattr(p, "_shard_bwd_hook"):
                     p_assert(len(p._shard_bwd_hook) == 2, f"WFPB: incorrect hook num: {len(p._shard_bwd_hook)}")
-                    p._shard_bwd_hook[1].remove()
-                    delattr(p, "_shard_bwd_hook")
 
                 # Leave the gradient accumulation state as-is if not synchronizing this pass. This ensures p.grad
                 # remains the unsharded gradient accumulated from prior no-sync passes, and p._saved_grad_shard