Fix: Fix the problem of different training times and modified the dis…

…tributed training synchronization lock.
chairc · Sep 11, 2023 · c0157d9 · c0157d9
1 parent 5c4b49a
commit c0157d9
Showing 1 changed file with 6 additions and 4 deletions.
diff --git a/tools/train.py b/tools/train.py
@@ -249,10 +249,6 @@ def train(rank=None, args=None):
             tb_logger.add_scalar(tag=f"[{device}]: MSE", scalar_value=loss.item(),
                                  global_step=epoch * len_dataloader + i)
 
-        # Synchronization during distributed training
-        if distributed:
-            dist.barrier()
-
         # Saving and validating models in the main process
         if save_models:
             # Saving model
@@ -295,6 +291,12 @@ def train(rank=None, args=None):
                     logger.info(msg=f"Save the {save_name}.pt, ema_{save_name}.pt, and optim_{save_name}.pt.")
                 logger.info(msg="Save the model.")
         logger.info(msg=f"[{device}]: Finish epoch {epoch}:")
+
+        # Synchronization during distributed training
+        if distributed:
+            logger.info(msg=f"[{device}]: Synchronization during distributed training.")
+            dist.barrier()
+
     logger.info(msg=f"[{device}]: Finish training.")
 
     # Clean up the distributed environment