Fix test failures on 1 GPU

easybuilders · Nov 28, 2022 · 168ff80 · 168ff80
1 parent 8b5a673
commit 168ff80
Show file tree

Hide file tree

Showing 3 changed files with 69 additions and 17 deletions.
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-1.12.1-foss-2021a-CUDA-11.3.1.eb b/easybuild/easyconfigs/p/PyTorch/PyTorch-1.12.1-foss-2021a-CUDA-11.3.1.eb
@@ -65,7 +65,7 @@ checksums = [
     '4aeb1b0bc863d4801b0095cbce69f8794066748f0df27c6aaaf729c5ecba04b7',
     # PyTorch-1.12.1_fix-cuda-gcc-version-check.patch
     'a650f4576f06c749f244cada52ff9c02499fa8f182019129488db3845e0756ab',
-    'dce66cae4f3e196fbeb5293a9664c9962bb3920dfa9f9531d4ce39663f431bb4',  # PyTorch-1.12.1_fix-skip-decorators.patch
+    '32fc722f5f93b9a89b10809c2b38718de9c847a96ffdd540d2c71eae154e4201',  # PyTorch-1.12.1_fix-skip-decorators.patch
     # PyTorch-1.12.1_fix-test_cpp_extensions_jit.patch
     '1efc9850c431d702e9117d4766277d3f88c5c8b3870997c9974971bce7f2ab83',
     # PyTorch-1.12.1_fix-TestCudaFuser.test_unary_ops.patch

diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-1.12.1-foss-2021a.eb b/easybuild/easyconfigs/p/PyTorch/PyTorch-1.12.1-foss-2021a.eb
@@ -61,7 +61,7 @@ checksums = [
     'f2e6b9625733d9a471bb75e1ea20e28814cf1380b4f9089aa838ee35ddecf07d',  # PyTorch-1.11.0_install-vsx-vec-headers.patch
     # PyTorch-1.12.1_fix-cuda-gcc-version-check.patch
     'a650f4576f06c749f244cada52ff9c02499fa8f182019129488db3845e0756ab',
-    'dce66cae4f3e196fbeb5293a9664c9962bb3920dfa9f9531d4ce39663f431bb4',  # PyTorch-1.12.1_fix-skip-decorators.patch
+    '32fc722f5f93b9a89b10809c2b38718de9c847a96ffdd540d2c71eae154e4201',  # PyTorch-1.12.1_fix-skip-decorators.patch
     # PyTorch-1.12.1_fix-test_cpp_extensions_jit.patch
     '1efc9850c431d702e9117d4766277d3f88c5c8b3870997c9974971bce7f2ab83',
     # PyTorch-1.12.1_fix-TestCudaFuser.test_unary_ops.patch

diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-1.12.1_fix-skip-decorators.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-1.12.1_fix-skip-decorators.patch
@@ -2,15 +2,15 @@ The decorators are implemented to run when the function is called which is after
 the test `setup` method spawned subprocesses which may use NCCL to sync failing when there are
 not enough GPUs available.
 So replace the custom code by calls to the `unittest` skip decorators.
-See https://github.com/pytorch/pytorch/issues/89686
+See https://github.com/pytorch/pytorch/pull/89750
 
 Author: Alexander Grund (TU Dresden)
 
 diff --git a/torch/testing/_internal/common_distributed.py b/torch/testing/_internal/common_distributed.py
-index 8baf7d03d9f..d4c17c80c5b 100644
+index 8baf7d03d9f..3dc922ee923 100644
 --- a/torch/testing/_internal/common_distributed.py
 +++ b/torch/testing/_internal/common_distributed.py
-@@ -124,46 +124,19 @@ def skip_if_odd_worldsize(func):
+@@ -124,46 +124,11 @@ def skip_if_odd_worldsize(func):
      return wrapper
 
  def require_n_gpus_for_nccl_backend(n, backend):
@@ -25,9 +25,8 @@ index 8baf7d03d9f..d4c17c80c5b 100644
 -        return wrapper
 -
 -    return decorator
--
-+    return unittest.skipIf(backend == "nccl" and torch.cuda.device_count() < n,
-+                           TEST_SKIPS[f"multi-gpu-{n}"].message)
++    return skip_if_lt_x_gpu(n) if backend == "nccl" else unittest.skipIf(False, None)
+
 
  def skip_if_lt_x_gpu(x):
 -    def decorator(func):
@@ -40,12 +39,10 @@ index 8baf7d03d9f..d4c17c80c5b 100644
 -        return wrapper
 -
 -    return decorator
-+    return unittest.skipUnless(torch.cuda.is_available() and torch.cuda.device_count() >= x,
-+                               TEST_SKIPS[f"multi-gpu-{n}"].message)
-
-
- # This decorator helps avoiding initializing cuda while testing other backends
- def nccl_skip_if_lt_x_gpu(backend, x):
+-
+-
+-# This decorator helps avoiding initializing cuda while testing other backends
+-def nccl_skip_if_lt_x_gpu(backend, x):
 -    def decorator(func):
 -        @wraps(func)
 -        def wrapper(*args, **kwargs):
@@ -58,9 +55,64 @@ index 8baf7d03d9f..d4c17c80c5b 100644
 -        return wrapper
 -
 -    return decorator
-+    return unittest.skipUnless(backend != "nccl" or
-+                               (torch.cuda.is_available() and torch.cuda.device_count() >= x),
-+                               TEST_SKIPS[f"multi-gpu-{n}"].message)
++    return unittest.skipIf(torch.cuda.device_count() < x, TEST_SKIPS[f"multi-gpu-{x}"].message)
 
 
  def verify_ddp_error_logged(model_DDP, err_substr):
+diff --git a/torch/testing/_internal/distributed/distributed_test.py b/torch/testing/_internal/distributed/distributed_test.py
+index 1414a0376b1..1f6b00e6edf 100644
+--- a/torch/testing/_internal/distributed/distributed_test.py
++++ b/torch/testing/_internal/distributed/distributed_test.py
+@@ -56,7 +56,6 @@ from torch.testing._internal.common_distributed import (
+     skip_if_small_worldsize,
+     skip_if_odd_worldsize,
+     skip_if_lt_x_gpu,
+-    nccl_skip_if_lt_x_gpu,
+     skip_if_no_gpu,
+     require_n_gpus_for_nccl_backend,
+     requires_nccl_version,
+@@ -4604,7 +4603,7 @@ class DistributedTest:
+             BACKEND != "mpi" and BACKEND != "nccl" and BACKEND != "gloo",
+             "get_future is only supported on mpi, nccl and gloo",
+         )
+-        @nccl_skip_if_lt_x_gpu(BACKEND, 2)
++        @require_n_gpus_for_nccl_backend(BACKEND, 2)
+         def test_accumulate_gradients_no_sync(self):
+             """
+             Runs _test_accumulate_gradients_no_sync using default inputs
+@@ -4615,7 +4614,7 @@ class DistributedTest:
+             BACKEND != "mpi" and BACKEND != "nccl" and BACKEND != "gloo",
+             "get_future is only supported on mpi, nccl and gloo",
+         )
+-        @nccl_skip_if_lt_x_gpu(BACKEND, 2)
++        @require_n_gpus_for_nccl_backend(BACKEND, 2)
+         def test_accumulate_gradients_no_sync_grad_is_view(self):
+             """
+             Runs _test_accumulate_gradients_no_sync using default inputs
+@@ -4626,7 +4625,7 @@ class DistributedTest:
+             BACKEND != "mpi" and BACKEND != "nccl" and BACKEND != "gloo",
+             "get_future is only supported on mpi, nccl and gloo",
+         )
+-        @nccl_skip_if_lt_x_gpu(BACKEND, 2)
++        @require_n_gpus_for_nccl_backend(BACKEND, 2)
+         def test_accumulate_gradients_no_sync_allreduce_hook(self):
+             """
+             Runs multiple iterations on _test_accumulate_gradients_no_sync
+@@ -4654,7 +4653,7 @@ class DistributedTest:
+             BACKEND != "mpi" and BACKEND != "nccl" and BACKEND != "gloo",
+             "get_future is only supported on mpi, nccl and gloo",
+         )
+-        @nccl_skip_if_lt_x_gpu(BACKEND, 2)
++        @require_n_gpus_for_nccl_backend(BACKEND, 2)
+         def test_accumulate_gradients_no_sync_allreduce_with_then_hook(self):
+             """
+             Runs multiple iterations on _test_accumulate_gradients_no_sync using allreduce
+@@ -4688,7 +4687,7 @@ class DistributedTest:
+             BACKEND != "mpi" and BACKEND != "nccl" and BACKEND != "gloo",
+             "get_future is only supported on mpi, nccl and gloo",
+         )
+-        @nccl_skip_if_lt_x_gpu(BACKEND, 2)
++        @require_n_gpus_for_nccl_backend(BACKEND, 2)
+         def test_get_future(self):
+             def mult(fut):
+                 return [t * 3 for t in fut.wait()]