Skip to content

Commit

Permalink
Fix test failures on 1 GPU
Browse files Browse the repository at this point in the history
  • Loading branch information
Flamefire committed Nov 28, 2022
1 parent 8b5a673 commit 168ff80
Show file tree
Hide file tree
Showing 3 changed files with 69 additions and 17 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ checksums = [
'4aeb1b0bc863d4801b0095cbce69f8794066748f0df27c6aaaf729c5ecba04b7',
# PyTorch-1.12.1_fix-cuda-gcc-version-check.patch
'a650f4576f06c749f244cada52ff9c02499fa8f182019129488db3845e0756ab',
'dce66cae4f3e196fbeb5293a9664c9962bb3920dfa9f9531d4ce39663f431bb4', # PyTorch-1.12.1_fix-skip-decorators.patch
'32fc722f5f93b9a89b10809c2b38718de9c847a96ffdd540d2c71eae154e4201', # PyTorch-1.12.1_fix-skip-decorators.patch
# PyTorch-1.12.1_fix-test_cpp_extensions_jit.patch
'1efc9850c431d702e9117d4766277d3f88c5c8b3870997c9974971bce7f2ab83',
# PyTorch-1.12.1_fix-TestCudaFuser.test_unary_ops.patch
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ checksums = [
'f2e6b9625733d9a471bb75e1ea20e28814cf1380b4f9089aa838ee35ddecf07d', # PyTorch-1.11.0_install-vsx-vec-headers.patch
# PyTorch-1.12.1_fix-cuda-gcc-version-check.patch
'a650f4576f06c749f244cada52ff9c02499fa8f182019129488db3845e0756ab',
'dce66cae4f3e196fbeb5293a9664c9962bb3920dfa9f9531d4ce39663f431bb4', # PyTorch-1.12.1_fix-skip-decorators.patch
'32fc722f5f93b9a89b10809c2b38718de9c847a96ffdd540d2c71eae154e4201', # PyTorch-1.12.1_fix-skip-decorators.patch
# PyTorch-1.12.1_fix-test_cpp_extensions_jit.patch
'1efc9850c431d702e9117d4766277d3f88c5c8b3870997c9974971bce7f2ab83',
# PyTorch-1.12.1_fix-TestCudaFuser.test_unary_ops.patch
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,15 @@ The decorators are implemented to run when the function is called which is after
the test `setup` method spawned subprocesses which may use NCCL to sync failing when there are
not enough GPUs available.
So replace the custom code by calls to the `unittest` skip decorators.
See https://github.com/pytorch/pytorch/issues/89686
See https://github.com/pytorch/pytorch/pull/89750

Author: Alexander Grund (TU Dresden)

diff --git a/torch/testing/_internal/common_distributed.py b/torch/testing/_internal/common_distributed.py
index 8baf7d03d9f..d4c17c80c5b 100644
index 8baf7d03d9f..3dc922ee923 100644
--- a/torch/testing/_internal/common_distributed.py
+++ b/torch/testing/_internal/common_distributed.py
@@ -124,46 +124,19 @@ def skip_if_odd_worldsize(func):
@@ -124,46 +124,11 @@ def skip_if_odd_worldsize(func):
return wrapper

def require_n_gpus_for_nccl_backend(n, backend):
Expand All @@ -25,9 +25,8 @@ index 8baf7d03d9f..d4c17c80c5b 100644
- return wrapper
-
- return decorator
-
+ return unittest.skipIf(backend == "nccl" and torch.cuda.device_count() < n,
+ TEST_SKIPS[f"multi-gpu-{n}"].message)
+ return skip_if_lt_x_gpu(n) if backend == "nccl" else unittest.skipIf(False, None)


def skip_if_lt_x_gpu(x):
- def decorator(func):
Expand All @@ -40,12 +39,10 @@ index 8baf7d03d9f..d4c17c80c5b 100644
- return wrapper
-
- return decorator
+ return unittest.skipUnless(torch.cuda.is_available() and torch.cuda.device_count() >= x,
+ TEST_SKIPS[f"multi-gpu-{n}"].message)


# This decorator helps avoiding initializing cuda while testing other backends
def nccl_skip_if_lt_x_gpu(backend, x):
-
-
-# This decorator helps avoiding initializing cuda while testing other backends
-def nccl_skip_if_lt_x_gpu(backend, x):
- def decorator(func):
- @wraps(func)
- def wrapper(*args, **kwargs):
Expand All @@ -58,9 +55,64 @@ index 8baf7d03d9f..d4c17c80c5b 100644
- return wrapper
-
- return decorator
+ return unittest.skipUnless(backend != "nccl" or
+ (torch.cuda.is_available() and torch.cuda.device_count() >= x),
+ TEST_SKIPS[f"multi-gpu-{n}"].message)
+ return unittest.skipIf(torch.cuda.device_count() < x, TEST_SKIPS[f"multi-gpu-{x}"].message)


def verify_ddp_error_logged(model_DDP, err_substr):
diff --git a/torch/testing/_internal/distributed/distributed_test.py b/torch/testing/_internal/distributed/distributed_test.py
index 1414a0376b1..1f6b00e6edf 100644
--- a/torch/testing/_internal/distributed/distributed_test.py
+++ b/torch/testing/_internal/distributed/distributed_test.py
@@ -56,7 +56,6 @@ from torch.testing._internal.common_distributed import (
skip_if_small_worldsize,
skip_if_odd_worldsize,
skip_if_lt_x_gpu,
- nccl_skip_if_lt_x_gpu,
skip_if_no_gpu,
require_n_gpus_for_nccl_backend,
requires_nccl_version,
@@ -4604,7 +4603,7 @@ class DistributedTest:
BACKEND != "mpi" and BACKEND != "nccl" and BACKEND != "gloo",
"get_future is only supported on mpi, nccl and gloo",
)
- @nccl_skip_if_lt_x_gpu(BACKEND, 2)
+ @require_n_gpus_for_nccl_backend(BACKEND, 2)
def test_accumulate_gradients_no_sync(self):
"""
Runs _test_accumulate_gradients_no_sync using default inputs
@@ -4615,7 +4614,7 @@ class DistributedTest:
BACKEND != "mpi" and BACKEND != "nccl" and BACKEND != "gloo",
"get_future is only supported on mpi, nccl and gloo",
)
- @nccl_skip_if_lt_x_gpu(BACKEND, 2)
+ @require_n_gpus_for_nccl_backend(BACKEND, 2)
def test_accumulate_gradients_no_sync_grad_is_view(self):
"""
Runs _test_accumulate_gradients_no_sync using default inputs
@@ -4626,7 +4625,7 @@ class DistributedTest:
BACKEND != "mpi" and BACKEND != "nccl" and BACKEND != "gloo",
"get_future is only supported on mpi, nccl and gloo",
)
- @nccl_skip_if_lt_x_gpu(BACKEND, 2)
+ @require_n_gpus_for_nccl_backend(BACKEND, 2)
def test_accumulate_gradients_no_sync_allreduce_hook(self):
"""
Runs multiple iterations on _test_accumulate_gradients_no_sync
@@ -4654,7 +4653,7 @@ class DistributedTest:
BACKEND != "mpi" and BACKEND != "nccl" and BACKEND != "gloo",
"get_future is only supported on mpi, nccl and gloo",
)
- @nccl_skip_if_lt_x_gpu(BACKEND, 2)
+ @require_n_gpus_for_nccl_backend(BACKEND, 2)
def test_accumulate_gradients_no_sync_allreduce_with_then_hook(self):
"""
Runs multiple iterations on _test_accumulate_gradients_no_sync using allreduce
@@ -4688,7 +4687,7 @@ class DistributedTest:
BACKEND != "mpi" and BACKEND != "nccl" and BACKEND != "gloo",
"get_future is only supported on mpi, nccl and gloo",
)
- @nccl_skip_if_lt_x_gpu(BACKEND, 2)
+ @require_n_gpus_for_nccl_backend(BACKEND, 2)
def test_get_future(self):
def mult(fut):
return [t * 3 for t in fut.wait()]

0 comments on commit 168ff80

Please sign in to comment.