From 4c3d9388d70781a11c4adf5f06878ffa701936b5 Mon Sep 17 00:00:00 2001 From: Alexander Grund Date: Tue, 19 Mar 2024 16:55:07 +0100 Subject: [PATCH 1/2] Fix test issues in PyTorch-2.1.2-foss-2023a-CUDA-12.1.1 --- .../PyTorch-2.1.2-foss-2023a-CUDA-12.1.1.eb | 17 ++- ...-locale-issue-in-nvrtcCompileProgram.patch | 106 ++++++++++++++++++ ...ch-2.1.2_fix-with_temp_dir-decorator.patch | 19 ++++ ..._fix-wrong-device-mesh-size-in-tests.patch | 28 +++++ ...-test_fsdp_tp_checkpoint_integration.patch | 33 ++++++ 5 files changed, 202 insertions(+), 1 deletion(-) create mode 100644 easybuild/easyconfigs/p/PyTorch/PyTorch-2.1.2_fix-locale-issue-in-nvrtcCompileProgram.patch create mode 100644 easybuild/easyconfigs/p/PyTorch/PyTorch-2.1.2_fix-with_temp_dir-decorator.patch create mode 100644 easybuild/easyconfigs/p/PyTorch/PyTorch-2.1.2_fix-wrong-device-mesh-size-in-tests.patch create mode 100644 easybuild/easyconfigs/p/PyTorch/PyTorch-2.1.2_skip-test_fsdp_tp_checkpoint_integration.patch diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.1.2-foss-2023a-CUDA-12.1.1.eb b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.1.2-foss-2023a-CUDA-12.1.1.eb index 225928ed9cbb..fdb202fa951e 100644 --- a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.1.2-foss-2023a-CUDA-12.1.1.eb +++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.1.2-foss-2023a-CUDA-12.1.1.eb @@ -47,15 +47,19 @@ patches = [ 'PyTorch-2.1.2_add-cuda-skip-markers.patch', 'PyTorch-2.1.2_fix-conj-mismatch-test-failures.patch', 'PyTorch-2.1.2_fix-device-mesh-check.patch', + 'PyTorch-2.1.2_fix-locale-issue-in-nvrtcCompileProgram.patch', 'PyTorch-2.1.2_fix-test_extension_backend-without-vectorization.patch', 'PyTorch-2.1.2_fix-test_memory_profiler.patch', 'PyTorch-2.1.2_fix-test_torchinductor-rounding.patch', 'PyTorch-2.1.2_fix-vsx-vector-abs.patch', 'PyTorch-2.1.2_fix-vsx-vector-div.patch', + 'PyTorch-2.1.2_fix-with_temp_dir-decorator.patch', + 'PyTorch-2.1.2_fix-wrong-device-mesh-size-in-tests.patch', 'PyTorch-2.1.2_relax-cuda-tolerances.patch', 'PyTorch-2.1.2_remove-nccl-backend-default-without-gpus.patch', 'PyTorch-2.1.2_skip-cpu_repro-test-without-vectorization.patch', 'PyTorch-2.1.2_skip-failing-test_dtensor_ops-subtests.patch', + 'PyTorch-2.1.2_skip-test_fsdp_tp_checkpoint_integration.patch', 'PyTorch-2.1.2_workaround_dynamo_failure_without_nnpack.patch', ] checksums = [ @@ -120,6 +124,8 @@ checksums = [ {'PyTorch-2.1.2_fix-conj-mismatch-test-failures.patch': 'c164357efa4ce88095376e590ba508fc1daa87161e1e59544eda56daac7f2847'}, {'PyTorch-2.1.2_fix-device-mesh-check.patch': 'c0efc288bf3d9a9a3c8bbd2691348a589a2677ea43880a8c987db91c8de4806b'}, + {'PyTorch-2.1.2_fix-locale-issue-in-nvrtcCompileProgram.patch': + 'f7adafb4e4d3b724b93237a259797b6ed6f535f83be0e34a7b759c71c6a8ddf2'}, {'PyTorch-2.1.2_fix-test_extension_backend-without-vectorization.patch': 'cd1455495886a7d6b2d30d48736eb0103fded21e2e36de6baac719b9c52a1c92'}, {'PyTorch-2.1.2_fix-test_memory_profiler.patch': @@ -128,6 +134,10 @@ checksums = [ 'a0ef99192ee2ad1509c78a8377023d5be2b5fddb16f84063b7c9a0b53d979090'}, {'PyTorch-2.1.2_fix-vsx-vector-abs.patch': 'd67d32407faed7dc1dbab4bba0e2f7de36c3db04560ced35c94caf8d84ade886'}, {'PyTorch-2.1.2_fix-vsx-vector-div.patch': '11f497a6892eb49b249a15320e4218e0d7ac8ae4ce67de39e4a018a064ca1acc'}, + {'PyTorch-2.1.2_fix-with_temp_dir-decorator.patch': + '90bd001e034095329277d70c6facc4026b4ce6d7f8b8d6aa81c0176eeb462eb1'}, + {'PyTorch-2.1.2_fix-wrong-device-mesh-size-in-tests.patch': + '07a5e4233d02fb6348872838f4d69573c777899c6f0ea4e39ae23c08660d41e5'}, {'PyTorch-2.1.2_relax-cuda-tolerances.patch': '554ad09787f61080fafdb84216e711e32327aa357e2a9c40bb428eb6503dee6e'}, {'PyTorch-2.1.2_remove-nccl-backend-default-without-gpus.patch': 'e6a1efe3d127fcbf4723476a7a1c01cfcf2ccb16d1fb250f478192623e8b6a15'}, @@ -135,6 +145,8 @@ checksums = [ '7ace835af60c58d9e0754a34c19d4b9a0c3a531f19e5d0eba8e2e49206eaa7eb'}, {'PyTorch-2.1.2_skip-failing-test_dtensor_ops-subtests.patch': '6cf711bf26518550903b09ed4431de9319791e79d61aab065785d6608fd5cc88'}, + {'PyTorch-2.1.2_skip-test_fsdp_tp_checkpoint_integration.patch': + '943ee92f5fd518f608a59e43fe426b9bb45d7e7ad0ba04639e516db2d61fa57d'}, {'PyTorch-2.1.2_workaround_dynamo_failure_without_nnpack.patch': 'fb96eefabf394617bbb3fbd3a7a7c1aa5991b3836edc2e5d2a30e708bfe49ba1'}, ] @@ -191,6 +203,8 @@ excluded_tests = { # intermittent failures on various systems # See https://github.com/easybuilders/easybuild-easyconfigs/issues/17712 'distributed/rpc/test_tensorpipe_agent', + # Broken test, can't ever succeed, see https://github.com/pytorch/pytorch/issues/122184 + 'distributed/tensor/parallel/test_tp_random_state.py', # failures on OmniPath systems, which don't support some optional InfiniBand features # See https://github.com/pytorch/tensorpipe/issues/413 'distributed/pipeline/sync/skip/test_gpipe', @@ -206,8 +220,9 @@ runtest = 'cd test && PYTHONUNBUFFERED=1 %(python)s run_test.py --continue-throu # Especially test_quantization has a few corner cases that are triggered by the random input values, # those cannot be easily avoided, see https://github.com/pytorch/pytorch/issues/107030 +# test_nn is also prone to spurious failures: https://github.com/pytorch/pytorch/issues/118294 # So allow a low number of tests to fail as the tests "usually" succeed -max_failed_tests = 50 +max_failed_tests = 2 # The readelf sanity check command can be taken out once the TestRPATH test from # https://github.com/pytorch/pytorch/pull/109493 is accepted, since it is then checked as part of the PyTorch test suite diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.1.2_fix-locale-issue-in-nvrtcCompileProgram.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.1.2_fix-locale-issue-in-nvrtcCompileProgram.patch new file mode 100644 index 000000000000..a8aa712d4ada --- /dev/null +++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.1.2_fix-locale-issue-in-nvrtcCompileProgram.patch @@ -0,0 +1,106 @@ +There is a bug in CUDA 11.7 through at least CUDA 12.4 which changes the current thread locale +when calling nvrtcCompileProgram. +See e.g. https://stackoverflow.com/questions/74044994 +This also includes the encoding used by Python by default for e.g. subsequent invocations of `subprocess` calls. +When the user environment is now set to e.g. UTF-8 and changed by CUDA (to ASCII/ANSI_X3.4-1968) Python will fail +to decode UTF-8 output from programs invoked. +This happens e.g. in `test_torch` which calls `from scipy import stats` which runs `lscpu` and errors +with something like +> /software/SciPy-bundle/2023.07-gfbf-2023a/lib/python3.11/site-packages/numpy/testing/_private/utils.py", line 1253, in +> _SUPPORTS_SVE = check_support_sve() +> /software/SciPy-bundle/2023.07-gfbf-2023a/lib/python3.11/site-packages/numpy/testing/_private/utils.py", line 1247, in check_support_sve +> output = subprocess.run(cmd, capture_output=True, text=True) +> /software/Python/3.11.3-GCCcore-12.3.0/lib/python3.11/subprocess.py", line 2113, in _communicate +> stdout = self._translate_newlines(stdout, +> UnicodeDecodeError: 'ascii' codec can't decode byte 0xc3 in position 96: ordinal not in range(128) + +Fix by wrapping the nvrtcCompileProgram saving and restoring the thread locale. + +Author: Alexander Grund (TU Dresden) + +diff --git a/aten/src/ATen/cuda/detail/LazyNVRTC.cpp b/aten/src/ATen/cuda/detail/LazyNVRTC.cpp +index 1b85e7776e2..1c13a3b1168 100644 +--- a/aten/src/ATen/cuda/detail/LazyNVRTC.cpp ++++ b/aten/src/ATen/cuda/detail/LazyNVRTC.cpp +@@ -2,6 +2,7 @@ + + #include + #include ++#include + #include + + namespace at { +@@ -143,6 +144,29 @@ nvrtcResult nvrtcCreateProgram(nvrtcProgram *prog, + return fn(prog, src, name, numHeaders, headers, includeNames); + } + ++nvrtcResult nvrtcCompileProgram_wrapped(nvrtcProgram prog, ++ int numOptions, ++ const char * const *options) { ++ // Save & restore current thread locale which can get modified by nvrtcCompileProgram ++ locale_t oldLocale = uselocale((locale_t) 0); ++ auto result = lazyNVRTC.nvrtcCompileProgram_real(prog, numOptions, options); ++ if (oldLocale != (locale_t) 0) ++ uselocale(oldLocale); ++ return result; ++} ++ ++nvrtcResult nvrtcCompileProgram(nvrtcProgram prog, ++ int numOptions, ++ const char * const *options) { ++ auto fn = reinterpret_cast(getNVRTCLibrary().sym(__func__)); ++ if (!fn) ++ throw std::runtime_error("Can't get nvrtcCompileProgram"); ++ lazyNVRTC.nvrtcCompileProgram_real = fn; ++ fn = &nvrtcCompileProgram_wrapped; ++ lazyNVRTC.nvrtcCompileProgram = fn; ++ return fn(prog, numOptions, options); ++} ++ + NVRTC_STUB1(nvrtcDestroyProgram, nvrtcProgram *); + NVRTC_STUB2(nvrtcGetPTXSize, nvrtcProgram, size_t *); + NVRTC_STUB2(nvrtcGetPTX, nvrtcProgram, char *); +@@ -150,7 +174,6 @@ NVRTC_STUB2(nvrtcGetPTX, nvrtcProgram, char *); + NVRTC_STUB2(nvrtcGetCUBINSize, nvrtcProgram, size_t *); + NVRTC_STUB2(nvrtcGetCUBIN, nvrtcProgram, char *); + #endif +-NVRTC_STUB3(nvrtcCompileProgram, nvrtcProgram, int, const char * const *); + _STUB_1(NVRTC, nvrtcGetErrorString, const char *, nvrtcResult); + NVRTC_STUB2(nvrtcGetProgramLogSize,nvrtcProgram, size_t*); + NVRTC_STUB2(nvrtcGetProgramLog, nvrtcProgram, char *); +diff --git a/aten/src/ATen/cuda/nvrtc_stub/ATenNVRTC.h b/aten/src/ATen/cuda/nvrtc_stub/ATenNVRTC.h +index 574b2c41c26..4ddc5316dad 100644 +--- a/aten/src/ATen/cuda/nvrtc_stub/ATenNVRTC.h ++++ b/aten/src/ATen/cuda/nvrtc_stub/ATenNVRTC.h +@@ -120,6 +120,8 @@ extern "C" typedef struct NVRTC { + #define CREATE_MEMBER(name) decltype(&name) name; + AT_FORALL_NVRTC(CREATE_MEMBER) + #undef CREATE_MEMBER ++ // Must be at end! ++ decltype(nvrtcCompileProgram) nvrtcCompileProgram_real; + } NVRTC; + + extern "C" TORCH_CUDA_CPP_API NVRTC* load_nvrtc(); +diff --git a/caffe2/cuda_rtc/common_rtc.h b/caffe2/cuda_rtc/common_rtc.h +index 9d9582d34b6..562a653a67a 100644 +--- a/caffe2/cuda_rtc/common_rtc.h ++++ b/caffe2/cuda_rtc/common_rtc.h +@@ -1,6 +1,7 @@ + #ifndef CAFFE2_CUDA_RTC_COMMON_RTC_H_ + #define CAFFE2_CUDA_RTC_COMMON_RTC_H_ + ++#include + #include + #include + +@@ -46,7 +47,10 @@ class CudaRTCFunction { + // coding it? + const char* nvrtc_opts[] = { + "--gpu-architecture=compute_35", "--use_fast_math"}; ++ locale_t oldLocale = uselocale((locale_t) 0); + nvrtcResult compile_result = nvrtcCompileProgram(prog, 2, nvrtc_opts); ++ if (oldLocale != (locale_t) 0) ++ uselocale(oldLocale); + if (compile_result != NVRTC_SUCCESS) { + size_t log_size; + NVRTC_CHECK(nvrtcGetProgramLogSize(prog, &log_size)); diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.1.2_fix-with_temp_dir-decorator.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.1.2_fix-with_temp_dir-decorator.patch new file mode 100644 index 000000000000..3834bdf3d258 --- /dev/null +++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.1.2_fix-with_temp_dir-decorator.patch @@ -0,0 +1,19 @@ +The decorator fails to pass the argument list to the function. +This then fails e.g. test/distributed/checkpoint/test_fsdp_optim_state.py with +> TypeError: FsdpOptimStateCheckpoint.test_load_sharded_optimizer_state_dict() missing 1 required positional argument: 'pass_planner' + +Author: Alexander Grund (TU Dresden) + +diff --git a/torch/testing/_internal/distributed/checkpoint_utils.py b/torch/testing/_internal/distributed/checkpoint_utils.py +index 1a6e43a038c..52f79b37bfd 100644 +--- a/torch/testing/_internal/distributed/checkpoint_utils.py ++++ b/torch/testing/_internal/distributed/checkpoint_utils.py +@@ -31,7 +31,7 @@ def with_temp_dir( + self.temp_dir = object_list[0] + + try: +- func(self) ++ func(self, *args, **kwargs) + finally: + if dist.get_rank() == 0: + shutil.rmtree(self.temp_dir, ignore_errors=True) diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.1.2_fix-wrong-device-mesh-size-in-tests.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.1.2_fix-wrong-device-mesh-size-in-tests.patch new file mode 100644 index 000000000000..9d8676074721 --- /dev/null +++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.1.2_fix-wrong-device-mesh-size-in-tests.patch @@ -0,0 +1,28 @@ +From 1d1308649298caf8884970fc57ed82a2d8ea6079 Mon Sep 17 00:00:00 2001 +From: Xilun Wu <12968408+XilunWu@users.noreply.github.com> +Date: Tue, 26 Dec 2023 17:48:11 -0800 +Subject: [PATCH] [BE] force DTensorTestBase.build_device_mesh to use + world_size rather than NUM_DEVICES constant (#116439) + +**Test**: +`python test/distributed/fsdp/test_shard_utils.py -k test_create_chunk_dtensor` + +Pull Request resolved: https://github.com/pytorch/pytorch/pull/116439 +Approved by: https://github.com/wanchaol +--- + torch/testing/_internal/distributed/_tensor/common_dtensor.py | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/torch/testing/_internal/distributed/_tensor/common_dtensor.py b/torch/testing/_internal/distributed/_tensor/common_dtensor.py +index ab86ecd1616a74f..05a3c0872878965 100644 +--- a/torch/testing/_internal/distributed/_tensor/common_dtensor.py ++++ b/torch/testing/_internal/distributed/_tensor/common_dtensor.py +@@ -192,7 +192,7 @@ def backend(self) -> str: + return PG_BACKEND + + def build_device_mesh(self) -> DeviceMesh: +- return DeviceMesh(DEVICE_TYPE, list(range(NUM_DEVICES))) ++ return DeviceMesh(DEVICE_TYPE, list(range(self.world_size))) + + def init_pg(self) -> None: + if "nccl" in self.backend and torch.cuda.device_count() < self.world_size: diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.1.2_skip-test_fsdp_tp_checkpoint_integration.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.1.2_skip-test_fsdp_tp_checkpoint_integration.patch new file mode 100644 index 000000000000..6a0461a7dcb4 --- /dev/null +++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.1.2_skip-test_fsdp_tp_checkpoint_integration.patch @@ -0,0 +1,33 @@ +test_fsdp_tp_checkpoint_integration in distributed/fsdp/test_fsdp_tp_integration.py +fails due to a regression. See https://github.com/pytorch/pytorch/issues/101162 + +> RuntimeError: Error(s) in loading state_dict for FullyShardedDataParallel: +> size mismatch for _fsdp_wrapped_module.net1.weight: copying a param with shape torch.Size([4, 5]) from checkpoint, the shape in current model is torch.Size([8, 5]). +> size mismatch for _fsdp_wrapped_module.net1.bias: copying a param with shape torch.Size([4]) from checkpoint, the shape in current model is torch.Size([8]). +> size mismatch for _fsdp_wrapped_module.net2.weight: copying a param with shape torch.Size([4, 4]) from checkpoint, the shape in current model is torch.Size([4, 8]). + +Skip the test. This should be fixed already for 2.2.x + +Author: Alexander Grund (TU Dresden) + +diff --git a/test/distributed/fsdp/test_fsdp_tp_integration.py b/test/distributed/fsdp/test_fsdp_tp_integration.py +index bc7a4aef4a3..aea16a1f1fb 100644 +--- a/test/distributed/fsdp/test_fsdp_tp_integration.py ++++ b/test/distributed/fsdp/test_fsdp_tp_integration.py +@@ -3,6 +3,7 @@ import copy + import sys + from collections import OrderedDict + from typing import Any, Dict, List, Optional, Tuple ++import unittest + + import torch + from torch import distributed as dist +@@ -306,7 +307,7 @@ class TestTPFSDPIntegration(FSDPTest): + tp_fsdp_out = tp_fsdp_model(inp) + self.assertEqual(fsdp_out, tp_fsdp_out) + +- @skip_if_lt_x_gpu(4) ++ @unittest.skip("Known failure: #101162") + def test_fsdp_tp_checkpoint_integration(self): + """Tests checkpointing for TP + FSDP integration.""" + self.assertTrue( From a8dab8e1eecee1b63aaa80907d0242bc8a4b78fb Mon Sep 17 00:00:00 2001 From: Alexander Grund Date: Wed, 20 Mar 2024 09:56:51 +0100 Subject: [PATCH 2/2] Remove extension from excluded test --- .../p/PyTorch/PyTorch-2.1.2-foss-2023a-CUDA-12.1.1.eb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.1.2-foss-2023a-CUDA-12.1.1.eb b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.1.2-foss-2023a-CUDA-12.1.1.eb index fdb202fa951e..7990dd1511b8 100644 --- a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.1.2-foss-2023a-CUDA-12.1.1.eb +++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.1.2-foss-2023a-CUDA-12.1.1.eb @@ -204,7 +204,7 @@ excluded_tests = { # See https://github.com/easybuilders/easybuild-easyconfigs/issues/17712 'distributed/rpc/test_tensorpipe_agent', # Broken test, can't ever succeed, see https://github.com/pytorch/pytorch/issues/122184 - 'distributed/tensor/parallel/test_tp_random_state.py', + 'distributed/tensor/parallel/test_tp_random_state', # failures on OmniPath systems, which don't support some optional InfiniBand features # See https://github.com/pytorch/tensorpipe/issues/413 'distributed/pipeline/sync/skip/test_gpipe',