-
Notifications
You must be signed in to change notification settings - Fork 683
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #18424 from Flamefire/20230731160602_new_pr_PyTorc…
…h1131 {ai}[foss/2022a] PyTorch v1.13.1 w/ CUDA 11.7.0
- Loading branch information
Showing
7 changed files
with
385 additions
and
0 deletions.
There are no files selected for viewing
164 changes: 164 additions & 0 deletions
164
easybuild/easyconfigs/p/PyTorch/PyTorch-1.13.1-foss-2022a-CUDA-11.7.0.eb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,164 @@ | ||
name = 'PyTorch' | ||
version = '1.13.1' | ||
versionsuffix = '-CUDA-%(cudaver)s' | ||
|
||
homepage = 'https://pytorch.org/' | ||
description = """Tensors and Dynamic neural networks in Python with strong GPU acceleration. | ||
PyTorch is a deep learning framework that puts Python first.""" | ||
|
||
toolchain = {'name': 'foss', 'version': '2022a'} | ||
|
||
source_urls = [GITHUB_RELEASE] | ||
sources = ['%(namelower)s-v%(version)s.tar.gz'] | ||
patches = [ | ||
'PyTorch-1.7.0_disable-dev-shm-test.patch', | ||
'PyTorch-1.10.0_fix-kineto-crash.patch', | ||
'PyTorch-1.11.0_fix-fp16-quantization-without-fbgemm.patch', | ||
'PyTorch-1.11.1_skip-test_init_from_local_shards.patch', | ||
'PyTorch-1.12.0_fix-EmbeddingBag-without-fbgemm.patch', | ||
'PyTorch-1.12.1_add-hypothesis-suppression.patch', | ||
'PyTorch-1.12.1_fix-skip-decorators.patch', | ||
'PyTorch-1.12.1_fix-test_cpp_extensions_jit.patch', | ||
'PyTorch-1.12.1_fix-test_wishart_log_prob.patch', | ||
'PyTorch-1.12.1_fix-TestTorch.test_to.patch', | ||
'PyTorch-1.12.1_fix-use-after-free-in-tensorpipe-agent.patch', | ||
'PyTorch-1.12.1_fix-vsx-loadu.patch', | ||
'PyTorch-1.12.1_fix-vsx-vector-funcs.patch', | ||
'PyTorch-1.12.1_skip-test_round_robin.patch', | ||
'PyTorch-1.13.1_disable-test-sharding.patch', | ||
'PyTorch-1.13.1_fix-flaky-jit-test.patch', | ||
'PyTorch-1.13.1_fix-fsdp-fp16-test.patch', | ||
'PyTorch-1.13.1_fix-fsdp-tp-integration-test.patch', | ||
'PyTorch-1.13.1_fix-kineto-crash-on-exit.patch', | ||
'PyTorch-1.13.1_fix-pytest-args.patch', | ||
'PyTorch-1.13.1_fix-test-ops-conf.patch', | ||
'PyTorch-1.13.1_fix-wrong-check-in-fsdp-tests.patch', | ||
'PyTorch-1.13.1_increase-tolerance-test_jit.patch', | ||
'PyTorch-1.13.1_increase-tolerance-test_ops.patch', | ||
'PyTorch-1.13.1_increase-tolerance-test_optim.patch', | ||
'PyTorch-1.13.1_install-vsx-vec-headers.patch', | ||
'PyTorch-1.13.1_no-cuda-stubs-rpath.patch', | ||
'PyTorch-1.13.1_remove-flaky-test-in-testnn.patch', | ||
'PyTorch-1.13.1_skip-failing-grad-test.patch', | ||
'PyTorch-1.13.1_skip-test-requiring-online-access.patch', | ||
'PyTorch-1.13.1_skip-tests-without-fbgemm.patch', | ||
] | ||
checksums = [ | ||
{'pytorch-v1.13.1.tar.gz': 'dbc229ee9750b02b514937d017744443a269ea0241ed3f32b9af0703589d25d4'}, | ||
{'PyTorch-1.7.0_disable-dev-shm-test.patch': '622cb1eaeadc06e13128a862d9946bcc1f1edd3d02b259c56a9aecc4d5406b8a'}, | ||
{'PyTorch-1.10.0_fix-kineto-crash.patch': 'dc467333b28162149af8f675929d8c6bf219f23230bfc0d39af02ba4f6f882eb'}, | ||
{'PyTorch-1.11.0_fix-fp16-quantization-without-fbgemm.patch': | ||
'cc526130b6446bbbf5f0f7372d3aeee3e7d4c4d6e471524dff028b430b152934'}, | ||
{'PyTorch-1.11.1_skip-test_init_from_local_shards.patch': | ||
'4aeb1b0bc863d4801b0095cbce69f8794066748f0df27c6aaaf729c5ecba04b7'}, | ||
{'PyTorch-1.12.0_fix-EmbeddingBag-without-fbgemm.patch': | ||
'090598592283e3fc46ee08a68b6a6afe07be41b26514afba51834408bf1c98ed'}, | ||
{'PyTorch-1.12.1_add-hypothesis-suppression.patch': | ||
'e71ffb94ebe69f580fa70e0de84017058325fdff944866d6bd03463626edc32c'}, | ||
{'PyTorch-1.12.1_fix-skip-decorators.patch': 'e3ca6e42b2fa592ea095939fb59ab875668a058479407db3f3684cc5c6f4146c'}, | ||
{'PyTorch-1.12.1_fix-test_cpp_extensions_jit.patch': | ||
'1efc9850c431d702e9117d4766277d3f88c5c8b3870997c9974971bce7f2ab83'}, | ||
{'PyTorch-1.12.1_fix-test_wishart_log_prob.patch': | ||
'cf475ae6e6234b96c8d1bf917597c5176c94b3ccd940b72f2e1cd0c979580f45'}, | ||
{'PyTorch-1.12.1_fix-TestTorch.test_to.patch': '75f27987c3f25c501e719bd2b1c70a029ae0ee28514a97fe447516aee02b1535'}, | ||
{'PyTorch-1.12.1_fix-use-after-free-in-tensorpipe-agent.patch': | ||
'0bd7e88b92c4c6f0fecf01746009858ba19f2df68b10b88c41485328a531875d'}, | ||
{'PyTorch-1.12.1_fix-vsx-loadu.patch': '8bfe3c94ada1dd1f7974a1261a8b576fb7ae944050fa1c7830fca033831123b2'}, | ||
{'PyTorch-1.12.1_fix-vsx-vector-funcs.patch': 'caccbf60f62eac313896c1eaec78b08f5d0fdfcb907079087490bb13d1561aa2'}, | ||
{'PyTorch-1.12.1_skip-test_round_robin.patch': '63d4849b78605aa088fdff695637d9473ea60dee603a3ff7f788690d70c55349'}, | ||
{'PyTorch-1.13.1_disable-test-sharding.patch': 'df2074adeba47998ce2993d99ca64eb6f1c79ab7057f553b436efdec264d3572'}, | ||
{'PyTorch-1.13.1_fix-flaky-jit-test.patch': '71efdeb29b5e5b4982c9f5cb2182733654a34d52f85bb5487bc4d7d99b86101b'}, | ||
{'PyTorch-1.13.1_fix-fsdp-fp16-test.patch': '8ae68e60d6e1f92f50322b7f0381c7e65251fba32d7606e3a238a36a2f55b5cf'}, | ||
{'PyTorch-1.13.1_fix-fsdp-tp-integration-test.patch': | ||
'31e2d63b54ae1a8c554575f46db79bf8bbda851b6ca0ffe623c4911207a3c2bc'}, | ||
{'PyTorch-1.13.1_fix-kineto-crash-on-exit.patch': | ||
'f1e6808ee8d91a2ad76e0caedb4685e5aec3008d5e2e3c3c3e88cbb25cbd71b4'}, | ||
{'PyTorch-1.13.1_fix-pytest-args.patch': 'd3e3c841cf8d73683750f29326f2be56ee0bb5df7ff522baf7d7c3f301a91ec2'}, | ||
{'PyTorch-1.13.1_fix-test-ops-conf.patch': 'df652eec7753864ebebbfeca546929a53e3fb8f24259d5c9b964266a8551198c'}, | ||
{'PyTorch-1.13.1_fix-wrong-check-in-fsdp-tests.patch': | ||
'cbb5ca9ad668a504a456a2cc02d7254b79ddfd9a971a1648f0508fb103a9fc89'}, | ||
{'PyTorch-1.13.1_increase-tolerance-test_jit.patch': | ||
'b97913754a0ae0887b8137db0b0d57caff8c3d7bd96fe555ea27ea01ff14527a'}, | ||
{'PyTorch-1.13.1_increase-tolerance-test_ops.patch': | ||
'c909fdfc2b12df457e1eb5514265ffec3eab653994949416f3f048668421e223'}, | ||
{'PyTorch-1.13.1_increase-tolerance-test_optim.patch': | ||
'a079d824085eab89794f5ecfc67792f735ed8cfd3fe7db52e4dea62e583cfe06'}, | ||
{'PyTorch-1.13.1_install-vsx-vec-headers.patch': | ||
'7b678f54bb947afd4767f5877ac424b4b94ce5db609ea20f5a869ccf4027035f'}, | ||
{'PyTorch-1.13.1_no-cuda-stubs-rpath.patch': '4c636059850fc9d1ecb27ce275f8aad5d5b6fdc19e35aff0c25b86cb3201352a'}, | ||
{'PyTorch-1.13.1_remove-flaky-test-in-testnn.patch': | ||
'be83ff61fe2dedab6d49c232936d5622df81ab49154264490021c6c828e53315'}, | ||
{'PyTorch-1.13.1_skip-failing-grad-test.patch': '6681200f9509893cb9231b5c93ac9bc5e6d9d9ae4febefca52e7cbc843ba8f51'}, | ||
{'PyTorch-1.13.1_skip-test-requiring-online-access.patch': | ||
'61c3b7859dc06a9969981b07aa2789630de110d6d1d3633d27364be47af74712'}, | ||
{'PyTorch-1.13.1_skip-tests-without-fbgemm.patch': | ||
'481e595f673baf8ae58b41697a6792b83048b0264aa79b422f48cd8c22948bb7'}, | ||
] | ||
|
||
osdependencies = [OS_PKG_IBVERBS_DEV] | ||
|
||
builddependencies = [ | ||
('CMake', '3.23.1'), | ||
('hypothesis', '6.46.7'), | ||
# For tests | ||
('pytest-rerunfailures', '11.1'), | ||
('pytest-shard', '0.1.2'), | ||
] | ||
|
||
dependencies = [ | ||
('CUDA', '11.7.0', '', SYSTEM), | ||
('Ninja', '1.10.2'), # Required for JIT compilation of C++ extensions | ||
('Python', '3.10.4'), | ||
('protobuf', '3.19.4'), | ||
('protobuf-python', '3.19.4'), | ||
('pybind11', '2.9.2'), | ||
('SciPy-bundle', '2022.05'), | ||
('PyYAML', '6.0'), | ||
('MPFR', '4.1.0'), | ||
('GMP', '6.2.1'), | ||
('numactl', '2.0.14'), | ||
('FFmpeg', '4.4.2'), | ||
('Pillow', '9.1.1'), | ||
('cuDNN', '8.4.1.50', '-CUDA-%(cudaver)s', SYSTEM), | ||
('magma', '2.6.2', '-CUDA-%(cudaver)s'), | ||
('NCCL', '2.12.12', '-CUDA-%(cudaver)s'), | ||
('expecttest', '0.1.3'), | ||
] | ||
|
||
# default CUDA compute capabilities to use (override via --cuda-compute-capabilities) | ||
cuda_compute_capabilities = ['3.5', '3.7', '5.2', '6.0', '6.1', '7.0', '7.2', '7.5', '8.0', '8.6'] | ||
|
||
excluded_tests = { | ||
'': [ | ||
# This test seems to take too long on NVIDIA Ampere at least. | ||
'distributed/test_distributed_spawn', | ||
# Broken on CUDA 11.6/11.7: https://github.com/pytorch/pytorch/issues/75375 | ||
'distributions/test_constraints', | ||
# no xdoctest | ||
'doctests', | ||
# failing on broadwell | ||
# See https://github.com/easybuilders/easybuild-easyconfigs/issues/17712 | ||
'test_native_mha', | ||
# intermittent failures on various systems | ||
# See https://github.com/easybuilders/easybuild-easyconfigs/issues/17712 | ||
'distributed/rpc/test_tensorpipe_agent', | ||
] | ||
} | ||
|
||
runtest = 'cd test && PYTHONUNBUFFERED=1 %(python)s run_test.py --continue-through-error --verbose %(excluded_tests)s' | ||
|
||
# The readelf sanity check command can be taken out once the TestRPATH test from | ||
# https://github.com/pytorch/pytorch/pull/87593 is accepted, since it is then checked as part of the PyTorch test suite | ||
local_libcaffe2 = "$EBROOTPYTORCH/lib/python%%(pyshortver)s/site-packages/torch/lib/libcaffe2_nvrtc.%s" % SHLIB_EXT | ||
sanity_check_commands = [ | ||
"readelf -d %s | egrep 'RPATH|RUNPATH' | grep -v stubs" % local_libcaffe2, | ||
] | ||
|
||
# Especially test_quantization has a few corner cases that are triggered by the random input values, | ||
# those cannot be easily avoided, see https://github.com/pytorch/pytorch/issues/107030 | ||
# So allow a low number of tests to fail as the tests "usually" succeed | ||
max_failed_tests = 2 | ||
|
||
tests = ['PyTorch-check-cpp-extension.py'] | ||
|
||
moduleclass = 'ai' |
32 changes: 32 additions & 0 deletions
32
easybuild/easyconfigs/p/PyTorch/PyTorch-1.13.1_fix-fsdp-tp-integration-test.patch
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
Similar to the FP16 test this test seems to expect at most 4 GPUs | ||
as indicated by `skip_if_lt_x_gpu(4)` decorators. | ||
|
||
Otherwise: | ||
- test_fsdp_tp_checkpoint_integration fails with | ||
File "/tmp/eb-tmp-2022a-cuda/lib/python3.10/site-packages/torch/distributed/_shard/sharded_tensor/_ops/tensor_ops.py", line 46, in tensor_device | ||
return self_st.local_shards()[0].tensor.device | ||
IndexError: list index out of range | ||
- test_fsdp_tp_integration fails with | ||
AssertionError: Tensor-likes are not close! | ||
|
||
Mismatched elements: 72 / 72 (100.0%) | ||
Greatest absolute difference: 0.958100214600563 at index (0, 2, 7) (up to 1e-05 allowed) | ||
Greatest relative difference: 52.01691657271702 at index (1, 2, 4) (up to 1.3e-06 allowed) | ||
|
||
Author: Alexander Grund (TU Dresden) | ||
|
||
diff --git a/test/distributed/fsdp/test_fsdp_tp_integration.py b/test/distributed/fsdp/test_fsdp_tp_integration.py | ||
index e813966ec2f..ec2308c705b 100644 | ||
--- a/test/distributed/fsdp/test_fsdp_tp_integration.py | ||
+++ b/test/distributed/fsdp/test_fsdp_tp_integration.py | ||
@@ -202,6 +202,10 @@ class SimpleModel(torch.nn.Module): | ||
|
||
|
||
class TestTPFSDPIntegration(FSDPTest): | ||
+ @property | ||
+ def world_size(self): | ||
+ return min(4, super().world_size) | ||
+ | ||
def _get_params_and_sharding_info( | ||
self, | ||
model: SimpleModel, |
102 changes: 102 additions & 0 deletions
102
easybuild/easyconfigs/p/PyTorch/PyTorch-1.13.1_fix-kineto-crash-on-exit.patch
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,102 @@ | ||
Fix a crash in libkineto on shutdown observed in e.g. test_jit_cuda_fuser.py | ||
Taken from https://github.com/pytorch/kineto/pull/696 | ||
by Dmytro Dzhulgakov <dima.v.dzhulgakov@gmail.com> | ||
|
||
Backported: Alexander Grund (TU Dresden) | ||
|
||
Submodule third_party/kineto contains modified content | ||
diff --git a/third_party/kineto/libkineto/include/Config.h b/third_party/kineto/libkineto/include/Config.h | ||
index 3e7f2d4..10c80d9 100644 | ||
--- a/third_party/kineto/libkineto/include/Config.h | ||
+++ b/third_party/kineto/libkineto/include/Config.h | ||
@@ -315,6 +315,12 @@ class Config : public AbstractConfig { | ||
|
||
void print(std::ostream& s) const; | ||
|
||
+ // Config relies on some state with global static lifetime. If other | ||
+ // threads are using the config, it's possible that the global state | ||
+ // is destroyed before the threads stop. By hanging onto this handle, | ||
+ // correct destruction order can be ensured. | ||
+ static std::shared_ptr<void> getStaticObjectsLifetimeHandle(); | ||
+ | ||
private: | ||
explicit Config(const Config& other) = default; | ||
|
||
diff --git a/third_party/kineto/libkineto/src/Config.cpp b/third_party/kineto/libkineto/src/Config.cpp | ||
index 2b6e04e..b23abd6 100644 | ||
--- a/third_party/kineto/libkineto/src/Config.cpp | ||
+++ b/third_party/kineto/libkineto/src/Config.cpp | ||
@@ -155,10 +155,8 @@ struct FactoryMap { | ||
|
||
std::shared_ptr<FactoryMap> configFactories() { | ||
// Ensure this is safe to call during shutdown, even as static | ||
- // destructors are invoked. Once factories destructor has been | ||
- // invoked, weak_ptr.lock() will return nullptr. | ||
- // But calls before that point will have a valid shared_ptr, | ||
- // delaying destruction of the underlying FactoryMap. | ||
+ // destructors are invoked. getStaticObjectLifetimeHandle hangs onto | ||
+ // FactoryMap delaying its destruction. | ||
static auto factories = std::make_shared<FactoryMap>(); | ||
static std::weak_ptr<FactoryMap> weak_ptr = factories; | ||
return weak_ptr.lock(); | ||
@@ -210,6 +208,10 @@ Config::Config() | ||
} | ||
} | ||
|
||
+std::shared_ptr<void> Config::getStaticObjectsLifetimeHandle() { | ||
+ return configFactories(); | ||
+} | ||
+ | ||
uint8_t Config::createDeviceMask(const string& val) { | ||
uint8_t res = 0; | ||
for (const auto& d : splitAndTrim(val, ',')) { | ||
diff --git a/third_party/kineto/libkineto/src/ConfigLoader.cpp b/third_party/kineto/libkineto/src/ConfigLoader.cpp | ||
index 4080b67..ca72b32 100644 | ||
--- a/third_party/kineto/libkineto/src/ConfigLoader.cpp | ||
+++ b/third_party/kineto/libkineto/src/ConfigLoader.cpp | ||
@@ -157,7 +157,7 @@ void ConfigLoader::startThread() { | ||
} | ||
} | ||
|
||
-ConfigLoader::~ConfigLoader() { | ||
+void ConfigLoader::stopThread() { | ||
if (updateThread_) { | ||
stopFlag_ = true; | ||
{ | ||
@@ -165,7 +165,12 @@ ConfigLoader::~ConfigLoader() { | ||
updateThreadCondVar_.notify_one(); | ||
} | ||
updateThread_->join(); | ||
+ updateThread_ = nullptr; | ||
} | ||
+} | ||
+ | ||
+ConfigLoader::~ConfigLoader() { | ||
+ stopThread(); | ||
#if !USE_GOOGLE_LOG | ||
Logger::clearLoggerObservers(); | ||
#endif // !USE_GOOGLE_LOG | ||
@@ -249,6 +254,11 @@ void ConfigLoader::configureFromDaemon( | ||
} | ||
|
||
void ConfigLoader::updateConfigThread() { | ||
+ // It's important to hang to this reference until the thread stops. | ||
+ // Otherwise, the Config's static members may be destroyed before this | ||
+ // function finishes. | ||
+ auto handle = Config::getStaticObjectsLifetimeHandle(); | ||
+ | ||
auto now = system_clock::now(); | ||
auto next_config_load_time = now; | ||
auto next_on_demand_load_time = now + onDemandConfigUpdateIntervalSecs_; | ||
diff --git a/third_party/kineto/libkineto/src/ConfigLoader.h b/third_party/kineto/libkineto/src/ConfigLoader.h | ||
index 4680816..83a94ac 100644 | ||
--- a/third_party/kineto/libkineto/src/ConfigLoader.h | ||
+++ b/third_party/kineto/libkineto/src/ConfigLoader.h | ||
@@ -108,6 +108,7 @@ class ConfigLoader { | ||
DaemonConfigLoader* daemonConfigLoader(); | ||
|
||
void startThread(); | ||
+ void stopThread(); | ||
void updateConfigThread(); | ||
void updateBaseConfig(); | ||
|
21 changes: 21 additions & 0 deletions
21
easybuild/easyconfigs/p/PyTorch/PyTorch-1.13.1_fix-wrong-check-in-fsdp-tests.patch
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
A check in the test code was inverted and incomplete. | ||
See https://github.com/pytorch/pytorch/pull/90252 & https://github.com/pytorch/pytorch/pull/90785 | ||
|
||
Author: Alexander Grund (TU Dresden) | ||
|
||
diff --git a/torch/testing/_internal/common_fsdp.py b/torch/testing/_internal/common_fsdp.py | ||
index eda1b953d95..56a95961371 100644 | ||
--- a/torch/testing/_internal/common_fsdp.py | ||
+++ b/torch/testing/_internal/common_fsdp.py | ||
@@ -1014,7 +1014,10 @@ class FSDPTest(MultiProcessTestCase): | ||
# the DDP parameters are in FP16 (from `half()`) while the FSDP | ||
# parameters are in FP32 (from `summon_full_params()`) and (2) DDP runs | ||
# the optimizer in FP16 while FSDP runs it in FP32 | ||
- if mixed_precision is not None: | ||
+ # TODO: Disable checking the parameters for pure FP16 due to floating | ||
+ # point inaccuracy. Note that this means that the backward pass is not | ||
+ # checked: https://github.com/pytorch/pytorch/issues/90784 | ||
+ if mixed_precision is None and not use_pure_fp16: | ||
self.assertEqual( | ||
ddp_params, | ||
fsdp_unsharded_params, |
24 changes: 24 additions & 0 deletions
24
easybuild/easyconfigs/p/PyTorch/PyTorch-1.13.1_increase-tolerance-test_jit.patch
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
Fix the following failure: | ||
|
||
FAIL: test_freeze_conv_relu_fusion (jit.test_freezing.TestFrozenOptimizations) | ||
---------------------------------------------------------------------- | ||
... | ||
Mismatched elements: 10 / 30 (33.3%) | ||
Greatest absolute difference: 3.057718276977539e-05 at index (2, 3, 0, 0, 0) (up to 1e-05 allowed) | ||
Greatest relative difference: 8.758584417742737e-05 at index (0, 3, 0, 0, 0) (up to 1.3e-06 allowed) | ||
|
||
Author: Alexander Grund (TU Dresden) | ||
|
||
diff --git a/test/jit/test_freezing.py b/test/jit/test_freezing.py | ||
index cdb2187f6bb..900c7dc5f53 100644 | ||
--- a/test/jit/test_freezing.py | ||
+++ b/test/jit/test_freezing.py | ||
@@ -2255,7 +2255,7 @@ class TestFrozenOptimizations(JitTestCase): | ||
else: | ||
FileCheck().check("aten::cudnn_convolution_relu").run(frozen_mod.graph) | ||
|
||
- self.assertEqual(mod_eager(inp), frozen_mod(inp)) | ||
+ self.assertEqual(mod_eager(inp), frozen_mod(inp), atol=5e-5, rtol=1e-4) | ||
|
||
@unittest.skipIf(not (TEST_CUDNN or TEST_WITH_ROCM), "requires CUDNN") | ||
def test_freeze_conv_relu_fusion_not_forward(self): |
17 changes: 17 additions & 0 deletions
17
easybuild/easyconfigs/p/PyTorch/PyTorch-1.13.1_increase-tolerance-test_optim.patch
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
Increase tolerance for failing test(s). | ||
See https://github.com/pytorch/pytorch/issues/98414 | ||
|
||
Author: Alexander Grund (TU Dresden) | ||
|
||
diff --git a/test/test_optim.py b/test/test_optim.py | ||
index 2cb27376919..ce7cf2cb160 100644 | ||
--- a/test/test_optim.py | ||
+++ b/test/test_optim.py | ||
@@ -777,6 +777,7 @@ class TestOptim(TestCase): | ||
) | ||
|
||
def test_nadam(self): | ||
+ self.precision, self.rel_tol = (2e-05, 1e-05) | ||
self._test_basic_cases( | ||
lambda weight, bias, foreach: optim.NAdam([weight, bias], lr=1e-3, foreach=foreach), | ||
constructor_accepts_foreach=True, |
25 changes: 25 additions & 0 deletions
25
easybuild/easyconfigs/p/PyTorch/PyTorch-1.13.1_skip-test-requiring-online-access.patch
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
This tests downloads a Perl file from a Github repo which may fail in: | ||
|
||
File "test/test_cuda.py", line 4632, in test_memory_snapshot | ||
torch.cuda.memory._save_segment_usage(f.name) | ||
File "/torch/cuda/memory.py", line 610, in _save_segment_usage | ||
f.write(_segments(snapshot)) | ||
File "/torch/cuda/_memory_viz.py", line 60, in segments | ||
return format_flamegraph(f.getvalue()) | ||
File "/torch/cuda/_memory_viz.py", line 21, in format_flamegraph | ||
urllib.request.urlretrieve( | ||
|
||
Author: Alexander Grund (TU Dresden) | ||
|
||
diff --git a/test/test_cuda.py b/test/test_cuda.py | ||
index 50644c22875..64d0eeb2fd4 100644 | ||
--- a/test/test_cuda.py | ||
+++ b/test/test_cuda.py | ||
@@ -4599,6 +4599,7 @@ class TestCudaComm(TestCase): | ||
cat = torch.cat((outputs[0][i].to('cpu'), outputs[1][i].to('cpu'))) | ||
self.assertTrue(torch.equal(x, cat)) | ||
|
||
+ @unittest.skip("Requires internet access") | ||
def test_memory_snapshot(self): | ||
try: | ||
torch.cuda.memory.empty_cache() |