Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

{ai}[foss/2022b] PyTorch v2.1.2 w/ CUDA 12.0.0 #20155

Open
wants to merge 2 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,238 @@
name = 'PyTorch'
version = '2.1.2'
versionsuffix = '-CUDA-%(cudaver)s'

homepage = 'https://pytorch.org/'
description = """Tensors and Dynamic neural networks in Python with strong GPU acceleration.
PyTorch is a deep learning framework that puts Python first."""

toolchain = {'name': 'foss', 'version': '2022b'}

source_urls = [GITHUB_RELEASE]
sources = ['%(namelower)s-v%(version)s.tar.gz']
patches = [
'PyTorch-1.7.0_disable-dev-shm-test.patch',
'PyTorch-1.11.1_skip-test_init_from_local_shards.patch',
'PyTorch-1.12.1_add-hypothesis-suppression.patch',
'PyTorch-1.12.1_fix-test_cpp_extensions_jit.patch',
'PyTorch-1.12.1_fix-TestTorch.test_to.patch',
'PyTorch-1.12.1_skip-test_round_robin.patch',
'PyTorch-1.13.1_fix-gcc-12-warning-in-fbgemm.patch',
'PyTorch-1.13.1_fix-protobuf-dependency.patch',
'PyTorch-1.13.1_fix-warning-in-test-cpp-api.patch',
'PyTorch-1.13.1_skip-failing-singular-grad-test.patch',
'PyTorch-1.13.1_skip-tests-without-fbgemm.patch',
'PyTorch-2.0.1_avoid-test_quantization-failures.patch',
'PyTorch-2.0.1_fix-skip-decorators.patch',
'PyTorch-2.0.1_fix-ub-in-inductor-codegen.patch',
'PyTorch-2.0.1_fix-vsx-loadu.patch',
'PyTorch-2.0.1_no-cuda-stubs-rpath.patch',
'PyTorch-2.0.1_skip-failing-gradtest.patch',
'PyTorch-2.0.1_skip-test_shuffle_reproducibility.patch',
'PyTorch-2.0.1_skip-tests-skipped-in-subprocess.patch',
'PyTorch-2.1.0_disable-gcc12-warning.patch',
'PyTorch-2.1.0_disable-cudnn-tf32-for-too-strict-tests.patch',
'PyTorch-2.1.0_fix-bufferoverflow-in-oneDNN.patch',
'PyTorch-2.1.0_fix-test_numpy_torch_operators.patch',
'PyTorch-2.1.0_fix-validationError-output-test.patch',
'PyTorch-2.1.0_fix-vsx-vector-shift-functions.patch',
'PyTorch-2.1.0_increase-tolerance-functorch-test_vmapvjpvjp.patch',
'PyTorch-2.1.0_remove-sparse-csr-nnz-overflow-test.patch',
'PyTorch-2.1.0_remove-test-requiring-online-access.patch',
'PyTorch-2.1.0_skip-diff-test-on-ppc.patch',
'PyTorch-2.1.0_skip-dynamo-test_predispatch.patch',
'PyTorch-2.1.0_skip-test_jvp_linalg_det_singular.patch',
'PyTorch-2.1.0_skip-test_linear_fp32-without-MKL.patch',
'PyTorch-2.1.0_skip-test_wrap_bad.patch',
'PyTorch-2.1.2_add-cuda-skip-markers.patch',
'PyTorch-2.1.2_fix-conj-mismatch-test-failures.patch',
'PyTorch-2.1.2_fix-device-mesh-check.patch',
'PyTorch-2.1.2_fix-locale-issue-in-nvrtcCompileProgram.patch',
'PyTorch-2.1.2_fix-test_extension_backend-without-vectorization.patch',
'PyTorch-2.1.2_fix-test_memory_profiler.patch',
'PyTorch-2.1.2_fix-test_torchinductor-rounding.patch',
'PyTorch-2.1.2_fix-vsx-vector-abs.patch',
'PyTorch-2.1.2_fix-vsx-vector-div.patch',
'PyTorch-2.1.2_fix-with_temp_dir-decorator.patch',
'PyTorch-2.1.2_fix-wrong-device-mesh-size-in-tests.patch',
'PyTorch-2.1.2_relax-cuda-tolerances.patch',
'PyTorch-2.1.2_remove-nccl-backend-default-without-gpus.patch',
'PyTorch-2.1.2_skip-cpu_repro-test-without-vectorization.patch',
'PyTorch-2.1.2_skip-failing-test_dtensor_ops-subtests.patch',
'PyTorch-2.1.2_skip-test_fsdp_tp_checkpoint_integration.patch',
'PyTorch-2.1.2_skip-xfailing-test_dtensor_ops.patch',
'PyTorch-2.1.2_workaround_dynamo_failure_without_nnpack.patch',
]
checksums = [
{'pytorch-v2.1.2.tar.gz': '85effbcce037bffa290aea775c9a4bad5f769cb229583450c40055501ee1acd7'},
{'PyTorch-1.7.0_disable-dev-shm-test.patch': '622cb1eaeadc06e13128a862d9946bcc1f1edd3d02b259c56a9aecc4d5406b8a'},
{'PyTorch-1.11.1_skip-test_init_from_local_shards.patch':
'4aeb1b0bc863d4801b0095cbce69f8794066748f0df27c6aaaf729c5ecba04b7'},
{'PyTorch-1.12.1_add-hypothesis-suppression.patch':
'e71ffb94ebe69f580fa70e0de84017058325fdff944866d6bd03463626edc32c'},
{'PyTorch-1.12.1_fix-test_cpp_extensions_jit.patch':
'1efc9850c431d702e9117d4766277d3f88c5c8b3870997c9974971bce7f2ab83'},
{'PyTorch-1.12.1_fix-TestTorch.test_to.patch': '75f27987c3f25c501e719bd2b1c70a029ae0ee28514a97fe447516aee02b1535'},
{'PyTorch-1.12.1_skip-test_round_robin.patch': '63d4849b78605aa088fdff695637d9473ea60dee603a3ff7f788690d70c55349'},
{'PyTorch-1.13.1_fix-gcc-12-warning-in-fbgemm.patch':
'5c7be91a6096083a0b1315efe0001537499c600f1f569953c6a2c7f4cc1d0910'},
{'PyTorch-1.13.1_fix-protobuf-dependency.patch':
'8bd755a0cab7233a243bc65ca57c9630dfccdc9bf8c9792f0de4e07a644fcb00'},
{'PyTorch-1.13.1_fix-warning-in-test-cpp-api.patch':
'bdde0f2105215c95a54de64ec4b1a4520528510663174fef6d5b900eb1db3937'},
{'PyTorch-1.13.1_skip-failing-singular-grad-test.patch':
'72688a57b2bb617665ad1a1d5e362c5111ae912c10936bb38a089c0204729f48'},
{'PyTorch-1.13.1_skip-tests-without-fbgemm.patch':
'481e595f673baf8ae58b41697a6792b83048b0264aa79b422f48cd8c22948bb7'},
{'PyTorch-2.0.1_avoid-test_quantization-failures.patch':
'02e3f47e4ed1d7d6077e26f1ae50073dc2b20426269930b505f4aefe5d2f33cd'},
{'PyTorch-2.0.1_fix-skip-decorators.patch': '2039012cef45446065e1a2097839fe20bb29fe3c1dcc926c3695ebf29832e920'},
{'PyTorch-2.0.1_fix-ub-in-inductor-codegen.patch':
'1b37194f55ae678f3657b8728dfb896c18ffe8babe90987ce468c4fa9274f357'},
{'PyTorch-2.0.1_fix-vsx-loadu.patch': 'a0ffa61da2d47c6acd09aaf6d4791e527d8919a6f4f1aa7ed38454cdcadb1f72'},
{'PyTorch-2.0.1_no-cuda-stubs-rpath.patch': '8902e58a762240f24cdbf0182e99ccdfc2a93492869352fcb4ca0ec7e407f83a'},
{'PyTorch-2.0.1_skip-failing-gradtest.patch': '8030bdec6ba49b057ab232d19a7f1a5e542e47e2ec340653a246ec9ed59f8bc1'},
{'PyTorch-2.0.1_skip-test_shuffle_reproducibility.patch':
'7047862abc1abaff62954da59700f36d4f39fcf83167a638183b1b7f8fec78ae'},
{'PyTorch-2.0.1_skip-tests-skipped-in-subprocess.patch':
'166c134573a95230e39b9ea09ece3ad8072f39d370c9a88fb2a1e24f6aaac2b5'},
{'PyTorch-2.1.0_disable-gcc12-warning.patch': 'c858b8db0010f41005dc06f9a50768d0d3dc2d2d499ccbdd5faf8a518869a421'},
{'PyTorch-2.1.0_disable-cudnn-tf32-for-too-strict-tests.patch':
'd895018ebdfd46e65d9f7645444a3b4c5bbfe3d533a08db559a04be34e01e478'},
{'PyTorch-2.1.0_fix-bufferoverflow-in-oneDNN.patch':
'b15b1291a3c37bf6a4982cfbb3483f693acb46a67bc0912b383fd98baf540ccf'},
{'PyTorch-2.1.0_fix-test_numpy_torch_operators.patch':
'84bb51a719abc677031a7a3dfe4382ff098b0cbd8b39b8bed2a7fa03f80ac1e9'},
{'PyTorch-2.1.0_fix-validationError-output-test.patch':
'7eba0942afb121ed92fac30d1529447d892a89eb3d53c565f8e9d480e95f692b'},
{'PyTorch-2.1.0_fix-vsx-vector-shift-functions.patch':
'3793b4b878be1abe7791efcbd534774b87862cfe7dc4774ca8729b6cabb39e7e'},
{'PyTorch-2.1.0_increase-tolerance-functorch-test_vmapvjpvjp.patch':
'aef38adf1210d0c5455e91d7c7a9d9e5caad3ae568301e0ba9fc204309438e7b'},
{'PyTorch-2.1.0_remove-sparse-csr-nnz-overflow-test.patch':
'0ac36411e76506b3354c85a8a1260987f66af947ee52ffc64230aee1fa02ea8b'},
{'PyTorch-2.1.0_remove-test-requiring-online-access.patch':
'35184b8c5a1b10f79e511cc25db3b8a5585a5d58b5d1aa25dd3d250200b14fd7'},
{'PyTorch-2.1.0_skip-diff-test-on-ppc.patch': '394157dbe565ffcbc1821cd63d05930957412156cc01e949ef3d3524176a1dda'},
{'PyTorch-2.1.0_skip-dynamo-test_predispatch.patch':
'6298daf9ddaa8542850eee9ea005f28594ab65b1f87af43d8aeca1579a8c4354'},
{'PyTorch-2.1.0_skip-test_jvp_linalg_det_singular.patch':
'5229ca88a71db7667a90ddc0b809b2c817698bd6e9c5aaabd73d3173cf9b99fe'},
{'PyTorch-2.1.0_skip-test_linear_fp32-without-MKL.patch':
'5dcc79883b6e3ec0a281a8e110db5e0a5880de843bb05653589891f16473ead5'},
{'PyTorch-2.1.0_skip-test_wrap_bad.patch': 'b8583125ee94e553b6f77c4ab4bfa812b89416175dc7e9b7390919f3b485cb63'},
{'PyTorch-2.1.2_add-cuda-skip-markers.patch': 'd007d6d0cdb533e7d01f503e9055218760123a67c1841c57585385144be18c9a'},
{'PyTorch-2.1.2_fix-conj-mismatch-test-failures.patch':
'c164357efa4ce88095376e590ba508fc1daa87161e1e59544eda56daac7f2847'},
{'PyTorch-2.1.2_fix-device-mesh-check.patch': 'c0efc288bf3d9a9a3c8bbd2691348a589a2677ea43880a8c987db91c8de4806b'},
{'PyTorch-2.1.2_fix-locale-issue-in-nvrtcCompileProgram.patch':
'f7adafb4e4d3b724b93237a259797b6ed6f535f83be0e34a7b759c71c6a8ddf2'},
{'PyTorch-2.1.2_fix-test_extension_backend-without-vectorization.patch':
'cd1455495886a7d6b2d30d48736eb0103fded21e2e36de6baac719b9c52a1c92'},
{'PyTorch-2.1.2_fix-test_memory_profiler.patch':
'30b0c9355636c0ab3dedae02399789053825dc3835b4d7dac6e696767772b1ce'},
{'PyTorch-2.1.2_fix-test_torchinductor-rounding.patch':
'a0ef99192ee2ad1509c78a8377023d5be2b5fddb16f84063b7c9a0b53d979090'},
{'PyTorch-2.1.2_fix-vsx-vector-abs.patch': 'd67d32407faed7dc1dbab4bba0e2f7de36c3db04560ced35c94caf8d84ade886'},
{'PyTorch-2.1.2_fix-vsx-vector-div.patch': '11f497a6892eb49b249a15320e4218e0d7ac8ae4ce67de39e4a018a064ca1acc'},
{'PyTorch-2.1.2_fix-with_temp_dir-decorator.patch':
'90bd001e034095329277d70c6facc4026b4ce6d7f8b8d6aa81c0176eeb462eb1'},
{'PyTorch-2.1.2_fix-wrong-device-mesh-size-in-tests.patch':
'07a5e4233d02fb6348872838f4d69573c777899c6f0ea4e39ae23c08660d41e5'},
{'PyTorch-2.1.2_relax-cuda-tolerances.patch': '554ad09787f61080fafdb84216e711e32327aa357e2a9c40bb428eb6503dee6e'},
{'PyTorch-2.1.2_remove-nccl-backend-default-without-gpus.patch':
'e6a1efe3d127fcbf4723476a7a1c01cfcf2ccb16d1fb250f478192623e8b6a15'},
{'PyTorch-2.1.2_skip-cpu_repro-test-without-vectorization.patch':
'7ace835af60c58d9e0754a34c19d4b9a0c3a531f19e5d0eba8e2e49206eaa7eb'},
{'PyTorch-2.1.2_skip-failing-test_dtensor_ops-subtests.patch':
'6cf711bf26518550903b09ed4431de9319791e79d61aab065785d6608fd5cc88'},
{'PyTorch-2.1.2_skip-test_fsdp_tp_checkpoint_integration.patch':
'943ee92f5fd518f608a59e43fe426b9bb45d7e7ad0ba04639e516db2d61fa57d'},
{'PyTorch-2.1.2_skip-xfailing-test_dtensor_ops.patch':
'7f5befddcb006b6ab5377de6ee3c29df375c5f8ef5e42b998d35113585b983f3'},
{'PyTorch-2.1.2_workaround_dynamo_failure_without_nnpack.patch':
'fb96eefabf394617bbb3fbd3a7a7c1aa5991b3836edc2e5d2a30e708bfe49ba1'},
]

osdependencies = [OS_PKG_IBVERBS_DEV]

builddependencies = [
('CMake', '3.24.3'),
('hypothesis', '6.68.2'),
# For tests
('pytest-flakefinder', '1.1.0'),
('pytest-rerunfailures', '12.0'),
('pytest-shard', '0.1.2'),
]

dependencies = [
('CUDA', '12.0.0', '', SYSTEM),
('cuDNN', '8.8.0.121', '-CUDA-%(cudaver)s', SYSTEM),
('magma', '2.7.1', '-CUDA-%(cudaver)s'),
('NCCL', '2.16.2', '-CUDA-%(cudaver)s'),
('Ninja', '1.11.1'), # Required for JIT compilation of C++ extensions
('Python', '3.10.8'),
('protobuf', '23.0'),
('protobuf-python', '4.23.0'),
('pybind11', '2.10.3'),
('SciPy-bundle', '2023.02'),
('PyYAML', '6.0'),
('MPFR', '4.2.0'),
('GMP', '6.2.1'),
('numactl', '2.0.16'),
('FFmpeg', '5.1.2'),
('Pillow', '9.4.0'),
('expecttest', '0.1.3'),
('networkx', '3.0'),
('sympy', '1.12'),
('Z3', '4.12.2', '-Python-%(pyver)s'),
]

use_pip = True
buildcmd = '%(python)s setup.py build' # Run the (long) build in the build step

excluded_tests = {
'': [
# This test seems to take too long on NVIDIA Ampere at least.
'distributed/test_distributed_spawn',
# Broken on CUDA 11.6/11.7: https://github.com/pytorch/pytorch/issues/75375
'distributions/test_constraints',
# no xdoctest
'doctests',
# failing on broadwell
# See https://github.com/easybuilders/easybuild-easyconfigs/issues/17712
'test_native_mha',
# intermittent failures on various systems
# See https://github.com/easybuilders/easybuild-easyconfigs/issues/17712
'distributed/rpc/test_tensorpipe_agent',
# Broken test, can't ever succeed, see https://github.com/pytorch/pytorch/issues/122184
'distributed/tensor/parallel/test_tp_random_state',
# failures on OmniPath systems, which don't support some optional InfiniBand features
# See https://github.com/pytorch/tensorpipe/issues/413
'distributed/pipeline/sync/skip/test_gpipe',
'distributed/pipeline/sync/skip/test_leak',
'distributed/pipeline/sync/test_bugs',
'distributed/pipeline/sync/test_inplace',
'distributed/pipeline/sync/test_pipe',
'distributed/pipeline/sync/test_transparency',
]
}

runtest = 'cd test && PYTHONUNBUFFERED=1 %(python)s run_test.py --continue-through-error --verbose %(excluded_tests)s'

# Especially test_quantization has a few corner cases that are triggered by the random input values,
# those cannot be easily avoided, see https://github.com/pytorch/pytorch/issues/107030
# test_nn is also prone to spurious failures: https://github.com/pytorch/pytorch/issues/118294
# So allow a low number of tests to fail as the tests "usually" succeed
max_failed_tests = 10

# The readelf sanity check command can be taken out once the TestRPATH test from
# https://github.com/pytorch/pytorch/pull/109493 is accepted, since it is then checked as part of the PyTorch test suite
local_libcaffe2 = "$EBROOTPYTORCH/lib/python%%(pyshortver)s/site-packages/torch/lib/libcaffe2_nvrtc.%s" % SHLIB_EXT
sanity_check_commands = [
"readelf -d %s | egrep 'RPATH|RUNPATH' | grep -v stubs" % local_libcaffe2,
]

tests = ['PyTorch-check-cpp-extension.py']

moduleclass = 'ai'
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
This test has ~400 sub-tests marked as xfail, i.e. expected to fail.
A comment states
> these sometimes pass and sometimes fail

That doesn't match the definition of "xfail"/"unittest.expectedFailure" as some are really
"expected failures" which have to fail (like some operators which are not defined for the type tested)

To reduce the flakiness and verbosity of this test suite (every xfailed tests prints an error and a rather large backtrace)
just skip those tests by setting the last element in each tuple of the list to `False`,
see the definition of `fail` vs `xfail` in that file.

Author: Alexander Grund (TU Dresden)

diff --git a/test/distributed/_tensor/test_dtensor_ops.py b/test/distributed/_tensor/test_dtensor_ops.py
index b7d453e56be..15478840ae5 100644
--- a/test/distributed/_tensor/test_dtensor_ops.py
+++ b/test/distributed/_tensor/test_dtensor_ops.py
@@ -541,6 +541,7 @@ dtensor_fails = {
skip("squeeze"),
}

+dtensor_fails = [x[:-1] + (False,) for x in dtensor_fails]

# Add a list of ops that are currently failing BW pass
skip_bw = [
Loading