Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

{ai}[foss/2022b] PyTorch v1.13.1 #18421

Merged
154 changes: 154 additions & 0 deletions easybuild/easyconfigs/p/PyTorch/PyTorch-1.13.1-foss-2022b.eb
Original file line number Diff line number Diff line change
@@ -0,0 +1,154 @@
name = 'PyTorch'
version = '1.13.1'

homepage = 'https://pytorch.org/'
description = """Tensors and Dynamic neural networks in Python with strong GPU acceleration.
PyTorch is a deep learning framework that puts Python first."""

toolchain = {'name': 'foss', 'version': '2022b'}

source_urls = [GITHUB_RELEASE]
sources = ['%(namelower)s-v%(version)s.tar.gz']
patches = [
'PyTorch-1.7.0_disable-dev-shm-test.patch',
'PyTorch-1.10.0_fix-kineto-crash.patch',
'PyTorch-1.11.0_fix-fp16-quantization-without-fbgemm.patch',
'PyTorch-1.11.1_skip-test_init_from_local_shards.patch',
'PyTorch-1.12.0_fix-EmbeddingBag-without-fbgemm.patch',
'PyTorch-1.12.1_add-hypothesis-suppression.patch',
'PyTorch-1.12.1_fix-skip-decorators.patch',
'PyTorch-1.12.1_fix-test_cpp_extensions_jit.patch',
'PyTorch-1.12.1_fix-test_wishart_log_prob.patch',
'PyTorch-1.12.1_fix-TestTorch.test_to.patch',
'PyTorch-1.12.1_fix-use-after-free-in-tensorpipe-agent.patch',
'PyTorch-1.12.1_fix-vsx-loadu.patch',
'PyTorch-1.12.1_fix-vsx-vector-funcs.patch',
'PyTorch-1.12.1_skip-test_round_robin.patch',
'PyTorch-1.13.1_disable-test-sharding.patch',
'PyTorch-1.13.1_fix-flaky-jit-test.patch',
'PyTorch-1.13.1_fix-fsdp-fp16-test.patch',
'PyTorch-1.13.1_fix-gcc-12-missing-includes.patch',
'PyTorch-1.13.1_fix-gcc-12-warning-in-fbgemm.patch',
'PyTorch-1.13.1_fix-numpy-deprecations.patch',
'PyTorch-1.13.1_fix-protobuf-dependency.patch',
'PyTorch-1.13.1_fix-pytest-args.patch',
'PyTorch-1.13.1_fix-python-3.11-compat.patch',
'PyTorch-1.13.1_fix-test-ops-conf.patch',
'PyTorch-1.13.1_fix-warning-in-test-cpp-api.patch',
'PyTorch-1.13.1_increase-tolerance-test_ops.patch',
'PyTorch-1.13.1_install-vsx-vec-headers.patch',
'PyTorch-1.13.1_no-cuda-stubs-rpath.patch',
'PyTorch-1.13.1_remove-flaky-test-in-testnn.patch',
'PyTorch-1.13.1_skip-failing-grad-test.patch',
'PyTorch-1.13.1_skip-failing-singular-grad-test.patch',
'PyTorch-1.13.1_skip-tests-without-fbgemm.patch',
]
checksums = [
{'pytorch-v1.13.1.tar.gz': 'dbc229ee9750b02b514937d017744443a269ea0241ed3f32b9af0703589d25d4'},
{'PyTorch-1.7.0_disable-dev-shm-test.patch': '622cb1eaeadc06e13128a862d9946bcc1f1edd3d02b259c56a9aecc4d5406b8a'},
{'PyTorch-1.10.0_fix-kineto-crash.patch': 'dc467333b28162149af8f675929d8c6bf219f23230bfc0d39af02ba4f6f882eb'},
{'PyTorch-1.11.0_fix-fp16-quantization-without-fbgemm.patch':
'cc526130b6446bbbf5f0f7372d3aeee3e7d4c4d6e471524dff028b430b152934'},
{'PyTorch-1.11.1_skip-test_init_from_local_shards.patch':
'4aeb1b0bc863d4801b0095cbce69f8794066748f0df27c6aaaf729c5ecba04b7'},
{'PyTorch-1.12.0_fix-EmbeddingBag-without-fbgemm.patch':
'090598592283e3fc46ee08a68b6a6afe07be41b26514afba51834408bf1c98ed'},
{'PyTorch-1.12.1_add-hypothesis-suppression.patch':
'e71ffb94ebe69f580fa70e0de84017058325fdff944866d6bd03463626edc32c'},
{'PyTorch-1.12.1_fix-skip-decorators.patch': 'e3ca6e42b2fa592ea095939fb59ab875668a058479407db3f3684cc5c6f4146c'},
{'PyTorch-1.12.1_fix-test_cpp_extensions_jit.patch':
'1efc9850c431d702e9117d4766277d3f88c5c8b3870997c9974971bce7f2ab83'},
{'PyTorch-1.12.1_fix-test_wishart_log_prob.patch':
'cf475ae6e6234b96c8d1bf917597c5176c94b3ccd940b72f2e1cd0c979580f45'},
{'PyTorch-1.12.1_fix-TestTorch.test_to.patch': '75f27987c3f25c501e719bd2b1c70a029ae0ee28514a97fe447516aee02b1535'},
{'PyTorch-1.12.1_fix-use-after-free-in-tensorpipe-agent.patch':
'0bd7e88b92c4c6f0fecf01746009858ba19f2df68b10b88c41485328a531875d'},
{'PyTorch-1.12.1_fix-vsx-loadu.patch': '8bfe3c94ada1dd1f7974a1261a8b576fb7ae944050fa1c7830fca033831123b2'},
{'PyTorch-1.12.1_fix-vsx-vector-funcs.patch': 'caccbf60f62eac313896c1eaec78b08f5d0fdfcb907079087490bb13d1561aa2'},
{'PyTorch-1.12.1_skip-test_round_robin.patch': '63d4849b78605aa088fdff695637d9473ea60dee603a3ff7f788690d70c55349'},
{'PyTorch-1.13.1_disable-test-sharding.patch': 'df2074adeba47998ce2993d99ca64eb6f1c79ab7057f553b436efdec264d3572'},
{'PyTorch-1.13.1_fix-flaky-jit-test.patch': '71efdeb29b5e5b4982c9f5cb2182733654a34d52f85bb5487bc4d7d99b86101b'},
{'PyTorch-1.13.1_fix-fsdp-fp16-test.patch': '8ae68e60d6e1f92f50322b7f0381c7e65251fba32d7606e3a238a36a2f55b5cf'},
{'PyTorch-1.13.1_fix-gcc-12-missing-includes.patch':
'18df8c61ecaa9fb659346c1e172828bca6b069f0145bb8f6a36b0a23b7bef0a6'},
{'PyTorch-1.13.1_fix-gcc-12-warning-in-fbgemm.patch':
'5c7be91a6096083a0b1315efe0001537499c600f1f569953c6a2c7f4cc1d0910'},
{'PyTorch-1.13.1_fix-numpy-deprecations.patch':
'f461b570efe0434ddd806bf2fa7020eb213e3ed89d0eb4403e076f4276ba2a46'},
{'PyTorch-1.13.1_fix-protobuf-dependency.patch':
'8bd755a0cab7233a243bc65ca57c9630dfccdc9bf8c9792f0de4e07a644fcb00'},
{'PyTorch-1.13.1_fix-pytest-args.patch': 'd3e3c841cf8d73683750f29326f2be56ee0bb5df7ff522baf7d7c3f301a91ec2'},
{'PyTorch-1.13.1_fix-python-3.11-compat.patch':
'fa4eb0e27e00a90bb217b77c0023089c4659c03f37d781ab4a681bdcb4f0432f'},
{'PyTorch-1.13.1_fix-test-ops-conf.patch': 'df652eec7753864ebebbfeca546929a53e3fb8f24259d5c9b964266a8551198c'},
{'PyTorch-1.13.1_fix-warning-in-test-cpp-api.patch':
'bdde0f2105215c95a54de64ec4b1a4520528510663174fef6d5b900eb1db3937'},
{'PyTorch-1.13.1_increase-tolerance-test_ops.patch':
'c909fdfc2b12df457e1eb5514265ffec3eab653994949416f3f048668421e223'},
{'PyTorch-1.13.1_install-vsx-vec-headers.patch':
'7b678f54bb947afd4767f5877ac424b4b94ce5db609ea20f5a869ccf4027035f'},
{'PyTorch-1.13.1_no-cuda-stubs-rpath.patch': '4c636059850fc9d1ecb27ce275f8aad5d5b6fdc19e35aff0c25b86cb3201352a'},
{'PyTorch-1.13.1_remove-flaky-test-in-testnn.patch':
'be83ff61fe2dedab6d49c232936d5622df81ab49154264490021c6c828e53315'},
{'PyTorch-1.13.1_skip-failing-grad-test.patch': '6681200f9509893cb9231b5c93ac9bc5e6d9d9ae4febefca52e7cbc843ba8f51'},
{'PyTorch-1.13.1_skip-failing-singular-grad-test.patch':
'72688a57b2bb617665ad1a1d5e362c5111ae912c10936bb38a089c0204729f48'},
{'PyTorch-1.13.1_skip-tests-without-fbgemm.patch':
'481e595f673baf8ae58b41697a6792b83048b0264aa79b422f48cd8c22948bb7'},
]

osdependencies = [OS_PKG_IBVERBS_DEV]

builddependencies = [
('CMake', '3.24.3'),
('hypothesis', '6.68.2'),
# For tests
('pytest-rerunfailures', '12.0'),
('pytest-shard', '0.1.2'),
]

dependencies = [
('Ninja', '1.11.1'), # Required for JIT compilation of C++ extensions
('Python', '3.10.8'),
('protobuf', '23.0'),
('protobuf-python', '4.23.0'),
('pybind11', '2.10.3'),
('SciPy-bundle', '2023.02'),
('PyYAML', '6.0'),
('MPFR', '4.2.0'),
('GMP', '6.2.1'),
('numactl', '2.0.16'),
('FFmpeg', '5.1.2'),
('Pillow', '9.4.0'),
('expecttest', '0.1.3'),
]

custom_opts = ['CMAKE_CXX_STANDARD=17']

excluded_tests = {
'': [
# This test seems to take too long on NVIDIA Ampere at least.
'distributed/test_distributed_spawn',
# Broken on CUDA 11.6/11.7: https://github.com/pytorch/pytorch/issues/75375
'distributions/test_constraints',
# no xdoctest
'doctests',
# failing on broadwell
# See https://github.com/easybuilders/easybuild-easyconfigs/issues/17712
'test_native_mha',
# intermittent failures on various systems
# See https://github.com/easybuilders/easybuild-easyconfigs/issues/17712
'distributed/rpc/test_tensorpipe_agent',
]
}

runtest = 'cd test && PYTHONUNBUFFERED=1 %(python)s run_test.py --continue-through-error --verbose %(excluded_tests)s'

# Especially test_quantization has a few corner cases that are triggered by the random input values,
# those cannot be easily avoided, see https://github.com/pytorch/pytorch/issues/107030
# So allow a low number of tests to fail as the tests "usually" succeed
max_failed_tests = 2

tests = ['PyTorch-check-cpp-extension.py']

moduleclass = 'ai'
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
Our error checking doesn't work well with the parallel/sharded pytorch test.
As the overall gain is low, disable it and always run the full test suite in a single process.
Additionally remove the fail-on-first-error flag (-x) to collect all errors.

Author: Alexander Grund (TU Dresden)

diff --git a/test/run_test.py b/test/run_test.py
index f7c80f3f0a6..9e930d774ed 100755
--- a/test/run_test.py
+++ b/test/run_test.py
@@ -718,18 +718,18 @@ def print_log_file(test: str, file_path: str, failed: bool) -> None:


def run_test_ops(test_module, test_directory, options):
- if 'slow-gradcheck' in os.getenv("BUILD_ENVIRONMENT", ""):
+ if True:
# there are a lot of tests that take up a lot of space in slowgrad check, so don't bother parallelizing
# it's also on periodic so we don't care about TTS as much
return run_test(test_module, test_directory, copy.deepcopy(options),
- extra_unittest_args=["--use-pytest", '-vv', '-x', '--reruns=2', '-rfEX'],
+ extra_unittest_args=["--use-pytest", '-vv', '--reruns=2', '-rfEX'],
)
return_codes = []
os.environ["NUM_PARALLEL_PROCS"] = str(NUM_PROCS)
pool = get_context("spawn").Pool(NUM_PROCS)
for i in range(NUM_PROCS):
return_code = pool.apply_async(run_test, args=(test_module, test_directory, copy.deepcopy(options)),
- kwds={"extra_unittest_args": ["--use-pytest", '-vv', '-x', '--reruns=2', '-rfEX',
+ kwds={"extra_unittest_args": ["--use-pytest", '-vv', '--reruns=2', '-rfEX',
f'--shard-id={i}', f'--num-shards={NUM_PROCS}',
"-k=not _linalg_cholesky_"],
})
@@ -742,7 +742,7 @@ def run_test_ops(test_module, test_directory, options):
if return_code.get() != 0:
return return_code.get()
return_code = run_test(test_module, test_directory, copy.deepcopy(options),
- extra_unittest_args=["--use-pytest", '-vv', '-x', '--reruns=2', '-rfEX',
+ extra_unittest_args=["--use-pytest", '-vv', '--reruns=2', '-rfEX',
"-k=_linalg_cholesky_"],
)
return return_code
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
Add missing headers that are no longer transitively included in GCC 12

Author: Alexander Grund (TU Dresden)

Submodule third_party/gloo contains modified content
diff --git a/third_party/gloo/gloo/transport/ibverbs/pair.h b/third_party/gloo/gloo/transport/ibverbs/pair.h
index 1ccc050..54dbc7e 100644
--- a/third_party/gloo/gloo/transport/ibverbs/pair.h
+++ b/third_party/gloo/gloo/transport/ibverbs/pair.h
@@ -8,6 +8,7 @@

#pragma once

+#include <array>
#include <atomic>
#include <condition_variable>
#include <exception>
diff --git a/third_party/gloo/gloo/transport/tcp/device.cc b/third_party/gloo/gloo/transport/tcp/device.cc
index 05cf0a4..4408d60 100644
--- a/third_party/gloo/gloo/transport/tcp/device.cc
+++ b/third_party/gloo/gloo/transport/tcp/device.cc
@@ -8,6 +8,7 @@

#include "gloo/transport/tcp/device.h"

+#include <array>
#include <ifaddrs.h>
#include <netdb.h>
#include <netinet/in.h>
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
GCC 12 has a regression (https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105593)
leading to warnings/errors during build: https://github.com/pytorch/FBGEMM/issues/1666
Suppress the affected warning in FBGEMM.

Author: Alexander Grund (TU Dresden)

Submodule third_party/fbgemm contains modified content
diff --git a/third_party/fbgemm/CMakeLists.txt b/third_party/fbgemm/CMakeLists.txt
index 58dcb9ae..c888f0f8 100644
--- a/third_party/fbgemm/CMakeLists.txt
+++ b/third_party/fbgemm/CMakeLists.txt
@@ -147,10 +147,10 @@ else(MSVC)
string(APPEND CMAKE_CXX_FLAGS " -Werror")
string(APPEND CMAKE_CXX_FLAGS " -Wno-deprecated-declarations")
target_compile_options(fbgemm_avx2 PRIVATE
- "-m64" "-mavx2" "-mf16c" "-mfma")
+ "-m64" "-mavx2" "-mf16c" "-mfma" "-Wno-uninitialized")
target_compile_options(fbgemm_avx512 PRIVATE
"-m64" "-mavx2" "-mfma" "-mavx512f" "-mavx512bw" "-mavx512dq"
- "-mavx512vl")
+ "-mavx512vl" "-Wno-uninitialized")
set_source_files_properties(
src/FbgemmFP16UKernelsAvx2.cc
src/FbgemmFP16UKernelsAvx512.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
Numpy 1.24 removed some types deprecated earlier leading to failures in e.g. test_torch:
> ERROR: test_parsing_intlist (__main__.TestTorch)
> ...
> AttributeError: module 'numpy' has no attribute 'float'.

Backported from https://github.com/pytorch/pytorch/pull/93997
Author: Alexander Grund (TU Dresden)

diff --git a/test/quantization/core/test_quantized_op.py b/test/quantization/core/test_quantized_op.py
index 79297e073f0..827a781df10 100644
--- a/test/quantization/core/test_quantized_op.py
+++ b/test/quantization/core/test_quantized_op.py
@@ -3010,7 +3010,7 @@ class TestDynamicQuantizedOps(TestCase):
# W_scale = 1.0
# W_zp = 0
W_scales = np.ones(output_channels)
- W_zps = np.zeros(output_channels).astype(np.int)
+ W_zps = np.zeros(output_channels).astype(int)
W_value_min = -128
W_value_max = 127
W_q0 = np.round(
@@ -3581,9 +3581,9 @@ class TestQuantizedLinear(TestCase):
# xnnpack forces W_zp to 0 when using symmetric quantization
# ONEDNN only supports symmetric quantization of weight
if dtype == torch.qint8 or qengine_is_onednn():
- W_zps = np.zeros(output_channels).astype(np.int)
+ W_zps = np.zeros(output_channels).astype(int)
else:
- W_zps = np.round(np.random.rand(output_channels) * 100 - 50).astype(np.int)
+ W_zps = np.round(np.random.rand(output_channels) * 100 - 50).astype(int)
# when using symmetric quantization
# special restriction for xnnpack fully connected op weight
# [-127, 127] instead of [-128, 127]
diff --git a/test/test_reductions.py b/test/test_reductions.py
index 0e36906f25f..20fcb6ef9ae 100644
--- a/test/test_reductions.py
+++ b/test/test_reductions.py
@@ -1323,7 +1323,7 @@ class TestReductions(TestCase):
vals = [[True, True], [True, False], [False, False], []]
for val in vals:
result = torch.prod(torch.tensor(val, device=device), dtype=torch.bool).item()
- expect = np.prod(np.array(val), dtype=np.bool)
+ expect = np.prod(np.array(val), dtype=bool)
self.assertEqual(result, expect)

result = torch.prod(torch.tensor(val, device=device)).item()
diff --git a/test/test_tensor_creation_ops.py b/test/test_tensor_creation_ops.py
index aab26452496..a71761ce670 100644
--- a/test/test_tensor_creation_ops.py
+++ b/test/test_tensor_creation_ops.py
@@ -1440,14 +1440,14 @@ class TestTensorCreation(TestCase):
def test_ctor_with_numpy_array(self, device):
correct_dtypes = [
np.double,
- np.float,
+ float,
np.float16,
np.int64,
np.int32,
np.int16,
np.int8,
np.uint8,
- np.bool,
+ bool,
]

incorrect_byteorder = '>' if sys.byteorder == 'little' else '<'
diff --git a/test/test_tensorboard.py b/test/test_tensorboard.py
index e836b0f1ba8..0857873a5fa 100644
--- a/test/test_tensorboard.py
+++ b/test/test_tensorboard.py
@@ -796,7 +796,7 @@ class TestTensorBoardNumpy(BaseTestCase):
model = ModelHelper(name="mnist")
# how come those inputs don't break the forward pass =.=a
workspace.FeedBlob("data", np.random.randn(1, 3, 64, 64).astype(np.float32))
- workspace.FeedBlob("label", np.random.randn(1, 1000).astype(np.int))
+ workspace.FeedBlob("label", np.random.randn(1, 1000).astype(int))

with core.NameScope("conv1"):
conv1 = brew.conv(model, "data", 'conv1', dim_in=1, dim_out=20, kernel=5)
@@ -831,7 +831,7 @@ class TestTensorBoardNumpy(BaseTestCase):
def test_caffe2_simple_cnnmodel(self):
model = cnn.CNNModelHelper("NCHW", name="overfeat")
workspace.FeedBlob("data", np.random.randn(1, 3, 64, 64).astype(np.float32))
- workspace.FeedBlob("label", np.random.randn(1, 1000).astype(np.int))
+ workspace.FeedBlob("label", np.random.randn(1, 1000).astype(int))
with core.NameScope("conv1"):
conv1 = model.Conv("data", "conv1", 3, 96, 11, stride=4)
relu1 = model.Relu(conv1, conv1)
diff --git a/test/test_torch.py b/test/test_torch.py
index 8de5b822d00..3121e256b21 100644
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -6182,7 +6182,7 @@ class TestTorch(TestCase):
# fail parse with float variables
self.assertRaises(TypeError, lambda: torch.ones((torch.tensor(3.), torch.tensor(4))))
# fail parse with numpy floats
- self.assertRaises(TypeError, lambda: torch.ones((np.float(3.), torch.tensor(4))))
+ self.assertRaises(TypeError, lambda: torch.ones((3., torch.tensor(4))))
self.assertRaises(TypeError, lambda: torch.ones((np.array(3.), torch.tensor(4))))

# fail parse with > 1 element variables
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
Add proper dependency in libprotobuf target to get the correct set of flags.
Otherwise the build will fail with e.g.:
undefined reference to `google::protobuf::internal::ThreadSafeArena::thread_cache_'
This is caused by missing the `PROTOBUF_USE_DLLS` define required for libprotobuf as a shared library.
See https://github.com/pytorch/pytorch/issues/106297

Author: Alexander Grund (TU Dresden)

diff --git a/caffe2/proto/CMakeLists.txt b/caffe2/proto/CMakeLists.txt
index ba6b696dde4..5033e228119 100644
--- a/caffe2/proto/CMakeLists.txt
+++ b/caffe2/proto/CMakeLists.txt
@@ -7,6 +7,7 @@ endif()
caffe2_protobuf_generate_cpp_py(Caffe2_PROTO_SRCS Caffe2_PROTO_HEADERS Caffe2_PROTO_PY ${Caffe2_PROTOBUF_FILES})

add_library(Caffe2_PROTO OBJECT ${Caffe2_PROTO_HEADERS} ${Caffe2_PROTO_SRCS})
+target_link_libraries(Caffe2_PROTO PRIVATE protobuf::libprotobuf)

if(MSVC)
if(BUILD_SHARED_LIBS)
diff --git a/cmake/ProtoBuf.cmake b/cmake/ProtoBuf.cmake
index 8d7633c4ab0..fb0e9a0c5f7 100644
--- a/cmake/ProtoBuf.cmake
+++ b/cmake/ProtoBuf.cmake
@@ -122,10 +122,6 @@ if((NOT TARGET protobuf::libprotobuf) AND (NOT TARGET protobuf::libprotobuf-lite
# "Please set the proper paths so that I can find protobuf correctly.")
endif()

-get_target_property(__tmp protobuf::libprotobuf INTERFACE_INCLUDE_DIRECTORIES)
-message(STATUS "Caffe2 protobuf include directory: " ${__tmp})
-include_directories(BEFORE SYSTEM ${__tmp})
-
# If Protobuf_VERSION is known (true in most cases, false if we are building
# local protobuf), then we will add a protobuf version check in
# Caffe2Config.cmake.in.