easybuilders · SebastianAchilles · Dec 27, 2023 · Dec 19, 2023 · Dec 19, 2023
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.1.0_disable-gcc12-warning.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.1.0_disable-gcc12-warning.patch
@@ -0,0 +1,32 @@
+GCC 12 has a false positive warning when compiled for some architectures, e.g. Intel Sapphire Rapids.
+See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=112370
+
+Suppress this warning such that the build doesn't error.
+
+Author: Alexander Grund (TU Dresden)
+
+diff --git a/CMakeLists.txt b/CMakeLists.txt
+index 3a48eaf4e29..faa06c8177a 100644
+--- a/CMakeLists.txt
++++ b/CMakeLists.txt
+@@ -561,6 +561,7 @@ if(MSVC)
+   string(APPEND CMAKE_CXX_FLAGS " /FS")
+   string(APPEND CMAKE_CUDA_FLAGS " -Xcompiler /FS")
+ endif(MSVC)
++append_cxx_flag_if_supported("-Wno-free-nonheap-object" CMAKE_CXX_FLAGS)
+
+ string(APPEND CMAKE_CUDA_FLAGS " -Xfatbin -compress-all")
+
+diff --git a/cmake/public/utils.cmake b/cmake/public/utils.cmake
+index 4d48c0f0f3d..bda17319ce7 100644
+--- a/cmake/public/utils.cmake
++++ b/cmake/public/utils.cmake
+@@ -533,6 +533,8 @@ function(torch_update_find_cuda_flags)
+   endif()
+ endfunction()
+
++include(CheckCXXCompilerFlag)
++
+ ##############################################################################
+ # CHeck if given flag is supported and append it to provided outputvar
+ # Also define HAS_UPPER_CASE_FLAG_NAME variable
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.1.0_fix-bufferoverflow-in-oneDNN.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.1.0_fix-bufferoverflow-in-oneDNN.patch
@@ -0,0 +1,30 @@
+Fix a buffer overflow in oneDNN resulting in memory corruption.
+Visible in e.g test_scaled_dot_product_fused_attention_vs_math_cpu in test_transformers.py
+
+See https://github.com/pytorch/pytorch/issues/115253
+Fixed in oneDNN 3.3 with https://github.com/oneapi-src/oneDNN/commit/6518e55b025b75ea83c677c601cfb8b1f7df0a68
+Use this commit to patch the submodule.
+
+Author: Alexander Grund (TU Dresden)
+
+diff -ur pytorch/third_party/ideep/mkl-dnn/src/cpu/matmul/gemm_based_common.hpp pytorch-orig/third_party/ideep/mkl-dnn/src/cpu/matmul/gemm_based_common.hpp
+--- pytorch/third_party/ideep/mkl-dnn/src/cpu/matmul/gemm_based_common.hpp	2023-12-08 11:02:21.817449743 +0100
++++ pytorch-orig/third_party/ideep/mkl-dnn/src/cpu/matmul/gemm_based_common.hpp	2023-12-08 11:02:17.796537323 +0100
+@@ -139,9 +139,15 @@
+         const int nthr) {
+     const int num_scratchpad_blocks
+             = use_single_gemm_call_optimization ? 1 : nthr;
+-    return get_scratchpad_block_elements(
+-                   batch, M, N, use_single_gemm_call_optimization, nthr)
++    size_t buf_sz = get_scratchpad_block_elements(batch, M, N,
++                            use_single_gemm_call_optimization, nthr)
+             * num_scratchpad_blocks;
++
++    // Buffer needs to be large enough to accommodate one thread buffer
++    // size requirement in case only one thread is used during execution.
++    size_t buf_sz_1thr = get_scratchpad_block_elements(
++            batch, M, N, use_single_gemm_call_optimization, 1);
++    return nstl::max(buf_sz_1thr, buf_sz);
+ }
+
+ inline void book_acc_scratchpad(matmul_pd_t &pd, const params_t &params,
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.1.0_fix-validationError-output-test.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.1.0_fix-validationError-output-test.patch
@@ -0,0 +1,66 @@
+test_trigger_on_error in dynamo/test_exc.py fails as the output lines are ordered differently.
+Sort those to make them consistent as is done in https://github.com/pytorch/pytorch/commit/dfdc0b63c9a5befd363d304eb5e08ed1e592d7ca
+Then update the expected output.
+
+Author: Alexander Grund (TU Dresden)
+
+diff --git a/test/dynamo/test_exc.py b/test/dynamo/test_exc.py
+index c8390880aa9..a69c229a8ae 100644
+--- a/test/dynamo/test_exc.py
++++ b/test/dynamo/test_exc.py
+@@ -211,28 +211,28 @@ ReluCompileError:""",
+ translation validation failed.
+
+ Model:
++  ==> L['x'].size()[0]: 4
+   ==> L['x'].storage_offset(): 0
+-  ==> s0: 4
+   ==> L['x'].stride()[0]: 1
+-  ==> L['x'].size()[0]: 4
++  ==> s0: 4
+
+ Assertions:
++  ==> (== 0 L['x'].storage_offset())
++  ==> (== 1 L['x'].stride()[0])
+   ==> (== L['x'].size()[0] s0)
+   ==> (> s0 1)
+   ==> (Not (And (< L['x'].size()[0] 4) (>= L['x'].size()[0] 0)))
+   ==> (True)
+-  ==> (== 0 L['x'].storage_offset())
+-  ==> (== 1 L['x'].stride()[0])
+   ==> (True)
+
+ Target Expressions:
+-  ==> (>= 9223372036854775806 s0)
+-  ==> (== 4 L['x'].size()[0])
++  ==> (<= 2 s0)
+   ==> (== 0 L['x'].storage_offset())
+-  ==> (> s0 0)
+   ==> (== 1 L['x'].stride()[0])
+-  ==> (<= 2 s0)
++  ==> (== 4 L['x'].size()[0])
+   ==> (== 4 s0)
++  ==> (> s0 0)
++  ==> (>= 9223372036854775806 s0)
+
+ Failed Source Expressions:
+   ==> (!= 4 L['x'].size()[0])""",
+diff --git a/torch/fx/experimental/validator.py b/torch/fx/experimental/validator.py
+index 313e299151c..3b5e1e81aa3 100644
+--- a/torch/fx/experimental/validator.py
++++ b/torch/fx/experimental/validator.py
+@@ -559,10 +559,10 @@ class ValidationException(TorchDynamoException):
+         def joinlines(xs) -> str:
+             return "\n".join(f"  ==> {x}" for x in xs)
+
+-        model_str = joinlines(map(symbolstr, model))
+-        assertions_str = joinlines(map(z3str, assertions))
+-        target_exprs_str = joinlines(map(z3str, target_exprs))
+-        failed_source_exprs_str = joinlines(map(z3str, failed_source_exprs))
++        model_str = joinlines(sorted(map(symbolstr, model)))
++        assertions_str = joinlines(sorted(map(z3str, assertions)))
++        target_exprs_str = joinlines(sorted(map(z3str, target_exprs)))
++        failed_source_exprs_str = joinlines(sorted(map(z3str, failed_source_exprs)))
+
+         super().__init__(
+             "translation validation failed.\n\n"
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.1.0_fix-vsx-vector-shift-functions.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.1.0_fix-vsx-vector-shift-functions.patch
@@ -0,0 +1,172 @@
+The VSX vector shift operators don't handle out-of-bounds values correctly.
+See https://github.com/pytorch/pytorch/issues/109777
+Taken from https://github.com/pytorch/pytorch/pull/109886
+
+Author: Alexander Grund (TU Dresden)
+
+diff --git a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_int16_vsx.h b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_int16_vsx.h
+index a71f50fc7aa..84c84286740 100644
+--- a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_int16_vsx.h
++++ b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_int16_vsx.h
+@@ -334,20 +334,6 @@ class Vectorized<int16_t> {
+   DEFINE_MEMBER_OP(operator^, int16_t, vec_xor)
+ };
+
+-template <>
+-Vectorized<int16_t> inline operator<<(const Vectorized<int16_t>& a, const Vectorized<int16_t>& b) {
+-               vuint16 shift_vec0 = reinterpret_cast<vuint16>(b.vec0());
+-               vuint16 shift_vec1 = reinterpret_cast<vuint16>(b.vec1());
+-         return Vectorized<int16_t>{vec_sl(a.vec0(), shift_vec0), vec_sl(a.vec1(), shift_vec1)};
+-}
+-
+-template <>
+-Vectorized<int16_t> inline operator>>(const Vectorized<int16_t>& a, const Vectorized<int16_t>& b) {
+-               vuint16 shift_vec0 = reinterpret_cast<vuint16>(b.vec0());
+-               vuint16 shift_vec1 = reinterpret_cast<vuint16>(b.vec1()) ;
+-         return Vectorized<int16_t>{vec_sr(a.vec0(), shift_vec0), vec_sr(a.vec1(), shift_vec1)};
+-}
+-
+ template <>
+ Vectorized<int16_t> inline maximum(
+     const Vectorized<int16_t>& a,
+@@ -362,6 +348,7 @@ Vectorized<int16_t> inline minimum(
+   return a.minimum(b);
+ }
+
++DEFINE_SHIFT_FUNCS(int16_t)
+
+ } // namespace
+ } // namespace vec
+diff --git a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_int32_vsx.h b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_int32_vsx.h
+index 1b6a82df39b..e1e86d3b53a 100644
+--- a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_int32_vsx.h
++++ b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_int32_vsx.h
+@@ -265,20 +265,6 @@ class Vectorized<int32_t> {
+   DEFINE_MEMBER_OP(operator^, int32_t, vec_xor)
+ };
+
+-template <>
+-Vectorized<int32_t> inline operator<<(const Vectorized<int32_t>& a, const Vectorized<int32_t>& b) {
+-                vuint32 shift_vec0 = reinterpret_cast<vuint32>(b.vec0());
+-                vuint32 shift_vec1 = reinterpret_cast<vuint32>(b.vec1()) ;
+-          return Vectorized<int32_t>{vec_sl(a.vec0(), shift_vec0), vec_sl(a.vec1(), shift_vec1)};
+-}
+-
+-template <>
+-Vectorized<int32_t> inline operator>>(const Vectorized<int32_t>& a, const Vectorized<int32_t>& b) {
+-                vuint32 shift_vec0 = reinterpret_cast<vuint32>(b.vec0());
+-                vuint32 shift_vec1 = reinterpret_cast<vuint32>(b.vec1()) ;
+-          return Vectorized<int32_t>{vec_sr(a.vec0(), shift_vec0), vec_sr(a.vec1(), shift_vec1)};
+-}
+-
+ template <>
+ Vectorized<int32_t> inline maximum(
+     const Vectorized<int32_t>& a,
+@@ -293,6 +279,8 @@ Vectorized<int32_t> inline minimum(
+   return a.minimum(b);
+ }
+
++DEFINE_SHIFT_FUNCS(int32_t)
++
+ } // namespace
+ } // namespace vec
+ } // namespace at
+diff --git a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_int64_vsx.h b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_int64_vsx.h
+index a7a376ee16e..70613d90443 100644
+--- a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_int64_vsx.h
++++ b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_int64_vsx.h
+@@ -217,20 +217,6 @@ class Vectorized<int64_t> {
+   DEFINE_MEMBER_OP(operator^, int64_t, vec_xor)
+ };
+
+-template <>
+-Vectorized<int64_t> inline operator<<(const Vectorized<int64_t>& a, const Vectorized<int64_t>& b) {
+-                vuint64 shift_vec0 = reinterpret_cast<vuint64>(b.vec0());
+-                vuint64 shift_vec1 = reinterpret_cast<vuint64>(b.vec1()) ;
+-          return Vectorized<int64_t>{vec_sl(a.vec0(), shift_vec0), vec_sl(a.vec1(), shift_vec1)};
+-}
+-
+-template <>
+-Vectorized<int64_t> inline operator>>(const Vectorized<int64_t>& a, const Vectorized<int64_t>& b) {
+-                vuint64 shift_vec0 = reinterpret_cast<vuint64>(b.vec0());
+-                vuint64 shift_vec1 = reinterpret_cast<vuint64>(b.vec1()) ;
+-          return Vectorized<int64_t>{vec_sr(a.vec0(), shift_vec0), vec_sr(a.vec1(), shift_vec1)};
+-}
+-
+ template <>
+ Vectorized<int64_t> inline maximum(
+     const Vectorized<int64_t>& a,
+@@ -245,6 +231,8 @@ Vectorized<int64_t> inline minimum(
+   return a.minimum(b);
+ }
+
++DEFINE_SHIFT_FUNCS(int64_t)
++
+ } // namespace
+ } // namespace vec
+ } // namespace at
+diff --git a/aten/src/ATen/cpu/vec/vec256/vsx/vsx_helpers.h b/aten/src/ATen/cpu/vec/vec256/vsx/vsx_helpers.h
+index dab38458184..52032cdd817 100644
+--- a/aten/src/ATen/cpu/vec/vec256/vsx/vsx_helpers.h
++++ b/aten/src/ATen/cpu/vec/vec256/vsx/vsx_helpers.h
+@@ -2,6 +2,7 @@
+ #include <cstdint>
+ #include <c10/macros/Macros.h>
+ #include <ATen/cpu/vec/intrinsics.h>
++#include <ATen/cpu/vec/vec_base.h>
+
+ using vbool8   =  __attribute__((altivec(vector__))) __attribute__((altivec(bool__))) char;
+ using vbool16  =  __attribute__((altivec(vector__))) __attribute__((altivec(bool__))) short;
+@@ -18,6 +19,11 @@ using vuint64  =  __attribute__((altivec(vector__)))  unsigned long long;
+ using vfloat32 =  __attribute__((altivec(vector__)))  float;
+ using vfloat64 =  __attribute__((altivec(vector__)))  double;
+
++inline auto make_vuint(vint8 v){ return reinterpret_cast<vuint8>(v); }
++inline auto make_vuint(vint16 v){ return reinterpret_cast<vuint16>(v); }
++inline auto make_vuint(vint32 v){ return reinterpret_cast<vuint32>(v); }
++inline auto make_vuint(vint64 v){ return reinterpret_cast<vuint64>(v); }
++
+ #if !defined(vec_float)
+ C10_ALWAYS_INLINE vfloat32 vec_float(const vint32& vec_in) {
+   vfloat32 vec_out;
+@@ -448,6 +454,40 @@ const vfloat64 vd_imag_half = vfloat64{0.0, 0.5};
+ const vfloat64 vd_sqrt2_2 = vfloat64{0.70710678118654757, 0.70710678118654757};
+ const vfloat64 vd_pi_2 = vfloat64{M_PI / 2.0, 0.0};
+
++template<typename T>
++Vectorized<T> VsxShiftRightArith(const Vectorized<T>& a, const Vectorized<T>& b) {
++  const Vectorized<T> max_shift(sizeof(T) * CHAR_BIT - std::is_signed_v<T>);
++  const auto mask = (b < Vectorized<T>(0)) | (b >= max_shift);
++  const auto shift = Vectorized<T>::blendv(b, max_shift, mask);
++  return Vectorized<T>{
++    vec_sra(a.vec0(), make_vuint(shift.vec0())),
++    vec_sra(a.vec1(), make_vuint(shift.vec1()))};
++}
++
++template<typename T>
++Vectorized<T> VsxShiftLeftArith(const Vectorized<T>& a, const Vectorized<T>& b) {
++  const Vectorized<T> max_shift(sizeof(T) * CHAR_BIT);
++  const auto mask = (b < Vectorized<T>(0)) | (b >= max_shift);
++  Vectorized<T> ret(
++    vec_sl(a.vec0(), make_vuint(b.vec0())),
++    vec_sl(a.vec1(), make_vuint(b.vec1())));
++  return Vectorized<T>::blendv(ret, Vectorized<T>(0), mask);
++}
++
++#define DEFINE_SHIFT_FUNCS(operand_type)                 \
++  template <>                                            \
++  Vectorized<operand_type> C10_ALWAYS_INLINE operator>>( \
++      const Vectorized<operand_type>& a,                 \
++      const Vectorized<operand_type>& b) {               \
++    return VsxShiftRightArith(a, b);                     \
++  }                                                      \
++  template <>                                            \
++  Vectorized<operand_type> C10_ALWAYS_INLINE operator<<( \
++      const Vectorized<operand_type>& a,                 \
++      const Vectorized<operand_type>& b) {               \
++    return VsxShiftLeftArith(a, b);                      \
++  }                                                      \
++
+ } // namespace
+ } // namespace vec
+ } // namespace at
diff --git a/...ld/easyconfigs/p/PyTorch/PyTorch-2.1.0_increase-tolerance-functorch-test_vmapvjpvjp.patch b/...ld/easyconfigs/p/PyTorch/PyTorch-2.1.0_increase-tolerance-functorch-test_vmapvjpvjp.patch
@@ -0,0 +1,28 @@
+Increase tolerance of test_vmapvjpvjp_linalg_tensorsolve_cpu_float32 which fails with the same diff on multiple machines.
+
+_________________________________________ TestOperatorsCPU.test_vmapvjpvjp_linalg_tensorsolve_cpu_float32 _________________________________________
+AssertionError: Tensor-likes are not close!
+
+Mismatched elements: 6 / 288 (2.1%)
+Greatest absolute difference: 0.0005550384521484375 at index (0, 2, 1, 0, 2) (up to 0.0001 allowed)
+Greatest relative difference: 0.0002498962276149541 at index (0, 1, 0, 0, 2) (up to 0.0001 allowed)
+
+The failure occurred for item [2]
+
+See https://github.com/pytorch/pytorch/issues/114868
+
+@Author: Alexander Grund (TU Dresden)
+
+diff --git a/test/functorch/test_ops.py b/test/functorch/test_ops.py
+index 923d75d53d9..1ae32bffba6 100644
+--- a/test/functorch/test_ops.py
++++ b/test/functorch/test_ops.py
+@@ -825,6 +825,8 @@ class TestOperators(TestCase):
+     @opsToleranceOverride('TestOperators', 'test_vmapvjpvjp', (
+         tol1('linalg.svd',
+              {torch.float32: tol(atol=1e-03, rtol=5e-04)}),
++        tol1('linalg.tensorsolve',
++             {torch.float32: tol(atol=1e-03, rtol=5e-04)}),
+         tol1('linalg.lu_factor',
+              {torch.float32: tol(atol=2e-03, rtol=2e-02)}),
+         tol1('svd',
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.1.0_remove-sparse-csr-nnz-overflow-test.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.1.0_remove-sparse-csr-nnz-overflow-test.patch
@@ -0,0 +1,26 @@
+The test causes an overflow before it gets to the point it wants to test.
+Simply remove that part.
+See https://github.com/pytorch/pytorch/pull/114940
+
+Author: Alexander Grund (TU Dresden)
+
+diff --git a/test/test_sparse_csr.py b/test/test_sparse_csr.py
+index 5d344953526..f0bf8535214 100644
+--- a/test/test_sparse_csr.py
++++ b/test/test_sparse_csr.py
+@@ -864,15 +864,6 @@ class TestSparseCompressed(TestCase):
+                                 torch.tensor([0], dtype=torch.int64),
+                                 torch.tensor([1]), (1, cols))
+
+-        nnz = 2 ** 31
+-        with self.assertRaisesRegex(RuntimeError, '32-bit integer overflow in nnz'):
+-            torch.sparse_csr_tensor(torch.tensor([0, nnz // 2, nnz], dtype=torch.int32),
+-                                    torch.arange(nnz // 2, dtype=torch.int32).repeat(2),
+-                                    torch.ones(nnz, dtype=torch.int8), (2, nnz // 2))
+-        torch.sparse_csr_tensor(torch.tensor([0, nnz // 2, nnz], dtype=torch.int64),
+-                                torch.arange(nnz // 2, dtype=torch.int64).repeat(2),
+-                                torch.ones(nnz, dtype=torch.int8), (2, nnz // 2))
+-
+     @skipMeta
+     @onlyCPU
+     @all_sparse_compressed_layouts()
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.1.0_remove-test-requiring-online-access.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.1.0_remove-test-requiring-online-access.patch
@@ -0,0 +1,26 @@
+This downloads a Perl file from a Github repo which may fail in:
+
+  File "test/test_cuda.py", line 4632, in test_memory_snapshot
+    torch.cuda.memory._save_segment_usage(f.name)
+  File "/torch/cuda/memory.py", line 610, in _save_segment_usage
+    f.write(_segments(snapshot))
+  File "/torch/cuda/_memory_viz.py", line 60, in segments
+    return format_flamegraph(f.getvalue())
+  File "/torch/cuda/_memory_viz.py", line 21, in format_flamegraph
+    urllib.request.urlretrieve(
+
+Author: Alexander Grund (TU Dresden)
+
+diff --git a/test/test_cuda.py b/test/test_cuda.py
+index e81c9365139..b95206bcff7 100644
+--- a/test/test_cuda.py
++++ b/test/test_cuda.py
+@@ -3372,7 +3372,7 @@ class TestCudaMallocAsync(TestCase):
+                         self.assertEqual(x.untyped_storage().data_ptr(), b['address'])
+             self.assertTrue(found_it)
+
+-            if not IS_WINDOWS:
++            if False:
+                 with tempfile.NamedTemporaryFile() as f:
+                     torch.cuda.memory._save_segment_usage(f.name)
+                     with open(f.name) as f2: