From bddd9bb12d8d1d6cffcdb69d19553cfd395d74bc Mon Sep 17 00:00:00 2001 From: Deven Desai Date: Mon, 6 Dec 2021 21:37:42 +0000 Subject: [PATCH] Disabling MKLDNN based Eigen contraction kernels for ROCm build On some CI nodes (typically those with higher CPU core counts 128/256), the `//tensorflow/c/eager:c_api_distributed_test_gpu` test fails on an intermitent basis. When it does fail, the failures manifests as segfault at the end of the test, with the stack dump shown at the end of this commit message. The stack dump points the finger to a routine within the MKLDNN implementation. This is further confirmed by the observation that disabling the MKLDNN based Eigen contraction kernels (for ROCm) seems to make the crash go away. related JIRA ticket - https://ontrack-internal.amd.com/browse/SWDEV-313684 A previous commit disabled the `//tensorflow/c/eager:c_api_distributed_test` unit-test only in the CPU unit-tests CI job (for the same reason). That comit cannot be reverted, because this commit disables MKLDNN based Eigen contraction kernels *only* for the ROCm build. ``` Thread 191 "c_api_distribut" received signal SIGSEGV, Segmentation fault. [Switching to thread 191 (Thread 0x7ffc777fe700 (LWP 159004))] 0x00007fff54530000 in ?? () (gdb) where #0 0x00007fff54530000 in ?? () #1 0x00007fffd5d15ae4 in dnnl::impl::cpu::x64::avx_gemm_f32::sgemm_nocopy_driver(char const*, char const*, long, long, long, float const*, float const*, long, float const*, long, float const*, float*, long, float const*, float*) () from /root/.cache/bazel/_bazel_root/efb88f6336d9c4a18216fb94287b8d97/execroot/org_tensorflow/bazel-out/k8-opt/bin/tensorflow/c/eager/../../../_solib_local/libexternal_Smkl_Udnn_Uv1_Slibmkl_Udnn.so #2 0x00007fffd5d166e1 in dnnl::impl::cpu::x64::jit_avx_gemm_f32(int, char const*, char const*, long const*, long const*, long const*, float const*, float const*, long const*, float const*, long const*, float const*, float*, long const*, float const*) () from /root/.cache/bazel/_bazel_root/efb88f6336d9c4a18216fb94287b8d97/execroot/org_tensorflow/bazel-out/k8-opt/bin/tensorflow/c/eager/../../../_solib_local/libexternal_Smkl_Udnn_Uv1_Slibmkl_Udnn.so #3 0x00007fffd5e277ed in dnnl_status_t dnnl::impl::cpu::x64::gemm_driver(char const*, char const*, char const*, long const*, long const*, long const*, float const*, float const*, long const*, float const*, float const*, long const*, float const*, float const*, float*, long const*, float const*, bool, dnnl::impl::cpu::x64::pack_type, dnnl::impl::cpu::x64::gemm_pack_storage_t*, bool) () from /root/.cache/bazel/_bazel_root/efb88f6336d9c4a18216fb94287b8d97/execroot/org_tensorflow/bazel-out/k8-opt/bin/tensorflow/c/eager/../../../_solib_local/libexternal_Smkl_Udnn_Uv1_Slibmkl_Udnn.so #4 0x00007fffd5665056 in dnnl::impl::cpu::extended_sgemm(char const*, char const*, long const*, long const*, long const*, float const*, float const*, long const*, float const*, long const*, float const*, float*, long const*, float const*, bool) () from /root/.cache/bazel/_bazel_root/efb88f6336d9c4a18216fb94287b8d97/execroot/org_tensorflow/bazel-out/k8-opt/bin/tensorflow/c/eager/../../../_solib_local/libexternal_Smkl_Udnn_Uv1_Slibmkl_Udnn.so #5 0x00007fffd52fe983 in dnnl_sgemm () from /root/.cache/bazel/_bazel_root/efb88f6336d9c4a18216fb94287b8d97/execroot/org_tensorflow/bazel-out/k8-opt/bin/tensorflow/c/eager/../../../_solib_local/libexternal_Smkl_Udnn_Uv1_Slibmkl_Udnn.so #6 0x0000555557187b0b in Eigen::internal::TensorContractionKernel, Eigen::internal::TensorContractionInputMapper, 16, Eigen::MakePointer> const, Eigen::ThreadPoolDevice>, Eigen::array, Eigen::array, 4, true, false, 0, Eigen::MakePointer>, Eigen::internal::TensorContractionInputMapper, 16, Eigen::MakePointer> const, Eigen::ThreadPoolDevice>, Eigen::array, Eigen::array, 4, true, false, 0, Eigen::MakePointer> >::invoke(Eigen::internal::blas_data_mapper const&, Eigen::internal::ColMajorBlock const&, Eigen::internal::ColMajorBlock const&, long, long, long, float, float) () #7 0x000055555718dc76 in Eigen::TensorEvaluator, 1ul> const, Eigen::TensorMap, 16, Eigen::MakePointer> const, Eigen::TensorMap, 16, Eigen::MakePointer> const, Eigen::NoOpOutputKernel const> const, Eigen::ThreadPoolDevice>::EvalParallelContext, 1ul> const, Eigen::TensorMap, 16, Eigen::MakePointer> const, Eigen::TensorMap, 16, Eigen::MakePointer> const, Eigen::NoOpOutputKernel const> const, Eigen::ThreadPoolDevice>::NoCallback, true, true, false, 0>::kernel(long, long, long, bool) () #8 0x000055555718f327 in Eigen::TensorEvaluator, 1ul> const, Eigen::TensorMap, 16, Eigen::MakePointer> const, Eigen::TensorMap, 16, Eigen::MakePointer> const, Eigen::NoOpOutputKernel const> const, Eigen::ThreadPoolDevice>::EvalParallelContext, 1ul> const, Eigen::TensorMap, 16, Eigen::MakePointer> const, Eigen::TensorMap, 16, Eigen::MakePointer> const, Eigen::NoOpOutputKernel const> const, Eigen::ThreadPoolDevice>::NoCallback, true, true, false, 0>::signal_kernel(long, long, long, bool, bool) () #9 0x00005555571904cb in Eigen::TensorEvaluator, 1ul> const, Eigen::TensorMap, 16, Eigen::MakePointer> const, Eigen::TensorMap, 16, Eigen::MakePointer> const, Eigen::NoOpOutputKernel const> const, Eigen::ThreadPoolDevice>::EvalParallelContext, 1ul> const, Eigen::TensorMap, 16, Eigen::MakePointer> const, Eigen::TensorMap, 16, Eigen::MakePointer> const, Eigen::NoOpOutputKernel const> const, Eigen::ThreadPoolDevice>::NoCallback, true, true, false, 0>::pack_rhs(long, long) () #10 0x000055555718fd69 in Eigen::TensorEvaluator, 1ul> const, Eigen::TensorMap, 16, Eigen::MakePointer> const, Eigen::TensorMap, 16, Eigen::MakePointer> const, Eigen::NoOpOutputKernel const> const, Eigen::ThreadPoolDevice>::EvalParallelContext, 1ul> const, Eigen::TensorMap, 16, Eigen::MakePointer> const, Eigen::TensorMap, 16, Eigen::MakePointer> const, Eigen::NoOpOutputKernel const> const, Eigen::ThreadPoolDevice>::NoCallback, true, true, false, 0>::enqueue_packing_helper(long, long, long, bool) () #11 0x00007ffff6b607a1 in Eigen::ThreadPoolTempl::WorkerLoop(int) () from /root/.cache/bazel/_bazel_root/efb88f6336d9c4a18216fb94287b8d97/execroot/org_tensorflow/bazel-out/k8-opt/bin/tensorflow/c/eager/../../../_solib_local/_U_S_Stensorflow_Sc_Seager_Cc_Uapi_Udistributed_Utest_Ugpu___Utensorflow/libtensorflow_framework.so.2 #12 0x00007ffff6b5de93 in std::_Function_handler)::{lambda()#1}>::_M_invoke(std::_Any_data const&) () from /root/.cache/bazel/_bazel_root/efb88f6336d9c4a18216fb94287b8d97/execroot/org_tensorflow/bazel-out/k8-opt/bin/tensorflow/c/eager/../../../_solib_local/_U_S_Stensorflow_Sc_Seager_Cc_Uapi_Udistributed_Utest_Ugpu___Utensorflow/libtensorflow_framework.so.2 #13 0x00007ffff6b40107 in tensorflow::(anonymous namespace)::PThread::ThreadFn(void*) () from /root/.cache/bazel/_bazel_root/efb88f6336d9c4a18216fb94287b8d97/execroot/org_tensorflow/bazel-out/k8-opt/bin/tensorflow/c/eager/../../../_solib_local/_U_S_Stensorflow_Sc_Seager_Cc_Uapi_Udistributed_Utest_Ugpu___Utensorflow/libtensorflow_framework.so.2 #14 0x00007fffd1ca86db in start_thread () from /lib/x86_64-linux-gnu/libpthread.so.0 #15 0x00007fffd00b471f in clone () from /lib/x86_64-linux-gnu/libc.so.6 ``` --- .bazelrc | 1 + 1 file changed, 1 insertion(+) diff --git a/.bazelrc b/.bazelrc index 8c645260972dca..229647a60e0fe0 100644 --- a/.bazelrc +++ b/.bazelrc @@ -265,6 +265,7 @@ build:tensorrt --repo_env TF_NEED_TENSORRT=1 build:rocm --crosstool_top=@local_config_rocm//crosstool:toolchain build:rocm --define=using_rocm_hipcc=true +build:rocm --define=tensorflow_mkldnn_contraction_kernel=0 build:rocm --repo_env TF_NEED_ROCM=1 # Options extracted from configure script