From c67efd8316b1cd44a4234090bbe898fde8409163 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aleksey=20Kliger=20=28=CE=BBgeek=29?= Date: Mon, 23 Mar 2020 23:13:58 -0400 Subject: [PATCH] [mono] retry mono_threads_pthread_kill if result == EAGAIN on Linux (#33966) * [mono] retry mono_threads_pthread_kill if result == EAGAIN on Linux Try to address https://github.com/dotnet/runtime/issues/32377 (signal queue overflow) by sleeping and retrying a few times. --- src/mono/mono/utils/mono-threads-posix.c | 34 ++++++++++++++++++++---- 1 file changed, 29 insertions(+), 5 deletions(-) diff --git a/src/mono/mono/utils/mono-threads-posix.c b/src/mono/mono/utils/mono-threads-posix.c index cd32e6b042d25..33afc458a7928 100644 --- a/src/mono/mono/utils/mono-threads-posix.c +++ b/src/mono/mono/utils/mono-threads-posix.c @@ -170,16 +170,25 @@ mono_threads_pthread_kill (MonoThreadInfo *info, int signum) { THREADS_SUSPEND_DEBUG ("sending signal %d to %p[%p]\n", signum, info, mono_thread_info_get_tid (info)); + const int signal_queue_ovf_retry_count G_GNUC_UNUSED = 5; + const gulong signal_queue_ovf_sleep_us G_GNUC_UNUSED = 10 * 1000; /* 10 milliseconds */ + int retry_count G_GNUC_UNUSED = 0; int result; +#if defined (__linux__) +redo: +#endif + #ifdef USE_TKILL_ON_ANDROID - int old_errno = errno; + { + int old_errno = errno; - result = tkill (info->native_handle, signum); + result = tkill (info->native_handle, signum); - if (result < 0) { - result = errno; - mono_set_errno (old_errno); + if (result < 0) { + result = errno; + mono_set_errno (old_errno); + } } #elif defined (HAVE_PTHREAD_KILL) result = pthread_kill (mono_thread_info_get_tid (info), signum); @@ -204,10 +213,25 @@ mono_threads_pthread_kill (MonoThreadInfo *info, int signum) result != ESRCH #if defined (__MACH__) && defined (ENOTSUP) && result != ENOTSUP +#endif +#if defined (__linux__) + && !(result == EAGAIN && retry_count < signal_queue_ovf_retry_count) #endif ) g_error ("%s: pthread_kill failed with error %d - potential kernel OOM or signal queue overflow", __func__, result); +#if defined (__linux__) + if (result == EAGAIN && retry_count < signal_queue_ovf_retry_count) { + /* HACK: if the signal queue overflows on linux, try again a couple of times. + * Tries to address https://github.com/dotnet/runtime/issues/32377 + */ + g_warning ("%s: pthread_kill failed with error %d - potential kernel OOM or signal queue overflow, sleeping for %ld microseconds", __func__, result, signal_queue_ovf_sleep_us); + g_usleep (signal_queue_ovf_sleep_us); + ++retry_count; + goto redo; + } +#endif + return result; }