From 777e342f37fe07ff8063055bc4c58d8ba89599fc Mon Sep 17 00:00:00 2001 From: Sam Gross Date: Fri, 26 Apr 2024 19:53:04 +0000 Subject: [PATCH] gh-118332: Fix deadlock involving stop the world Avoid detaching thread state when stopping the world. When re-attaching the thread state, the thread would attempt to resume the top-most critical section, which might now be held by a thread paused for our stop-the-world request. --- Include/internal/pycore_lock.h | 6 +- .../test_critical_sections.c | 85 +++++++++++++++++++ Modules/_threadmodule.c | 3 +- Python/lock.c | 6 +- Python/pystate.c | 3 +- 5 files changed, 96 insertions(+), 7 deletions(-) diff --git a/Include/internal/pycore_lock.h b/Include/internal/pycore_lock.h index f993c95ecbf75a..a5b28e4bd4744e 100644 --- a/Include/internal/pycore_lock.h +++ b/Include/internal/pycore_lock.h @@ -150,8 +150,10 @@ PyAPI_FUNC(void) PyEvent_Wait(PyEvent *evt); // Wait for the event to be set, or until the timeout expires. If the event is // already set, then this returns immediately. Returns 1 if the event was set, -// and 0 if the timeout expired or thread was interrupted. -PyAPI_FUNC(int) PyEvent_WaitTimed(PyEvent *evt, PyTime_t timeout_ns); +// and 0 if the timeout expired or thread was interrupted. If `detach` is +// true, then the thread will detach/release the GIL while waiting. +PyAPI_FUNC(int) +PyEvent_WaitTimed(PyEvent *evt, PyTime_t timeout_ns, int detach); // _PyRawMutex implements a word-sized mutex that that does not depend on the // parking lot API, and therefore can be used in the parking lot diff --git a/Modules/_testinternalcapi/test_critical_sections.c b/Modules/_testinternalcapi/test_critical_sections.c index 94da0468fcf149..cdf8a70fc79ff3 100644 --- a/Modules/_testinternalcapi/test_critical_sections.c +++ b/Modules/_testinternalcapi/test_critical_sections.c @@ -204,6 +204,90 @@ test_critical_sections_threads(PyObject *self, PyObject *Py_UNUSED(args)) Py_DECREF(test_data.obj1); Py_RETURN_NONE; } + +static void +pysleep(int ms) +{ +#ifdef MS_WINDOWS + Sleep(ms); +#else + usleep(ms * 1000); +#endif +} + +struct test_data_gc { + PyObject *obj; + Py_ssize_t num_threads; + Py_ssize_t id; + Py_ssize_t countdown; + PyEvent done_event; + PyEvent ready; +}; + +static void +thread_gc(void *arg) +{ + struct test_data_gc *test_data = arg; + PyGILState_STATE gil = PyGILState_Ensure(); + + Py_ssize_t id = _Py_atomic_add_ssize(&test_data->id, 1); + if (id == test_data->num_threads - 1) { + _PyEvent_Notify(&test_data->ready); + } + else { + // wait for all test threads to more reliably reproduce the issue. + PyEvent_Wait(&test_data->ready); + } + + if (id == 0) { + Py_BEGIN_CRITICAL_SECTION(test_data->obj); + // pause long enough that the lock would be handed off directly to + // a waiting thread. + pysleep(5); + PyGC_Collect(); + Py_END_CRITICAL_SECTION(); + } + else if (id == 1) { + pysleep(1); + Py_BEGIN_CRITICAL_SECTION(test_data->obj); + pysleep(1); + Py_END_CRITICAL_SECTION(); + } + else if (id == 2) { + // sleep long enough so that thread 0 is waiting to stop the world + pysleep(6); + Py_BEGIN_CRITICAL_SECTION(test_data->obj); + pysleep(1); + Py_END_CRITICAL_SECTION(); + } + + PyGILState_Release(gil); + if (_Py_atomic_add_ssize(&test_data->countdown, -1) == 1) { + // last thread to finish sets done_event + _PyEvent_Notify(&test_data->done_event); + } +} + +static PyObject * +test_critical_sections_gc(PyObject *self, PyObject *Py_UNUSED(args)) +{ + // gh-118332: Contended critical sections should not deadlock with GC + const Py_ssize_t NUM_THREADS = 3; + struct test_data_gc test_data = { + .obj = PyDict_New(), + .countdown = NUM_THREADS, + .num_threads = NUM_THREADS, + }; + assert(test_data.obj != NULL); + + for (int i = 0; i < NUM_THREADS; i++) { + PyThread_start_new_thread(&thread_gc, &test_data); + } + PyEvent_Wait(&test_data.done_event); + Py_DECREF(test_data.obj); + Py_RETURN_NONE; +} + #endif static PyMethodDef test_methods[] = { @@ -212,6 +296,7 @@ static PyMethodDef test_methods[] = { {"test_critical_sections_suspend", test_critical_sections_suspend, METH_NOARGS}, #ifdef Py_CAN_START_THREADS {"test_critical_sections_threads", test_critical_sections_threads, METH_NOARGS}, + {"test_critical_sections_gc", test_critical_sections_gc, METH_NOARGS}, #endif {NULL, NULL} /* sentinel */ }; diff --git a/Modules/_threadmodule.c b/Modules/_threadmodule.c index 5aa719c3834e61..f5e3b42600675e 100644 --- a/Modules/_threadmodule.c +++ b/Modules/_threadmodule.c @@ -501,7 +501,8 @@ ThreadHandle_join(ThreadHandle *self, PyTime_t timeout_ns) // Wait until the deadline for the thread to exit. PyTime_t deadline = timeout_ns != -1 ? _PyDeadline_Init(timeout_ns) : 0; - while (!PyEvent_WaitTimed(&self->thread_is_exiting, timeout_ns)) { + int detach = 1; + while (!PyEvent_WaitTimed(&self->thread_is_exiting, timeout_ns, detach)) { if (deadline) { // _PyDeadline_Get will return a negative value if the deadline has // been exceeded. diff --git a/Python/lock.c b/Python/lock.c index 91c66df8fd9093..5ed95fcaf4188c 100644 --- a/Python/lock.c +++ b/Python/lock.c @@ -277,12 +277,12 @@ _PyEvent_Notify(PyEvent *evt) void PyEvent_Wait(PyEvent *evt) { - while (!PyEvent_WaitTimed(evt, -1)) + while (!PyEvent_WaitTimed(evt, -1, /*detach=*/1)) ; } int -PyEvent_WaitTimed(PyEvent *evt, PyTime_t timeout_ns) +PyEvent_WaitTimed(PyEvent *evt, PyTime_t timeout_ns, int detach) { for (;;) { uint8_t v = _Py_atomic_load_uint8(&evt->v); @@ -298,7 +298,7 @@ PyEvent_WaitTimed(PyEvent *evt, PyTime_t timeout_ns) uint8_t expected = _Py_HAS_PARKED; (void) _PyParkingLot_Park(&evt->v, &expected, sizeof(evt->v), - timeout_ns, NULL, 1); + timeout_ns, NULL, detach); return _Py_atomic_load_uint8(&evt->v) == _Py_LOCKED; } diff --git a/Python/pystate.c b/Python/pystate.c index 78b39c9a577404..9d7b73b3a071e1 100644 --- a/Python/pystate.c +++ b/Python/pystate.c @@ -2238,7 +2238,8 @@ stop_the_world(struct _stoptheworld_state *stw) } PyTime_t wait_ns = 1000*1000; // 1ms (arbitrary, may need tuning) - if (PyEvent_WaitTimed(&stw->stop_event, wait_ns)) { + int detach = 0; + if (PyEvent_WaitTimed(&stw->stop_event, wait_ns, detach)) { assert(stw->thread_countdown == 0); break; }