Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 29 additions & 8 deletions pkg/tui/tui.go
Original file line number Diff line number Diff line change
Expand Up @@ -2376,15 +2376,36 @@ func (m *appModel) cleanupAll() {
ed.Cleanup()
}

// Safety net: force-exit if bubbletea's shutdown gets stuck.
// This can happen when the renderer's flush goroutine blocks on a
// stdout write (terminal buffer full) while holding the renderer
// mutex, preventing the event loop from completing the render call
// that follows tea.Quit.
// Safety net: bubbletea's renderer can deadlock on shutdown if stdout
// is wedged — the final flush re-acquires the mutex that the still
// blocked previous flush is holding. Race Wait() against a deadline
// and force-exit if shutdown stalls. Snapshot the package globals so
// they can't race with t.Cleanup. Clear m.program so subsequent calls
// to cleanupAll (e.g. ExitSessionMsg followed by ExitConfirmedMsg) are
// no-ops and don't spawn parallel safety nets that would each call exit.
program := m.program
if program == nil {
return
}
m.program = nil
timeout := shutdownTimeout
exit := exitFunc
go func() {
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[MEDIUM] No guard prevents multiple concurrent safety-net goroutines from each calling exit(0)

cleanupAll() is called from at least three message-handler sites (lines ~697, ~768, ~772). Each call creates a fresh safety-net goroutine with its own done channel and snapshotted exit. If cleanupAll() is called N times before the process exits (e.g. ExitSessionMsg followed by ExitConfirmedMsg on the same model), N independent goroutines run concurrently; each can independently reach the time.After(timeout) branch and call exit(0).

In production exit is os.Exit so the second call never happens (process is gone). In tests, though, exit is replaced with a function like func(int) { close(exitDone) } — a double call panics with "close of closed channel" and crashes the entire test binary.

Consider adding a sync.Once or atomically nil-ing m.program inside cleanupAll (before launching the goroutine) so only the first caller races Wait against the deadline:

program := m.program
m.program = nil   // or use sync.Once at the appModel level
if program == nil {
    return
}

time.Sleep(shutdownTimeout)
slog.Warn("Graceful shutdown timed out, forcing exit")
exitFunc(0)
done := make(chan struct{})
go func() {
program.Wait()
close(done)
}()

select {
case <-done:
case <-time.After(timeout):
slog.Warn("Graceful shutdown timed out, forcing exit")
// ReleaseTerminal grabs the same mutex that's stuck, so
// fire-and-forget; exit either way.
go func() { _ = program.ReleaseTerminal() }()
exit(0)
}
}()
}

Expand Down
215 changes: 165 additions & 50 deletions pkg/tui/tui_exit_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -147,16 +147,10 @@ func newTestModel() (*appModel, *mockEditor) {
return m, ed
}

// neutralizeExitFunc replaces the package-level exitFunc with a no-op for
// the duration of the test so that the safety-net goroutine spawned by
// cleanupAll doesn't call os.Exit. It also shrinks shutdownTimeout so the
// safety-net goroutine fires quickly, and waits for it to fire (or for a
// short timeout) before restoring the originals so that subsequent tests
// don't race against a pending background goroutine.
//
// Tests that call this helper must NOT use t.Parallel(): exitFunc and
// shutdownTimeout are package-level variables, and concurrent mutation
// from sibling tests would race with the cleanup that restores them.
// neutralizeExitFunc replaces exitFunc with a no-op for the duration of the
// test and waits for the safety-net goroutine to fire (or time out) before
// restoring the originals. Tests using this helper must NOT use t.Parallel()
// because exitFunc and shutdownTimeout are package globals.
func neutralizeExitFunc(t *testing.T) {
t.Helper()

Expand All @@ -171,9 +165,6 @@ func neutralizeExitFunc(t *testing.T) {
shutdownTimeout = 10 * time.Millisecond

t.Cleanup(func() {
// Wait for any pending safety-net goroutine to observe our no-op
// exitFunc, but with a deadline so tests that never trigger
// cleanupAll don't block.
select {
case <-fired:
case <-time.After(200 * time.Millisecond):
Expand Down Expand Up @@ -255,9 +246,8 @@ func (w *blockingWriter) unblock() {
w.mu.Unlock()
}

// quitModel is a minimal bubbletea model that requests alt-screen output
// and quits in response to a trigger message. An optional onQuit callback
// runs inside Update before tea.Quit is returned.
// quitModel is a minimal bubbletea model that requests alt-screen and quits
// on triggerQuitMsg. onQuit, if set, runs before tea.Quit.
type quitModel struct {
onQuit func()
}
Expand All @@ -282,9 +272,9 @@ func (m *quitModel) View() tea.View {
return v
}

// initBlockingBubbletea creates a bubbletea program whose output writer
// blocks. It lets the initial render complete (so the event loop is ready)
// then re-blocks the writer. Returns the program and the writer.
// initBlockingBubbletea starts a bubbletea program whose stdout will block
// the renderer on its next flush. Used to reproduce the wedged-renderer
// shutdown deadlock.
func initBlockingBubbletea(t *testing.T, model tea.Model) (*tea.Program, *blockingWriter, <-chan struct{}) {
t.Helper()

Expand All @@ -303,26 +293,24 @@ func initBlockingBubbletea(t *testing.T, model tea.Model) (*tea.Program, *blocki
_, _ = p.Run()
}()

// Wait for the initial render to hit the blocking writer.
select {
case <-w.blocked:
case <-time.After(5 * time.Second):
t.Fatal("timed out waiting for initial write to block")
}

// Let the initial writes through so the event loop starts.
// Let the initial writes through so the event loop starts, then re-block
// so the next flush stalls.
w.unblock()
time.Sleep(200 * time.Millisecond)

// Re-block so the next renderer flush will stall.
w.reblock()

return p, w, runDone
}

// TestCleanupAll_SpawnsSafetyNet verifies that cleanupAll spawns a goroutine
// that calls exitFunc after shutdownTimeout. Without the safety net, the
// process would hang when bubbletea's renderer deadlocks on exit.
// TestCleanupAll_SpawnsSafetyNet: an unstarted Program has a nil finished
// channel, so Wait() blocks forever — same shape as a real renderer
// deadlock. exitFunc must fire after shutdownTimeout.
func TestCleanupAll_SpawnsSafetyNet(t *testing.T) {
origTimeout := shutdownTimeout
origExitFunc := exitFunc
Expand All @@ -338,6 +326,7 @@ func TestCleanupAll_SpawnsSafetyNet(t *testing.T) {
}

m, _ := newTestModel()
m.program = tea.NewProgram(&quitModel{})
m.cleanupAll()

select {
Expand All @@ -348,38 +337,166 @@ func TestCleanupAll_SpawnsSafetyNet(t *testing.T) {
}
}

// TestExitDeadlock_BlockedStdout proves that bubbletea's p.Run() hangs when
// stdout blocks during the final render after tea.Quit. This is the underlying
// bug that the safety net in cleanupAll works around.
// TestCleanupAll_GracefulShutdownSkipsExit: when Wait() returns promptly,
// the safety net must not call exitFunc.
func TestCleanupAll_GracefulShutdownSkipsExit(t *testing.T) {
origTimeout := shutdownTimeout
origExitFunc := exitFunc
t.Cleanup(func() {
shutdownTimeout = origTimeout
exitFunc = origExitFunc
})
shutdownTimeout = 2 * time.Second

var exitCalled atomic.Bool
exitFunc = func(int) { exitCalled.Store(true) }

var in, out bytes.Buffer
p := tea.NewProgram(&quitModel{},
tea.WithContext(t.Context()),
tea.WithInput(&in),
tea.WithOutput(&out),
)

runDone := make(chan struct{})
go func() {
defer close(runDone)
_, _ = p.Run()
}()

// Send blocks until the program is running, which guarantees Run() has
// initialized p.finished — otherwise Wait() races the assignment.
p.Send(syncMsg{})

m, _ := newTestModel()
m.program = p
m.cleanupAll()

p.Send(triggerQuitMsg{})

select {
case <-runDone:
case <-time.After(3 * time.Second):
t.Fatal("p.Run() did not return within deadline")
}

// Let the safety-net goroutine observe Wait() returning.
time.Sleep(100 * time.Millisecond)
assert.False(t, exitCalled.Load(),
"exitFunc must not fire on prompt shutdown")
}

// syncMsg pings the program's event loop to confirm Run() has started.
type syncMsg struct{}

// TestCleanupAll_NilProgramIsSafe: with no program wired, cleanupAll is a
// no-op and exitFunc is never called.
func TestCleanupAll_NilProgramIsSafe(t *testing.T) {
origTimeout := shutdownTimeout
origExitFunc := exitFunc
t.Cleanup(func() {
shutdownTimeout = origTimeout
exitFunc = origExitFunc
})
shutdownTimeout = 20 * time.Millisecond

var exitCalled atomic.Bool
exitFunc = func(int) { exitCalled.Store(true) }

m, _ := newTestModel()
m.program = nil
assert.NotPanics(t, func() { m.cleanupAll() })

time.Sleep(shutdownTimeout + 50*time.Millisecond)
assert.False(t, exitCalled.Load(), "exitFunc must not fire without a program")
}

// TestCleanupAll_WedgedStdoutFiresExit: the realistic case. The renderer is
// stuck on a wedged stdout write, and once tea.Quit fires the final flush
// would itself re-acquire the same mutex — a hard deadlock. Wait() never
// returns and ReleaseTerminal would block too; exitFunc must still fire.
func TestCleanupAll_WedgedStdoutFiresExit(t *testing.T) {
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[MEDIUM] Test doesn't exercise the wedged-renderer scenario it claims to test

TestCleanupAll_WedgedStdoutFiresExit never sends a triggerQuitMsg to the running program. Without a quit signal the program keeps running indefinitely, so program.Wait() blocks simply because the program is still alive — not because the renderer is stuck trying to acquire a mutex held by a blocked stdout flush.

The wedged-renderer deadlock path requires:

  1. The program receives a quit signal.
  2. bubbletea initiates its final render flush.
  3. The renderer tries to re-acquire the mutex already held by the still-blocked write goroutine → deadlock.

None of these steps occur here, so the test is actually exercising: "Wait() blocks when the program has no reason to quit → timeout fires → exitFunc is called" — a trivially weaker property.

The companion test TestExitDeadlock_BlockedStdout does send triggerQuitMsg and correctly demonstrates the real deadlock. TestCleanupAll_WedgedStdoutFiresExit should do the same (send triggerQuitMsg after calling m.cleanupAll(), before waiting for exitDone) to actually validate the safety net against the renderer-mutex deadlock.

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[MEDIUM] Test does not exercise the claimed quit-induced renderer deadlock

TestCleanupAll_WedgedStdoutFiresExit never sends triggerQuitMsg{} to the program, so the bubbletea event loop never issues a tea.Quit and the renderer never attempts the final flush that triggers the real deadlock.

What the test actually exercises: program.Wait() blocks because the renderer's periodic write is stalled on the blocked writer — unrelated to the quit path. The program only exits when the test-supplied t.Context() is cancelled (at test cleanup), which may race with the 300 ms shutdownTimeout window.

The comment says "the renderer is stuck on a wedged stdout write, so Wait() never returns AND ReleaseTerminal would itself deadlock on the same mutex" — but without tea.Quit being sent, the renderer is not in the quit-flush code path that actually holds the mutex. The scenario described in TestExitDeadlock_BlockedStdout (which does send triggerQuitMsg{}) is the actual regression; this test does not reproduce it.

Suggested fix: send triggerQuitMsg{} before calling m.cleanupAll(), so the program attempts the quit render against the blocked writer and truly reproduces the wedged-renderer deadlock.

origTimeout := shutdownTimeout
origExitFunc := exitFunc
t.Cleanup(func() {
shutdownTimeout = origTimeout
exitFunc = origExitFunc
})
shutdownTimeout = 300 * time.Millisecond

exitDone := make(chan struct{})
exitFunc = func(int) { close(exitDone) }

p, w, _ := initBlockingBubbletea(t, &quitModel{})
defer w.unblock()

m, _ := newTestModel()
m.program = p
m.cleanupAll()

// Drive the program into the deadlock path: tea.Quit triggers the final
// render flush against the wedged writer, which is the actual upstream
// bug the safety net guards against.
p.Send(triggerQuitMsg{})

select {
case <-exitDone:
case <-time.After(shutdownTimeout + 2*time.Second):
t.Fatal("exitFunc was not called — safety net is blocked by ReleaseTerminal")
}
}

// TestCleanupAll_MultipleCallsFireExitOnce: cleanupAll is invoked from
// several message handlers (ExitSessionMsg, ExitConfirmedMsg, …) and may
// run more than once on the same model. Each safety-net goroutine snapshots
// exitFunc, so without a guard each one would call exit(0) on timeout —
// fine in production where exit is os.Exit, fatal in tests where it's a
// channel close.
func TestCleanupAll_MultipleCallsFireExitOnce(t *testing.T) {
origTimeout := shutdownTimeout
origExitFunc := exitFunc
t.Cleanup(func() {
shutdownTimeout = origTimeout
exitFunc = origExitFunc
})
shutdownTimeout = 100 * time.Millisecond

var exitCount atomic.Int32
exitFunc = func(int) { exitCount.Add(1) }

m, _ := newTestModel()
m.program = tea.NewProgram(&quitModel{})

m.cleanupAll()
m.cleanupAll()
m.cleanupAll()

time.Sleep(shutdownTimeout + 200*time.Millisecond)
assert.Equal(t, int32(1), exitCount.Load(),
"only the first cleanupAll should arm a safety net")
}

// TestExitDeadlock_BlockedStdout proves the underlying bubbletea bug: Run()
// hangs when stdout blocks during the final render after tea.Quit.
func TestExitDeadlock_BlockedStdout(t *testing.T) {
t.Parallel()

model := &quitModel{}
p, w, runDone := initBlockingBubbletea(t, model)

// Trigger quit — the event loop will deadlock trying to render.
p.Send(triggerQuitMsg{})

// Verify that p.Run() does NOT return within a reasonable window.
select {
case <-runDone:
t.Skip("bubbletea returned without deadlocking; upstream fix may have landed")
case <-time.After(2 * time.Second):
// Expected: p.Run() is stuck.
}

// Unblock everything to let goroutines drain.
w.unblock()
}

// TestExitSafetyNet_BlockedStdout verifies that when bubbletea's renderer
// is stuck writing to stdout (terminal buffer full), the shutdown safety net
// forces the process to exit.
//
// Background: bubbletea's cursed renderer holds a mutex during io.Copy to
// stdout. If stdout blocks (e.g. full PTY buffer), the event loop's final
// render call after tea.Quit deadlocks on the same mutex. Without the safety
// net the process hangs forever.
// TestExitSafetyNet_BlockedStdout: with a wedged renderer, an external
// safety-net (simulated here in onQuit) must force the process to exit.
func TestExitSafetyNet_BlockedStdout(t *testing.T) {
t.Parallel()

Expand All @@ -402,22 +519,21 @@ func TestExitSafetyNet_BlockedStdout(t *testing.T) {
p, w, runDone := initBlockingBubbletea(t, model)
defer w.unblock()

// Trigger quit — the model's onQuit starts the safety net.
p.Send(triggerQuitMsg{})

select {
case code := <-exitDone:
assert.True(t, exitCalled.Load())
assert.Equal(t, 0, code)
case <-runDone:
// p.Run() returned on its own — also acceptable.
// Run() returned on its own — also acceptable.
case <-time.After(safetyNetTimeout + 2*time.Second):
t.Fatal("neither p.Run() returned nor safety-net exitFunc fired within the deadline")
t.Fatal("neither Run() returned nor safety-net exitFunc fired")
}
}

// TestExitSafetyNet_GracefulShutdown verifies that when bubbletea shuts down
// normally (no blocked stdout), p.Run() returns before the safety net fires.
// TestExitSafetyNet_GracefulShutdown: on a clean shutdown, Run() must return
// before the safety net fires.
func TestExitSafetyNet_GracefulShutdown(t *testing.T) {
t.Parallel()

Expand Down Expand Up @@ -456,7 +572,6 @@ func TestExitSafetyNet_GracefulShutdown(t *testing.T) {
runDone <- err
}()

// Give bubbletea time to initialise.
time.Sleep(200 * time.Millisecond)

p.Send(triggerQuitMsg{})
Expand All @@ -465,11 +580,11 @@ func TestExitSafetyNet_GracefulShutdown(t *testing.T) {
case err := <-runDone:
require.NoError(t, err)
case <-time.After(3 * time.Second):
t.Fatal("p.Run() did not return within deadline for graceful shutdown")
t.Fatal("p.Run() did not return")
}

mu.Lock()
assert.True(t, cleanupCalled, "cleanup should have been called")
assert.True(t, cleanupCalled)
mu.Unlock()
assert.False(t, exitCalled.Load(), "exitFunc should NOT fire during graceful shutdown")
assert.False(t, exitCalled.Load(), "exitFunc must not fire on graceful shutdown")
}
Loading