From 793d0c0aee631efa7b6b333c07132060a3bb0f0f Mon Sep 17 00:00:00 2001 From: Thomas Kosiewski Date: Wed, 3 Jun 2026 16:26:52 +0200 Subject: [PATCH 1/6] fix(server): close connection on malformed WebSocket frames instead of stalling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `frame.parse_frame` previously returned an identical `(nil, 0)` for both "incomplete, need more bytes" and "fatal protocol violation". The receive loop in `client.process_data` just `break`ed on a nil frame, so a malformed frame was never closed with a Close frame and its bytes stayed in `client.buffer`, getting re-parsed on every read forever — a wedged connection whose buffer never drained. - `parse_frame` now disambiguates the two failure modes via a backward- compatible optional third return value: incomplete frames still return `nil, 0` (no third value), while fatal protocol violations return `nil, 0, ` with the RFC 6455 status (1002 protocol error, 1007 invalid payload data, 1009 message too big). - The receive loop captures the close code, drops the offending bytes, reports the error, and tears the connection down via the existing `close_client` (which sends a Close frame), mirroring the existing CONTINUATION / unknown-opcode error paths. - Fixed an adjacent bug in `close_client` where the trailing `client.state = "closing"` clobbered the `"closed"` state set on the not-handshake-complete branch. Adds `tests/unit/server/frame_spec.lua` covering the fatal close codes, the preserved `(nil, 0)` incomplete-frame contract, and the regression: a malformed frame now closes and drains the buffer while an incomplete frame stays buffered with the connection open. Change-Id: I892f0a10bf6c1894cef8118598a15e35378e5f0f Co-Authored-By: Claude Opus 4.8 (1M context) Signed-off-by: Thomas Kosiewski --- lua/claudecode/server/client.lua | 18 ++- lua/claudecode/server/frame.lua | 22 ++- tests/unit/server/frame_spec.lua | 251 +++++++++++++++++++++++++++++++ 3 files changed, 279 insertions(+), 12 deletions(-) create mode 100644 tests/unit/server/frame_spec.lua diff --git a/lua/claudecode/server/client.lua b/lua/claudecode/server/client.lua index d159b72f..6aafeb14 100644 --- a/lua/claudecode/server/client.lua +++ b/lua/claudecode/server/client.lua @@ -114,10 +114,18 @@ function M.process_data(client, data, on_message, on_close, on_error, auth_token end while #client.buffer >= 2 do -- Minimum frame size - local parsed_frame, bytes_consumed = frame.parse_frame(client.buffer) + local parsed_frame, bytes_consumed, close_code = frame.parse_frame(client.buffer) if not parsed_frame then - break + if close_code then + -- Fatal protocol violation: close with the RFC 6455 status code and drop + -- the offending bytes instead of leaving them buffered to be re-parsed + -- forever (which previously wedged the connection). + client.buffer = "" + on_error(client, "WebSocket protocol error in frame") + M.close_client(client, close_code, "Protocol error") + end + break -- No close_code means the frame is incomplete; wait for more bytes. end -- Frame validation is now handled entirely within frame.parse_frame. @@ -218,12 +226,14 @@ function M.close_client(client, code, reason) client.state = "closed" client.tcp_handle:close() end) + -- Mark as "closing" while the Close frame write/teardown is in flight. The + -- write callback transitions to "closed". Do not clobber the "closed" state + -- set synchronously on the not-handshake-complete branch below. + client.state = "closing" else client.state = "closed" client.tcp_handle:close() end - - client.state = "closing" end ---Check if a client connection is alive diff --git a/lua/claudecode/server/frame.lua b/lua/claudecode/server/frame.lua index c60d4b4f..09e4aaa7 100644 --- a/lua/claudecode/server/frame.lua +++ b/lua/claudecode/server/frame.lua @@ -22,9 +22,15 @@ M.OPCODE = { ---@field payload string Frame payload data ---Parse a WebSocket frame from binary data +--- +---Failure is disambiguated via the optional third return value: +--- * incomplete input ("need more bytes") returns `nil, 0` with NO third value +--- * a fatal protocol violation returns `nil, 0, ` where the close +--- code is the RFC 6455 status to send in the Close frame (1002/1007/1009) ---@param data string The binary frame data ---@return WebSocketFrame|nil frame The parsed frame, or nil if incomplete/invalid ---@return number bytes_consumed Number of bytes consumed from input +---@return number|nil close_code WebSocket close code for fatal protocol violations function M.parse_frame(data) if type(data) ~= "string" then return nil, 0 @@ -65,18 +71,18 @@ function M.parse_frame(data) } if not valid_opcodes[opcode] then - return nil, 0 -- Invalid opcode + return nil, 0, 1002 -- Invalid opcode (protocol error) end -- Check for reserved bits (must be 0) if rsv1 or rsv2 or rsv3 then - return nil, 0 -- Protocol error + return nil, 0, 1002 -- Reserved bits set (protocol error) end -- Control frames must have fin=1 and payload ≤ 125 (RFC 6455 Section 5.5) if opcode >= M.OPCODE.CLOSE then if not fin or payload_len > 125 then - return nil, 0 -- Protocol violation + return nil, 0, 1002 -- Control frame fragmented or oversized (protocol error) end end @@ -103,13 +109,13 @@ function M.parse_frame(data) -- Prevent extremely large payloads (DOS protection) if actual_payload_len > 100 * 1024 * 1024 then -- 100MB limit - return nil, 0 + return nil, 0, 1009 -- Message too big end end -- Additional payload length validation if actual_payload_len < 0 then - return nil, 0 -- Invalid negative length + return nil, 0, 1002 -- Invalid negative length (protocol error) end -- Read mask if present @@ -138,19 +144,19 @@ function M.parse_frame(data) -- Validate text frame payload is valid UTF-8 if opcode == M.OPCODE.TEXT and not utils.is_valid_utf8(payload) then - return nil, 0 -- Invalid UTF-8 in text frame + return nil, 0, 1007 -- Invalid UTF-8 in text frame (invalid payload data) end -- Basic validation for close frame payload if opcode == M.OPCODE.CLOSE and actual_payload_len > 0 then if actual_payload_len == 1 then - return nil, 0 -- Close frame with 1 byte payload is invalid + return nil, 0, 1002 -- Close frame with 1 byte payload is invalid (protocol error) end -- Allow most close codes for compatibility, only validate UTF-8 for reason text if actual_payload_len > 2 then local reason = payload:sub(3) if not utils.is_valid_utf8(reason) then - return nil, 0 -- Invalid UTF-8 in close reason + return nil, 0, 1007 -- Invalid UTF-8 in close reason (invalid payload data) end end end diff --git a/tests/unit/server/frame_spec.lua b/tests/unit/server/frame_spec.lua new file mode 100644 index 00000000..40376aea --- /dev/null +++ b/tests/unit/server/frame_spec.lua @@ -0,0 +1,251 @@ +require("tests.busted_setup") + +local client = require("claudecode.server.client") +local frame = require("claudecode.server.frame") +local utils = require("claudecode.server.utils") + +describe("WebSocket frame parsing", function() + describe("parse_frame fatal protocol violations", function() + it("returns close code 1002 for an invalid opcode", function() + -- byte1: fin=1, opcode=0x3 (reserved, invalid); byte2: unmasked, len=0 + local data = string.char(0x83, 0x00) + local parsed, consumed, close_code = frame.parse_frame(data) + + assert.is_nil(parsed) + assert.equals(0, consumed) + assert.equals(1002, close_code) + end) + + it("returns close code 1002 when reserved bits are set", function() + -- byte1: fin=1, rsv1=1 (0x40), opcode=TEXT(0x1) => 0xC1; byte2: len=0 + local data = string.char(0xC1, 0x00) + local parsed, consumed, close_code = frame.parse_frame(data) + + assert.is_nil(parsed) + assert.equals(0, consumed) + assert.equals(1002, close_code) + end) + + it("returns close code 1002 for an oversized control frame", function() + -- byte1: fin=1, opcode=PING(0x9) => 0x89; byte2: unmasked, len=126 (>125) + local data = string.char(0x89, 0x7E) + local parsed, consumed, close_code = frame.parse_frame(data) + + assert.is_nil(parsed) + assert.equals(0, consumed) + assert.equals(1002, close_code) + end) + + it("returns close code 1002 for a fragmented control frame (fin=0)", function() + -- byte1: fin=0, opcode=CLOSE(0x8) => 0x08; byte2: unmasked, len=0 + local data = string.char(0x08, 0x00) + local parsed, consumed, close_code = frame.parse_frame(data) + + assert.is_nil(parsed) + assert.equals(0, consumed) + assert.equals(1002, close_code) + end) + + it("returns close code 1002 for a 1-byte close frame payload", function() + -- byte1: fin=1, opcode=CLOSE(0x8) => 0x88; byte2: unmasked, len=1; 1 payload byte + local data = string.char(0x88, 0x01) .. string.char(0x03) + local parsed, consumed, close_code = frame.parse_frame(data) + + assert.is_nil(parsed) + assert.equals(0, consumed) + assert.equals(1002, close_code) + end) + + it("returns close code 1009 for a declared payload over the 100MB cap", function() + -- byte1: fin=1, opcode=BINARY(0x2) => 0x82; byte2: unmasked, len=127 (64-bit length) + -- 64-bit length = 200MB, which exceeds the 100MB cap. + local big_len = 200 * 1024 * 1024 + local data = string.char(0x82, 0x7F) .. utils.uint64_to_bytes(big_len) + local parsed, consumed, close_code = frame.parse_frame(data) + + assert.is_nil(parsed) + assert.equals(0, consumed) + assert.equals(1009, close_code) + end) + + it("returns close code 1007 for invalid UTF-8 in a text frame", function() + -- byte1: fin=1, opcode=TEXT(0x1) => 0x81; byte2: unmasked, len=2 + -- payload: 0xFF 0xFE is not valid UTF-8. + local data = string.char(0x81, 0x02) .. string.char(0xFF, 0xFE) + local parsed, consumed, close_code = frame.parse_frame(data) + + assert.is_nil(parsed) + assert.equals(0, consumed) + assert.equals(1007, close_code) + end) + + it("returns close code 1007 for invalid UTF-8 in a close reason", function() + -- byte1: fin=1, opcode=CLOSE(0x8) => 0x88; byte2: unmasked, len=4 + -- payload: 2-byte close code (1000) + 2 invalid UTF-8 reason bytes. + local payload = utils.uint16_to_bytes(1000) .. string.char(0xFF, 0xFE) + local data = string.char(0x88, 0x04) .. payload + local parsed, consumed, close_code = frame.parse_frame(data) + + assert.is_nil(parsed) + assert.equals(0, consumed) + assert.equals(1007, close_code) + end) + end) + + describe("parse_frame incomplete frames", function() + it("returns nil, 0 with NO third value when the payload is truncated", function() + -- Header declares a 10-byte unmasked text payload but only 3 bytes follow. + -- byte1: fin=1, opcode=TEXT(0x1) => 0x81; byte2: unmasked, len=10 + local data = string.char(0x81, 0x0A) .. "abc" + local parsed, consumed, close_code = frame.parse_frame(data) + + assert.is_nil(parsed) + assert.equals(0, consumed) + assert.is_nil(close_code) + end) + + it("returns nil, 0 with NO third value when only one header byte is present", function() + local parsed, consumed, close_code = frame.parse_frame(string.char(0x81)) + + assert.is_nil(parsed) + assert.equals(0, consumed) + assert.is_nil(close_code) + end) + + it("returns nil, 0 with NO third value when extended length bytes are missing", function() + -- byte2 = 126 means a 16-bit extended length follows, but it is absent. + local data = string.char(0x81, 0x7E) + local parsed, consumed, close_code = frame.parse_frame(data) + + assert.is_nil(parsed) + assert.equals(0, consumed) + assert.is_nil(close_code) + end) + + it("returns nil, 0 with NO third value when mask bytes are missing", function() + -- Masked frame (0x80) declaring a 4-byte payload, but the 4 mask bytes are absent. + local data = string.char(0x81, 0x84) + local parsed, consumed, close_code = frame.parse_frame(data) + + assert.is_nil(parsed) + assert.equals(0, consumed) + assert.is_nil(close_code) + end) + end) + + describe("parse_frame valid frames remain unaffected", function() + it("parses a valid unmasked text frame without a close code", function() + local data = frame.create_text_frame("hello") + local parsed, consumed, close_code = frame.parse_frame(data) + + assert.is_table(parsed) + assert.equals(frame.OPCODE.TEXT, parsed.opcode) + assert.equals("hello", parsed.payload) + assert.equals(#data, consumed) + assert.is_nil(close_code) + end) + end) +end) + +describe("WebSocket client malformed frame handling", function() + -- Build a client whose tcp_handle records writes so we can assert that a + -- Close frame was sent and the connection torn down. + local function make_client() + local writes = {} + local handle = { + _closed = false, + write = function(_, data, callback) + table.insert(writes, data) + if callback then + callback() + end + return true + end, + close = function(self) + self._closed = true + return true + end, + is_closing = function(self) + return self._closed + end, + } + + local c = { + id = "test_client", + tcp_handle = handle, + state = "connected", + buffer = "", + handshake_complete = true, + last_ping = 0, + last_pong = 0, + } + + return c, writes, handle + end + + local function noop() end + + it("closes the connection and drains the buffer on a malformed frame", function() + local c, writes = make_client() + + local on_error = spy.new(noop) + local on_close = spy.new(noop) + + -- A text frame whose payload is invalid UTF-8 is a fatal (1007) violation. + local malformed = string.char(0x81, 0x02) .. string.char(0xFF, 0xFE) + + client.process_data(c, malformed, noop, on_close, on_error) + + -- Connection must be torn down rather than left wedged. + assert.is_true(c.state == "closing" or c.state == "closed") + + -- A Close frame must have been written. + assert.is_true(#writes >= 1) + local last_frame = frame.parse_frame(writes[#writes]) + assert.is_table(last_frame) + assert.equals(frame.OPCODE.CLOSE, last_frame.opcode) + + -- The error callback must have fired for the protocol violation. + assert.spy(on_error).was_called() + + -- Regression: the malformed bytes must NOT remain buffered for re-parsing. + assert.is_false(c.buffer:find(string.char(0xFF, 0xFE), 1, true) ~= nil) + end) + + it("sends a Close frame carrying the 1002 status for an invalid opcode", function() + local c, writes = make_client() + + -- Invalid opcode 0x3 => fatal 1002 protocol error. + local malformed = string.char(0x83, 0x00) + + client.process_data(c, malformed, noop, noop, noop) + + assert.is_true(#writes >= 1) + local close_frame = frame.parse_frame(writes[#writes]) + assert.is_table(close_frame) + assert.equals(frame.OPCODE.CLOSE, close_frame.opcode) + -- Close payload begins with the 2-byte big-endian status code. + assert.is_true(#close_frame.payload >= 2) + local code = close_frame.payload:byte(1) * 256 + close_frame.payload:byte(2) + assert.equals(1002, code) + end) + + it("keeps an incomplete frame buffered and the connection open", function() + local c, writes = make_client() + + local on_error = spy.new(noop) + + -- Header declares a 10-byte text payload but only 3 bytes are present. + local incomplete = string.char(0x81, 0x0A) .. "abc" + + client.process_data(c, incomplete, noop, noop, on_error) + + -- Connection stays open; nothing written; no error raised. + assert.equals("connected", c.state) + assert.equals(0, #writes) + assert.spy(on_error).was_not_called() + + -- The bytes remain buffered, waiting for the rest of the frame. + assert.equals(incomplete, c.buffer) + end) +end) From aa4a18cb82af166bf8935ce76b46ac84ccde2e78 Mon Sep 17 00:00:00 2001 From: Thomas Kosiewski Date: Wed, 3 Jun 2026 17:33:51 +0200 Subject: [PATCH 2/6] fix(server): send Close frame before on_error tears down connection Review feedback (codex+claude): on_error synchronously closes the TCP handle (tcp.lua _disconnect_client -> _remove_client -> close()) without setting client.state, so writing the Close frame afterward targeted an already-closed handle and the 1002/1007/1009 status never reached the peer. Reorder close_client before on_error in the fatal-frame, CONTINUATION and unknown-opcode paths; guard close_client against writing to an already-closing handle; and strengthen frame_spec to assert the Close frame is delivered (the test now fails against the old ordering). Change-Id: I584cc516081e7589531bcdcec6d7a2291e1dc7db Co-Authored-By: Claude Opus 4.8 (1M context) Signed-off-by: Thomas Kosiewski --- lua/claudecode/server/client.lua | 31 ++++++++++++--- tests/unit/server/frame_spec.lua | 66 ++++++++++++++++++++++++-------- 2 files changed, 75 insertions(+), 22 deletions(-) diff --git a/lua/claudecode/server/client.lua b/lua/claudecode/server/client.lua index 6aafeb14..68f05317 100644 --- a/lua/claudecode/server/client.lua +++ b/lua/claudecode/server/client.lua @@ -121,9 +121,16 @@ function M.process_data(client, data, on_message, on_close, on_error, auth_token -- Fatal protocol violation: close with the RFC 6455 status code and drop -- the offending bytes instead of leaving them buffered to be re-parsed -- forever (which previously wedged the connection). + -- + -- close_client MUST run before on_error: on_error tears the connection + -- down synchronously (tcp.lua: _disconnect_client -> _remove_client -> + -- tcp_handle:close()) without setting client.state, so if it ran first + -- the Close frame would be written to an already-closed handle and never + -- reach the peer. Closing first writes the Close frame while the handle + -- is open and sets client.state, after which on_error's teardown no-ops. client.buffer = "" - on_error(client, "WebSocket protocol error in frame") M.close_client(client, close_code, "Protocol error") + on_error(client, "WebSocket protocol error in frame") end break -- No close_code means the frame is incomplete; wait for more bytes. end @@ -169,12 +176,14 @@ function M.process_data(client, data, on_message, on_close, on_error, auth_token elseif parsed_frame.opcode == frame.OPCODE.PONG then client.last_pong = vim.loop.now() elseif parsed_frame.opcode == frame.OPCODE.CONTINUATION then - -- Continuation frame - for simplicity, we don't support fragmentation - on_error(client, "Fragmented messages not supported") + -- Continuation frame - for simplicity, we don't support fragmentation. + -- close_client before on_error so the Close frame reaches the peer (see + -- the fatal-frame branch above for why ordering matters). M.close_client(client, 1003, "Unsupported data") + on_error(client, "Fragmented messages not supported") else - on_error(client, "Unknown WebSocket opcode: " .. parsed_frame.opcode) M.close_client(client, 1002, "Protocol error") + on_error(client, "Unknown WebSocket opcode: " .. parsed_frame.opcode) end end end @@ -220,11 +229,23 @@ function M.close_client(client, code, reason) code = code or 1000 reason = reason or "" + -- Defensive guard for callers that don't gate on client.state: if the + -- underlying handle is already closing (e.g. a TCP error path closed it), + -- writing a Close frame would target a dead handle and never reach the peer. + -- Just mark the client closed in that case. Mirrors the is_closing() check + -- in tcp.lua's _remove_client. + if client.tcp_handle:is_closing() then + client.state = "closed" + return + end + if client.handshake_complete then local close_frame = frame.create_close_frame(code, reason) client.tcp_handle:write(close_frame, function() client.state = "closed" - client.tcp_handle:close() + if not client.tcp_handle:is_closing() then + client.tcp_handle:close() + end end) -- Mark as "closing" while the Close frame write/teardown is in flight. The -- write callback transitions to "closed". Do not clobber the "closed" state diff --git a/tests/unit/server/frame_spec.lua b/tests/unit/server/frame_spec.lua index 40376aea..cd2c8c37 100644 --- a/tests/unit/server/frame_spec.lua +++ b/tests/unit/server/frame_spec.lua @@ -150,12 +150,17 @@ end) describe("WebSocket client malformed frame handling", function() -- Build a client whose tcp_handle records writes so we can assert that a -- Close frame was sent and the connection torn down. + -- + -- The handle records, per write, whether it occurred while the handle was + -- already closing. In production a write to a closed handle never reaches the + -- peer, so the Close frame this PR is designed to send would be silently lost. + -- Asserting `closing == false` on the recorded write is what catches that. local function make_client() local writes = {} local handle = { _closed = false, - write = function(_, data, callback) - table.insert(writes, data) + write = function(self, data, callback) + table.insert(writes, { data = data, closing = self._closed }) if callback then callback() end @@ -185,10 +190,44 @@ describe("WebSocket client malformed frame handling", function() local function noop() end + -- Returns an on_error callback that faithfully reproduces the production + -- teardown wired in tcp.lua: on a client error the TCP handle is closed + -- synchronously (via _disconnect_client -> _remove_client -> handle:close()) + -- WITHOUT touching client.state. A noop spy would mask the ordering bug this + -- suite is meant to catch, so the realistic version must close the handle. + local function make_realistic_on_error(c) + return spy.new(function() + if not c.tcp_handle:is_closing() then + c.tcp_handle:close() + end + end) + end + + -- Find the Close frame among recorded writes and assert it was written while + -- the handle was still open (i.e. it actually reached the peer). + local function assert_open_close_frame_written(writes, expected_code) + local found_close + for _, w in ipairs(writes) do + local parsed = frame.parse_frame(w.data) + if parsed and parsed.opcode == frame.OPCODE.CLOSE then + found_close = w + -- A Close frame written to an already-closed handle never reaches the + -- peer. This is the production bug: on_error closed the handle first. + assert.is_false(w.closing, "Close frame was written to an already-closed handle") + if expected_code then + assert.is_true(#parsed.payload >= 2) + local code = parsed.payload:byte(1) * 256 + parsed.payload:byte(2) + assert.equals(expected_code, code) + end + end + end + assert.is_table(found_close, "expected a Close frame to be written") + end + it("closes the connection and drains the buffer on a malformed frame", function() local c, writes = make_client() - local on_error = spy.new(noop) + local on_error = make_realistic_on_error(c) local on_close = spy.new(noop) -- A text frame whose payload is invalid UTF-8 is a fatal (1007) violation. @@ -199,11 +238,8 @@ describe("WebSocket client malformed frame handling", function() -- Connection must be torn down rather than left wedged. assert.is_true(c.state == "closing" or c.state == "closed") - -- A Close frame must have been written. - assert.is_true(#writes >= 1) - local last_frame = frame.parse_frame(writes[#writes]) - assert.is_table(last_frame) - assert.equals(frame.OPCODE.CLOSE, last_frame.opcode) + -- A Close frame must have been delivered (written while the handle was open). + assert_open_close_frame_written(writes, 1007) -- The error callback must have fired for the protocol violation. assert.spy(on_error).was_called() @@ -215,19 +251,15 @@ describe("WebSocket client malformed frame handling", function() it("sends a Close frame carrying the 1002 status for an invalid opcode", function() local c, writes = make_client() + local on_error = make_realistic_on_error(c) + -- Invalid opcode 0x3 => fatal 1002 protocol error. local malformed = string.char(0x83, 0x00) - client.process_data(c, malformed, noop, noop, noop) + client.process_data(c, malformed, noop, noop, on_error) - assert.is_true(#writes >= 1) - local close_frame = frame.parse_frame(writes[#writes]) - assert.is_table(close_frame) - assert.equals(frame.OPCODE.CLOSE, close_frame.opcode) - -- Close payload begins with the 2-byte big-endian status code. - assert.is_true(#close_frame.payload >= 2) - local code = close_frame.payload:byte(1) * 256 + close_frame.payload:byte(2) - assert.equals(1002, code) + -- The 1002 Close frame must reach the peer (written before teardown). + assert_open_close_frame_written(writes, 1002) end) it("keeps an incomplete frame buffered and the connection open", function() From 86979326baed9055f06c01fe109095b001ea8710 Mon Sep 17 00:00:00 2001 From: Thomas Kosiewski Date: Wed, 3 Jun 2026 18:27:40 +0200 Subject: [PATCH 3/6] fix(server): deliver Close frame before teardown on protocol violations Round-2 review (codex+claude): close_client queues the Close frame asynchronously, so on_error closing the handle in the same tick cancelled the write before it flushed. Defer on_error via vim.schedule in the fatal-frame, CONTINUATION and unknown-opcode paths (mirroring the CLOSE opcode handler), and add the missing buffer-clear + break to the CONTINUATION/unknown branches so the receive loop stops iterating over a torn-down connection. Change-Id: Id109192037aa856ab4c1fb7bd1ec0b9785568b31 Co-Authored-By: Claude Opus 4.8 (1M context) Signed-off-by: Thomas Kosiewski --- lua/claudecode/server/client.lua | 41 +++++++++++++++++++++----------- 1 file changed, 27 insertions(+), 14 deletions(-) diff --git a/lua/claudecode/server/client.lua b/lua/claudecode/server/client.lua index 68f05317..7e40d608 100644 --- a/lua/claudecode/server/client.lua +++ b/lua/claudecode/server/client.lua @@ -118,19 +118,22 @@ function M.process_data(client, data, on_message, on_close, on_error, auth_token if not parsed_frame then if close_code then - -- Fatal protocol violation: close with the RFC 6455 status code and drop - -- the offending bytes instead of leaving them buffered to be re-parsed - -- forever (which previously wedged the connection). + -- Fatal protocol violation: send the RFC 6455 Close frame, drop the + -- offending bytes (so they are not re-parsed forever, which previously + -- wedged the connection), and tear the connection down. -- - -- close_client MUST run before on_error: on_error tears the connection - -- down synchronously (tcp.lua: _disconnect_client -> _remove_client -> - -- tcp_handle:close()) without setting client.state, so if it ran first - -- the Close frame would be written to an already-closed handle and never - -- reach the peer. Closing first writes the Close frame while the handle - -- is open and sets client.state, after which on_error's teardown no-ops. + -- close_client runs first to queue the Close frame while the handle is + -- open; on_error is deferred with vim.schedule so that queued write + -- flushes before on_error's synchronous teardown (tcp.lua + -- _disconnect_client -> _remove_client -> tcp_handle:close()) runs -- + -- otherwise closing the handle in the same tick cancels the pending + -- write and the status code never reaches the peer. This mirrors the + -- CLOSE opcode path below, which schedules on_close after writing. client.buffer = "" M.close_client(client, close_code, "Protocol error") - on_error(client, "WebSocket protocol error in frame") + vim.schedule(function() + on_error(client, "WebSocket protocol error in frame") + end) end break -- No close_code means the frame is incomplete; wait for more bytes. end @@ -177,13 +180,23 @@ function M.process_data(client, data, on_message, on_close, on_error, auth_token client.last_pong = vim.loop.now() elseif parsed_frame.opcode == frame.OPCODE.CONTINUATION then -- Continuation frame - for simplicity, we don't support fragmentation. - -- close_client before on_error so the Close frame reaches the peer (see - -- the fatal-frame branch above for why ordering matters). + -- Send the Close frame, defer on_error, then clear the buffer and break so + -- the queued write can flush and we stop iterating over a connection that + -- is being torn down. See the fatal-frame branch above for the rationale. M.close_client(client, 1003, "Unsupported data") - on_error(client, "Fragmented messages not supported") + vim.schedule(function() + on_error(client, "Fragmented messages not supported") + end) + client.buffer = "" + break else + local opcode = parsed_frame.opcode M.close_client(client, 1002, "Protocol error") - on_error(client, "Unknown WebSocket opcode: " .. parsed_frame.opcode) + vim.schedule(function() + on_error(client, "Unknown WebSocket opcode: " .. opcode) + end) + client.buffer = "" + break end end end From cfd0ac9db8f2ffa21a603b1480e7f9c67605001a Mon Sep 17 00:00:00 2001 From: Thomas Kosiewski Date: Wed, 3 Jun 2026 19:16:44 +0200 Subject: [PATCH 4/6] fix(server): stop processing after an inbound CLOSE frame Round-3 review (claude): the CLOSE branch was the only termination path in process_data not clearing the buffer and breaking, so a frame after a CLOSE (a protocol violation) was still processed -- a trailing PING echoed a PONG after our own Close, and a trailing TEXT/BINARY dispatched on_message after on_close. Clear the buffer and break, matching the other termination branches. Change-Id: Ic905b5eca6c5ec6b1e3c1baaed8288a43863ff6b Co-Authored-By: Claude Opus 4.8 (1M context) Signed-off-by: Thomas Kosiewski --- lua/claudecode/server/client.lua | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/lua/claudecode/server/client.lua b/lua/claudecode/server/client.lua index 7e40d608..b5d3a0f6 100644 --- a/lua/claudecode/server/client.lua +++ b/lua/claudecode/server/client.lua @@ -173,6 +173,12 @@ function M.process_data(client, data, on_message, on_close, on_error, auth_token vim.schedule(function() on_close(client, code, reason) end) + -- A CLOSE is terminal: drop any bytes that followed it (a frame after CLOSE + -- is a protocol violation) and stop iterating, so we neither echo a PONG + -- after our own Close nor dispatch on_message after on_close. Matches the + -- other termination branches. + client.buffer = "" + break elseif parsed_frame.opcode == frame.OPCODE.PING then local pong_frame = frame.create_pong_frame(parsed_frame.payload) client.tcp_handle:write(pong_frame) From 7dbb6f976bddd13d3e72dd2866572b1ed2cb544f Mon Sep 17 00:00:00 2001 From: Thomas Kosiewski Date: Wed, 3 Jun 2026 19:55:31 +0200 Subject: [PATCH 5/6] fix(server): skip frame dispatch once teardown has started Round-4 review (claude): process_data had no client.state guard, so a TCP segment arriving after a branch set state=closing (handle still open, read not stopped) could re-enter and dispatch frames against a closing or closed client. Break out of the frame loop when state is closing/closed. Change-Id: I947156531aed84e9920f129bb5393503bd453b95 Co-Authored-By: Claude Opus 4.8 (1M context) Signed-off-by: Thomas Kosiewski --- lua/claudecode/server/client.lua | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/lua/claudecode/server/client.lua b/lua/claudecode/server/client.lua index b5d3a0f6..752555cf 100644 --- a/lua/claudecode/server/client.lua +++ b/lua/claudecode/server/client.lua @@ -114,6 +114,14 @@ function M.process_data(client, data, on_message, on_close, on_error, auth_token end while #client.buffer >= 2 do -- Minimum frame size + -- Stop if a prior frame (or a TCP error) already initiated teardown. The + -- handle can still be open (Close-frame write / scheduled on_close/on_error + -- pending) and read is not stopped, so a later TCP segment could otherwise + -- re-enter process_data and dispatch frames against a closing/closed client. + if client.state == "closing" or client.state == "closed" then + break + end + local parsed_frame, bytes_consumed, close_code = frame.parse_frame(client.buffer) if not parsed_frame then From a5ed12d971657a4c3e343a995219c45b00ce38b4 Mon Sep 17 00:00:00 2001 From: Thomas Kosiewski Date: Wed, 3 Jun 2026 20:05:23 +0200 Subject: [PATCH 6/6] fix(server): run protocol-violation teardown from the Close-frame write callback Round-5 review (codex): the vim.schedule(on_error) deferral still raced the asynchronous Close-frame write under backpressure -- teardown could close the handle before the write callback fired, cancelling the frame. Thread an on_done callback through close_client and invoke it from the write callback, so the 1002/1007/1009 status is flushed before on_error tears the connection down. libuv always invokes the write callback (on success or error), so on_done is never dropped. Change-Id: Ib69a4502d1b43f571c63f5fc5e89506e75fc7d64 Co-Authored-By: Claude Opus 4.8 (1M context) Signed-off-by: Thomas Kosiewski --- lua/claudecode/server/client.lua | 49 +++++++++++++++++++++----------- 1 file changed, 32 insertions(+), 17 deletions(-) diff --git a/lua/claudecode/server/client.lua b/lua/claudecode/server/client.lua index 752555cf..9de3fa25 100644 --- a/lua/claudecode/server/client.lua +++ b/lua/claudecode/server/client.lua @@ -130,16 +130,13 @@ function M.process_data(client, data, on_message, on_close, on_error, auth_token -- offending bytes (so they are not re-parsed forever, which previously -- wedged the connection), and tear the connection down. -- - -- close_client runs first to queue the Close frame while the handle is - -- open; on_error is deferred with vim.schedule so that queued write - -- flushes before on_error's synchronous teardown (tcp.lua - -- _disconnect_client -> _remove_client -> tcp_handle:close()) runs -- - -- otherwise closing the handle in the same tick cancels the pending - -- write and the status code never reaches the peer. This mirrors the - -- CLOSE opcode path below, which schedules on_close after writing. + -- on_error runs the synchronous teardown (tcp.lua _disconnect_client -> + -- _remove_client -> tcp_handle:close()), which would cancel the pending + -- Close-frame write if it ran first. Passing it as close_client's on_done + -- runs it from the write callback, i.e. only after the status code has + -- been flushed, so the peer actually receives the 1002/1007/1009 close. client.buffer = "" - M.close_client(client, close_code, "Protocol error") - vim.schedule(function() + M.close_client(client, close_code, "Protocol error", function() on_error(client, "WebSocket protocol error in frame") end) end @@ -194,19 +191,18 @@ function M.process_data(client, data, on_message, on_close, on_error, auth_token client.last_pong = vim.loop.now() elseif parsed_frame.opcode == frame.OPCODE.CONTINUATION then -- Continuation frame - for simplicity, we don't support fragmentation. - -- Send the Close frame, defer on_error, then clear the buffer and break so - -- the queued write can flush and we stop iterating over a connection that - -- is being torn down. See the fatal-frame branch above for the rationale. - M.close_client(client, 1003, "Unsupported data") - vim.schedule(function() + -- Run on_error from close_client's write callback (so the Close frame is + -- flushed before teardown), then clear the buffer and break so we stop + -- iterating over a connection that is being torn down. See the fatal-frame + -- branch above for the rationale. + M.close_client(client, 1003, "Unsupported data", function() on_error(client, "Fragmented messages not supported") end) client.buffer = "" break else local opcode = parsed_frame.opcode - M.close_client(client, 1002, "Protocol error") - vim.schedule(function() + M.close_client(client, 1002, "Protocol error", function() on_error(client, "Unknown WebSocket opcode: " .. opcode) end) client.buffer = "" @@ -248,8 +244,14 @@ end ---@param client WebSocketClient The client object ---@param code number|nil Close code (default: 1000) ---@param reason string|nil Close reason -function M.close_client(client, code, reason) +---@param on_done function|nil Optional callback run after the Close frame has been +--- written (or once the connection is already gone). Protocol-violation paths use +--- it to defer error/teardown until the status code has actually been flushed. +function M.close_client(client, code, reason, on_done) if client.state == "closed" or client.state == "closing" then + if on_done then + vim.schedule(on_done) + end return end @@ -263,6 +265,9 @@ function M.close_client(client, code, reason) -- in tcp.lua's _remove_client. if client.tcp_handle:is_closing() then client.state = "closed" + if on_done then + vim.schedule(on_done) + end return end @@ -273,6 +278,13 @@ function M.close_client(client, code, reason) if not client.tcp_handle:is_closing() then client.tcp_handle:close() end + -- Run any teardown only after the Close frame has been flushed, so a + -- caller's synchronous handle close cannot cancel the pending write. libuv + -- always invokes this callback (on success or error), so on_done is never + -- dropped. + if on_done then + on_done() + end end) -- Mark as "closing" while the Close frame write/teardown is in flight. The -- write callback transitions to "closed". Do not clobber the "closed" state @@ -281,6 +293,9 @@ function M.close_client(client, code, reason) else client.state = "closed" client.tcp_handle:close() + if on_done then + vim.schedule(on_done) + end end end