From 392d70d90dd56fcde22a76a1a85b9edf7a2b75ed Mon Sep 17 00:00:00 2001 From: root Date: Sat, 6 Jun 2026 23:09:34 +0000 Subject: [PATCH] fix(sanitize): preserve angle brackets inside markdown code Protect angle brackets in fenced and inline code before bluemonday HTML sanitization so generic type syntax like mut_raw_ptr is not stripped from issue and PR bodies returned by MCP tools. Closes #2202 --- pkg/github/issues_test.go | 39 ++++++++++ pkg/sanitize/sanitize.go | 143 +++++++++++++++++++++++++++++++++- pkg/sanitize/sanitize_test.go | 83 ++++++++++++++++++++ 3 files changed, 263 insertions(+), 2 deletions(-) diff --git a/pkg/github/issues_test.go b/pkg/github/issues_test.go index 7e47cdb527..73f5968904 100644 --- a/pkg/github/issues_test.go +++ b/pkg/github/issues_test.go @@ -276,6 +276,45 @@ func Test_GetIssue(t *testing.T) { } } +func Test_GetIssue_PreservesAngleBracketsInCodeBlocks(t *testing.T) { + body := "```\nlet ptr: mut_raw_ptr = raw_new int;\n```" + mockIssue := &github.Issue{ + Number: github.Ptr(42), + Title: github.Ptr("Angle brackets in code"), + Body: github.Ptr(body), + State: github.Ptr("open"), + HTMLURL: github.Ptr("https://github.com/owner/repo/issues/42"), + User: &github.User{Login: github.Ptr("testuser")}, + } + + serverTool := IssueRead(translations.NullTranslationHelper) + client := mustNewGHClient(t, MockHTTPClientWithHandlers(map[string]http.HandlerFunc{ + GetReposIssuesByOwnerByRepoByIssueNumber: mockResponse(t, http.StatusOK, mockIssue), + })) + deps := BaseDeps{ + Client: client, + GQLClient: defaultGQLClient, + } + handler := serverTool.Handler(deps) + + request := createMCPRequest(map[string]any{ + "method": "get", + "owner": "owner", + "repo": "repo", + "issue_number": float64(42), + }) + result, err := handler(ContextWithDeps(context.Background(), deps), &request) + require.NoError(t, err) + require.NotNil(t, result) + require.False(t, result.IsError) + + textContent := getTextResult(t, result) + var returnedIssue MinimalIssue + err = json.Unmarshal([]byte(textContent.Text), &returnedIssue) + require.NoError(t, err) + assert.Equal(t, body, returnedIssue.Body) +} + func Test_IssueRead_IFC_InsidersMode(t *testing.T) { t.Parallel() diff --git a/pkg/sanitize/sanitize.go b/pkg/sanitize/sanitize.go index e6401e4fb3..ae1c3e93a0 100644 --- a/pkg/sanitize/sanitize.go +++ b/pkg/sanitize/sanitize.go @@ -12,7 +12,10 @@ var policy *bluemonday.Policy var policyOnce sync.Once func Sanitize(input string) string { - return FilterHTMLTags(FilterCodeFenceMetadata(FilterInvisibleCharacters(input))) + cleaned := FilterCodeFenceMetadata(FilterInvisibleCharacters(input)) + protected := protectCodeAngleBrackets(cleaned) + sanitized := FilterHTMLTags(protected) + return restoreCodeAngleBrackets(sanitized) } // FilterInvisibleCharacters removes invisible or control characters that should not appear @@ -145,6 +148,141 @@ func isSafeCodeFenceToken(token string) bool { return true } +// Sentinels used to protect angle brackets inside code from HTML sanitization. +// NUL bytes are stripped by FilterInvisibleCharacters before protectCodeAngleBrackets +// runs, preventing sentinel collision attacks. +const ( + ltSentinel = "\x00LT\x00" + gtSentinel = "\x00GT\x00" +) + +// protectCodeAngleBrackets replaces < and > inside fenced and inline code with +// sentinels so bluemonday does not strip them as HTML tags. +func protectCodeAngleBrackets(input string) string { + if input == "" { + return input + } + + lines := strings.Split(input, "\n") + insideFence := false + currentFenceLen := 0 + + for i, line := range lines { + if toggled, fenceLen := toggleCodeFence(line, insideFence, currentFenceLen); toggled { + insideFence = !insideFence + if insideFence { + currentFenceLen = fenceLen + } else { + currentFenceLen = 0 + } + continue + } + + if insideFence { + lines[i] = replaceAngleBrackets(line) + continue + } + lines[i] = protectInlineCodeAngleBrackets(line) + } + + return strings.Join(lines, "\n") +} + +func toggleCodeFence(line string, insideFence bool, currentFenceLen int) (bool, int) { + idx := strings.Index(line, "```") + if idx == -1 || hasNonWhitespace(line[:idx]) { + return false, currentFenceLen + } + + fenceEnd := idx + for fenceEnd < len(line) && line[fenceEnd] == '`' { + fenceEnd++ + } + + fenceLen := fenceEnd - idx + if fenceLen < 3 { + return false, currentFenceLen + } + + if insideFence { + if currentFenceLen != 0 && fenceLen < currentFenceLen { + return false, currentFenceLen + } + return true, fenceLen + } + + return true, fenceLen +} + +func protectInlineCodeAngleBrackets(line string) string { + if !strings.Contains(line, "`") { + return line + } + + var out strings.Builder + out.Grow(len(line)) + i := 0 + for i < len(line) { + if line[i] != '`' { + out.WriteByte(line[i]) + i++ + continue + } + + openStart := i + openLen := 0 + for i < len(line) && line[i] == '`' { + openLen++ + i++ + } + + contentStart := i + closeIdx := findInlineCodeClose(line, contentStart, openLen) + if closeIdx == -1 { + out.WriteString(line[openStart:i]) + continue + } + + out.WriteString(line[openStart:contentStart]) + out.WriteString(replaceAngleBrackets(line[contentStart:closeIdx])) + out.WriteString(line[closeIdx : closeIdx+openLen]) + i = closeIdx + openLen + } + + return out.String() +} + +func findInlineCodeClose(line string, contentStart, openLen int) int { + for i := contentStart; i < len(line); i++ { + if line[i] != '`' { + continue + } + + closeLen := 0 + for j := i; j < len(line) && line[j] == '`'; j++ { + closeLen++ + } + if closeLen == openLen { + return i + } + } + + return -1 +} + +func replaceAngleBrackets(s string) string { + if !strings.ContainsAny(s, "<>") { + return s + } + s = strings.ReplaceAll(s, "<", ltSentinel) + return strings.ReplaceAll(s, ">", gtSentinel) +} + +func restoreCodeAngleBrackets(input string) string { + s := strings.ReplaceAll(input, ltSentinel, "<") + return strings.ReplaceAll(s, gtSentinel, ">") +} + func getPolicy() *bluemonday.Policy { policyOnce.Do(func() { p := bluemonday.StrictPolicy() @@ -175,7 +313,8 @@ func getPolicy() *bluemonday.Policy { func shouldRemoveRune(r rune) bool { switch r { - case 0x200B, // ZERO WIDTH SPACE + case 0x0000, // NUL — stripped to prevent sentinel collision in protectCodeAngleBrackets + 0x200B, // ZERO WIDTH SPACE 0x200C, // ZERO WIDTH NON-JOINER 0x200E, // LEFT-TO-RIGHT MARK 0x200F, // RIGHT-TO-LEFT MARK diff --git a/pkg/sanitize/sanitize_test.go b/pkg/sanitize/sanitize_test.go index 35b23e6abe..445f531353 100644 --- a/pkg/sanitize/sanitize_test.go +++ b/pkg/sanitize/sanitize_test.go @@ -129,6 +129,7 @@ func TestShouldRemoveRune(t *testing.T) { expected bool }{ // Individual characters that should be removed + {name: "NUL byte", rune: 0x0000, expected: true}, {name: "zero width space", rune: 0x200B, expected: true}, {name: "zero width non-joiner", rune: 0x200C, expected: true}, {name: "left-to-right mark", rune: 0x200E, expected: true}, @@ -300,3 +301,85 @@ func TestSanitizeRemovesInvisibleCodeFenceMetadata(t *testing.T) { result := Sanitize(input) assert.Equal(t, expected, result) } + +func TestSanitizePreservesAngleBracketsInCodeBlocks(t *testing.T) { + tests := []struct { + name string + input string + expected string + }{ + { + name: "fenced code block with angle brackets", + input: "```\nlet ptr: mut_raw_ptr = raw_new int;\n```", + expected: "```\nlet ptr: mut_raw_ptr = raw_new int;\n```", + }, + { + name: "inline code with angle brackets", + input: "Use `Vec` for collections.", + expected: "Use `Vec` for collections.", + }, + { + name: "angle brackets outside code are sanitized", + input: "This has in it.", + expected: "This has in it.", + }, + { + name: "fenced code block with generic types", + input: "Example:\n```go\nfunc Foo[T comparable](x T) {}\n```\nDone.", + expected: "Example:\n```go\nfunc Foo[T comparable](x T) {}\n```\nDone.", + }, + { + name: "multiple inline code spans with angle brackets", + input: "Compare `Map` and `Set`.", + expected: "Compare `Map` and `Set`.", + }, + { + name: "shorter fence inside code does not close block", + input: "````\nline\n```\nstill\n````", + expected: "````\nline\n```\nstill\n````", + }, + { + name: "sentinel collision does not bypass sanitizer", + input: "\x00LT\x00script\x00GT\x00alert(1)\x00LT\x00/script\x00GT\x00", + expected: "LTscriptGTalert(1)LT/scriptGT", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result := Sanitize(tt.input) + assert.Equal(t, tt.expected, result) + }) + } +} + +func TestProtectCodeAngleBrackets(t *testing.T) { + tests := []struct { + name string + input string + expected string + }{ + { + name: "fenced code block with angle brackets", + input: "```\nvector v;\n```", + expected: "```\nvector" + ltSentinel + "int" + gtSentinel + " v;\n```", + }, + { + name: "inline code with angle brackets", + input: "Use `Map` here.", + expected: "Use `Map" + ltSentinel + "K, V" + gtSentinel + "` here.", + }, + { + name: "angle brackets outside code unchanged", + input: "Use bold\n```\ncode\n```", + expected: "Use bold\n```\ncode" + ltSentinel + "T" + gtSentinel + "\n```", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result := protectCodeAngleBrackets(tt.input) + assert.Equal(t, tt.expected, result) + }) + } +}