diff --git a/crates/jp_md/src/ansi.rs b/crates/jp_md/src/ansi.rs index 1fa2c0eb..31d5fdf4 100644 --- a/crates/jp_md/src/ansi.rs +++ b/crates/jp_md/src/ansi.rs @@ -1,8 +1,8 @@ //! Shared ANSI SGR escape constants and state tracking. //! -//! This module provides the escape sequences, state tracking, and visual -//! width computation used by both the terminal renderer (`render.rs`) and -//! the table formatter (`table.rs`). +//! This module provides the escape sequences, state tracking, and visual width +//! computation used by both the terminal renderer (`render.rs`) and the table +//! formatter (`table.rs`). /// SGR: Bold on. pub const BOLD_START: &str = "\x1b[1m"; @@ -39,9 +39,9 @@ pub const RESET: &str = "\x1b[0m"; /// Tracks which ANSI SGR attributes are currently active. /// -/// Used to close formatting at line breaks and re-open it on the next -/// line, both for the terminal renderer's incremental wrapping and the -/// table formatter's batch wrapping. +/// Used to close formatting at line breaks and re-open it on the next line, +/// both for the terminal renderer's incremental wrapping and the table +/// formatter's batch wrapping. #[derive(Debug, Clone, Default)] #[expect(clippy::struct_excessive_bools)] pub struct AnsiState { @@ -81,8 +81,8 @@ impl AnsiState { || self.background.is_some() } - /// Update the tracked state from a complete ANSI escape sequence - /// (e.g. `"\x1b[1m"`). + /// Update the tracked state from a complete ANSI escape sequence (e.g. + /// `"\x1b[1m"`). pub(crate) fn update(&mut self, esc: &str) { match esc { BOLD_START => self.bold = true, @@ -161,10 +161,9 @@ impl AnsiState { /// Calculate the visual width of a string, ignoring ANSI escape sequences. /// -/// Strips ANSI escape sequences, then delegates to -/// `UnicodeWidthStr::width()` which correctly handles multi-codepoint -/// sequences like emoji presentation (VS16), ZWJ sequences, and -/// script-specific ligatures. +/// Strips ANSI escape sequences, then delegates to `UnicodeWidthStr::width()` +/// which correctly handles multi-codepoint sequences like emoji presentation +/// (VS16), ZWJ sequences, and script-specific ligatures. pub fn visual_width(s: &str) -> usize { use unicode_width::UnicodeWidthStr as _; diff --git a/crates/jp_md/src/buffer.rs b/crates/jp_md/src/buffer.rs index c0ce0108..2f89108c 100644 --- a/crates/jp_md/src/buffer.rs +++ b/crates/jp_md/src/buffer.rs @@ -24,10 +24,11 @@ const TYPE5_START_TAG: &str = ", } @@ -184,17 +187,16 @@ impl Buffer { /// Drain all remaining content and emit it as a sequence of events. /// - /// Called at the end of the stream. For a `Buffer` that's mid-list - /// with several complete items still queued (because the buffer - /// can't flush an item until the next line is fully received), - /// this emits each complete item as its own `Block` event with - /// the correct renumbered marker and visual indent. The trailing - /// partial segment becomes the final `Flush` event. + /// Called at the end of the stream. + /// For a `Buffer` that's mid-list with several complete items still queued + /// (because the buffer can't flush an item until the next line is fully + /// received), this emits each complete item as its own `Block` event with + /// the correct renumbered marker and visual indent. + /// The trailing partial segment becomes the final `Flush` event. /// - /// For non-list states the entire remainder is emitted as a - /// single `Flush` event with the active indent (stripping the - /// fence's indent for `InFencedCode`, leaving content as-is - /// otherwise). + /// For non-list states the entire remainder is emitted as a single `Flush` + /// event with the active indent (stripping the fence's indent for + /// `InFencedCode`, leaving content as-is otherwise). pub fn flush_events(&mut self) -> Vec { let raw = std::mem::take(&mut self.data); @@ -240,10 +242,10 @@ impl Buffer { events } - /// Implements [`Self::flush_events`] for `InList` state: scan the - /// remaining buffer for sibling-marker boundaries, emit each - /// complete preceding item as a `Block` (renumbered against the - /// list's start), and emit the final segment as a `Flush`. + /// Implements [`Self::flush_events`] for `InList` state: scan the remaining + /// buffer for sibling-marker boundaries, emit each complete preceding item + /// as a `Block` (renumbered against the list's start), and emit the final + /// segment as a `Flush`. fn flush_list_events( raw: &str, marker_column: usize, @@ -331,8 +333,8 @@ impl Buffer { } } - /// Handles the `AtBoundary` state: we are at a block boundary. We inspect - /// the start of the buffer to decide what block we're in. + /// Handles the `AtBoundary` state: we are at a block boundary. + /// We inspect the start of the buffer to decide what block we're in. fn handle_at_boundary(&mut self) -> (Option, State) { // Trim leading blank lines, as they are just block separators. let trimmed_buffer = self.data.trim_start_matches('\n'); @@ -434,8 +436,8 @@ impl Buffer { (None, State::BufferingParagraph) } - /// Handles `BufferingParagraph`: we're in a paragraph-like block. We need - /// to find its terminator. + /// Handles `BufferingParagraph`: we're in a paragraph-like block. + /// We need to find its terminator. fn handle_buffering_paragraph(&mut self) -> (Option, State) { let mut terminator_pos: Option = None; let mut flush_len: usize = 0; @@ -498,19 +500,20 @@ impl Buffer { /// Handles `InList`: we're inside a list, buffering the current item. /// - /// Walks the buffer line by line, looking for a safe flush point. A - /// flush is safe at: + /// Walks the buffer line by line, looking for a safe flush point. + /// A flush is safe at: /// - /// - A sibling marker at column == `marker_column` (the current item - /// is complete; the new marker starts the next item). Stay in this - /// state. - /// - A line at column ≤ `marker_column` that is not a list marker, when - /// it either is a block starter (header, HR, fenced code, HTML block) - /// or follows a blank line. The list has ended. Transition back to - /// `AtBoundary`. + /// - A sibling marker at column == `marker_column` (the current item is + /// complete; the new marker starts the next item). + /// Stay in this state. + /// - A line at column ≤ `marker_column` that is not a list marker, when it + /// either is a block starter (header, HR, fenced code, HTML block) or + /// follows a blank line. + /// The list has ended. + /// Transition back to `AtBoundary`. /// - /// Blank lines and indented continuations (column > `marker_column`) - /// are buffered, not flushed. + /// Blank lines and indented continuations (column \> `marker_column`) are + /// buffered, not flushed. #[expect(clippy::too_many_lines)] fn handle_in_list( &mut self, @@ -532,10 +535,18 @@ impl Buffer { // Drop a single leading blank line: it belongs to the trailing // separator of whatever block was emitted just before us (e.g. - // a closing fence), not to the next item. Multiple blanks are - // left for the walk's `prev_blank` logic to interpret — two - // blank lines + less-indented content should still terminate - // the list. + // a closing fence, or a nested list that terminated on a blank). + // Multiple blanks are left for the walk's `prev_blank` logic to + // interpret — two blank lines + less-indented content should + // still terminate the list. + // + // The blank itself carries signal: it means the immediately + // preceding scope ended on a blank line. Initialise `prev_blank` + // from that so the walk's terminator check (`prev_blank && + // indent < content_column`) fires on the very first line, which + // matters when a popped child consumed the blank as part of its + // own flush and left us with a less-indented non-marker at the + // head of the buffer. let leading_blank = leading_blank_line_bytes(&self.data); if leading_blank > 0 { self.data.drain(..leading_blank); @@ -552,7 +563,13 @@ impl Buffer { } let mut scan = 0_usize; - let mut prev_blank = false; + // Byte offset just past the last non-blank line we've walked. + // Used by the `Terminator` branch to flush only the item's + // actual content and leave trailing blank lines in the buffer, + // so the popped-to parent state can pick up the same + // `prev_blank=true` signal that triggered the termination here. + let mut last_content_end = 0_usize; + let mut prev_blank = leading_blank > 0; while scan < self.data.len() { // Compute line shape without holding a borrow on the buffer @@ -587,9 +604,11 @@ impl Buffer { if scan == 0 { prev_blank = false; scan += line_len; + last_content_end = scan; continue; } let (event, new_state) = self.flush_list_segment( + scan, scan, marker_column, content_column, @@ -603,6 +622,7 @@ impl Buffer { ListLineKind::NestedContainer => { if scan > 0 { let (event, new_state) = self.flush_list_segment( + scan, scan, marker_column, content_column, @@ -624,10 +644,18 @@ impl Buffer { // Fall through as continuation defensively. prev_blank = false; scan += line_len; + last_content_end = scan; } ListLineKind::Terminator => { let next_state = self.pop_parent_or_boundary(); - if scan == 0 { + // Flush only this scope's actual content; leave any + // trailing blank lines in the buffer so the popped-to + // parent state can see them and apply its own + // termination check. Without this, a paragraph at + // less indent than the parent's `content_column` + // would be misclassified as a lazy continuation of + // the parent item. + if last_content_end == 0 { // Nothing buffered for this list yet (e.g. the // parent's next marker arrived right after we // entered this nested list). Hand control back @@ -636,8 +664,15 @@ impl Buffer { // empty `Block`. return (None, next_state); } + // Capture content up to `scan` (including any trailing + // blank lines we walked past) so the rendered Block + // keeps the visual separator to the next sibling Block; + // drain only up to `last_content_end` so those same + // blank lines stay in the buffer for the parent state + // to see as `prev_blank=true`. let (event, _) = self.flush_list_segment( scan, + last_content_end, marker_column, content_column, is_ordered, @@ -650,6 +685,7 @@ impl Buffer { ListLineKind::Continuation => { prev_blank = false; scan += line_len; + last_content_end = scan; } } } @@ -658,9 +694,10 @@ impl Buffer { } /// If the buffer starts with a nested list marker or a fence at - /// `content_column` or deeper, return the transition that enters - /// that nested container. The caller is responsible for pushing the - /// current `InList` state onto `parents` before returning. + /// `content_column` or deeper, return the transition that enters that + /// nested container. + /// The caller is responsible for pushing the current `InList` state onto + /// `parents` before returning. fn maybe_enter_nested_from_list_head( &mut self, marker_column: usize, @@ -707,16 +744,26 @@ impl Buffer { None } - /// Drain `flush_pos` bytes from the buffer and emit them as a Block. + /// Capture `content_end` bytes as the Block content and drain `drain_end` + /// bytes from the buffer. + /// + /// In the common case (`SiblingMarker`, `NestedContainer`), the two are + /// equal: drain the same bytes that go into the Block. + /// The `Terminator` branch passes `content_end > drain_end` to keep + /// trailing blank lines *both* in the emitted Block (so the renderer + /// preserves the visual separation between this item and whatever follows) + /// *and* in the buffer (so the popped-to parent state can pick up + /// `prev_blank=true`). /// - /// The first line of the segment is inspected to decide whether the - /// segment is an item-style flush (starts with this list's marker) or - /// a paragraph-style flush (continuation content inside the item). - /// Item flushes are renumbered against `start_number + items_flushed` - /// for ordered lists, and the returned `items_flushed` is incremented. + /// The first line of the segment is inspected to decide whether the segment + /// is an item-style flush (starts with this list's marker) or a + /// paragraph-style flush (continuation content inside the item). + /// Item flushes are renumbered against `start_number + items_flushed` for + /// ordered lists, and the returned `items_flushed` is incremented. fn flush_list_segment( &mut self, - flush_pos: usize, + content_end: usize, + drain_end: usize, marker_column: usize, content_column: usize, is_ordered: bool, @@ -724,7 +771,12 @@ impl Buffer { start_number: u32, items_flushed: u32, ) -> (Event, State) { - let raw: String = self.data.drain(..flush_pos).collect(); + debug_assert!( + drain_end <= content_end, + "drain_end ({drain_end}) must not exceed content_end ({content_end})" + ); + let raw: String = self.data[..content_end].to_string(); + self.data.drain(..drain_end); let first_line = raw.lines().next().unwrap_or(""); let (first_indent, first_content) = get_indent(first_line); let is_item = first_indent == marker_column && is_list_marker(first_content); @@ -853,8 +905,8 @@ impl Buffer { /// Handles `InFencedCode`: we process one line at a time. /// /// Tracks nesting depth so that inner fenced code blocks (which LLMs - /// frequently produce inside markdown code blocks) don't prematurely - /// close the outer block. + /// frequently produce inside markdown code blocks) don't prematurely close + /// the outer block. fn handle_in_fenced_code( &mut self, fence_type: FenceType, @@ -1065,7 +1117,8 @@ impl Iterator for Buffer { /// Check if content (after indent stripping) starts with a list marker. /// -/// Matches unordered (`- `, `* `, `+ `) and ordered (`1. `, `2) `) markers. +/// Matches unordered (` - `, ` * `, ` + `) and ordered (` 1. `, ` 2) `) +/// markers. fn is_list_marker(content: &str) -> bool { parse_list_marker(content).is_some() } @@ -1073,19 +1126,21 @@ fn is_list_marker(content: &str) -> bool { /// A parsed list marker. #[derive(Debug, Clone, Copy, PartialEq, Eq)] struct ListMarker { - /// Visual width of the marker including the trailing space, e.g. - /// 2 for `- `, 3 for `1. `, 4 for `10. `. + /// Visual width of the marker including the trailing space, e.g. 2 for `-`, + /// 3 for ` 1. `, 4 for ` 10. `. marker_width: usize, /// Whether the marker is ordered (digits + delimiter). is_ordered: bool, /// The delimiter byte: `.` or `)` for ordered, `-`/`*`/`+` for bullet. delimiter: u8, - /// For ordered markers, the number value. `0` for bullet markers. + /// For ordered markers, the number value. + /// `0` for bullet markers. number: u32, } /// Parse a list marker at the start of `content`, returning its shape if -/// present. `content` should already have leading whitespace stripped. +/// present. +/// `content` should already have leading whitespace stripped. fn parse_list_marker(content: &str) -> Option { let bytes = content.as_bytes(); @@ -1118,14 +1173,14 @@ fn parse_list_marker(content: &str) -> Option { }) } -/// Count the number of leading bytes in `s` that form *a single* blank -/// line (only spaces and tabs, terminated by `\n`). Returns `0` if the -/// content doesn't begin with a blank line. +/// Count the number of leading bytes in `s` that form *a single* blank line +/// (only spaces and tabs, terminated by `\n`). +/// Returns `0` if the content doesn't begin with a blank line. /// -/// Used at the start of `handle_in_list` to consume the trailing -/// separator left behind by a just-closed inner block (e.g. a fenced -/// code block). Stops after one line so two-blank-lines-end-of-list -/// semantics still propagate to the walk. +/// Used at the start of `handle_in_list` to consume the trailing separator left +/// behind by a just-closed inner block (e.g. a fenced code block). +/// Stops after one line so two-blank-lines-end-of-list semantics still +/// propagate to the walk. fn leading_blank_line_bytes(s: &str) -> usize { let bytes = s.as_bytes(); let mut idx = 0; @@ -1144,11 +1199,11 @@ fn leading_blank_line_bytes(s: &str) -> usize { enum ListLineKind { /// A sibling marker at this list's `marker_column`. SiblingMarker, - /// A list marker or fenced code start at `content_column` or deeper - /// — a nested container inside the current item. + /// A list marker or fenced code start at `content_column` or deeper — a + /// nested container inside the current item. NestedContainer, - /// A line that terminates the list: less-indented after a blank, or - /// a block interrupter at <= 3 spaces. + /// A line that terminates the list: less-indented after a blank, or a block + /// interrupter at \<= 3 spaces. Terminator, /// Any other non-blank line: continuation of the current item. Continuation, @@ -1158,8 +1213,8 @@ enum ListLineKind { /// /// `is_ordered` and `delimiter` describe the active list's marker shape, used /// to distinguish sibling markers from markers that start a *new* list at the -/// same column (per CommonMark §5.2: two markers are the same kind only if -/// they share `is_ordered` and their delimiter character). +/// same column (per CommonMark §5.2: two markers are the same kind only if they +/// share `is_ordered` and their delimiter character). fn classify_list_line( indent: usize, content: &str, @@ -1238,9 +1293,10 @@ fn strip_lines_indent(raw: &str, max_strip: usize) -> String { /// Rewrite the leading ordered-list marker number in `content` to `new`. /// -/// `delimiter` is the marker's delimiter byte (`.` or `)`); used to confirm -/// the leading marker shape before rewriting. If the content does not start -/// with a matching marker, it is returned unchanged. +/// `delimiter` is the marker's delimiter byte (`.` or `)`); used to confirm the +/// leading marker shape before rewriting. +/// If the content does not start with a matching marker, it is returned +/// unchanged. fn renumber_first_marker(content: String, new: u32, delimiter: u8) -> String { let bytes = content.as_bytes(); let digit_count = bytes.iter().take_while(|b| b.is_ascii_digit()).count(); diff --git a/crates/jp_md/src/buffer/fixup.rs b/crates/jp_md/src/buffer/fixup.rs index 5798275c..66078404 100644 --- a/crates/jp_md/src/buffer/fixup.rs +++ b/crates/jp_md/src/buffer/fixup.rs @@ -1,8 +1,9 @@ //! Post-processing fixups for buffer events. //! //! Fixups are stateful transformers that sit between the [`Buffer`] iterator -//! and the consumer. They handle LLM-specific quirks that don't belong in the -//! core markdown parsing logic. +//! and the consumer. +//! They handle LLM-specific quirks that don't belong in the core markdown +//! parsing logic. //! //! [`Buffer`]: super::Buffer @@ -10,17 +11,19 @@ use super::Event; /// A stateful event transformer. /// -/// Each fixup inspects events as they pass through and can rewrite, -/// suppress, or pass them unchanged. Fixups may hold state across -/// events (e.g. remembering properties of the previous block). +/// Each fixup inspects events as they pass through and can rewrite, suppress, +/// or pass them unchanged. +/// Fixups may hold state across events (e.g. remembering properties of the +/// previous block). pub trait EventFixup { - /// Process a single event. Returns `None` to suppress the event, - /// or `Some(event)` (possibly modified) to pass it through. + /// Process a single event. + /// Returns `None` to suppress the event, or `Some(event)` (possibly + /// modified) to pass it through. fn process(&mut self, event: Event) -> Option; } -/// Wraps a buffer iterator and applies a chain of [`EventFixup`]s to -/// each emitted event. +/// Wraps a buffer iterator and applies a chain of [`EventFixup`]s to each +/// emitted event. pub struct FixupChain { /// The underlying event source. inner: I, @@ -55,8 +58,9 @@ impl> Iterator for FixupChain { } /// Check if a block contains a fence pattern embedded mid-line (not at the -/// start). This indicates the LLM started a code block at the end of a -/// paragraph line, and a subsequent bare fence is likely the orphaned close. +/// start). +/// This indicates the LLM started a code block at the end of a paragraph line, +/// and a subsequent bare fence is likely the orphaned close. fn has_embedded_fence(block: &str) -> bool { for line in block.lines() { let trimmed = line.trim_start(); @@ -74,17 +78,18 @@ fn has_embedded_fence(block: &str) -> bool { /// Fixes orphaned closing fences from mid-line code fence patterns. /// -/// When an LLM produces backticks mid-line (e.g. `text:```lang`), -/// the bare closing fence on the next line gets misinterpreted as a -/// new code block opening. This fixup detects when a `Block` contains -/// such an embedded fence pattern and converts the following bare -/// `FencedCodeStart` (no language tag) into a `Block` instead. +/// When an LLM produces backticks mid-line (e.g. +/// `text:```lang`), the bare closing fence on the next line gets misinterpreted +/// as a new code block opening. +/// This fixup detects when a `Block` contains such an embedded fence pattern +/// and converts the following bare `FencedCodeStart` (no language tag) into a +/// `Block` instead. pub struct OrphanedFenceFixup { /// Whether the previous block had an embedded fence pattern. prev_had_embedded_fence: bool, /// When true, we're inside a fake code block from an orphaned fence. - /// All `FencedCodeLine` events become `Block` events, and - /// `FencedCodeEnd` is suppressed. + /// All `FencedCodeLine` events become `Block` events, and `FencedCodeEnd` + /// is suppressed. suppressing: bool, } @@ -146,9 +151,9 @@ impl EventFixup for OrphanedFenceFixup { /// Escalates fence lengths so rendered output safely contains inner fences. /// -/// Rewrites `FencedCodeStart` and `FencedCodeEnd` events to use at least -/// 5 backticks/tildes, so 3-backtick inner fences render as literal -/// content in the output. +/// Rewrites `FencedCodeStart` and `FencedCodeEnd` events to use at least 5 +/// backticks/tildes, so 3-backtick inner fences render as literal content in +/// the output. pub struct FenceEscalationFixup; impl EventFixup for FenceEscalationFixup { diff --git a/crates/jp_md/src/buffer/state.rs b/crates/jp_md/src/buffer/state.rs index 8753bc43..508187c3 100644 --- a/crates/jp_md/src/buffer/state.rs +++ b/crates/jp_md/src/buffer/state.rs @@ -19,30 +19,30 @@ pub enum State { /// /// While in this state, indented content at any column greater than /// `marker_column` is treated as continuation of the current item. - /// In particular, content at column 4 is *not* treated as an indented - /// code block — that classification only applies at block boundaries - /// outside a list. + /// In particular, content at column 4 is *not* treated as an indented code + /// block — that classification only applies at block boundaries outside a + /// list. /// - /// When a fence or a deeper list marker appears inside the item, - /// the buffer pushes the current `InList` state onto its parents - /// stack and switches to the inner state. On close, the parent is - /// popped back. + /// When a fence or a deeper list marker appears inside the item, the buffer + /// pushes the current `InList` state onto its parents stack and switches to + /// the inner state. + /// On close, the parent is popped back. InList { - /// Column where the list's markers appear. Sibling items must - /// share this column. + /// Column where the list's markers appear. + /// Sibling items must share this column. marker_column: usize, - /// Column where item content (post-marker) starts. Deeper - /// markers seen at this column or beyond start a nested list. + /// Column where item content (post-marker) starts. + /// Deeper markers seen at this column or beyond start a nested list. content_column: usize, - /// Whether the list is ordered (digit + delim). Bullet otherwise. + /// Whether the list is ordered (digit + delim). + /// Bullet otherwise. is_ordered: bool, - /// The marker delimiter character: `.` or `)` for ordered, `-`/ - /// `*`/`+` for bullet. + /// The marker delimiter character: `.` or `)` for ordered, `-`/ `*`/`+` + /// for bullet. delimiter: u8, - /// For ordered lists, the marker number on the first item. Used - /// to renumber emitted items so the output is consistent with - /// CommonMark's renumbering even though we stream items - /// individually. + /// For ordered lists, the marker number on the first item. + /// Used to renumber emitted items so the output is consistent with + /// CommonMark's renumbering even though we stream items individually. start_number: u32, /// Number of items already flushed from this list. items_flushed: u32, @@ -58,15 +58,15 @@ pub enum State { /// The indentation of the opening fence. /// - /// When the fence is inside a list item, this is the parent - /// list's `content_column`; code lines have this many leading - /// spaces stripped before emission. + /// When the fence is inside a list item, this is the parent list's + /// `content_column`; code lines have this many leading spaces stripped + /// before emission. indent: usize, - /// Nesting depth of inner fenced code blocks. When an inner fence - /// opening (backticks + language tag) is seen, this increments; - /// a bare closing fence decrements. The outer block only closes - /// when depth reaches 0. + /// Nesting depth of inner fenced code blocks. + /// When an inner fence opening (backticks + language tag) is seen, this + /// increments; a bare closing fence decrements. + /// The outer block only closes when depth reaches 0. depth: usize, }, @@ -101,8 +101,8 @@ impl FenceType { /// Represents the 7 types of HTML blocks defined by the CommonMark spec. #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum HtmlBlockRule { - /// Type 1: `