From 76f9e9048991d519dd3c25d5549f8d4f9c9d7275 Mon Sep 17 00:00:00 2001 From: Rasmus Kaj Date: Sun, 18 Oct 2020 19:32:00 +0200 Subject: [PATCH] Implement LocatedSpan::get_line_beginning(). (#66) * Implement LocatedSpan::get_line(). Add a function to get the full input line containing the (start point of the) LocatedSpan. As suggested in #53. * Add some tests. * Remove bogus comment. * No need for get_line() to return Option. * The test that uses `format!` requires std. * Some rustfmt. * Refactor two similar unsafe blocks to one. * Add some disclaimer comments / docs. * Rename get_line to get_line_beginning. * Add line_begining_may_ot_be_entire_len test. This test documents how `get_line_beginning()` differs from a hypotetical `get_line()` method. --- src/lib.rs | 56 +++++++++++++++++++++++++-- src/tests.rs | 106 +++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 158 insertions(+), 4 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 46f3672..1e95578 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -267,17 +267,28 @@ impl LocatedSpan { &self.fragment } - fn get_columns_and_bytes_before(&self) -> (usize, &[u8]) { + // Attempt to get the "original" data slice back, by extending + // self.fragment backwards by self.offset. + // Note that any bytes truncated from after self.fragment will not + // be recovered. + fn get_unoffsetted_slice(&self) -> &[u8] { let self_bytes = self.fragment.as_bytes(); let self_ptr = self_bytes.as_ptr(); - let before_self = unsafe { + unsafe { assert!( self.offset <= isize::max_value() as usize, "offset is too big" ); let orig_input_ptr = self_ptr.offset(-(self.offset as isize)); - slice::from_raw_parts(orig_input_ptr, self.offset) - }; + slice::from_raw_parts( + orig_input_ptr, + self.offset + self_bytes.len(), + ) + } + } + + fn get_columns_and_bytes_before(&self) -> (usize, &[u8]) { + let before_self = &self.get_unoffsetted_slice()[..self.offset]; let column = match memchr::memrchr(b'\n', before_self) { None => self.offset + 1, @@ -287,6 +298,43 @@ impl LocatedSpan { (column, &before_self[self.offset - (column - 1)..]) } + /// Return the line that contains this LocatedSpan. + /// + /// The `get_column` and `get_utf8_column` functions returns + /// indexes that corresponds to the line returned by this function. + /// + /// Note that if this LocatedSpan ends before the end of the + /// original data, the result of calling `get_line_beginning()` + /// will not include any data from after the LocatedSpan. + /// + /// ``` + /// # extern crate nom_locate; + /// # extern crate nom; + /// # use nom_locate::LocatedSpan; + /// # use nom::{Slice, FindSubstring}; + /// # + /// # fn main() { + /// let program = LocatedSpan::new( + /// "Hello World!\ + /// \nThis is a multi-line input\ + /// \nthat ends after this line.\n"); + /// let multi = program.find_substring("multi").unwrap(); + /// + /// assert_eq!( + /// program.slice(multi..).get_line_beginning(), + /// "This is a multi-line input".as_bytes(), + /// ); + /// # } + /// ``` + pub fn get_line_beginning(&self) -> &[u8] { + let column0 = self.get_column() - 1; + let the_line = &self.get_unoffsetted_slice()[self.offset - column0..]; + match memchr::memchr(b'\n', &the_line[column0..]) { + None => the_line, + Some(pos) => &the_line[..column0 + pos], + } + } + /// Return the column index, assuming 1 byte = 1 column. /// /// Use it for ascii text, or use get_utf8_column for UTF8. diff --git a/src/tests.rs b/src/tests.rs index 537b53e..c07b273 100644 --- a/src/tests.rs +++ b/src/tests.rs @@ -414,3 +414,109 @@ fn it_should_display_hex() { "00000000\t61 62 63 \tabc\n".to_owned() ); } + +#[test] +fn line_of_empty_span_is_empty() { + assert_eq!(StrSpan::new("").get_line_beginning(), "".as_bytes()); +} + +#[test] +fn line_of_single_line_start_is_whole() { + assert_eq!( + StrSpan::new("A single line").get_line_beginning(), + "A single line".as_bytes(), + ); +} +#[test] +fn line_of_single_line_end_is_whole() { + let data = "A single line"; + assert_eq!( + StrSpan::new(data).slice(data.len()..).get_line_beginning(), + "A single line".as_bytes(), + ); +} + +#[test] +fn line_of_start_is_first() { + assert_eq!( + StrSpan::new( + "One line of text\ + \nFollowed by a second\ + \nand a third\n" + ) + .get_line_beginning(), + "One line of text".as_bytes(), + ); +} + +#[test] +fn line_of_nl_is_before() { + let data = "One line of text\ + \nFollowed by a second\ + \nand a third\n"; + assert_eq!( + StrSpan::new(data) + .slice(data.find('\n').unwrap()..) + .get_line_beginning(), + "One line of text".as_bytes(), + ); +} + +#[test] +fn line_of_end_after_nl_is_empty() { + let data = "One line of text\ + \nFollowed by a second\ + \nand a third\n"; + assert_eq!( + StrSpan::new(data).slice(data.len()..).get_line_beginning(), + "".as_bytes(), + ); +} + +#[test] +fn line_of_end_no_nl_is_last() { + let data = "One line of text\ + \nFollowed by a second\ + \nand a third"; + assert_eq!( + StrSpan::new(data).slice(data.len()..).get_line_beginning(), + "and a third".as_bytes(), + ); +} + +/// This test documents how `get_line_beginning()` differs from +/// a hypotetical `get_line()` method. +#[test] +fn line_begining_may_ot_be_entire_len() { + let data = "One line of text\ + \nFollowed by a second\ + \nand a third"; + let by = "by"; + let pos = data.find_substring(by).unwrap(); + assert_eq!( + StrSpan::new(data).slice(pos..pos+by.len()).get_line_beginning(), + "Followed by".as_bytes(), + ); +} + +#[cfg(feature = "std")] +#[test] +fn line_for_non_ascii_chars() { + let data = StrSpan::new( + "Några rader text på Svenska.\ + \nFörra raden var först, den här är i mitten\ + \noch här är sista raden.\n", + ); + let s = data.slice(data.find_substring("först").unwrap()..); + assert_eq!( + format!( + "{line_no:3}: {line_text}\n {0:>lpos$}^- The match\n", + "", + line_no = s.location_line(), + line_text = core::str::from_utf8(s.get_line_beginning()).unwrap(), + lpos = s.get_utf8_column(), + ), + " 2: Förra raden var först, den här är i mitten\ + \n ^- The match\n", + ); +}