From 0c786d42a288298228ff448e81991bd1d7289798 Mon Sep 17 00:00:00 2001 From: Daniel Parks Date: Mon, 29 May 2023 09:26:18 -0700 Subject: [PATCH] Implement Shlex with bytes::Shlex. --- src/bytes.rs | 5 +- src/lib.rs | 129 +++++++++------------------------------------------ 2 files changed, 26 insertions(+), 108 deletions(-) diff --git a/src/bytes.rs b/src/bytes.rs index 5c03d81..e3306f5 100644 --- a/src/bytes.rs +++ b/src/bytes.rs @@ -5,8 +5,9 @@ //! [`Shlex`] and friends for byte strings. //! -//! This may be more convenient if you are working with byte slices (`[u8]`) -//! or types that are wrappers around bytes, such as [`OsStr`](std::ffi::OsStr): +//! This is used internally by the [outer module](crate), and may be more +//! convenient if you are working with byte slices (`[u8]`) or types that are +//! wrappers around bytes, such as [`OsStr`](std::ffi::OsStr): //! //! ```rust //! #[cfg(unix)] { diff --git a/src/lib.rs b/src/lib.rs index e6bf432..444c1fa 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -12,8 +12,9 @@ //! This implementation also deviates from the Python version in not treating `\r` specially, which //! I believe is more compliant. //! -//! The algorithms in this crate are oblivious to UTF-8 high bytes, so they iterate over the bytes -//! directly as a micro-optimization. +//! This is a string-friendly wrapper around the [bytes] module that works on the underlying byte +//! slices. The algorithms in this crate are oblivious to UTF-8 high bytes, so working directly +//! with bytes is a safe micro-optimization. //! //! Disabling the `std` feature (which is enabled by default) will allow the crate to work in //! `no_std` environments, where the `alloc` crate, and a global allocator, are available. @@ -33,122 +34,38 @@ pub mod bytes; /// An iterator that takes an input string and splits it into the words using the same syntax as /// the POSIX shell. -pub struct Shlex<'a> { - in_iter: core::str::Bytes<'a>, - /// The number of newlines read so far, plus one. - pub line_no: usize, - /// An input string is erroneous if it ends while inside a quotation or right after an - /// unescaped backslash. Since Iterator does not have a mechanism to return an error, if that - /// happens, Shlex just throws out the last token, ends the iteration, and sets 'had_error' to - /// true; best to check it after you're done iterating. - pub had_error: bool, -} +/// +/// See [`bytes::Shlex`]. +pub struct Shlex<'a>(bytes::Shlex<'a>); impl<'a> Shlex<'a> { pub fn new(in_str: &'a str) -> Self { - Shlex { - in_iter: in_str.bytes(), - line_no: 1, - had_error: false, - } - } - - fn parse_word(&mut self, mut ch: u8) -> Option { - let mut result: Vec = Vec::new(); - loop { - match ch as char { - '"' => if let Err(()) = self.parse_double(&mut result) { - self.had_error = true; - return None; - }, - '\'' => if let Err(()) = self.parse_single(&mut result) { - self.had_error = true; - return None; - }, - '\\' => if let Some(ch2) = self.next_char() { - if ch2 != '\n' as u8 { result.push(ch2); } - } else { - self.had_error = true; - return None; - }, - ' ' | '\t' | '\n' => { break; }, - _ => { result.push(ch as u8); }, - } - if let Some(ch2) = self.next_char() { ch = ch2; } else { break; } - } - unsafe { Some(String::from_utf8_unchecked(result)) } + Self(bytes::Shlex::new(in_str.as_bytes())) } +} - fn parse_double(&mut self, result: &mut Vec) -> Result<(), ()> { - loop { - if let Some(ch2) = self.next_char() { - match ch2 as char { - '\\' => { - if let Some(ch3) = self.next_char() { - match ch3 as char { - // \$ => $ - '$' | '`' | '"' | '\\' => { result.push(ch3); }, - // \ => nothing - '\n' => {}, - // \x => =x - _ => { result.push('\\' as u8); result.push(ch3); } - } - } else { - return Err(()); - } - }, - '"' => { return Ok(()); }, - _ => { result.push(ch2); }, - } - } else { - return Err(()); - } - } +impl<'a> Iterator for Shlex<'a> { + type Item = String; + fn next(&mut self) -> Option { + self.0.next().map(|byte_word| { + // Safety: given valid UTF-8, bytes::Shlex will always return valid UTF-8. + unsafe { String::from_utf8_unchecked(byte_word) } + }) } +} - fn parse_single(&mut self, result: &mut Vec) -> Result<(), ()> { - loop { - if let Some(ch2) = self.next_char() { - match ch2 as char { - '\'' => { return Ok(()); }, - _ => { result.push(ch2); }, - } - } else { - return Err(()); - } - } - } +impl<'a> core::ops::Deref for Shlex<'a> { + type Target = bytes::Shlex<'a>; - fn next_char(&mut self) -> Option { - let res = self.in_iter.next(); - if res == Some('\n' as u8) { self.line_no += 1; } - res + fn deref(&self) -> &Self::Target { + &self.0 } } -impl<'a> Iterator for Shlex<'a> { - type Item = String; - fn next(&mut self) -> Option { - if let Some(mut ch) = self.next_char() { - // skip initial whitespace - loop { - match ch as char { - ' ' | '\t' | '\n' => {}, - '#' => { - while let Some(ch2) = self.next_char() { - if ch2 as char == '\n' { break; } - } - }, - _ => { break; } - } - if let Some(ch2) = self.next_char() { ch = ch2; } else { return None; } - } - self.parse_word(ch) - } else { // no initial character - None - } +impl<'a> core::ops::DerefMut for Shlex<'a> { + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.0 } - } /// Convenience function that consumes the whole string at once. Returns None if the input was