From 61cf00464e311b3dcf1a47b342c57978b1f40d88 Mon Sep 17 00:00:00 2001 From: David Tolnay Date: Sat, 13 Apr 2024 19:34:38 -0700 Subject: [PATCH] Stabilize Utf8Chunks --- library/alloc/src/lib.rs | 1 - library/alloc/src/str.rs | 2 +- library/alloc/src/string.rs | 4 +- library/core/src/str/lossy.rs | 74 +++++++++++++++++++---------- library/core/src/str/mod.rs | 2 +- library/core/tests/lib.rs | 1 - library/core/tests/str_lossy.rs | 6 +-- library/std/src/lib.rs | 1 - library/std/src/sys/os_str/bytes.rs | 6 +-- 9 files changed, 57 insertions(+), 40 deletions(-) diff --git a/library/alloc/src/lib.rs b/library/alloc/src/lib.rs index dec04d7e421e3..7d71f10e876e1 100644 --- a/library/alloc/src/lib.rs +++ b/library/alloc/src/lib.rs @@ -163,7 +163,6 @@ #![feature(tuple_trait)] #![feature(unicode_internals)] #![feature(unsize)] -#![feature(utf8_chunks)] #![feature(vec_pop_if)] // tidy-alphabetical-end // diff --git a/library/alloc/src/str.rs b/library/alloc/src/str.rs index ade114678b7f9..d88639c4092ea 100644 --- a/library/alloc/src/str.rs +++ b/library/alloc/src/str.rs @@ -53,7 +53,7 @@ pub use core::str::{RSplit, Split}; pub use core::str::{RSplitN, SplitN}; #[stable(feature = "rust1", since = "1.0.0")] pub use core::str::{RSplitTerminator, SplitTerminator}; -#[unstable(feature = "utf8_chunks", issue = "99543")] +#[stable(feature = "utf8_chunks", since = "CURRENT_RUSTC_VERSION")] pub use core::str::{Utf8Chunk, Utf8Chunks}; /// Note: `str` in `Concat` is not meaningful here. diff --git a/library/alloc/src/string.rs b/library/alloc/src/string.rs index 5d552c8f15c60..29664fc3dfd51 100644 --- a/library/alloc/src/string.rs +++ b/library/alloc/src/string.rs @@ -58,8 +58,6 @@ use core::ops::{self, Range, RangeBounds}; use core::ptr; use core::slice; use core::str::pattern::Pattern; -#[cfg(not(no_global_oom_handling))] -use core::str::Utf8Chunks; #[cfg(not(no_global_oom_handling))] use crate::borrow::{Cow, ToOwned}; @@ -633,7 +631,7 @@ impl String { #[cfg(not(no_global_oom_handling))] #[stable(feature = "rust1", since = "1.0.0")] pub fn from_utf8_lossy(v: &[u8]) -> Cow<'_, str> { - let mut iter = Utf8Chunks::new(v); + let mut iter = v.utf8_chunks(); let first_valid = if let Some(chunk) = iter.next() { let valid = chunk.valid(); diff --git a/library/core/src/str/lossy.rs b/library/core/src/str/lossy.rs index 59f873d1268ce..f8ecf1f3a7c52 100644 --- a/library/core/src/str/lossy.rs +++ b/library/core/src/str/lossy.rs @@ -6,6 +6,46 @@ use crate::iter::FusedIterator; use super::from_utf8_unchecked; use super::validations::utf8_char_width; +impl [u8] { + /// Creates an iterator over the contiguous valid UTF-8 ranges of this + /// slice, and the non-UTF-8 fragments in between. + /// + /// # Examples + /// + /// This function formats arbitrary but mostly-UTF-8 bytes into Rust source + /// code in the form of a C-string literal (`c"..."`). + /// + /// ``` + /// use std::fmt::Write as _; + /// + /// pub fn cstr_literal(bytes: &[u8]) -> String { + /// let mut repr = String::new(); + /// repr.push_str("c\""); + /// for chunk in bytes.utf8_chunks() { + /// for ch in chunk.valid().chars() { + /// // Escapes \0, \t, \r, \n, \\, \', \", and uses \u{...} for non-printable characters. + /// write!(repr, "{}", ch.escape_debug()).unwrap(); + /// } + /// for byte in chunk.invalid() { + /// write!(repr, "\\x{:02X}", byte).unwrap(); + /// } + /// } + /// repr.push('"'); + /// repr + /// } + /// + /// fn main() { + /// let lit = cstr_literal(b"\xferris the \xf0\x9f\xa6\x80\x07"); + /// let expected = stringify!(c"\xFErris the 🦀\u{7}"); + /// assert_eq!(lit, expected); + /// } + /// ``` + #[stable(feature = "utf8_chunks", since = "CURRENT_RUSTC_VERSION")] + pub fn utf8_chunks(&self) -> Utf8Chunks<'_> { + Utf8Chunks { source: self } + } +} + /// An item returned by the [`Utf8Chunks`] iterator. /// /// A `Utf8Chunk` stores a sequence of [`u8`] up to the first broken character @@ -14,15 +54,11 @@ use super::validations::utf8_char_width; /// # Examples /// /// ``` -/// #![feature(utf8_chunks)] -/// -/// use std::str::Utf8Chunks; -/// /// // An invalid UTF-8 string /// let bytes = b"foo\xF1\x80bar"; /// /// // Decode the first `Utf8Chunk` -/// let chunk = Utf8Chunks::new(bytes).next().unwrap(); +/// let chunk = bytes.utf8_chunks().next().unwrap(); /// /// // The first three characters are valid UTF-8 /// assert_eq!("foo", chunk.valid()); @@ -30,7 +66,7 @@ use super::validations::utf8_char_width; /// // The fourth character is broken /// assert_eq!(b"\xF1\x80", chunk.invalid()); /// ``` -#[unstable(feature = "utf8_chunks", issue = "99543")] +#[stable(feature = "utf8_chunks", since = "CURRENT_RUSTC_VERSION")] #[derive(Clone, Debug, PartialEq, Eq)] pub struct Utf8Chunk<'a> { valid: &'a str, @@ -43,7 +79,7 @@ impl<'a> Utf8Chunk<'a> { /// This substring can be empty at the start of the string or between /// broken UTF-8 characters. #[must_use] - #[unstable(feature = "utf8_chunks", issue = "99543")] + #[stable(feature = "utf8_chunks", since = "CURRENT_RUSTC_VERSION")] pub fn valid(&self) -> &'a str { self.valid } @@ -63,7 +99,7 @@ impl<'a> Utf8Chunk<'a> { /// [`valid`]: Self::valid /// [`U+FFFD REPLACEMENT CHARACTER`]: crate::char::REPLACEMENT_CHARACTER #[must_use] - #[unstable(feature = "utf8_chunks", issue = "99543")] + #[stable(feature = "utf8_chunks", since = "CURRENT_RUSTC_VERSION")] pub fn invalid(&self) -> &'a [u8] { self.invalid } @@ -78,7 +114,7 @@ impl fmt::Debug for Debug<'_> { fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { f.write_char('"')?; - for chunk in Utf8Chunks::new(self.0) { + for chunk in self.0.utf8_chunks() { // Valid part. // Here we partially parse UTF-8 again which is suboptimal. { @@ -123,12 +159,8 @@ impl fmt::Debug for Debug<'_> { /// [`String::from_utf8_lossy`] without allocating heap memory: /// /// ``` -/// #![feature(utf8_chunks)] -/// -/// use std::str::Utf8Chunks; -/// /// fn from_utf8_lossy(input: &[u8], mut push: F) where F: FnMut(&str) { -/// for chunk in Utf8Chunks::new(input) { +/// for chunk in input.utf8_chunks() { /// push(chunk.valid()); /// /// if !chunk.invalid().is_empty() { @@ -140,19 +172,13 @@ impl fmt::Debug for Debug<'_> { /// /// [`String::from_utf8_lossy`]: ../../std/string/struct.String.html#method.from_utf8_lossy #[must_use = "iterators are lazy and do nothing unless consumed"] -#[unstable(feature = "utf8_chunks", issue = "99543")] +#[stable(feature = "utf8_chunks", since = "CURRENT_RUSTC_VERSION")] #[derive(Clone)] pub struct Utf8Chunks<'a> { source: &'a [u8], } impl<'a> Utf8Chunks<'a> { - /// Creates a new iterator to decode the bytes. - #[unstable(feature = "utf8_chunks", issue = "99543")] - pub fn new(bytes: &'a [u8]) -> Self { - Self { source: bytes } - } - #[doc(hidden)] #[unstable(feature = "str_internals", issue = "none")] pub fn debug(&self) -> Debug<'_> { @@ -160,7 +186,7 @@ impl<'a> Utf8Chunks<'a> { } } -#[unstable(feature = "utf8_chunks", issue = "99543")] +#[stable(feature = "utf8_chunks", since = "CURRENT_RUSTC_VERSION")] impl<'a> Iterator for Utf8Chunks<'a> { type Item = Utf8Chunk<'a>; @@ -259,10 +285,10 @@ impl<'a> Iterator for Utf8Chunks<'a> { } } -#[unstable(feature = "utf8_chunks", issue = "99543")] +#[stable(feature = "utf8_chunks", since = "CURRENT_RUSTC_VERSION")] impl FusedIterator for Utf8Chunks<'_> {} -#[unstable(feature = "utf8_chunks", issue = "99543")] +#[stable(feature = "utf8_chunks", since = "CURRENT_RUSTC_VERSION")] impl fmt::Debug for Utf8Chunks<'_> { fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { f.debug_struct("Utf8Chunks").field("source", &self.debug()).finish() diff --git a/library/core/src/str/mod.rs b/library/core/src/str/mod.rs index 61a604561458b..3313da9dce78c 100644 --- a/library/core/src/str/mod.rs +++ b/library/core/src/str/mod.rs @@ -24,7 +24,7 @@ use crate::slice::{self, SliceIndex}; pub mod pattern; mod lossy; -#[unstable(feature = "utf8_chunks", issue = "99543")] +#[stable(feature = "utf8_chunks", since = "CURRENT_RUSTC_VERSION")] pub use lossy::{Utf8Chunk, Utf8Chunks}; #[stable(feature = "rust1", since = "1.0.0")] diff --git a/library/core/tests/lib.rs b/library/core/tests/lib.rs index e741149e7ce21..8bbd1901df349 100644 --- a/library/core/tests/lib.rs +++ b/library/core/tests/lib.rs @@ -118,7 +118,6 @@ #![feature(error_generic_member_access)] #![feature(error_in_core)] #![feature(trait_upcasting)] -#![feature(utf8_chunks)] #![feature(is_ascii_octdigit)] #![feature(get_many_mut)] #![feature(iter_map_windows)] diff --git a/library/core/tests/str_lossy.rs b/library/core/tests/str_lossy.rs index 9d3f0b65fdb94..6e70ea3e28574 100644 --- a/library/core/tests/str_lossy.rs +++ b/library/core/tests/str_lossy.rs @@ -1,10 +1,8 @@ -use core::str::Utf8Chunks; - #[test] fn chunks() { macro_rules! assert_chunks { ( $string:expr, $(($valid:expr, $invalid:expr)),* $(,)? ) => {{ - let mut iter = Utf8Chunks::new($string); + let mut iter = $string.utf8_chunks(); $( let chunk = iter.next().expect("missing chunk"); assert_eq!($valid, chunk.valid()); @@ -79,7 +77,7 @@ fn debug() { "\"Hello\\xC0\\x80 There\\xE6\\x83 Goodbye\\u{10d4ea}\"", &format!( "{:?}", - Utf8Chunks::new(b"Hello\xC0\x80 There\xE6\x83 Goodbye\xf4\x8d\x93\xaa").debug(), + b"Hello\xC0\x80 There\xE6\x83 Goodbye\xf4\x8d\x93\xaa".utf8_chunks().debug(), ), ); } diff --git a/library/std/src/lib.rs b/library/std/src/lib.rs index e9de3b7767044..906b64c0dd281 100644 --- a/library/std/src/lib.rs +++ b/library/std/src/lib.rs @@ -313,7 +313,6 @@ #![feature(thread_local)] #![feature(try_blocks)] #![feature(type_alias_impl_trait)] -#![feature(utf8_chunks)] // tidy-alphabetical-end // // Library features (core): diff --git a/library/std/src/sys/os_str/bytes.rs b/library/std/src/sys/os_str/bytes.rs index 4ca3f1cd1853a..9be02bc191e5e 100644 --- a/library/std/src/sys/os_str/bytes.rs +++ b/library/std/src/sys/os_str/bytes.rs @@ -11,8 +11,6 @@ use crate::str; use crate::sync::Arc; use crate::sys_common::{AsInner, IntoInner}; -use core::str::Utf8Chunks; - #[cfg(test)] mod tests; @@ -29,7 +27,7 @@ pub struct Slice { impl fmt::Debug for Slice { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - fmt::Debug::fmt(&Utf8Chunks::new(&self.inner).debug(), f) + fmt::Debug::fmt(&self.inner.utf8_chunks().debug(), f) } } @@ -41,7 +39,7 @@ impl fmt::Display for Slice { return "".fmt(f); } - for chunk in Utf8Chunks::new(&self.inner) { + for chunk in self.inner.utf8_chunks() { let valid = chunk.valid(); // If we successfully decoded the whole chunk as a valid string then // we can return a direct formatting of the string which will also