Skip to content

Commit

Permalink
Auto merge of rust-lang#123909 - dtolnay:utf8chunks, r=joboet
Browse files Browse the repository at this point in the history
Stabilize `Utf8Chunks`

Pending FCP in rust-lang#99543.

This PR includes the proposed modification in rust-lang/libs-team#190 as agreed in rust-lang#99543 (comment).
  • Loading branch information
bors committed Apr 26, 2024
2 parents 1b3a329 + 61cf004 commit 4d570ee
Show file tree
Hide file tree
Showing 9 changed files with 57 additions and 40 deletions.
1 change: 0 additions & 1 deletion library/alloc/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -161,7 +161,6 @@
#![feature(tuple_trait)]
#![feature(unicode_internals)]
#![feature(unsize)]
#![feature(utf8_chunks)]
#![feature(vec_pop_if)]
// tidy-alphabetical-end
//
Expand Down
2 changes: 1 addition & 1 deletion library/alloc/src/str.rs
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ pub use core::str::{RSplit, Split};
pub use core::str::{RSplitN, SplitN};
#[stable(feature = "rust1", since = "1.0.0")]
pub use core::str::{RSplitTerminator, SplitTerminator};
#[unstable(feature = "utf8_chunks", issue = "99543")]
#[stable(feature = "utf8_chunks", since = "CURRENT_RUSTC_VERSION")]
pub use core::str::{Utf8Chunk, Utf8Chunks};

/// Note: `str` in `Concat<str>` is not meaningful here.
Expand Down
4 changes: 1 addition & 3 deletions library/alloc/src/string.rs
Original file line number Diff line number Diff line change
Expand Up @@ -58,8 +58,6 @@ use core::ops::{self, Range, RangeBounds};
use core::ptr;
use core::slice;
use core::str::pattern::Pattern;
#[cfg(not(no_global_oom_handling))]
use core::str::Utf8Chunks;

#[cfg(not(no_global_oom_handling))]
use crate::borrow::{Cow, ToOwned};
Expand Down Expand Up @@ -633,7 +631,7 @@ impl String {
#[cfg(not(no_global_oom_handling))]
#[stable(feature = "rust1", since = "1.0.0")]
pub fn from_utf8_lossy(v: &[u8]) -> Cow<'_, str> {
let mut iter = Utf8Chunks::new(v);
let mut iter = v.utf8_chunks();

let first_valid = if let Some(chunk) = iter.next() {
let valid = chunk.valid();
Expand Down
74 changes: 50 additions & 24 deletions library/core/src/str/lossy.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,46 @@ use crate::iter::FusedIterator;
use super::from_utf8_unchecked;
use super::validations::utf8_char_width;

impl [u8] {
/// Creates an iterator over the contiguous valid UTF-8 ranges of this
/// slice, and the non-UTF-8 fragments in between.
///
/// # Examples
///
/// This function formats arbitrary but mostly-UTF-8 bytes into Rust source
/// code in the form of a C-string literal (`c"..."`).
///
/// ```
/// use std::fmt::Write as _;
///
/// pub fn cstr_literal(bytes: &[u8]) -> String {
/// let mut repr = String::new();
/// repr.push_str("c\"");
/// for chunk in bytes.utf8_chunks() {
/// for ch in chunk.valid().chars() {
/// // Escapes \0, \t, \r, \n, \\, \', \", and uses \u{...} for non-printable characters.
/// write!(repr, "{}", ch.escape_debug()).unwrap();
/// }
/// for byte in chunk.invalid() {
/// write!(repr, "\\x{:02X}", byte).unwrap();
/// }
/// }
/// repr.push('"');
/// repr
/// }
///
/// fn main() {
/// let lit = cstr_literal(b"\xferris the \xf0\x9f\xa6\x80\x07");
/// let expected = stringify!(c"\xFErris the 🦀\u{7}");
/// assert_eq!(lit, expected);
/// }
/// ```
#[stable(feature = "utf8_chunks", since = "CURRENT_RUSTC_VERSION")]
pub fn utf8_chunks(&self) -> Utf8Chunks<'_> {
Utf8Chunks { source: self }
}
}

/// An item returned by the [`Utf8Chunks`] iterator.
///
/// A `Utf8Chunk` stores a sequence of [`u8`] up to the first broken character
Expand All @@ -14,23 +54,19 @@ use super::validations::utf8_char_width;
/// # Examples
///
/// ```
/// #![feature(utf8_chunks)]
///
/// use std::str::Utf8Chunks;
///
/// // An invalid UTF-8 string
/// let bytes = b"foo\xF1\x80bar";
///
/// // Decode the first `Utf8Chunk`
/// let chunk = Utf8Chunks::new(bytes).next().unwrap();
/// let chunk = bytes.utf8_chunks().next().unwrap();
///
/// // The first three characters are valid UTF-8
/// assert_eq!("foo", chunk.valid());
///
/// // The fourth character is broken
/// assert_eq!(b"\xF1\x80", chunk.invalid());
/// ```
#[unstable(feature = "utf8_chunks", issue = "99543")]
#[stable(feature = "utf8_chunks", since = "CURRENT_RUSTC_VERSION")]
#[derive(Clone, Debug, PartialEq, Eq)]
pub struct Utf8Chunk<'a> {
valid: &'a str,
Expand All @@ -43,7 +79,7 @@ impl<'a> Utf8Chunk<'a> {
/// This substring can be empty at the start of the string or between
/// broken UTF-8 characters.
#[must_use]
#[unstable(feature = "utf8_chunks", issue = "99543")]
#[stable(feature = "utf8_chunks", since = "CURRENT_RUSTC_VERSION")]
pub fn valid(&self) -> &'a str {
self.valid
}
Expand All @@ -63,7 +99,7 @@ impl<'a> Utf8Chunk<'a> {
/// [`valid`]: Self::valid
/// [`U+FFFD REPLACEMENT CHARACTER`]: crate::char::REPLACEMENT_CHARACTER
#[must_use]
#[unstable(feature = "utf8_chunks", issue = "99543")]
#[stable(feature = "utf8_chunks", since = "CURRENT_RUSTC_VERSION")]
pub fn invalid(&self) -> &'a [u8] {
self.invalid
}
Expand All @@ -78,7 +114,7 @@ impl fmt::Debug for Debug<'_> {
fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
f.write_char('"')?;

for chunk in Utf8Chunks::new(self.0) {
for chunk in self.0.utf8_chunks() {
// Valid part.
// Here we partially parse UTF-8 again which is suboptimal.
{
Expand Down Expand Up @@ -123,12 +159,8 @@ impl fmt::Debug for Debug<'_> {
/// [`String::from_utf8_lossy`] without allocating heap memory:
///
/// ```
/// #![feature(utf8_chunks)]
///
/// use std::str::Utf8Chunks;
///
/// fn from_utf8_lossy<F>(input: &[u8], mut push: F) where F: FnMut(&str) {
/// for chunk in Utf8Chunks::new(input) {
/// for chunk in input.utf8_chunks() {
/// push(chunk.valid());
///
/// if !chunk.invalid().is_empty() {
Expand All @@ -140,27 +172,21 @@ impl fmt::Debug for Debug<'_> {
///
/// [`String::from_utf8_lossy`]: ../../std/string/struct.String.html#method.from_utf8_lossy
#[must_use = "iterators are lazy and do nothing unless consumed"]
#[unstable(feature = "utf8_chunks", issue = "99543")]
#[stable(feature = "utf8_chunks", since = "CURRENT_RUSTC_VERSION")]
#[derive(Clone)]
pub struct Utf8Chunks<'a> {
source: &'a [u8],
}

impl<'a> Utf8Chunks<'a> {
/// Creates a new iterator to decode the bytes.
#[unstable(feature = "utf8_chunks", issue = "99543")]
pub fn new(bytes: &'a [u8]) -> Self {
Self { source: bytes }
}

#[doc(hidden)]
#[unstable(feature = "str_internals", issue = "none")]
pub fn debug(&self) -> Debug<'_> {
Debug(self.source)
}
}

#[unstable(feature = "utf8_chunks", issue = "99543")]
#[stable(feature = "utf8_chunks", since = "CURRENT_RUSTC_VERSION")]
impl<'a> Iterator for Utf8Chunks<'a> {
type Item = Utf8Chunk<'a>;

Expand Down Expand Up @@ -259,10 +285,10 @@ impl<'a> Iterator for Utf8Chunks<'a> {
}
}

#[unstable(feature = "utf8_chunks", issue = "99543")]
#[stable(feature = "utf8_chunks", since = "CURRENT_RUSTC_VERSION")]
impl FusedIterator for Utf8Chunks<'_> {}

#[unstable(feature = "utf8_chunks", issue = "99543")]
#[stable(feature = "utf8_chunks", since = "CURRENT_RUSTC_VERSION")]
impl fmt::Debug for Utf8Chunks<'_> {
fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
f.debug_struct("Utf8Chunks").field("source", &self.debug()).finish()
Expand Down
2 changes: 1 addition & 1 deletion library/core/src/str/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ use crate::slice::{self, SliceIndex};
pub mod pattern;

mod lossy;
#[unstable(feature = "utf8_chunks", issue = "99543")]
#[stable(feature = "utf8_chunks", since = "CURRENT_RUSTC_VERSION")]
pub use lossy::{Utf8Chunk, Utf8Chunks};

#[stable(feature = "rust1", since = "1.0.0")]
Expand Down
1 change: 0 additions & 1 deletion library/core/tests/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,6 @@
#![feature(error_generic_member_access)]
#![feature(error_in_core)]
#![feature(trait_upcasting)]
#![feature(utf8_chunks)]
#![feature(is_ascii_octdigit)]
#![feature(get_many_mut)]
#![feature(iter_map_windows)]
Expand Down
6 changes: 2 additions & 4 deletions library/core/tests/str_lossy.rs
Original file line number Diff line number Diff line change
@@ -1,10 +1,8 @@
use core::str::Utf8Chunks;

#[test]
fn chunks() {
macro_rules! assert_chunks {
( $string:expr, $(($valid:expr, $invalid:expr)),* $(,)? ) => {{
let mut iter = Utf8Chunks::new($string);
let mut iter = $string.utf8_chunks();
$(
let chunk = iter.next().expect("missing chunk");
assert_eq!($valid, chunk.valid());
Expand Down Expand Up @@ -79,7 +77,7 @@ fn debug() {
"\"Hello\\xC0\\x80 There\\xE6\\x83 Goodbye\\u{10d4ea}\"",
&format!(
"{:?}",
Utf8Chunks::new(b"Hello\xC0\x80 There\xE6\x83 Goodbye\xf4\x8d\x93\xaa").debug(),
b"Hello\xC0\x80 There\xE6\x83 Goodbye\xf4\x8d\x93\xaa".utf8_chunks().debug(),
),
);
}
1 change: 0 additions & 1 deletion library/std/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -314,7 +314,6 @@
#![feature(thread_local)]
#![feature(try_blocks)]
#![feature(type_alias_impl_trait)]
#![feature(utf8_chunks)]
// tidy-alphabetical-end
//
// Library features (core):
Expand Down
6 changes: 2 additions & 4 deletions library/std/src/sys/os_str/bytes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,6 @@ use crate::str;
use crate::sync::Arc;
use crate::sys_common::{AsInner, IntoInner};

use core::str::Utf8Chunks;

#[cfg(test)]
mod tests;

Expand All @@ -29,7 +27,7 @@ pub struct Slice {

impl fmt::Debug for Slice {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
fmt::Debug::fmt(&Utf8Chunks::new(&self.inner).debug(), f)
fmt::Debug::fmt(&self.inner.utf8_chunks().debug(), f)
}
}

Expand All @@ -41,7 +39,7 @@ impl fmt::Display for Slice {
return "".fmt(f);
}

for chunk in Utf8Chunks::new(&self.inner) {
for chunk in self.inner.utf8_chunks() {
let valid = chunk.valid();
// If we successfully decoded the whole chunk as a valid string then
// we can return a direct formatting of the string which will also
Expand Down

0 comments on commit 4d570ee

Please sign in to comment.