diff --git a/python3-sys/src/unicodeobject.rs b/python3-sys/src/unicodeobject.rs index fb604815..3d1fddc5 100644 --- a/python3-sys/src/unicodeobject.rs +++ b/python3-sys/src/unicodeobject.rs @@ -2,6 +2,8 @@ use libc::{c_char, c_int, c_void, wchar_t}; use crate::object::*; use crate::pyport::Py_ssize_t; +#[cfg(not(Py_LIMITED_API))] +use crate::pyport::Py_hash_t; #[cfg(not(Py_LIMITED_API))] #[deprecated(since = "0.2.1", note = "Deprecated since Python 3.3 / PEP 393")] @@ -123,7 +125,7 @@ extern "C" { pub fn PyUnicode_FromOrdinal(ordinal: c_int) -> *mut PyObject; #[cfg(not(Py_3_9))] pub fn PyUnicode_ClearFreeList() -> c_int; - #[cfg(not(Py_LIMITED_API))] + #[cfg(any(not(Py_LIMITED_API), Py_3_10))] pub fn PyUnicode_AsUTF8AndSize(unicode: *mut PyObject, size: *mut Py_ssize_t) -> *const c_char; #[cfg(not(Py_LIMITED_API))] pub fn PyUnicode_AsUTF8(unicode: *mut PyObject) -> *const c_char; @@ -429,4 +431,113 @@ extern "C" { pub fn PyUnicode_IsIdentifier(s: *mut PyObject) -> c_int; #[cfg(not(Py_LIMITED_API))] pub fn PyUnicode_AsUnicodeCopy(unicode: *mut PyObject) -> *mut Py_UNICODE; + + #[cfg(not(Py_LIMITED_API))] + fn _PyUnicode_Ready(o: *mut PyObject) -> c_int; +} + +#[repr(C)] +#[cfg(not(Py_LIMITED_API))] +pub struct PyASCIIObject { + pub ob_base: PyObject, + pub length: Py_ssize_t, + pub hash: Py_hash_t, + pub state: u32, + pub wstr: *mut c_void +} + +#[repr(C)] +#[cfg(not(Py_LIMITED_API))] +pub struct PyCompactUnicodeObject { + _base: PyASCIIObject, + utf8_length: Py_ssize_t, + utf8: *mut u8, + wstr_length: Py_ssize_t +} + +#[repr(C)] +#[cfg(not(Py_LIMITED_API))] +pub struct PyUnicodeObject { + _base: PyASCIIObject, + data: *mut c_void +} + +#[cfg(not(Py_LIMITED_API))] +#[inline] +unsafe fn PyUnicode_IS_ASCII(o: *mut PyObject) -> bool { + let ascii_bit = 1 << 6; + let state = (*(o as *mut PyASCIIObject)).state; + (state & ascii_bit) != 0 +} + +#[cfg(not(Py_LIMITED_API))] +#[inline] +unsafe fn PyUnicode_IS_COMPACT(o: *mut PyObject) -> bool { + let compact_bit = 1 << 5; + let state = (*(o as *mut PyASCIIObject)).state; + (state & compact_bit) != 0 +} + +#[cfg(not(Py_LIMITED_API))] +pub const PyUnicode_WCHAR_KIND: u32 = 0; +#[cfg(not(Py_LIMITED_API))] +pub const PyUnicode_1BYTE_KIND: u32 = 1; +#[cfg(not(Py_LIMITED_API))] +pub const PyUnicode_2BYTE_KIND: u32 = 2; +#[cfg(not(Py_LIMITED_API))] +pub const PyUnicode_4BYTE_KIND: u32 = 4; + +#[cfg(not(Py_LIMITED_API))] +#[inline] +pub unsafe fn PyUnicode_KIND(o: *mut PyObject) -> u32 { + debug_assert!(PyUnicode_Check(o) > 0); + debug_assert!(PyUnicode_IS_READY(o)); + let state = (*(o as *mut PyASCIIObject)).state; + (state >> 2) & 7 +} + +#[cfg(not(Py_LIMITED_API))] +pub unsafe fn PyUnicode_DATA(o: *mut PyObject) -> *mut c_void { + debug_assert!(PyUnicode_Check(o) > 0); + debug_assert!(PyUnicode_IS_READY(o)); + if PyUnicode_IS_COMPACT(o) { + // fn _PyUnicode_COMPACT_DATA + if PyUnicode_IS_ASCII(o) { + (o as *mut PyASCIIObject).offset(1) as *mut c_void + } else { + (o as *mut PyCompactUnicodeObject).offset(1) as *mut c_void + } + } else { + // fn _PyUnicode_NONCOMPACT_DATA + let data = (*(o as *mut PyUnicodeObject)).data; + debug_assert!(!data.is_null()); + data + } +} + +#[cfg(not(Py_LIMITED_API))] +#[inline] +pub unsafe fn PyUnicode_GET_LENGTH(o: *mut PyObject) -> Py_ssize_t { + debug_assert!(PyUnicode_Check(o) > 0); + debug_assert!(PyUnicode_IS_READY(o)); + (*(o as *mut PyASCIIObject)).length +} + +#[cfg(not(Py_LIMITED_API))] +#[inline] +unsafe fn PyUnicode_IS_READY(o: *mut PyObject) -> bool { + let ready_bit = 1 << 7; + let state = (*(o as *mut PyASCIIObject)).state; + (state & ready_bit) != 0 +} + +#[cfg(not(Py_LIMITED_API))] +#[inline] +pub unsafe fn PyUnicode_READY(o: *mut PyObject) -> c_int { + debug_assert!(PyUnicode_Check(o) > 0); + if PyUnicode_IS_READY(o) { + 0 + } else { + _PyUnicode_Ready(o) + } } diff --git a/src/objects/string.rs b/src/objects/string.rs index 05711b1f..11e51aed 100644 --- a/src/objects/string.rs +++ b/src/objects/string.rs @@ -159,7 +159,7 @@ impl<'a> PyStringData<'a> { )), }, PyStringData::Latin1(data) => { - if data.iter().all(|&b| b.is_ascii()) { + if data.is_ascii() { Ok(Cow::Borrowed(unsafe { str::from_utf8_unchecked(data) })) } else { Ok(Cow::Owned(data.iter().map(|&b| b as char).collect())) @@ -214,7 +214,7 @@ impl<'a> PyStringData<'a> { match self { PyStringData::Utf8(data) => String::from_utf8_lossy(data), PyStringData::Latin1(data) => { - if data.iter().all(|&b| b.is_ascii()) { + if data.is_ascii() { Cow::Borrowed(unsafe { str::from_utf8_unchecked(data) }) } else { Cow::Owned(data.iter().map(|&b| b as char).collect()) @@ -283,17 +283,24 @@ impl PyString { } #[cfg(feature = "python3-sys")] - fn data_impl(&self, py: Python) -> PyStringData { - // TODO: return the original representation instead - // of forcing the UTF-8 representation to be created. - let mut size: ffi::Py_ssize_t = 0; + fn data_impl(&self, _py: Python) -> PyStringData { + let ptr = self.as_ptr(); unsafe { - let data = ffi::PyUnicode_AsUTF8AndSize(self.as_ptr(), &mut size) as *const u8; - if data.is_null() { - PyErr::fetch(py).print(py); - panic!("PyUnicode_AsUTF8AndSize failed"); + let ready = ffi::PyUnicode_READY(ptr); + if ready < 0 { + // should fail only on OOM + ffi::PyErr_Print(); + panic!("PyUnicode_READY failed"); + } + let size = ffi::PyUnicode_GET_LENGTH(ptr) as usize; + let data = ffi::PyUnicode_DATA(ptr); + let kind = ffi::PyUnicode_KIND(ptr); + match kind { + ffi::PyUnicode_1BYTE_KIND => PyStringData::Latin1(std::slice::from_raw_parts(data as *const u8, size)), + ffi::PyUnicode_2BYTE_KIND => PyStringData::Utf16(std::slice::from_raw_parts(data as *const u16, size)), + ffi::PyUnicode_4BYTE_KIND => PyStringData::Utf32(std::slice::from_raw_parts(data as *const u32, size)), + _ => panic!("Unknown PyUnicode_KIND") } - PyStringData::Utf8(std::slice::from_raw_parts(data, size as usize)) } } @@ -306,7 +313,26 @@ impl PyString { /// (containing unpaired surrogates, or a Python 2.7 byte string that is /// not valid UTF-8). pub fn to_string(&self, py: Python) -> PyResult> { - self.data(py).to_string(py) + #[cfg(feature = "python3-sys")] + unsafe { + // On Python 3, we can use the UTF-8 representation stored + // inside the Python string. + // This should produce identical results to + // `self.data(py).to_string(py)` but avoids + // re-encoding the string on every to_string call. + let mut size: ffi::Py_ssize_t = 0; + let data = ffi::PyUnicode_AsUTF8AndSize(self.as_ptr(), &mut size); + if data.is_null() { + return Err(PyErr::fetch(py)); + } else { + let slice = std::slice::from_raw_parts(data as *const u8, size as usize); + return Ok(Cow::Borrowed(std::str::from_utf8_unchecked(slice))); + } + } + #[cfg(feature = "python27-sys")] + { + return self.data(py).to_string(py); + } } /// Convert the `PyString` into a Rust string. @@ -535,6 +561,7 @@ impl RefFromPyObject for [u8] { mod test { use crate::conversion::{RefFromPyObject, ToPyObject}; use crate::python::{Python, PythonObject}; + use super::{PyString, PyStringData}; #[test] fn test_non_bmp() { @@ -583,4 +610,49 @@ mod test { let v = py_bytes.extract::>(py).unwrap(); assert_eq!(b"Hello", &v[..]); } + + #[allow(unused_variables)] // when compiling for py2.7 + #[test] + fn test_extract_umlaut() { + let gil = Python::acquire_gil(); + let py = gil.python(); + let py_string = py.eval("u'x=\\u00e4'", None, None).unwrap(); + let data = py_string.cast_as::(py).unwrap().data(py); + #[cfg(feature = "python3-sys")] + { + if let PyStringData::Latin1(s) = data { + assert_eq!([b'x', b'=', 0xe4], *s); + } else { + panic!("Expected PyStringData::Latin1"); + } + } + assert_eq!("x=รค", py_string.extract::(py).unwrap()); + } + + #[allow(unused_variables)] // when compiling for py2.7 + #[test] + fn test_extract_lone_surrogate() { + let gil = Python::acquire_gil(); + let py = gil.python(); + let py_string = py.eval("u'x=\\ud800'", None, None).unwrap(); + let data = py_string.cast_as::(py).unwrap().data(py); + #[cfg(feature = "python3-sys")] + { + if let PyStringData::Utf16(s) = data { + assert_eq!(['x' as u16, '=' as u16, 0xd800], *s); + } else { + panic!("Expected PyStringData::Utf16"); + } + } + assert!(py_string.extract::(py).is_err()); + } + + #[test] + fn test_extract_lone_surrogate_lossy() { + let gil = Python::acquire_gil(); + let py = gil.python(); + let py_string = py.eval("u'x=\\ud800'", None, None).unwrap(); + let result = py_string.cast_as::(py).unwrap().to_string_lossy(py); + assert_eq!("x=\u{fffd}", result); + } }