Merge pull request #247 from dgrunwald/unicode-data

`PyString::data()`: return the internal representation of the Python unicode object
dgrunwald · Feb 17, 2021 · 40c815e · 40c815e
2 parents 284c70d + e3976d1
commit 40c815e
Show file tree

Hide file tree

Showing 2 changed files with 196 additions and 13 deletions.
diff --git a/python3-sys/src/unicodeobject.rs b/python3-sys/src/unicodeobject.rs
@@ -2,6 +2,8 @@ use libc::{c_char, c_int, c_void, wchar_t};
 
 use crate::object::*;
 use crate::pyport::Py_ssize_t;
+#[cfg(not(Py_LIMITED_API))]
+use crate::pyport::Py_hash_t;
 
 #[cfg(not(Py_LIMITED_API))]
 #[deprecated(since = "0.2.1", note = "Deprecated since Python 3.3 / PEP 393")]
@@ -123,7 +125,7 @@ extern "C" {
     pub fn PyUnicode_FromOrdinal(ordinal: c_int) -> *mut PyObject;
     #[cfg(not(Py_3_9))]
     pub fn PyUnicode_ClearFreeList() -> c_int;
-    #[cfg(not(Py_LIMITED_API))]
+    #[cfg(any(not(Py_LIMITED_API), Py_3_10))]
     pub fn PyUnicode_AsUTF8AndSize(unicode: *mut PyObject, size: *mut Py_ssize_t) -> *const c_char;
     #[cfg(not(Py_LIMITED_API))]
     pub fn PyUnicode_AsUTF8(unicode: *mut PyObject) -> *const c_char;
@@ -429,4 +431,113 @@ extern "C" {
     pub fn PyUnicode_IsIdentifier(s: *mut PyObject) -> c_int;
     #[cfg(not(Py_LIMITED_API))]
     pub fn PyUnicode_AsUnicodeCopy(unicode: *mut PyObject) -> *mut Py_UNICODE;
+
+    #[cfg(not(Py_LIMITED_API))]
+    fn _PyUnicode_Ready(o: *mut PyObject) -> c_int;
+}
+
+#[repr(C)]
+#[cfg(not(Py_LIMITED_API))]
+pub struct PyASCIIObject {
+    pub ob_base: PyObject,
+    pub length: Py_ssize_t,
+    pub hash: Py_hash_t,
+    pub state: u32,
+    pub wstr: *mut c_void
+}
+
+#[repr(C)]
+#[cfg(not(Py_LIMITED_API))]
+pub struct PyCompactUnicodeObject {
+    _base: PyASCIIObject,
+    utf8_length: Py_ssize_t,
+    utf8: *mut u8,
+    wstr_length: Py_ssize_t
+}
+
+#[repr(C)]
+#[cfg(not(Py_LIMITED_API))]
+pub struct PyUnicodeObject {
+    _base: PyASCIIObject,
+    data: *mut c_void
+}
+
+#[cfg(not(Py_LIMITED_API))]
+#[inline]
+unsafe fn PyUnicode_IS_ASCII(o: *mut PyObject) -> bool {
+    let ascii_bit = 1 << 6;
+    let state = (*(o as *mut PyASCIIObject)).state;
+    (state & ascii_bit) != 0
+}
+
+#[cfg(not(Py_LIMITED_API))]
+#[inline]
+unsafe fn PyUnicode_IS_COMPACT(o: *mut PyObject) -> bool {
+    let compact_bit = 1 << 5;
+    let state = (*(o as *mut PyASCIIObject)).state;
+    (state & compact_bit) != 0
+}
+
+#[cfg(not(Py_LIMITED_API))]
+pub const PyUnicode_WCHAR_KIND: u32 = 0;
+#[cfg(not(Py_LIMITED_API))]
+pub const PyUnicode_1BYTE_KIND: u32 = 1;
+#[cfg(not(Py_LIMITED_API))]
+pub const PyUnicode_2BYTE_KIND: u32 = 2;
+#[cfg(not(Py_LIMITED_API))]
+pub const PyUnicode_4BYTE_KIND: u32 = 4;
+
+#[cfg(not(Py_LIMITED_API))]
+#[inline]
+pub unsafe fn PyUnicode_KIND(o: *mut PyObject) -> u32 {
+    debug_assert!(PyUnicode_Check(o) > 0);
+    debug_assert!(PyUnicode_IS_READY(o));
+    let state = (*(o as *mut PyASCIIObject)).state;
+    (state >> 2) & 7
+}
+
+#[cfg(not(Py_LIMITED_API))]
+pub unsafe fn PyUnicode_DATA(o: *mut PyObject) -> *mut c_void {
+    debug_assert!(PyUnicode_Check(o) > 0);
+    debug_assert!(PyUnicode_IS_READY(o));
+    if PyUnicode_IS_COMPACT(o) {
+        // fn _PyUnicode_COMPACT_DATA
+         if PyUnicode_IS_ASCII(o) {
+            (o as *mut PyASCIIObject).offset(1) as *mut c_void
+        } else {
+            (o as *mut PyCompactUnicodeObject).offset(1) as *mut c_void
+        }
+    } else {
+        // fn _PyUnicode_NONCOMPACT_DATA
+        let data = (*(o as *mut PyUnicodeObject)).data;
+        debug_assert!(!data.is_null());
+        data
+    }
+}
+
+#[cfg(not(Py_LIMITED_API))]
+#[inline]
+pub unsafe fn PyUnicode_GET_LENGTH(o: *mut PyObject) -> Py_ssize_t {
+    debug_assert!(PyUnicode_Check(o) > 0);
+    debug_assert!(PyUnicode_IS_READY(o));
+    (*(o as *mut PyASCIIObject)).length
+}
+
+#[cfg(not(Py_LIMITED_API))]
+#[inline]
+unsafe fn PyUnicode_IS_READY(o: *mut PyObject) -> bool {
+    let ready_bit = 1 << 7;
+    let state = (*(o as *mut PyASCIIObject)).state;
+    (state & ready_bit) != 0
+}
+
+#[cfg(not(Py_LIMITED_API))]
+#[inline]
+pub unsafe fn PyUnicode_READY(o: *mut PyObject) -> c_int {
+    debug_assert!(PyUnicode_Check(o) > 0);
+    if PyUnicode_IS_READY(o) {
+        0
+    } else {
+        _PyUnicode_Ready(o)
+    }
 }
diff --git a/src/objects/string.rs b/src/objects/string.rs
@@ -159,7 +159,7 @@ impl<'a> PyStringData<'a> {
                 )),
             },
             PyStringData::Latin1(data) => {
-                if data.iter().all(|&b| b.is_ascii()) {
+                if data.is_ascii() {
                     Ok(Cow::Borrowed(unsafe { str::from_utf8_unchecked(data) }))
                 } else {
                     Ok(Cow::Owned(data.iter().map(|&b| b as char).collect()))
@@ -214,7 +214,7 @@ impl<'a> PyStringData<'a> {
         match self {
             PyStringData::Utf8(data) => String::from_utf8_lossy(data),
             PyStringData::Latin1(data) => {
-                if data.iter().all(|&b| b.is_ascii()) {
+                if data.is_ascii() {
                     Cow::Borrowed(unsafe { str::from_utf8_unchecked(data) })
                 } else {
                     Cow::Owned(data.iter().map(|&b| b as char).collect())
@@ -283,17 +283,24 @@ impl PyString {
     }
 
     #[cfg(feature = "python3-sys")]
-    fn data_impl(&self, py: Python) -> PyStringData {
-        // TODO: return the original representation instead
-        // of forcing the UTF-8 representation to be created.
-        let mut size: ffi::Py_ssize_t = 0;
+    fn data_impl(&self, _py: Python) -> PyStringData {
+        let ptr = self.as_ptr();
         unsafe {
-            let data = ffi::PyUnicode_AsUTF8AndSize(self.as_ptr(), &mut size) as *const u8;
-            if data.is_null() {
-                PyErr::fetch(py).print(py);
-                panic!("PyUnicode_AsUTF8AndSize failed");
+            let ready = ffi::PyUnicode_READY(ptr);
+            if ready < 0 {
+                // should fail only on OOM
+                ffi::PyErr_Print();
+                panic!("PyUnicode_READY failed");
+            }
+            let size = ffi::PyUnicode_GET_LENGTH(ptr) as usize;
+            let data = ffi::PyUnicode_DATA(ptr);
+            let kind = ffi::PyUnicode_KIND(ptr);
+            match kind {
+                ffi::PyUnicode_1BYTE_KIND => PyStringData::Latin1(std::slice::from_raw_parts(data as *const u8, size)),
+                ffi::PyUnicode_2BYTE_KIND => PyStringData::Utf16(std::slice::from_raw_parts(data as *const u16, size)),
+                ffi::PyUnicode_4BYTE_KIND => PyStringData::Utf32(std::slice::from_raw_parts(data as *const u32, size)),
+                _ => panic!("Unknown PyUnicode_KIND")
             }
-            PyStringData::Utf8(std::slice::from_raw_parts(data, size as usize))
         }
     }
 
@@ -306,7 +313,26 @@ impl PyString {
     /// (containing unpaired surrogates, or a Python 2.7 byte string that is
     /// not valid UTF-8).
     pub fn to_string(&self, py: Python) -> PyResult<Cow<str>> {
-        self.data(py).to_string(py)
+        #[cfg(feature = "python3-sys")]
+        unsafe {
+            // On Python 3, we can use the UTF-8 representation stored
+            // inside the Python string.
+            // This should produce identical results to
+            // `self.data(py).to_string(py)` but avoids
+            // re-encoding the string on every to_string call.
+            let mut size: ffi::Py_ssize_t = 0;
+            let data = ffi::PyUnicode_AsUTF8AndSize(self.as_ptr(), &mut size);
+            if data.is_null() {
+                return Err(PyErr::fetch(py));
+            } else {
+                let slice = std::slice::from_raw_parts(data as *const u8, size as usize);
+                return Ok(Cow::Borrowed(std::str::from_utf8_unchecked(slice)));
+            }
+        }
+        #[cfg(feature = "python27-sys")]
+        {
+            return self.data(py).to_string(py);
+        }
     }
 
     /// Convert the `PyString` into a Rust string.
@@ -535,6 +561,7 @@ impl RefFromPyObject for [u8] {
 mod test {
     use crate::conversion::{RefFromPyObject, ToPyObject};
     use crate::python::{Python, PythonObject};
+    use super::{PyString, PyStringData};
 
     #[test]
     fn test_non_bmp() {
@@ -583,4 +610,49 @@ mod test {
         let v = py_bytes.extract::<Vec<u8>>(py).unwrap();
         assert_eq!(b"Hello", &v[..]);
     }
+
+    #[allow(unused_variables)] // when compiling for py2.7
+    #[test]
+    fn test_extract_umlaut() {
+        let gil = Python::acquire_gil();
+        let py = gil.python();
+        let py_string = py.eval("u'x=\\u00e4'", None, None).unwrap();
+        let data = py_string.cast_as::<PyString>(py).unwrap().data(py);
+        #[cfg(feature = "python3-sys")]
+        {
+            if let PyStringData::Latin1(s) = data {
+                assert_eq!([b'x', b'=', 0xe4], *s);
+            } else {
+                panic!("Expected PyStringData::Latin1");
+            }
+        }
+        assert_eq!("x=ä", py_string.extract::<String>(py).unwrap());
+    }
+
+    #[allow(unused_variables)] // when compiling for py2.7
+    #[test]
+    fn test_extract_lone_surrogate() {
+        let gil = Python::acquire_gil();
+        let py = gil.python();
+        let py_string = py.eval("u'x=\\ud800'", None, None).unwrap();
+        let data = py_string.cast_as::<PyString>(py).unwrap().data(py);
+        #[cfg(feature = "python3-sys")]
+        {
+            if let PyStringData::Utf16(s) = data {
+                assert_eq!(['x' as u16, '=' as u16, 0xd800], *s);
+            } else {
+                panic!("Expected PyStringData::Utf16");
+            }
+        }
+        assert!(py_string.extract::<String>(py).is_err());
+    }
+
+    #[test]
+    fn test_extract_lone_surrogate_lossy() {
+        let gil = Python::acquire_gil();
+        let py = gil.python();
+        let py_string = py.eval("u'x=\\ud800'", None, None).unwrap();
+        let result = py_string.cast_as::<PyString>(py).unwrap().to_string_lossy(py);
+        assert_eq!("x=\u{fffd}", result);
+    }
 }