Skip to content

Commit

Permalink
Merge pull request #247 from dgrunwald/unicode-data
Browse files Browse the repository at this point in the history
`PyString::data()`: return the internal representation of the Python unicode object
  • Loading branch information
dgrunwald committed Feb 17, 2021
2 parents 284c70d + e3976d1 commit 40c815e
Show file tree
Hide file tree
Showing 2 changed files with 196 additions and 13 deletions.
113 changes: 112 additions & 1 deletion python3-sys/src/unicodeobject.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@ use libc::{c_char, c_int, c_void, wchar_t};

use crate::object::*;
use crate::pyport::Py_ssize_t;
#[cfg(not(Py_LIMITED_API))]
use crate::pyport::Py_hash_t;

#[cfg(not(Py_LIMITED_API))]
#[deprecated(since = "0.2.1", note = "Deprecated since Python 3.3 / PEP 393")]
Expand Down Expand Up @@ -123,7 +125,7 @@ extern "C" {
pub fn PyUnicode_FromOrdinal(ordinal: c_int) -> *mut PyObject;
#[cfg(not(Py_3_9))]
pub fn PyUnicode_ClearFreeList() -> c_int;
#[cfg(not(Py_LIMITED_API))]
#[cfg(any(not(Py_LIMITED_API), Py_3_10))]
pub fn PyUnicode_AsUTF8AndSize(unicode: *mut PyObject, size: *mut Py_ssize_t) -> *const c_char;
#[cfg(not(Py_LIMITED_API))]
pub fn PyUnicode_AsUTF8(unicode: *mut PyObject) -> *const c_char;
Expand Down Expand Up @@ -429,4 +431,113 @@ extern "C" {
pub fn PyUnicode_IsIdentifier(s: *mut PyObject) -> c_int;
#[cfg(not(Py_LIMITED_API))]
pub fn PyUnicode_AsUnicodeCopy(unicode: *mut PyObject) -> *mut Py_UNICODE;

#[cfg(not(Py_LIMITED_API))]
fn _PyUnicode_Ready(o: *mut PyObject) -> c_int;
}

#[repr(C)]
#[cfg(not(Py_LIMITED_API))]
pub struct PyASCIIObject {
pub ob_base: PyObject,
pub length: Py_ssize_t,
pub hash: Py_hash_t,
pub state: u32,
pub wstr: *mut c_void
}

#[repr(C)]
#[cfg(not(Py_LIMITED_API))]
pub struct PyCompactUnicodeObject {
_base: PyASCIIObject,
utf8_length: Py_ssize_t,
utf8: *mut u8,
wstr_length: Py_ssize_t
}

#[repr(C)]
#[cfg(not(Py_LIMITED_API))]
pub struct PyUnicodeObject {
_base: PyASCIIObject,
data: *mut c_void
}

#[cfg(not(Py_LIMITED_API))]
#[inline]
unsafe fn PyUnicode_IS_ASCII(o: *mut PyObject) -> bool {
let ascii_bit = 1 << 6;
let state = (*(o as *mut PyASCIIObject)).state;
(state & ascii_bit) != 0
}

#[cfg(not(Py_LIMITED_API))]
#[inline]
unsafe fn PyUnicode_IS_COMPACT(o: *mut PyObject) -> bool {
let compact_bit = 1 << 5;
let state = (*(o as *mut PyASCIIObject)).state;
(state & compact_bit) != 0
}

#[cfg(not(Py_LIMITED_API))]
pub const PyUnicode_WCHAR_KIND: u32 = 0;
#[cfg(not(Py_LIMITED_API))]
pub const PyUnicode_1BYTE_KIND: u32 = 1;
#[cfg(not(Py_LIMITED_API))]
pub const PyUnicode_2BYTE_KIND: u32 = 2;
#[cfg(not(Py_LIMITED_API))]
pub const PyUnicode_4BYTE_KIND: u32 = 4;

#[cfg(not(Py_LIMITED_API))]
#[inline]
pub unsafe fn PyUnicode_KIND(o: *mut PyObject) -> u32 {
debug_assert!(PyUnicode_Check(o) > 0);
debug_assert!(PyUnicode_IS_READY(o));
let state = (*(o as *mut PyASCIIObject)).state;
(state >> 2) & 7
}

#[cfg(not(Py_LIMITED_API))]
pub unsafe fn PyUnicode_DATA(o: *mut PyObject) -> *mut c_void {
debug_assert!(PyUnicode_Check(o) > 0);
debug_assert!(PyUnicode_IS_READY(o));
if PyUnicode_IS_COMPACT(o) {
// fn _PyUnicode_COMPACT_DATA
if PyUnicode_IS_ASCII(o) {
(o as *mut PyASCIIObject).offset(1) as *mut c_void
} else {
(o as *mut PyCompactUnicodeObject).offset(1) as *mut c_void
}
} else {
// fn _PyUnicode_NONCOMPACT_DATA
let data = (*(o as *mut PyUnicodeObject)).data;
debug_assert!(!data.is_null());
data
}
}

#[cfg(not(Py_LIMITED_API))]
#[inline]
pub unsafe fn PyUnicode_GET_LENGTH(o: *mut PyObject) -> Py_ssize_t {
debug_assert!(PyUnicode_Check(o) > 0);
debug_assert!(PyUnicode_IS_READY(o));
(*(o as *mut PyASCIIObject)).length
}

#[cfg(not(Py_LIMITED_API))]
#[inline]
unsafe fn PyUnicode_IS_READY(o: *mut PyObject) -> bool {
let ready_bit = 1 << 7;
let state = (*(o as *mut PyASCIIObject)).state;
(state & ready_bit) != 0
}

#[cfg(not(Py_LIMITED_API))]
#[inline]
pub unsafe fn PyUnicode_READY(o: *mut PyObject) -> c_int {
debug_assert!(PyUnicode_Check(o) > 0);
if PyUnicode_IS_READY(o) {
0
} else {
_PyUnicode_Ready(o)
}
}
96 changes: 84 additions & 12 deletions src/objects/string.rs
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,7 @@ impl<'a> PyStringData<'a> {
)),
},
PyStringData::Latin1(data) => {
if data.iter().all(|&b| b.is_ascii()) {
if data.is_ascii() {
Ok(Cow::Borrowed(unsafe { str::from_utf8_unchecked(data) }))
} else {
Ok(Cow::Owned(data.iter().map(|&b| b as char).collect()))
Expand Down Expand Up @@ -214,7 +214,7 @@ impl<'a> PyStringData<'a> {
match self {
PyStringData::Utf8(data) => String::from_utf8_lossy(data),
PyStringData::Latin1(data) => {
if data.iter().all(|&b| b.is_ascii()) {
if data.is_ascii() {
Cow::Borrowed(unsafe { str::from_utf8_unchecked(data) })
} else {
Cow::Owned(data.iter().map(|&b| b as char).collect())
Expand Down Expand Up @@ -283,17 +283,24 @@ impl PyString {
}

#[cfg(feature = "python3-sys")]
fn data_impl(&self, py: Python) -> PyStringData {
// TODO: return the original representation instead
// of forcing the UTF-8 representation to be created.
let mut size: ffi::Py_ssize_t = 0;
fn data_impl(&self, _py: Python) -> PyStringData {
let ptr = self.as_ptr();
unsafe {
let data = ffi::PyUnicode_AsUTF8AndSize(self.as_ptr(), &mut size) as *const u8;
if data.is_null() {
PyErr::fetch(py).print(py);
panic!("PyUnicode_AsUTF8AndSize failed");
let ready = ffi::PyUnicode_READY(ptr);
if ready < 0 {
// should fail only on OOM
ffi::PyErr_Print();
panic!("PyUnicode_READY failed");
}
let size = ffi::PyUnicode_GET_LENGTH(ptr) as usize;
let data = ffi::PyUnicode_DATA(ptr);
let kind = ffi::PyUnicode_KIND(ptr);
match kind {
ffi::PyUnicode_1BYTE_KIND => PyStringData::Latin1(std::slice::from_raw_parts(data as *const u8, size)),
ffi::PyUnicode_2BYTE_KIND => PyStringData::Utf16(std::slice::from_raw_parts(data as *const u16, size)),
ffi::PyUnicode_4BYTE_KIND => PyStringData::Utf32(std::slice::from_raw_parts(data as *const u32, size)),
_ => panic!("Unknown PyUnicode_KIND")
}
PyStringData::Utf8(std::slice::from_raw_parts(data, size as usize))
}
}

Expand All @@ -306,7 +313,26 @@ impl PyString {
/// (containing unpaired surrogates, or a Python 2.7 byte string that is
/// not valid UTF-8).
pub fn to_string(&self, py: Python) -> PyResult<Cow<str>> {
self.data(py).to_string(py)
#[cfg(feature = "python3-sys")]
unsafe {
// On Python 3, we can use the UTF-8 representation stored
// inside the Python string.
// This should produce identical results to
// `self.data(py).to_string(py)` but avoids
// re-encoding the string on every to_string call.
let mut size: ffi::Py_ssize_t = 0;
let data = ffi::PyUnicode_AsUTF8AndSize(self.as_ptr(), &mut size);
if data.is_null() {
return Err(PyErr::fetch(py));
} else {
let slice = std::slice::from_raw_parts(data as *const u8, size as usize);
return Ok(Cow::Borrowed(std::str::from_utf8_unchecked(slice)));
}
}
#[cfg(feature = "python27-sys")]
{
return self.data(py).to_string(py);
}
}

/// Convert the `PyString` into a Rust string.
Expand Down Expand Up @@ -535,6 +561,7 @@ impl RefFromPyObject for [u8] {
mod test {
use crate::conversion::{RefFromPyObject, ToPyObject};
use crate::python::{Python, PythonObject};
use super::{PyString, PyStringData};

#[test]
fn test_non_bmp() {
Expand Down Expand Up @@ -583,4 +610,49 @@ mod test {
let v = py_bytes.extract::<Vec<u8>>(py).unwrap();
assert_eq!(b"Hello", &v[..]);
}

#[allow(unused_variables)] // when compiling for py2.7
#[test]
fn test_extract_umlaut() {
let gil = Python::acquire_gil();
let py = gil.python();
let py_string = py.eval("u'x=\\u00e4'", None, None).unwrap();
let data = py_string.cast_as::<PyString>(py).unwrap().data(py);
#[cfg(feature = "python3-sys")]
{
if let PyStringData::Latin1(s) = data {
assert_eq!([b'x', b'=', 0xe4], *s);
} else {
panic!("Expected PyStringData::Latin1");
}
}
assert_eq!("x=ä", py_string.extract::<String>(py).unwrap());
}

#[allow(unused_variables)] // when compiling for py2.7
#[test]
fn test_extract_lone_surrogate() {
let gil = Python::acquire_gil();
let py = gil.python();
let py_string = py.eval("u'x=\\ud800'", None, None).unwrap();
let data = py_string.cast_as::<PyString>(py).unwrap().data(py);
#[cfg(feature = "python3-sys")]
{
if let PyStringData::Utf16(s) = data {
assert_eq!(['x' as u16, '=' as u16, 0xd800], *s);
} else {
panic!("Expected PyStringData::Utf16");
}
}
assert!(py_string.extract::<String>(py).is_err());
}

#[test]
fn test_extract_lone_surrogate_lossy() {
let gil = Python::acquire_gil();
let py = gil.python();
let py_string = py.eval("u'x=\\ud800'", None, None).unwrap();
let result = py_string.cast_as::<PyString>(py).unwrap().to_string_lossy(py);
assert_eq!("x=\u{fffd}", result);
}
}

0 comments on commit 40c815e

Please sign in to comment.