Skip to content

Commit

Permalink
Implement grapheme registry
Browse files Browse the repository at this point in the history
  • Loading branch information
crlf0710 committed Oct 17, 2020
1 parent 9fce341 commit 563efcd
Show file tree
Hide file tree
Showing 5 changed files with 109 additions and 12 deletions.
3 changes: 2 additions & 1 deletion Cargo.toml
Expand Up @@ -9,8 +9,9 @@ license = "MIT OR Apache-2.0"
static_assertions = "1.1.0"
typenum = {git = "https://github.com/crlf0710/typenum"}
globals_struct = {git = "https://github.com/crlf0710/globals_struct"}
unicode-normalization = { version = "0.1.13", optional = true }

[features]
initex = []
unicode_support = []
unicode_support = ['unicode-normalization']
default = ['initex', 'unicode_support']
2 changes: 2 additions & 0 deletions src/lib.rs
Expand Up @@ -263,4 +263,6 @@ mod section_1378;
mod section_1379;
mod section_1380;

mod unicode_support;

pub use section_1332::entry;
21 changes: 21 additions & 0 deletions src/pascal.rs
Expand Up @@ -3,6 +3,26 @@ pub type real = f32;
pub type word = u32;
pub type boolean = bool;

#[cfg(not(feature = "unicode_support"))]
pub(crate) struct char(u8);

#[cfg(not(feature = "unicode_support"))]
impl char {
pub(crate) const fn new(v: u8) -> Self {
char(v)
}
}

#[cfg(feature = "unicode_support")]
pub(crate) struct char(u32, PhantomData<Rc<()>>);

#[cfg(feature = "unicode_support")]
impl char {
pub(crate) const fn new(v: u32) -> Self {
char(v, PhantomData)
}
}

macro_rules! define_ranged_unsigned_integer {
($v:vis $name:ident => $base_type:path; $typenum_const:ident) => {
// TODO: Implement this.
Expand Down Expand Up @@ -412,4 +432,5 @@ pub(crate) fn r#break<F: PascalFile>(file: &mut F) {
use core::fmt::{self, Display};
use core::marker::PhantomData;
use std::io::{self, Read, Write};
use std::rc::Rc;
use typenum::Unsigned;
15 changes: 4 additions & 11 deletions src/section_0019.rs
Expand Up @@ -27,29 +27,22 @@
// @d text_char == char {the data type of characters in text files}

/// the data type of characters in text files
#[cfg(not(feature = "unicode_support"))]
pub(crate) struct text_char(u8);

/// the data type of characters in text files
///
/// represents a unicode grapheme cluster
#[cfg(feature = "unicode_support")]
pub(crate) struct text_char(u32);
pub(crate) type text_char = crate::pascal::char;

// @d first_text_char=0 {ordinal number of the smallest element of |text_char|}

/// ordinal number of the smallest element of [text_char]
const first_text_char: text_char = text_char(0);
const first_text_char: text_char = text_char::new(0);

// @d last_text_char=255 {ordinal number of the largest element of |text_char|}

/// ordinal number of the largest element of [text_char]
#[cfg(not(feature = "unicode_support"))]
const last_text_char: text_char = text_char(255);
const last_text_char: text_char = text_char::new(255);

/// ordinal number of the largest element of [text_char]
#[cfg(feature = "unicode_support")]
const last_text_char: text_char = text_char(u32::MAX);
const last_text_char: text_char = text_char::new(u32::MAX);

//
// @<Local variables for init...@>=
Expand Down
80 changes: 80 additions & 0 deletions src/unicode_support.rs
@@ -0,0 +1,80 @@
#![cfg(feature = "unicode_support")]

thread_local! {
static GRAPHE_REGISTRY: GraphemeRegistry = GraphemeRegistry::new();
}

struct GraphemeRegistry {
normalized_strings_after_single_scalar_value: RefCell<Vec<&'static str>>,
normalized_string_lookup_map: RefCell<BTreeMap<&'static str, u32>>,
next_value: Cell<u32>,
}

const GRAPHEME_REGISTRY_INITIAL_VALUE: u32 = core::char::MAX as u32 + 1;

enum GraphemeRegistryItem {
SingleScalarValue(char),
MultiScalarValue(&'static str),
InvalidValue(u32),
}

impl GraphemeRegistry {
fn new() -> Self {
GraphemeRegistry {
normalized_strings_after_single_scalar_value: RefCell::new(Vec::new()),
normalized_string_lookup_map: RefCell::new(BTreeMap::new()),
next_value: Cell::new(GRAPHEME_REGISTRY_INITIAL_VALUE),
}
}

fn intern_normalized_multi_scalar_value(&self, s: String) -> u32 {
let s_ref = &s[..];
if let Some(v) = self.normalized_string_lookup_map.borrow_mut().get(s_ref) {
return *v;
}
let cur_value = self.next_value.get();
let s = Box::leak(s.into_boxed_str());
self.normalized_strings_after_single_scalar_value
.borrow_mut()
.push(s);
self.normalized_string_lookup_map
.borrow_mut()
.insert(s, cur_value);
self.next_value.set(cur_value.checked_add(1).unwrap());
cur_value
}

fn grapheme_value(&self, val: u32) -> GraphemeRegistryItem {
use core::convert::TryFrom;
if val < GRAPHEME_REGISTRY_INITIAL_VALUE {
match char::try_from(val) {
Ok(v) => GraphemeRegistryItem::SingleScalarValue(v),
Err(..) => GraphemeRegistryItem::InvalidValue(val),
}
} else {
let strings = self.normalized_strings_after_single_scalar_value.borrow();
match strings.get((val - GRAPHEME_REGISTRY_INITIAL_VALUE) as usize) {
Some(v) => GraphemeRegistryItem::MultiScalarValue(v),
None => GraphemeRegistryItem::InvalidValue(val),
}
}
}
}

type generalized_char = crate::pascal::char;

pub(crate) fn generalized_char_from_str(s: &str) -> generalized_char {
debug_assert!(!s.is_empty());
let (pos, ch) = s.char_indices().rev().next().unwrap();
if pos == 0 {
// single scalar value, fast path
return generalized_char::new(ch as u32);
}
let str = s.nfc().collect::<String>();
let result = GRAPHE_REGISTRY.with(|reg| reg.intern_normalized_multi_scalar_value(str));
generalized_char::new(result)
}

use core::cell::{Cell, RefCell};
use std::collections::BTreeMap;
use unicode_normalization::UnicodeNormalization;

0 comments on commit 563efcd

Please sign in to comment.