Permalink
9656 lines (8687 sloc) 292 KB
// Written in the D programming language.
/++
$(P The $(D std.uni) module provides an implementation
of fundamental Unicode algorithms and data structures.
This doesn't include UTF encoding and decoding primitives,
see $(REF decode, std,_utf) and $(REF encode, std,_utf) in std.utf
for this functionality. )
$(P All primitives listed operate on Unicode characters and
sets of characters. For functions which operate on ASCII characters
and ignore Unicode $(CHARACTERS), see $(MREF std, ascii).
For definitions of Unicode $(CHARACTER), $(CODEPOINT) and other terms
used throughout this module see the $(S_LINK Terminology, terminology) section
below.
)
$(P The focus of this module is the core needs of developing Unicode-aware
applications. To that effect it provides the following optimized primitives:
)
$(UL
$(LI Character classification by category and common properties:
$(LREF isAlpha), $(LREF isWhite) and others.
)
$(LI
Case-insensitive string comparison ($(LREF sicmp), $(LREF icmp)).
)
$(LI
Converting text to any of the four normalization forms via $(LREF normalize).
)
$(LI
Decoding ($(LREF decodeGrapheme)) and iteration ($(LREF byGrapheme), $(LREF graphemeStride))
by user-perceived characters, that is by $(LREF Grapheme) clusters.
)
$(LI
Decomposing and composing of individual character(s) according to canonical
or compatibility rules, see $(LREF compose) and $(LREF decompose),
including the specific version for Hangul syllables $(LREF composeJamo)
and $(LREF decomposeHangul).
)
)
$(P It's recognized that an application may need further enhancements
and extensions, such as less commonly known algorithms,
or tailoring existing ones for region specific needs. To help users
with building any extra functionality beyond the core primitives,
the module provides:
)
$(UL
$(LI
$(LREF CodepointSet), a type for easy manipulation of sets of characters.
Besides the typical set algebra it provides an unusual feature:
a D source code generator for detection of $(CODEPOINTS) in this set.
This is a boon for meta-programming parser frameworks,
and is used internally to power classification in small
sets like $(LREF isWhite).
)
$(LI
A way to construct optimal packed multi-stage tables also known as a
special case of $(LINK2 https://en.wikipedia.org/wiki/Trie, Trie).
The functions $(LREF codepointTrie), $(LREF codepointSetTrie)
construct custom tries that map dchar to value.
The end result is a fast and predictable $(BIGOH 1) lookup that powers
functions like $(LREF isAlpha) and $(LREF combiningClass),
but for user-defined data sets.
)
$(LI
A useful technique for Unicode-aware parsers that perform
character classification of encoded $(CODEPOINTS)
is to avoid unnecassary decoding at all costs.
$(LREF utfMatcher) provides an improvement over the usual workflow
of decode-classify-process, combining the decoding and classification
steps. By extracting necessary bits directly from encoded
$(S_LINK Code unit, code units) matchers achieve
significant performance improvements. See $(LREF MatcherConcept) for
the common interface of UTF matchers.
)
$(LI
Generally useful building blocks for customized normalization:
$(LREF combiningClass) for querying combining class
and $(LREF allowedIn) for testing the Quick_Check
property of a given normalization form.
)
$(LI
Access to a large selection of commonly used sets of $(CODEPOINTS).
$(S_LINK Unicode properties, Supported sets) include Script,
Block and General Category. The exact contents of a set can be
observed in the CLDR utility, on the
$(HTTP www.unicode.org/cldr/utility/properties.jsp, property index) page
of the Unicode website.
See $(LREF unicode) for easy and (optionally) compile-time checked set
queries.
)
)
$(SECTION Synopsis)
---
import std.uni;
void main()
{
// initialize code point sets using script/block or property name
// now 'set' contains code points from both scripts.
auto set = unicode("Cyrillic") | unicode("Armenian");
// same thing but simpler and checked at compile-time
auto ascii = unicode.ASCII;
auto currency = unicode.Currency_Symbol;
// easy set ops
auto a = set & ascii;
assert(a.empty); // as it has no intersection with ascii
a = set | ascii;
auto b = currency - a; // subtract all ASCII, Cyrillic and Armenian
// some properties of code point sets
assert(b.length > 45); // 46 items in Unicode 6.1, even more in 6.2
// testing presence of a code point in a set
// is just fine, it is O(logN)
assert(!b['$']);
assert(!b['\u058F']); // Armenian dram sign
assert(b['¥']);
// building fast lookup tables, these guarantee O(1) complexity
// 1-level Trie lookup table essentially a huge bit-set ~262Kb
auto oneTrie = toTrie!1(b);
// 2-level far more compact but typically slightly slower
auto twoTrie = toTrie!2(b);
// 3-level even smaller, and a bit slower yet
auto threeTrie = toTrie!3(b);
assert(oneTrie['£']);
assert(twoTrie['£']);
assert(threeTrie['£']);
// build the trie with the most sensible trie level
// and bind it as a functor
auto cyrillicOrArmenian = toDelegate(set);
auto balance = find!(cyrillicOrArmenian)("Hello ընկեր!");
assert(balance == "ընկեր!");
// compatible with bool delegate(dchar)
bool delegate(dchar) bindIt = cyrillicOrArmenian;
// Normalization
string s = "Plain ascii (and not only), is always normalized!";
assert(s is normalize(s));// is the same string
string nonS = "A\u0308ffin"; // A ligature
auto nS = normalize(nonS); // to NFC, the W3C endorsed standard
assert(nS == "Äffin");
assert(nS != nonS);
string composed = "Äffin";
assert(normalize!NFD(composed) == "A\u0308ffin");
// to NFKD, compatibility decomposition useful for fuzzy matching/searching
assert(normalize!NFKD("2¹⁰") == "210");
}
---
$(SECTION Terminology
)
$(P The following is a list of important Unicode notions
and definitions. Any conventions used specifically in this
module alone are marked as such. The descriptions are based on the formal
definition as found in $(HTTP www.unicode.org/versions/Unicode6.2.0/ch03.pdf,
chapter three of The Unicode Standard Core Specification.)
)
$(P $(DEF Abstract character) A unit of information used for the organization,
control, or representation of textual data.
Note that:
$(UL
$(LI When representing data, the nature of that data
is generally symbolic as opposed to some other
kind of data (for example, visual).
)
$(LI An abstract character has no concrete form
and should not be confused with a $(S_LINK Glyph, glyph).
)
$(LI An abstract character does not necessarily
correspond to what a user thinks of as a “character”
and should not be confused with a $(LREF Grapheme).
)
$(LI The abstract characters encoded (see Encoded character)
are known as Unicode abstract characters.
)
$(LI Abstract characters not directly
encoded by the Unicode Standard can often be
represented by the use of combining character sequences.
)
)
)
$(P $(DEF Canonical decomposition)
The decomposition of a character or character sequence
that results from recursively applying the canonical
mappings found in the Unicode Character Database
and these described in Conjoining Jamo Behavior
(section 12 of
$(HTTP www.unicode.org/uni2book/ch03.pdf, Unicode Conformance)).
)
$(P $(DEF Canonical composition)
The precise definition of the Canonical composition
is the algorithm as specified in $(HTTP www.unicode.org/uni2book/ch03.pdf,
Unicode Conformance) section 11.
Informally it's the process that does the reverse of the canonical
decomposition with the addition of certain rules
that e.g. prevent legacy characters from appearing in the composed result.
)
$(P $(DEF Canonical equivalent)
Two character sequences are said to be canonical equivalents if
their full canonical decompositions are identical.
)
$(P $(DEF Character) Typically differs by context.
For the purpose of this documentation the term $(I character)
implies $(I encoded character), that is, a code point having
an assigned abstract character (a symbolic meaning).
)
$(P $(DEF Code point) Any value in the Unicode codespace;
that is, the range of integers from 0 to 10FFFF (hex).
Not all code points are assigned to encoded characters.
)
$(P $(DEF Code unit) The minimal bit combination that can represent
a unit of encoded text for processing or interchange.
Depending on the encoding this could be:
8-bit code units in the UTF-8 ($(D char)),
16-bit code units in the UTF-16 ($(D wchar)),
and 32-bit code units in the UTF-32 ($(D dchar)).
$(I Note that in UTF-32, a code unit is a code point
and is represented by the D $(D dchar) type.)
)
$(P $(DEF Combining character) A character with the General Category
of Combining Mark(M).
$(UL
$(LI All characters with non-zero canonical combining class
are combining characters, but the reverse is not the case:
there are combining characters with a zero combining class.
)
$(LI These characters are not normally used in isolation
unless they are being described. They include such characters
as accents, diacritics, Hebrew points, Arabic vowel signs,
and Indic matras.
)
)
)
$(P $(DEF Combining class)
A numerical value used by the Unicode Canonical Ordering Algorithm
to determine which sequences of combining marks are to be
considered canonically equivalent and which are not.
)
$(P $(DEF Compatibility decomposition)
The decomposition of a character or character sequence that results
from recursively applying both the compatibility mappings and
the canonical mappings found in the Unicode Character Database, and those
described in Conjoining Jamo Behavior no characters
can be further decomposed.
)
$(P $(DEF Compatibility equivalent)
Two character sequences are said to be compatibility
equivalents if their full compatibility decompositions are identical.
)
$(P $(DEF Encoded character) An association (or mapping)
between an abstract character and a code point.
)
$(P $(DEF Glyph) The actual, concrete image of a glyph representation
having been rasterized or otherwise imaged onto some display surface.
)
$(P $(DEF Grapheme base) A character with the property
Grapheme_Base, or any standard Korean syllable block.
)
$(P $(DEF Grapheme cluster) Defined as the text between
grapheme boundaries as specified by Unicode Standard Annex #29,
$(HTTP www.unicode.org/reports/tr29/, Unicode text segmentation).
Important general properties of a grapheme:
$(UL
$(LI The grapheme cluster represents a horizontally segmentable
unit of text, consisting of some grapheme base (which may
consist of a Korean syllable) together with any number of
nonspacing marks applied to it.
)
$(LI A grapheme cluster typically starts with a grapheme base
and then extends across any subsequent sequence of nonspacing marks.
A grapheme cluster is most directly relevant to text rendering and
processes such as cursor placement and text selection in editing,
but may also be relevant to comparison and searching.
)
$(LI For many processes, a grapheme cluster behaves as if it was a
single character with the same properties as its grapheme base.
Effectively, nonspacing marks apply $(I graphically) to the base,
but do not change its properties.
)
)
$(P This module defines a number of primitives that work with graphemes:
$(LREF Grapheme), $(LREF decodeGrapheme) and $(LREF graphemeStride).
All of them are using $(I extended grapheme) boundaries
as defined in the aforementioned standard annex.
)
)
$(P $(DEF Nonspacing mark) A combining character with the
General Category of Nonspacing Mark (Mn) or Enclosing Mark (Me).
)
$(P $(DEF Spacing mark) A combining character that is not a nonspacing mark.
)
$(SECTION Normalization
)
$(P The concepts of $(S_LINK Canonical equivalent, canonical equivalent)
or $(S_LINK Compatibility equivalent, compatibility equivalent)
characters in the Unicode Standard make it necessary to have a full, formal
definition of equivalence for Unicode strings.
String equivalence is determined by a process called normalization,
whereby strings are converted into forms which are compared
directly for identity. This is the primary goal of the normalization process,
see the function $(LREF normalize) to convert into any of
the four defined forms.
)
$(P A very important attribute of the Unicode Normalization Forms
is that they must remain stable between versions of the Unicode Standard.
A Unicode string normalized to a particular Unicode Normalization Form
in one version of the standard is guaranteed to remain in that Normalization
Form for implementations of future versions of the standard.
)
$(P The Unicode Standard specifies four normalization forms.
Informally, two of these forms are defined by maximal decomposition
of equivalent sequences, and two of these forms are defined
by maximal $(I composition) of equivalent sequences.
$(UL
$(LI Normalization Form D (NFD): The $(S_LINK Canonical decomposition,
canonical decomposition) of a character sequence.)
$(LI Normalization Form KD (NFKD): The $(S_LINK Compatibility decomposition,
compatibility decomposition) of a character sequence.)
$(LI Normalization Form C (NFC): The canonical composition of the
$(S_LINK Canonical decomposition, canonical decomposition)
of a coded character sequence.)
$(LI Normalization Form KC (NFKC): The canonical composition
of the $(S_LINK Compatibility decomposition,
compatibility decomposition) of a character sequence)
)
)
$(P The choice of the normalization form depends on the particular use case.
NFC is the best form for general text, since it's more compatible with
strings converted from legacy encodings. NFKC is the preferred form for
identifiers, especially where there are security concerns. NFD and NFKD
are the most useful for internal processing.
)
$(SECTION Construction of lookup tables
)
$(P The Unicode standard describes a set of algorithms that
depend on having the ability to quickly look up various properties
of a code point. Given the the codespace of about 1 million $(CODEPOINTS),
it is not a trivial task to provide a space-efficient solution for
the multitude of properties.
)
$(P Common approaches such as hash-tables or binary search over
sorted code point intervals (as in $(LREF InversionList)) are insufficient.
Hash-tables have enormous memory footprint and binary search
over intervals is not fast enough for some heavy-duty algorithms.
)
$(P The recommended solution (see Unicode Implementation Guidelines)
is using multi-stage tables that are an implementation of the
$(HTTP en.wikipedia.org/wiki/Trie, Trie) data structure with integer
keys and a fixed number of stages. For the remainder of the section
this will be called a fixed trie. The following describes a particular
implementation that is aimed for the speed of access at the expense
of ideal size savings.
)
$(P Taking a 2-level Trie as an example the principle of operation is as follows.
Split the number of bits in a key (code point, 21 bits) into 2 components
(e.g. 15 and 8). The first is the number of bits in the index of the trie
and the other is number of bits in each page of the trie.
The layout of the trie is then an array of size 2^^bits-of-index followed
an array of memory chunks of size 2^^bits-of-page/bits-per-element.
)
$(P The number of pages is variable (but not less then 1)
unlike the number of entries in the index. The slots of the index
all have to contain a number of a page that is present. The lookup is then
just a couple of operations - slice the upper bits,
lookup an index for these, take a page at this index and use
the lower bits as an offset within this page.
Assuming that pages are laid out consequently
in one array at $(D pages), the pseudo-code is:
)
---
auto elemsPerPage = (2 ^^ bits_per_page) / Value.sizeOfInBits;
pages[index[n >> bits_per_page]][n & (elemsPerPage - 1)];
---
$(P Where if $(D elemsPerPage) is a power of 2 the whole process is
a handful of simple instructions and 2 array reads. Subsequent levels
of the trie are introduced by recursing on this notion - the index array
is treated as values. The number of bits in index is then again
split into 2 parts, with pages over 'current-index' and the new 'upper-index'.
)
$(P For completeness a level 1 trie is simply an array.
The current implementation takes advantage of bit-packing values
when the range is known to be limited in advance (such as $(D bool)).
See also $(LREF BitPacked) for enforcing it manually.
The major size advantage however comes from the fact
that multiple $(B identical pages on every level are merged) by construction.
)
$(P The process of constructing a trie is more involved and is hidden from
the user in a form of the convenience functions $(LREF codepointTrie),
$(LREF codepointSetTrie) and the even more convenient $(LREF toTrie).
In general a set or built-in AA with $(D dchar) type
can be turned into a trie. The trie object in this module
is read-only (immutable); it's effectively frozen after construction.
)
$(SECTION Unicode properties
)
$(P This is a full list of Unicode properties accessible through $(LREF unicode)
with specific helpers per category nested within. Consult the
$(HTTP www.unicode.org/cldr/utility/properties.jsp, CLDR utility)
when in doubt about the contents of a particular set.
)
$(P General category sets listed below are only accessible with the
$(LREF unicode) shorthand accessor.)
$(BOOKTABLE $(B General category ),
$(TR $(TH Abb.) $(TH Long form)
$(TH Abb.) $(TH Long form)$(TH Abb.) $(TH Long form))
$(TR $(TD L) $(TD Letter)
$(TD Cn) $(TD Unassigned) $(TD Po) $(TD Other_Punctuation))
$(TR $(TD Ll) $(TD Lowercase_Letter)
$(TD Co) $(TD Private_Use) $(TD Ps) $(TD Open_Punctuation))
$(TR $(TD Lm) $(TD Modifier_Letter)
$(TD Cs) $(TD Surrogate) $(TD S) $(TD Symbol))
$(TR $(TD Lo) $(TD Other_Letter)
$(TD N) $(TD Number) $(TD Sc) $(TD Currency_Symbol))
$(TR $(TD Lt) $(TD Titlecase_Letter)
$(TD Nd) $(TD Decimal_Number) $(TD Sk) $(TD Modifier_Symbol))
$(TR $(TD Lu) $(TD Uppercase_Letter)
$(TD Nl) $(TD Letter_Number) $(TD Sm) $(TD Math_Symbol))
$(TR $(TD M) $(TD Mark)
$(TD No) $(TD Other_Number) $(TD So) $(TD Other_Symbol))
$(TR $(TD Mc) $(TD Spacing_Mark)
$(TD P) $(TD Punctuation) $(TD Z) $(TD Separator))
$(TR $(TD Me) $(TD Enclosing_Mark)
$(TD Pc) $(TD Connector_Punctuation) $(TD Zl) $(TD Line_Separator))
$(TR $(TD Mn) $(TD Nonspacing_Mark)
$(TD Pd) $(TD Dash_Punctuation) $(TD Zp) $(TD Paragraph_Separator))
$(TR $(TD C) $(TD Other)
$(TD Pe) $(TD Close_Punctuation) $(TD Zs) $(TD Space_Separator))
$(TR $(TD Cc) $(TD Control) $(TD Pf)
$(TD Final_Punctuation) $(TD -) $(TD Any))
$(TR $(TD Cf) $(TD Format)
$(TD Pi) $(TD Initial_Punctuation) $(TD -) $(TD ASCII))
)
$(P Sets for other commonly useful properties that are
accessible with $(LREF unicode):)
$(BOOKTABLE $(B Common binary properties),
$(TR $(TH Name) $(TH Name) $(TH Name))
$(TR $(TD Alphabetic) $(TD Ideographic) $(TD Other_Uppercase))
$(TR $(TD ASCII_Hex_Digit) $(TD IDS_Binary_Operator) $(TD Pattern_Syntax))
$(TR $(TD Bidi_Control) $(TD ID_Start) $(TD Pattern_White_Space))
$(TR $(TD Cased) $(TD IDS_Trinary_Operator) $(TD Quotation_Mark))
$(TR $(TD Case_Ignorable) $(TD Join_Control) $(TD Radical))
$(TR $(TD Dash) $(TD Logical_Order_Exception) $(TD Soft_Dotted))
$(TR $(TD Default_Ignorable_Code_Point) $(TD Lowercase) $(TD STerm))
$(TR $(TD Deprecated) $(TD Math) $(TD Terminal_Punctuation))
$(TR $(TD Diacritic) $(TD Noncharacter_Code_Point) $(TD Unified_Ideograph))
$(TR $(TD Extender) $(TD Other_Alphabetic) $(TD Uppercase))
$(TR $(TD Grapheme_Base) $(TD Other_Default_Ignorable_Code_Point) $(TD Variation_Selector))
$(TR $(TD Grapheme_Extend) $(TD Other_Grapheme_Extend) $(TD White_Space))
$(TR $(TD Grapheme_Link) $(TD Other_ID_Continue) $(TD XID_Continue))
$(TR $(TD Hex_Digit) $(TD Other_ID_Start) $(TD XID_Start))
$(TR $(TD Hyphen) $(TD Other_Lowercase) )
$(TR $(TD ID_Continue) $(TD Other_Math) )
)
$(P Below is the table with block names accepted by $(LREF unicode.block).
Note that the shorthand version $(LREF unicode) requires "In"
to be prepended to the names of blocks so as to disambiguate
scripts and blocks.
)
$(BOOKTABLE $(B Blocks),
$(TR $(TD Aegean Numbers) $(TD Ethiopic Extended) $(TD Mongolian))
$(TR $(TD Alchemical Symbols) $(TD Ethiopic Extended-A) $(TD Musical Symbols))
$(TR $(TD Alphabetic Presentation Forms) $(TD Ethiopic Supplement) $(TD Myanmar))
$(TR $(TD Ancient Greek Musical Notation) $(TD General Punctuation) $(TD Myanmar Extended-A))
$(TR $(TD Ancient Greek Numbers) $(TD Geometric Shapes) $(TD New Tai Lue))
$(TR $(TD Ancient Symbols) $(TD Georgian) $(TD NKo))
$(TR $(TD Arabic) $(TD Georgian Supplement) $(TD Number Forms))
$(TR $(TD Arabic Extended-A) $(TD Glagolitic) $(TD Ogham))
$(TR $(TD Arabic Mathematical Alphabetic Symbols) $(TD Gothic) $(TD Ol Chiki))
$(TR $(TD Arabic Presentation Forms-A) $(TD Greek and Coptic) $(TD Old Italic))
$(TR $(TD Arabic Presentation Forms-B) $(TD Greek Extended) $(TD Old Persian))
$(TR $(TD Arabic Supplement) $(TD Gujarati) $(TD Old South Arabian))
$(TR $(TD Armenian) $(TD Gurmukhi) $(TD Old Turkic))
$(TR $(TD Arrows) $(TD Halfwidth and Fullwidth Forms) $(TD Optical Character Recognition))
$(TR $(TD Avestan) $(TD Hangul Compatibility Jamo) $(TD Oriya))
$(TR $(TD Balinese) $(TD Hangul Jamo) $(TD Osmanya))
$(TR $(TD Bamum) $(TD Hangul Jamo Extended-A) $(TD Phags-pa))
$(TR $(TD Bamum Supplement) $(TD Hangul Jamo Extended-B) $(TD Phaistos Disc))
$(TR $(TD Basic Latin) $(TD Hangul Syllables) $(TD Phoenician))
$(TR $(TD Batak) $(TD Hanunoo) $(TD Phonetic Extensions))
$(TR $(TD Bengali) $(TD Hebrew) $(TD Phonetic Extensions Supplement))
$(TR $(TD Block Elements) $(TD High Private Use Surrogates) $(TD Playing Cards))
$(TR $(TD Bopomofo) $(TD High Surrogates) $(TD Private Use Area))
$(TR $(TD Bopomofo Extended) $(TD Hiragana) $(TD Rejang))
$(TR $(TD Box Drawing) $(TD Ideographic Description Characters) $(TD Rumi Numeral Symbols))
$(TR $(TD Brahmi) $(TD Imperial Aramaic) $(TD Runic))
$(TR $(TD Braille Patterns) $(TD Inscriptional Pahlavi) $(TD Samaritan))
$(TR $(TD Buginese) $(TD Inscriptional Parthian) $(TD Saurashtra))
$(TR $(TD Buhid) $(TD IPA Extensions) $(TD Sharada))
$(TR $(TD Byzantine Musical Symbols) $(TD Javanese) $(TD Shavian))
$(TR $(TD Carian) $(TD Kaithi) $(TD Sinhala))
$(TR $(TD Chakma) $(TD Kana Supplement) $(TD Small Form Variants))
$(TR $(TD Cham) $(TD Kanbun) $(TD Sora Sompeng))
$(TR $(TD Cherokee) $(TD Kangxi Radicals) $(TD Spacing Modifier Letters))
$(TR $(TD CJK Compatibility) $(TD Kannada) $(TD Specials))
$(TR $(TD CJK Compatibility Forms) $(TD Katakana) $(TD Sundanese))
$(TR $(TD CJK Compatibility Ideographs) $(TD Katakana Phonetic Extensions) $(TD Sundanese Supplement))
$(TR $(TD CJK Compatibility Ideographs Supplement) $(TD Kayah Li) $(TD Superscripts and Subscripts))
$(TR $(TD CJK Radicals Supplement) $(TD Kharoshthi) $(TD Supplemental Arrows-A))
$(TR $(TD CJK Strokes) $(TD Khmer) $(TD Supplemental Arrows-B))
$(TR $(TD CJK Symbols and Punctuation) $(TD Khmer Symbols) $(TD Supplemental Mathematical Operators))
$(TR $(TD CJK Unified Ideographs) $(TD Lao) $(TD Supplemental Punctuation))
$(TR $(TD CJK Unified Ideographs Extension A) $(TD Latin-1 Supplement) $(TD Supplementary Private Use Area-A))
$(TR $(TD CJK Unified Ideographs Extension B) $(TD Latin Extended-A) $(TD Supplementary Private Use Area-B))
$(TR $(TD CJK Unified Ideographs Extension C) $(TD Latin Extended Additional) $(TD Syloti Nagri))
$(TR $(TD CJK Unified Ideographs Extension D) $(TD Latin Extended-B) $(TD Syriac))
$(TR $(TD Combining Diacritical Marks) $(TD Latin Extended-C) $(TD Tagalog))
$(TR $(TD Combining Diacritical Marks for Symbols) $(TD Latin Extended-D) $(TD Tagbanwa))
$(TR $(TD Combining Diacritical Marks Supplement) $(TD Lepcha) $(TD Tags))
$(TR $(TD Combining Half Marks) $(TD Letterlike Symbols) $(TD Tai Le))
$(TR $(TD Common Indic Number Forms) $(TD Limbu) $(TD Tai Tham))
$(TR $(TD Control Pictures) $(TD Linear B Ideograms) $(TD Tai Viet))
$(TR $(TD Coptic) $(TD Linear B Syllabary) $(TD Tai Xuan Jing Symbols))
$(TR $(TD Counting Rod Numerals) $(TD Lisu) $(TD Takri))
$(TR $(TD Cuneiform) $(TD Low Surrogates) $(TD Tamil))
$(TR $(TD Cuneiform Numbers and Punctuation) $(TD Lycian) $(TD Telugu))
$(TR $(TD Currency Symbols) $(TD Lydian) $(TD Thaana))
$(TR $(TD Cypriot Syllabary) $(TD Mahjong Tiles) $(TD Thai))
$(TR $(TD Cyrillic) $(TD Malayalam) $(TD Tibetan))
$(TR $(TD Cyrillic Extended-A) $(TD Mandaic) $(TD Tifinagh))
$(TR $(TD Cyrillic Extended-B) $(TD Mathematical Alphanumeric Symbols) $(TD Transport And Map Symbols))
$(TR $(TD Cyrillic Supplement) $(TD Mathematical Operators) $(TD Ugaritic))
$(TR $(TD Deseret) $(TD Meetei Mayek) $(TD Unified Canadian Aboriginal Syllabics))
$(TR $(TD Devanagari) $(TD Meetei Mayek Extensions) $(TD Unified Canadian Aboriginal Syllabics Extended))
$(TR $(TD Devanagari Extended) $(TD Meroitic Cursive) $(TD Vai))
$(TR $(TD Dingbats) $(TD Meroitic Hieroglyphs) $(TD Variation Selectors))
$(TR $(TD Domino Tiles) $(TD Miao) $(TD Variation Selectors Supplement))
$(TR $(TD Egyptian Hieroglyphs) $(TD Miscellaneous Mathematical Symbols-A) $(TD Vedic Extensions))
$(TR $(TD Emoticons) $(TD Miscellaneous Mathematical Symbols-B) $(TD Vertical Forms))
$(TR $(TD Enclosed Alphanumerics) $(TD Miscellaneous Symbols) $(TD Yijing Hexagram Symbols))
$(TR $(TD Enclosed Alphanumeric Supplement) $(TD Miscellaneous Symbols and Arrows) $(TD Yi Radicals))
$(TR $(TD Enclosed CJK Letters and Months) $(TD Miscellaneous Symbols And Pictographs) $(TD Yi Syllables))
$(TR $(TD Enclosed Ideographic Supplement) $(TD Miscellaneous Technical) )
$(TR $(TD Ethiopic) $(TD Modifier Tone Letters) )
)
$(P Below is the table with script names accepted by $(LREF unicode.script)
and by the shorthand version $(LREF unicode):)
$(BOOKTABLE $(B Scripts),
$(TR $(TD Arabic) $(TD Hanunoo) $(TD Old_Italic))
$(TR $(TD Armenian) $(TD Hebrew) $(TD Old_Persian))
$(TR $(TD Avestan) $(TD Hiragana) $(TD Old_South_Arabian))
$(TR $(TD Balinese) $(TD Imperial_Aramaic) $(TD Old_Turkic))
$(TR $(TD Bamum) $(TD Inherited) $(TD Oriya))
$(TR $(TD Batak) $(TD Inscriptional_Pahlavi) $(TD Osmanya))
$(TR $(TD Bengali) $(TD Inscriptional_Parthian) $(TD Phags_Pa))
$(TR $(TD Bopomofo) $(TD Javanese) $(TD Phoenician))
$(TR $(TD Brahmi) $(TD Kaithi) $(TD Rejang))
$(TR $(TD Braille) $(TD Kannada) $(TD Runic))
$(TR $(TD Buginese) $(TD Katakana) $(TD Samaritan))
$(TR $(TD Buhid) $(TD Kayah_Li) $(TD Saurashtra))
$(TR $(TD Canadian_Aboriginal) $(TD Kharoshthi) $(TD Sharada))
$(TR $(TD Carian) $(TD Khmer) $(TD Shavian))
$(TR $(TD Chakma) $(TD Lao) $(TD Sinhala))
$(TR $(TD Cham) $(TD Latin) $(TD Sora_Sompeng))
$(TR $(TD Cherokee) $(TD Lepcha) $(TD Sundanese))
$(TR $(TD Common) $(TD Limbu) $(TD Syloti_Nagri))
$(TR $(TD Coptic) $(TD Linear_B) $(TD Syriac))
$(TR $(TD Cuneiform) $(TD Lisu) $(TD Tagalog))
$(TR $(TD Cypriot) $(TD Lycian) $(TD Tagbanwa))
$(TR $(TD Cyrillic) $(TD Lydian) $(TD Tai_Le))
$(TR $(TD Deseret) $(TD Malayalam) $(TD Tai_Tham))
$(TR $(TD Devanagari) $(TD Mandaic) $(TD Tai_Viet))
$(TR $(TD Egyptian_Hieroglyphs) $(TD Meetei_Mayek) $(TD Takri))
$(TR $(TD Ethiopic) $(TD Meroitic_Cursive) $(TD Tamil))
$(TR $(TD Georgian) $(TD Meroitic_Hieroglyphs) $(TD Telugu))
$(TR $(TD Glagolitic) $(TD Miao) $(TD Thaana))
$(TR $(TD Gothic) $(TD Mongolian) $(TD Thai))
$(TR $(TD Greek) $(TD Myanmar) $(TD Tibetan))
$(TR $(TD Gujarati) $(TD New_Tai_Lue) $(TD Tifinagh))
$(TR $(TD Gurmukhi) $(TD Nko) $(TD Ugaritic))
$(TR $(TD Han) $(TD Ogham) $(TD Vai))
$(TR $(TD Hangul) $(TD Ol_Chiki) $(TD Yi))
)
$(P Below is the table of names accepted by $(LREF unicode.hangulSyllableType).)
$(BOOKTABLE $(B Hangul syllable type),
$(TR $(TH Abb.) $(TH Long form))
$(TR $(TD L) $(TD Leading_Jamo))
$(TR $(TD LV) $(TD LV_Syllable))
$(TR $(TD LVT) $(TD LVT_Syllable) )
$(TR $(TD T) $(TD Trailing_Jamo))
$(TR $(TD V) $(TD Vowel_Jamo))
)
References:
$(HTTP www.digitalmars.com/d/ascii-table.html, ASCII Table),
$(HTTP en.wikipedia.org/wiki/Unicode, Wikipedia),
$(HTTP www.unicode.org, The Unicode Consortium),
$(HTTP www.unicode.org/reports/tr15/, Unicode normalization forms),
$(HTTP www.unicode.org/reports/tr29/, Unicode text segmentation)
$(HTTP www.unicode.org/uni2book/ch05.pdf,
Unicode Implementation Guidelines)
$(HTTP www.unicode.org/uni2book/ch03.pdf,
Unicode Conformance)
Trademarks:
Unicode(tm) is a trademark of Unicode, Inc.
Copyright: Copyright 2013 -
License: $(HTTP www.boost.org/LICENSE_1_0.txt, Boost License 1.0).
Authors: Dmitry Olshansky
Source: $(PHOBOSSRC std/_uni.d)
Standards: $(HTTP www.unicode.org/versions/Unicode6.2.0/, Unicode v6.2)
Macros:
SECTION = <h3><a id="$1">$0</a></h3>
DEF = <div><a id="$1"><i>$0</i></a></div>
S_LINK = <a href="#$1">$+</a>
CODEPOINT = $(S_LINK Code point, code point)
CODEPOINTS = $(S_LINK Code point, code points)
CHARACTER = $(S_LINK Character, character)
CHARACTERS = $(S_LINK Character, characters)
CLUSTER = $(S_LINK Grapheme cluster, grapheme cluster)
+/
module std.uni;
import std.meta; // AliasSeq
import std.range.primitives; // back, ElementEncodingType, ElementType, empty,
// front, isForwardRange, isInputRange, isRandomAccessRange, popFront, put,
// save
import std.traits; // isConvertibleToString, isIntegral, isSomeChar,
// isSomeString, Unqual
// debug = std_uni;
debug(std_uni) import std.stdio; // writefln, writeln
private:
version (unittest)
{
private:
struct TestAliasedString
{
string get() @safe @nogc pure nothrow { return _s; }
alias get this;
@disable this(this);
string _s;
}
bool testAliasedString(alias func, Args...)(string s, Args args)
{
import std.algorithm.comparison : equal;
auto a = func(TestAliasedString(s), args);
auto b = func(s, args);
static if (is(typeof(equal(a, b))))
{
// For ranges, compare contents instead of object identity.
return equal(a, b);
}
else
{
return a == b;
}
}
}
void copyBackwards(T,U)(T[] src, U[] dest)
{
assert(src.length == dest.length);
for (size_t i=src.length; i-- > 0; )
dest[i] = src[i];
}
void copyForward(T,U)(T[] src, U[] dest)
{
assert(src.length == dest.length);
for (size_t i=0; i<src.length; i++)
dest[i] = src[i];
}
// TODO: update to reflect all major CPUs supporting unaligned reads
version(X86)
enum hasUnalignedReads = true;
else version(X86_64)
enum hasUnalignedReads = true;
else
enum hasUnalignedReads = false; // better be safe then sorry
public enum dchar lineSep = '\u2028'; /// Constant $(CODEPOINT) (0x2028) - line separator.
public enum dchar paraSep = '\u2029'; /// Constant $(CODEPOINT) (0x2029) - paragraph separator.
public enum dchar nelSep = '\u0085'; /// Constant $(CODEPOINT) (0x0085) - next line.
// test the intro example
@safe unittest
{
import std.algorithm.searching : find;
// initialize code point sets using script/block or property name
// set contains code points from both scripts.
auto set = unicode("Cyrillic") | unicode("Armenian");
// or simpler and statically-checked look
auto ascii = unicode.ASCII;
auto currency = unicode.Currency_Symbol;
// easy set ops
auto a = set & ascii;
assert(a.empty); // as it has no intersection with ascii
a = set | ascii;
auto b = currency - a; // subtract all ASCII, Cyrillic and Armenian
// some properties of code point sets
assert(b.length > 45); // 46 items in Unicode 6.1, even more in 6.2
// testing presence of a code point in a set
// is just fine, it is O(logN)
assert(!b['$']);
assert(!b['\u058F']); // Armenian dram sign
assert(b['¥']);
// building fast lookup tables, these guarantee O(1) complexity
// 1-level Trie lookup table essentially a huge bit-set ~262Kb
auto oneTrie = toTrie!1(b);
// 2-level far more compact but typically slightly slower
auto twoTrie = toTrie!2(b);
// 3-level even smaller, and a bit slower yet
auto threeTrie = toTrie!3(b);
assert(oneTrie['£']);
assert(twoTrie['£']);
assert(threeTrie['£']);
// build the trie with the most sensible trie level
// and bind it as a functor
auto cyrillicOrArmenian = toDelegate(set);
auto balance = find!(cyrillicOrArmenian)("Hello ընկեր!");
assert(balance == "ընկեր!");
// compatible with bool delegate(dchar)
bool delegate(dchar) bindIt = cyrillicOrArmenian;
// Normalization
string s = "Plain ascii (and not only), is always normalized!";
assert(s is normalize(s));// is the same string
string nonS = "A\u0308ffin"; // A ligature
auto nS = normalize(nonS); // to NFC, the W3C endorsed standard
assert(nS == "Äffin");
assert(nS != nonS);
string composed = "Äffin";
assert(normalize!NFD(composed) == "A\u0308ffin");
// to NFKD, compatibility decomposition useful for fuzzy matching/searching
assert(normalize!NFKD("2¹⁰") == "210");
}
enum lastDchar = 0x10FFFF;
auto force(T, F)(F from)
if (isIntegral!T && !is(T == F))
{
assert(from <= T.max && from >= T.min);
return cast(T) from;
}
auto force(T, F)(F from)
if (isBitPacked!T && !is(T == F))
{
assert(from <= 2^^bitSizeOf!T-1);
return T(cast(TypeOfBitPacked!T) from);
}
auto force(T, F)(F from)
if (is(T == F))
{
return from;
}
// repeat X times the bit-pattern in val assuming it's length is 'bits'
size_t replicateBits(size_t times, size_t bits)(size_t val) @safe pure nothrow @nogc
{
static if (times == 1)
return val;
else static if (bits == 1)
{
static if (times == size_t.sizeof*8)
return val ? size_t.max : 0;
else
return val ? (1 << times)-1 : 0;
}
else static if (times % 2)
return (replicateBits!(times-1, bits)(val)<<bits) | val;
else
return replicateBits!(times/2, bits*2)((val << bits) | val);
}
@safe pure nothrow @nogc unittest // for replicate
{
import std.algorithm.iteration : sum, map;
import std.range : iota;
size_t m = 0b111;
size_t m2 = 0b01;
foreach (i; AliasSeq!(1, 2, 3, 4, 5, 6, 7, 8, 9, 10))
{
assert(replicateBits!(i, 3)(m)+1 == (1<<(3*i)));
assert(replicateBits!(i, 2)(m2) == iota(0, i).map!"2^^(2*a)"().sum());
}
}
// multiple arrays squashed into one memory block
struct MultiArray(Types...)
{
import std.range.primitives : isOutputRange;
this(size_t[] sizes...) @safe pure nothrow
{
assert(dim == sizes.length);
size_t full_size;
foreach (i, v; Types)
{
full_size += spaceFor!(bitSizeOf!v)(sizes[i]);
sz[i] = sizes[i];
static if (i >= 1)
offsets[i] = offsets[i-1] +
spaceFor!(bitSizeOf!(Types[i-1]))(sizes[i-1]);
}
storage = new size_t[full_size];
}
this(const(size_t)[] raw_offsets,
const(size_t)[] raw_sizes, const(size_t)[] data)const @safe pure nothrow @nogc
{
offsets[] = raw_offsets[];
sz[] = raw_sizes[];
storage = data;
}
@property auto slice(size_t n)()inout pure nothrow @nogc
{
auto ptr = raw_ptr!n;
return packedArrayView!(Types[n])(ptr, sz[n]);
}
@property auto ptr(size_t n)()inout pure nothrow @nogc
{
auto ptr = raw_ptr!n;
return inout(PackedPtr!(Types[n]))(ptr);
}
template length(size_t n)
{
@property size_t length()const @safe pure nothrow @nogc{ return sz[n]; }
@property void length(size_t new_size)
{
if (new_size > sz[n])
{// extend
size_t delta = (new_size - sz[n]);
sz[n] += delta;
delta = spaceFor!(bitSizeOf!(Types[n]))(delta);
storage.length += delta;// extend space at end
// raw_slice!x must follow resize as it could be moved!
// next stmts move all data past this array, last-one-goes-first
static if (n != dim-1)
{
auto start = raw_ptr!(n+1);
// len includes delta
size_t len = (storage.ptr+storage.length-start);
copyBackwards(start[0 .. len-delta], start[delta .. len]);
start[0 .. delta] = 0;
// offsets are used for raw_slice, ptr etc.
foreach (i; n+1 .. dim)
offsets[i] += delta;
}
}
else if (new_size < sz[n])
{// shrink
size_t delta = (sz[n] - new_size);
sz[n] -= delta;
delta = spaceFor!(bitSizeOf!(Types[n]))(delta);
// move all data past this array, forward direction
static if (n != dim-1)
{
auto start = raw_ptr!(n+1);
size_t len = (storage.ptr+storage.length-start);
copyForward(start[0 .. len-delta], start[delta .. len]);
// adjust offsets last, they affect raw_slice
foreach (i; n+1 .. dim)
offsets[i] -= delta;
}
storage.length -= delta;
}
// else - NOP
}
}
@property size_t bytes(size_t n=size_t.max)() const @safe
{
static if (n == size_t.max)
return storage.length*size_t.sizeof;
else static if (n != Types.length-1)
return (raw_ptr!(n+1)-raw_ptr!n)*size_t.sizeof;
else
return (storage.ptr+storage.length - raw_ptr!n)*size_t.sizeof;
}
void store(OutRange)(scope OutRange sink) const
if (isOutputRange!(OutRange, char))
{
import std.format : formattedWrite;
formattedWrite(sink, "[%( 0x%x, %)]", offsets[]);
formattedWrite(sink, ", [%( 0x%x, %)]", sz[]);
formattedWrite(sink, ", [%( 0x%x, %)]", storage);
}
private:
import std.meta : staticMap;
@property auto raw_ptr(size_t n)()inout pure nothrow @nogc
{
static if (n == 0)
return storage.ptr;
else
{
return storage.ptr+offsets[n];
}
}
enum dim = Types.length;
size_t[dim] offsets;// offset for level x
size_t[dim] sz;// size of level x
alias bitWidth = staticMap!(bitSizeOf, Types);
size_t[] storage;
}
@system unittest
{
import std.conv : text;
enum dg = (){
// sizes are:
// lvl0: 3, lvl1 : 2, lvl2: 1
auto m = MultiArray!(int, ubyte, int)(3,2,1);
static void check(size_t k, T)(ref T m, int n)
{
foreach (i; 0 .. n)
assert(m.slice!(k)[i] == i+1, text("level:",i," : ",m.slice!(k)[0 .. n]));
}
static void checkB(size_t k, T)(ref T m, int n)
{
foreach (i; 0 .. n)
assert(m.slice!(k)[i] == n-i, text("level:",i," : ",m.slice!(k)[0 .. n]));
}
static void fill(size_t k, T)(ref T m, int n)
{
foreach (i; 0 .. n)
m.slice!(k)[i] = force!ubyte(i+1);
}
static void fillB(size_t k, T)(ref T m, int n)
{
foreach (i; 0 .. n)
m.slice!(k)[i] = force!ubyte(n-i);
}
m.length!1 = 100;
fill!1(m, 100);
check!1(m, 100);
m.length!0 = 220;
fill!0(m, 220);
check!1(m, 100);
check!0(m, 220);
m.length!2 = 17;
fillB!2(m, 17);
checkB!2(m, 17);
check!0(m, 220);
check!1(m, 100);
m.length!2 = 33;
checkB!2(m, 17);
fillB!2(m, 33);
checkB!2(m, 33);
check!0(m, 220);
check!1(m, 100);
m.length!1 = 195;
fillB!1(m, 195);
checkB!1(m, 195);
checkB!2(m, 33);
check!0(m, 220);
auto marr = MultiArray!(BitPacked!(uint, 4), BitPacked!(uint, 6))(20, 10);
marr.length!0 = 15;
marr.length!1 = 30;
fill!1(marr, 30);
fill!0(marr, 15);
check!1(marr, 30);
check!0(marr, 15);
return 0;
};
enum ct = dg();
auto rt = dg();
}
@system unittest
{// more bitpacking tests
import std.conv : text;
alias Bitty =
MultiArray!(BitPacked!(size_t, 3)
, BitPacked!(size_t, 4)
, BitPacked!(size_t, 3)
, BitPacked!(size_t, 6)
, bool);
alias fn1 = sliceBits!(13, 16);
alias fn2 = sliceBits!( 9, 13);
alias fn3 = sliceBits!( 6, 9);
alias fn4 = sliceBits!( 0, 6);
static void check(size_t lvl, MA)(ref MA arr){
for (size_t i = 0; i< arr.length!lvl; i++)
assert(arr.slice!(lvl)[i] == i, text("Mismatch on lvl ", lvl, " idx ", i, " value: ", arr.slice!(lvl)[i]));
}
static void fillIdx(size_t lvl, MA)(ref MA arr){
for (size_t i = 0; i< arr.length!lvl; i++)
arr.slice!(lvl)[i] = i;
}
Bitty m1;
m1.length!4 = 10;
m1.length!3 = 2^^6;
m1.length!2 = 2^^3;
m1.length!1 = 2^^4;
m1.length!0 = 2^^3;
m1.length!4 = 2^^16;
for (size_t i = 0; i< m1.length!4; i++)
m1.slice!(4)[i] = i % 2;
fillIdx!1(m1);
check!1(m1);
fillIdx!2(m1);
check!2(m1);
fillIdx!3(m1);
check!3(m1);
fillIdx!0(m1);
check!0(m1);
check!3(m1);
check!2(m1);
check!1(m1);
for (size_t i=0; i < 2^^16; i++)
{
m1.slice!(4)[i] = i % 2;
m1.slice!(0)[fn1(i)] = fn1(i);
m1.slice!(1)[fn2(i)] = fn2(i);
m1.slice!(2)[fn3(i)] = fn3(i);
m1.slice!(3)[fn4(i)] = fn4(i);
}
for (size_t i=0; i < 2^^16; i++)
{
assert(m1.slice!(4)[i] == i % 2);
assert(m1.slice!(0)[fn1(i)] == fn1(i));
assert(m1.slice!(1)[fn2(i)] == fn2(i));
assert(m1.slice!(2)[fn3(i)] == fn3(i));
assert(m1.slice!(3)[fn4(i)] == fn4(i));
}
}
size_t spaceFor(size_t _bits)(size_t new_len) @safe pure nothrow @nogc
{
import std.math : nextPow2;
enum bits = _bits == 1 ? 1 : nextPow2(_bits - 1);// see PackedArrayView
static if (bits > 8*size_t.sizeof)
{
static assert(bits % (size_t.sizeof*8) == 0);
return new_len * bits/(8*size_t.sizeof);
}
else
{
enum factor = size_t.sizeof*8/bits;
return (new_len+factor-1)/factor; // rounded up
}
}
template isBitPackableType(T)
{
enum isBitPackableType = isBitPacked!T
|| isIntegral!T || is(T == bool) || isSomeChar!T;
}
//============================================================================
template PackedArrayView(T)
if ((is(T dummy == BitPacked!(U, sz), U, size_t sz)
&& isBitPackableType!U) || isBitPackableType!T)
{
import std.math : nextPow2;
private enum bits = bitSizeOf!T;
alias PackedArrayView = PackedArrayViewImpl!(T, bits > 1 ? nextPow2(bits - 1) : 1);
}
//unsafe and fast access to a chunk of RAM as if it contains packed values
template PackedPtr(T)
if ((is(T dummy == BitPacked!(U, sz), U, size_t sz)
&& isBitPackableType!U) || isBitPackableType!T)
{
import std.math : nextPow2;
private enum bits = bitSizeOf!T;
alias PackedPtr = PackedPtrImpl!(T, bits > 1 ? nextPow2(bits - 1) : 1);
}
struct PackedPtrImpl(T, size_t bits)
{
pure nothrow:
static assert(isPow2OrZero(bits));
this(inout(size_t)* ptr)inout @safe @nogc
{
origin = ptr;
}
private T simpleIndex(size_t n) inout
{
immutable q = n / factor;
immutable r = n % factor;
return cast(T)((origin[q] >> bits*r) & mask);
}
private void simpleWrite(TypeOfBitPacked!T val, size_t n)
in
{
static if (isIntegral!T)
assert(val <= mask);
}
body
{
immutable q = n / factor;
immutable r = n % factor;
immutable tgt_shift = bits*r;
immutable word = origin[q];
origin[q] = (word & ~(mask << tgt_shift))
| (cast(size_t) val << tgt_shift);
}
static if (factor == bytesPerWord// can safely pack by byte
|| factor == 1 // a whole word at a time
|| ((factor == bytesPerWord/2 || factor == bytesPerWord/4)
&& hasUnalignedReads)) // this needs unaligned reads
{
static if (factor == bytesPerWord)
alias U = ubyte;
else static if (factor == bytesPerWord/2)
alias U = ushort;
else static if (factor == bytesPerWord/4)
alias U = uint;
else static if (size_t.sizeof == 8 && factor == bytesPerWord/8)
alias U = ulong;
T opIndex(size_t idx) inout
{
return __ctfe ? simpleIndex(idx) :
cast(inout(T))(cast(U*) origin)[idx];
}
static if (isBitPacked!T) // lack of user-defined implicit conversion
{
void opIndexAssign(T val, size_t idx)
{
return opIndexAssign(cast(TypeOfBitPacked!T) val, idx);
}
}
void opIndexAssign(TypeOfBitPacked!T val, size_t idx)
{
if (__ctfe)
simpleWrite(val, idx);
else
(cast(U*) origin)[idx] = cast(U) val;
}
}
else
{
T opIndex(size_t n) inout
{
return simpleIndex(n);
}
static if (isBitPacked!T) // lack of user-defined implicit conversion
{
void opIndexAssign(T val, size_t idx)
{
return opIndexAssign(cast(TypeOfBitPacked!T) val, idx);
}
}
void opIndexAssign(TypeOfBitPacked!T val, size_t n)
{
return simpleWrite(val, n);
}
}
private:
// factor - number of elements in one machine word
enum factor = size_t.sizeof*8/bits, mask = 2^^bits-1;
enum bytesPerWord = size_t.sizeof;
size_t* origin;
}
// data is packed only by power of two sized packs per word,
// thus avoiding mul/div overhead at the cost of ultimate packing
// this construct doesn't own memory, only provides access, see MultiArray for usage
struct PackedArrayViewImpl(T, size_t bits)
{
pure nothrow:
this(inout(size_t)* origin, size_t offset, size_t items) inout @safe
{
ptr = inout(PackedPtr!(T))(origin);
ofs = offset;
limit = items;
}
bool zeros(size_t s, size_t e)
in
{
assert(s <= e);
}
body
{
s += ofs;
e += ofs;
immutable pad_s = roundUp(s);
if ( s >= e)
{
foreach (i; s .. e)
if (ptr[i])
return false;
return true;
}
immutable pad_e = roundDown(e);
size_t i;
for (i=s; i<pad_s; i++)
if (ptr[i])
return false;
// all in between is x*factor elements
for (size_t j=i/factor; i<pad_e; i+=factor, j++)
if (ptr.origin[j])
return false;
for (; i<e; i++)
if (ptr[i])
return false;
return true;
}
T opIndex(size_t idx) inout
in
{
assert(idx < limit);
}
body
{
return ptr[ofs + idx];
}
static if (isBitPacked!T) // lack of user-defined implicit conversion
{
void opIndexAssign(T val, size_t idx)
{
return opIndexAssign(cast(TypeOfBitPacked!T) val, idx);
}
}
void opIndexAssign(TypeOfBitPacked!T val, size_t idx)
in
{
assert(idx < limit);
}
body
{
ptr[ofs + idx] = val;
}
static if (isBitPacked!T) // lack of user-defined implicit conversions
{
void opSliceAssign(T val, size_t start, size_t end)
{
opSliceAssign(cast(TypeOfBitPacked!T) val, start, end);
}
}
void opSliceAssign(TypeOfBitPacked!T val, size_t start, size_t end)
in
{
assert(start <= end);
assert(end <= limit);
}
body
{
// account for ofsetted view
start += ofs;
end += ofs;
// rounded to factor granularity
immutable pad_start = roundUp(start);// rounded up
if (pad_start >= end) //rounded up >= then end of slice
{
//nothing to gain, use per element assignment
foreach (i; start .. end)
ptr[i] = val;
return;
}
immutable pad_end = roundDown(end); // rounded down
size_t i;
for (i=start; i<pad_start; i++)
ptr[i] = val;
// all in between is x*factor elements
if (pad_start != pad_end)
{
immutable repval = replicateBits!(factor, bits)(val);
for (size_t j=i/factor; i<pad_end; i+=factor, j++)
ptr.origin[j] = repval;// so speed it up by factor
}
for (; i<end; i++)
ptr[i] = val;
}
auto opSlice(size_t from, size_t to)inout
in
{
assert(from <= to);
assert(ofs + to <= limit);
}
body
{
return typeof(this)(ptr.origin, ofs + from, to - from);
}
auto opSlice(){ return opSlice(0, length); }
bool opEquals(T)(auto ref T arr) const
{
if (limit != arr.limit)
return false;
size_t s1 = ofs, s2 = arr.ofs;
size_t e1 = s1 + limit, e2 = s2 + limit;
if (s1 % factor == 0 && s2 % factor == 0 && length % factor == 0)
{
return ptr.origin[s1/factor .. e1/factor]
== arr.ptr.origin[s2/factor .. e2/factor];
}
for (size_t i=0;i<limit; i++)
if (this[i] != arr[i])
return false;
return true;
}
@property size_t length()const{ return limit; }
private:
auto roundUp()(size_t val){ return (val+factor-1)/factor*factor; }
auto roundDown()(size_t val){ return val/factor*factor; }
// factor - number of elements in one machine word
enum factor = size_t.sizeof*8/bits;
PackedPtr!(T) ptr;
size_t ofs, limit;
}
private struct SliceOverIndexed(T)
{
enum assignableIndex = is(typeof((){ T.init[0] = Item.init; }));
enum assignableSlice = is(typeof((){ T.init[0 .. 0] = Item.init; }));
auto opIndex(size_t idx)const
in
{
assert(idx < to - from);
}
body
{
return (*arr)[from+idx];
}
static if (assignableIndex)
void opIndexAssign(Item val, size_t idx)
in
{
assert(idx < to - from);
}
body
{
(*arr)[from+idx] = val;
}
auto opSlice(size_t a, size_t b)
{
return typeof(this)(from+a, from+b, arr);
}
// static if (assignableSlice)
void opSliceAssign(T)(T val, size_t start, size_t end)
{
(*arr)[start+from .. end+from] = val;
}
auto opSlice()
{
return typeof(this)(from, to, arr);
}
@property size_t length()const { return to-from;}
auto opDollar()const { return length; }
@property bool empty()const { return from == to; }
@property auto front()const { return (*arr)[from]; }
static if (assignableIndex)
@property void front(Item val) { (*arr)[from] = val; }
@property auto back()const { return (*arr)[to-1]; }
static if (assignableIndex)
@property void back(Item val) { (*arr)[to-1] = val; }
@property auto save() inout { return this; }
void popFront() { from++; }
void popBack() { to--; }
bool opEquals(T)(auto ref T arr) const
{
if (arr.length != length)
return false;
for (size_t i=0; i <length; i++)
if (this[i] != arr[i])
return false;
return true;
}
private:
alias Item = typeof(T.init[0]);
size_t from, to;
T* arr;
}
static assert(isRandomAccessRange!(SliceOverIndexed!(int[])));
SliceOverIndexed!(const(T)) sliceOverIndexed(T)(size_t a, size_t b, const(T)* x)
if (is(Unqual!T == T))
{
return SliceOverIndexed!(const(T))(a, b, x);
}
// BUG? inout is out of reach
//...SliceOverIndexed.arr only parameters or stack based variables can be inout
SliceOverIndexed!T sliceOverIndexed(T)(size_t a, size_t b, T* x)
if (is(Unqual!T == T))
{
return SliceOverIndexed!T(a, b, x);
}
@system unittest
{
int[] idxArray = [2, 3, 5, 8, 13];
auto sliced = sliceOverIndexed(0, idxArray.length, &idxArray);
assert(!sliced.empty);
assert(sliced.front == 2);
sliced.front = 1;
assert(sliced.front == 1);
assert(sliced.back == 13);
sliced.popFront();
assert(sliced.front == 3);
assert(sliced.back == 13);
sliced.back = 11;
assert(sliced.back == 11);
sliced.popBack();
assert(sliced.front == 3);
assert(sliced[$-1] == 8);
sliced = sliced[];
assert(sliced[0] == 3);
assert(sliced.back == 8);
sliced = sliced[1..$];
assert(sliced.front == 5);
sliced = sliced[0..$-1];
assert(sliced[$-1] == 5);
int[] other = [2, 5];
assert(sliced[] == sliceOverIndexed(1, 2, &other));
sliceOverIndexed(0, 2, &idxArray)[0 .. 2] = -1;
assert(idxArray[0 .. 2] == [-1, -1]);
uint[] nullArr = null;
auto nullSlice = sliceOverIndexed(0, 0, &idxArray);
assert(nullSlice.empty);
}
private auto packedArrayView(T)(inout(size_t)* ptr, size_t items) @trusted pure nothrow
{
return inout(PackedArrayView!T)(ptr, 0, items);
}
//============================================================================
// Partially unrolled binary search using Shar's method
//============================================================================
string genUnrolledSwitchSearch(size_t size) @safe pure nothrow
{
import core.bitop : bsr;
import std.array : replace;
import std.conv : to;
assert(isPow2OrZero(size));
string code = `
import core.bitop : bsr;
auto power = bsr(m)+1;
switch (power){`;
size_t i = bsr(size);
foreach_reverse (val; 0 .. bsr(size))
{
auto v = 2^^val;
code ~= `
case pow:
if (pred(range[idx+m], needle))
idx += m;
goto case;
`.replace("m", to!string(v))
.replace("pow", to!string(i));
i--;
}
code ~= `
case 0:
if (pred(range[idx], needle))
idx += 1;
goto default;
`;
code ~= `
default:
}`;
return code;
}
bool isPow2OrZero(size_t sz) @safe pure nothrow @nogc
{
// See also: std.math.isPowerOf2()
return (sz & (sz-1)) == 0;
}
size_t uniformLowerBound(alias pred, Range, T)(Range range, T needle)
if (is(T : ElementType!Range))
{
assert(isPow2OrZero(range.length));
size_t idx = 0, m = range.length/2;
while (m != 0)
{
if (pred(range[idx+m], needle))
idx += m;
m /= 2;
}
if (pred(range[idx], needle))
idx += 1;
return idx;
}
size_t switchUniformLowerBound(alias pred, Range, T)(Range range, T needle)
if (is(T : ElementType!Range))
{
assert(isPow2OrZero(range.length));
size_t idx = 0, m = range.length/2;
enum max = 1 << 10;
while (m >= max)
{
if (pred(range[idx+m], needle))
idx += m;
m /= 2;
}
mixin(genUnrolledSwitchSearch(max));
return idx;
}
template sharMethod(alias uniLowerBound)
{
size_t sharMethod(alias _pred="a<b", Range, T)(Range range, T needle)
if (is(T : ElementType!Range))
{
import std.functional : binaryFun;
import std.math : nextPow2, truncPow2;
alias pred = binaryFun!_pred;
if (range.length == 0)
return 0;
if (isPow2OrZero(range.length))
return uniLowerBound!pred(range, needle);
size_t n = truncPow2(range.length);
if (pred(range[n-1], needle))
{// search in another 2^^k area that fully covers the tail of range
size_t k = nextPow2(range.length - n + 1);
return range.length - k + uniLowerBound!pred(range[$-k..$], needle);
}
else
return uniLowerBound!pred(range[0 .. n], needle);
}
}
alias sharLowerBound = sharMethod!uniformLowerBound;
alias sharSwitchLowerBound = sharMethod!switchUniformLowerBound;
@safe unittest
{
import std.array : array;
import std.range : assumeSorted, iota;
auto stdLowerBound(T)(T[] range, T needle)
{
return assumeSorted(range).lowerBound(needle).length;
}
immutable MAX = 5*1173;
auto arr = array(iota(5, MAX, 5));
assert(arr.length == MAX/5-1);
foreach (i; 0 .. MAX+5)
{
auto st = stdLowerBound(arr, i);
assert(st == sharLowerBound(arr, i));
assert(st == sharSwitchLowerBound(arr, i));
}
arr = [];
auto st = stdLowerBound(arr, 33);
assert(st == sharLowerBound(arr, 33));
assert(st == sharSwitchLowerBound(arr, 33));
}
//============================================================================
@safe
{
// hope to see simillar stuff in public interface... once Allocators are out
//@@@BUG moveFront and friends? dunno, for now it's POD-only
@trusted size_t genericReplace(Policy=void, T, Range)
(ref T dest, size_t from, size_t to, Range stuff)
{
import std.algorithm.mutation : copy;
size_t delta = to - from;
size_t stuff_end = from+stuff.length;
if (stuff.length > delta)
{// replace increases length
delta = stuff.length - delta;// now, new is > old by delta
static if (is(Policy == void))
dest.length = dest.length+delta;//@@@BUG lame @property
else
dest = Policy.realloc(dest, dest.length+delta);
copyBackwards(dest[to .. dest.length-delta],
dest[to+delta .. dest.length]);
copyForward(stuff, dest[from .. stuff_end]);
}
else if (stuff.length == delta)
{
copy(stuff, dest[from .. to]);
}
else
{// replace decreases length by delta
delta = delta - stuff.length;
copy(stuff, dest[from .. stuff_end]);
copyForward(dest[to .. dest.length],
dest[stuff_end .. dest.length-delta]);
static if (is(Policy == void))
dest.length = dest.length - delta;//@@@BUG lame @property
else
dest = Policy.realloc(dest, dest.length-delta);
}
return stuff_end;
}
// Simple storage manipulation policy
@trusted private struct GcPolicy
{
import std.traits : isDynamicArray;
static T[] dup(T)(const T[] arr)
{
return arr.dup;
}
static T[] alloc(T)(size_t size)
{
return new T[size];
}
static T[] realloc(T)(T[] arr, size_t sz)
{
arr.length = sz;
return arr;
}
static void replaceImpl(T, Range)(ref T[] dest, size_t from, size_t to, Range stuff)
{
replaceInPlace(dest, from, to, stuff);
}
static void append(T, V)(ref T[] arr, V value)
if (!isInputRange!V)
{
arr ~= force!T(value);
}
static void append(T, V)(ref T[] arr, V value)
if (isInputRange!V)
{
insertInPlace(arr, arr.length, value);
}
static void destroy(T)(ref T arr)
if (isDynamicArray!T && is(Unqual!T == T))
{
debug
{
arr[] = cast(typeof(T.init[0]))(0xdead_beef);
}
arr = null;
}
static void destroy(T)(ref T arr)
if (isDynamicArray!T && !is(Unqual!T == T))
{
arr = null;
}
}
// ditto
@trusted struct ReallocPolicy
{
import std.range.primitives : hasLength;
static T[] dup(T)(const T[] arr)
{
auto result = alloc!T(arr.length);
result[] = arr[];
return result;
}
static T[] alloc(T)(size_t size)
{
import core.stdc.stdlib : malloc;
import std.exception : enforce;
import core.checkedint : mulu;
bool overflow;
size_t nbytes = mulu(size, T.sizeof, overflow);
if (overflow) assert(0);
auto ptr = cast(T*) enforce(malloc(nbytes), "out of memory on C heap");
return ptr[0 .. size];
}
static T[] realloc(T)(T[] arr, size_t size)
{
import core.stdc.stdlib : realloc;
import std.exception : enforce;
if (!size)
{
destroy(arr);
return null;
}
import core.checkedint : mulu;
bool overflow;
size_t nbytes = mulu(size, T.sizeof, overflow);
if (overflow) assert(0);
auto ptr = cast(T*) enforce(realloc(arr.ptr, nbytes), "out of memory on C heap");
return ptr[0 .. size];
}
static void replaceImpl(T, Range)(ref T[] dest, size_t from, size_t to, Range stuff)
{
genericReplace!(ReallocPolicy)(dest, from, to, stuff);
}
static void append(T, V)(ref T[] arr, V value)
if (!isInputRange!V)
{
if (arr.length == size_t.max) assert(0);
arr = realloc(arr, arr.length+1);
arr[$-1] = force!T(value);
}
@safe unittest
{
int[] arr;
ReallocPolicy.append(arr, 3);
import std.algorithm.comparison : equal;
assert(equal(arr, [3]));
}
static void append(T, V)(ref T[] arr, V value)
if (isInputRange!V && hasLength!V)
{
import core.checkedint : addu;
bool overflow;
size_t nelems = addu(arr.length, value.length, overflow);
if (overflow) assert(0);
arr = realloc(arr, nelems);
import std.algorithm.mutation : copy;
copy(value, arr[$-value.length..$]);
}
@safe unittest
{
int[] arr;
ReallocPolicy.append(arr, [1,2,3]);
import std.algorithm.comparison : equal;
assert(equal(arr, [1,2,3]));
}
static void destroy(T)(ref T[] arr)
{
import core.stdc.stdlib : free;
if (arr.ptr)
free(arr.ptr);
arr = null;
}
}
//build hack
alias _RealArray = CowArray!ReallocPolicy;
@safe unittest
{
import std.algorithm.comparison : equal;
with(ReallocPolicy)
{
bool test(T, U, V)(T orig, size_t from, size_t to, U toReplace, V result,
string file = __FILE__, size_t line = __LINE__)
{
{
replaceImpl(orig, from, to, toReplace);
scope(exit) destroy(orig);
if (!equal(orig, result))
return false;
}
return true;
}
static T[] arr(T)(T[] args... )
{
return dup(args);
}
assert(test(arr([1, 2, 3, 4]), 0, 0, [5, 6, 7], [5, 6, 7, 1, 2, 3, 4]));
assert(test(arr([1, 2, 3, 4]), 0, 2, cast(int[])[], [3, 4]));
assert(test(arr([1, 2, 3, 4]), 0, 4, [5, 6, 7], [5, 6, 7]));
assert(test(arr([1, 2, 3, 4]), 0, 2, [5, 6, 7], [5, 6, 7, 3, 4]));
assert(test(arr([1, 2, 3, 4]), 2, 3, [5, 6, 7], [1, 2, 5, 6, 7, 4]));
}
}
/**
Tests if T is some kind a set of code points. Intended for template constraints.
*/
public template isCodepointSet(T)
{
static if (is(T dummy == InversionList!(Args), Args...))
enum isCodepointSet = true;
else
enum isCodepointSet = false;
}
/**
Tests if $(D T) is a pair of integers that implicitly convert to $(D V).
The following code must compile for any pair $(D T):
---
(T x){ V a = x[0]; V b = x[1];}
---
The following must not compile:
---
(T x){ V c = x[2];}
---
*/
public template isIntegralPair(T, V=uint)
{
enum isIntegralPair = is(typeof((T x){ V a = x[0]; V b = x[1];}))
&& !is(typeof((T x){ V c = x[2]; }));
}
/**
The recommended default type for set of $(CODEPOINTS).
For details, see the current implementation: $(LREF InversionList).
*/
public alias CodepointSet = InversionList!GcPolicy;
//@@@BUG: std.typecons tuples depend on std.format to produce fields mixin
// which relies on std.uni.isGraphical and this chain blows up with Forward reference error
// hence below doesn't seem to work
// public alias CodepointInterval = Tuple!(uint, "a", uint, "b");
/**
The recommended type of $(REF Tuple, std,_typecons)
to represent [a, b$(RPAREN) intervals of $(CODEPOINTS). As used in $(LREF InversionList).
Any interval type should pass $(LREF isIntegralPair) trait.
*/
public struct CodepointInterval
{
pure:
uint[2] _tuple;
alias _tuple this;
@safe pure nothrow @nogc:
this(uint low, uint high)
{
_tuple[0] = low;
_tuple[1] = high;
}
bool opEquals(T)(T val) const
{
return this[0] == val[0] && this[1] == val[1];
}
@property ref inout(uint) a() inout { return _tuple[0]; }
@property ref inout(uint) b() inout { return _tuple[1]; }
}
/**
$(P
$(D InversionList) is a set of $(CODEPOINTS)
represented as an array of open-right [a, b$(RPAREN)
intervals (see $(LREF CodepointInterval) above).
The name comes from the way the representation reads left to right.
For instance a set of all values [10, 50$(RPAREN), [80, 90$(RPAREN),
plus a singular value 60 looks like this:
)
---
10, 50, 60, 61, 80, 90
---
$(P
The way to read this is: start with negative meaning that all numbers
smaller then the next one are not present in this set (and positive
- the contrary). Then switch positive/negative after each
number passed from left to right.
)
$(P This way negative spans until 10, then positive until 50,
then negative until 60, then positive until 61, and so on.
As seen this provides a space-efficient storage of highly redundant data
that comes in long runs. A description which Unicode $(CHARACTER)
properties fit nicely. The technique itself could be seen as a variation
on $(LINK2 https://en.wikipedia.org/wiki/Run-length_encoding, RLE encoding).
)
$(P Sets are value types (just like $(D int) is) thus they
are never aliased.
)
Example:
---
auto a = CodepointSet('a', 'z'+1);
auto b = CodepointSet('A', 'Z'+1);
auto c = a;
a = a | b;
assert(a == CodepointSet('A', 'Z'+1, 'a', 'z'+1));
assert(a != c);
---
$(P See also $(LREF unicode) for simpler construction of sets
from predefined ones.
)
$(P Memory usage is 8 bytes per each contiguous interval in a set.
The value semantics are achieved by using the
$(HTTP en.wikipedia.org/wiki/Copy-on-write, COW) technique
and thus it's $(RED not) safe to cast this type to $(D_KEYWORD shared).
)
Note:
$(P It's not recommended to rely on the template parameters
or the exact type of a current $(CODEPOINT) set in $(D std.uni).
The type and parameters may change when the standard
allocators design is finalized.
Use $(LREF isCodepointSet) with templates or just stick with the default
alias $(LREF CodepointSet) throughout the whole code base.
)
*/
@trusted public struct InversionList(SP=GcPolicy)
{
import std.range : assumeSorted;
public:
/**
Construct from another code point set of any type.
*/
this(Set)(Set set) pure
if (isCodepointSet!Set)
{
uint[] arr;
foreach (v; set.byInterval)
{
arr ~= v.a;
arr ~= v.b;
}
data = CowArray!(SP).reuse(arr);
}
/**
Construct a set from a forward range of code point intervals.
*/
this(Range)(Range intervals) pure
if (isForwardRange!Range && isIntegralPair!(ElementType!Range))
{
uint[] arr;
foreach (v; intervals)
{
SP.append(arr, v.a);
SP.append(arr, v.b);
}
data = CowArray!(SP).reuse(arr);
sanitize(); //enforce invariant: sort intervals etc.
}
//helper function that avoids sanity check to be CTFE-friendly
private static fromIntervals(Range)(Range intervals) pure
{
import std.algorithm.iteration : map;
import std.range : roundRobin;
auto flattened = roundRobin(intervals.save.map!"a[0]"(),
intervals.save.map!"a[1]"());
InversionList set;
set.data = CowArray!(SP)(flattened);
return set;
}
//ditto untill sort is CTFE-able
private static fromIntervals()(uint[] intervals...) pure
in
{
import std.conv : text;
assert(intervals.length % 2 == 0, "Odd number of interval bounds [a, b)!");
for (uint i = 0; i < intervals.length; i += 2)
{
auto a = intervals[i], b = intervals[i+1];
assert(a < b, text("illegal interval [a, b): ", a, " > ", b));
}
}
body
{
InversionList set;
set.data = CowArray!(SP)(intervals);
return set;
}
/**
Construct a set from plain values of code point intervals.
*/
this()(uint[] intervals...)
in
{
import std.conv : text;
assert(intervals.length % 2 == 0, "Odd number of interval bounds [a, b)!");
for (uint i = 0; i < intervals.length; i += 2)
{
auto a = intervals[i], b = intervals[i+1];
assert(a < b, text("illegal interval [a, b): ", a, " > ", b));
}
}
body
{
data = CowArray!(SP)(intervals);
sanitize(); //enforce invariant: sort intervals etc.
}
///
@safe unittest
{
import std.algorithm.comparison : equal;
auto set = CodepointSet('a', 'z'+1, 'а', 'я'+1);
foreach (v; 'a'..'z'+1)
assert(set[v]);
// Cyrillic lowercase interval
foreach (v; 'а'..'я'+1)
assert(set[v]);
//specific order is not required, intervals may interesect
auto set2 = CodepointSet('а', 'я'+1, 'a', 'd', 'b', 'z'+1);
//the same end result
assert(set2.byInterval.equal(set.byInterval));
}
/**
Get range that spans all of the $(CODEPOINT) intervals in this $(LREF InversionList).
Example:
-----------
import std.algorithm.comparison : equal;
import std.typecons : tuple;
auto set = CodepointSet('A', 'D'+1, 'a', 'd'+1);
assert(set.byInterval.equal([tuple('A','E'), tuple('a','e')]));
-----------
*/
@property auto byInterval()
{
return Intervals!(typeof(data))(data);
}
/**
Tests the presence of code point $(D val) in this set.
*/
bool opIndex(uint val) const
{
// the <= ensures that searching in interval of [a, b) for 'a' you get .length == 1
// return assumeSorted!((a,b) => a <= b)(data[]).lowerBound(val).length & 1;
return sharSwitchLowerBound!"a <= b"(data[], val) & 1;
}
///
@safe unittest
{
auto gothic = unicode.Gothic;
// Gothic letter ahsa
assert(gothic['\U00010330']);
// no ascii in Gothic obviously
assert(!gothic['$']);
}
// Linear scan for $(D ch). Useful only for small sets.
// TODO:
// used internally in std.regex
// should be properly exposed in a public API ?
package auto scanFor()(dchar ch) const
{
immutable len = data.length;
for (size_t i = 0; i < len; i++)
if (ch < data[i])
return i & 1;
return 0;
}
/// Number of $(CODEPOINTS) in this set
@property size_t length()
{
size_t sum = 0;
foreach (iv; byInterval)
{
sum += iv.b - iv.a;
}
return sum;
}
// bootstrap full set operations from 4 primitives (suitable as a template mixin):
// addInterval, skipUpTo, dropUpTo & byInterval iteration
//============================================================================
public:
/**
$(P Sets support natural syntax for set algebra, namely: )
$(BOOKTABLE ,
$(TR $(TH Operator) $(TH Math notation) $(TH Description) )
$(TR $(TD &) $(TD a ∩ b) $(TD intersection) )
$(TR $(TD |) $(TD a ∪ b) $(TD union) )
$(TR $(TD -) $(TD a ∖ b) $(TD subtraction) )
$(TR $(TD ~) $(TD a ~ b) $(TD symmetric set difference i.e. (a ∪ b) \ (a ∩ b)) )
)
*/
This opBinary(string op, U)(U rhs)
if (isCodepointSet!U || is(U:dchar))
{
static if (op == "&" || op == "|" || op == "~")
{// symmetric ops thus can swap arguments to reuse r-value
static if (is(U:dchar))
{
auto tmp = this;
mixin("tmp "~op~"= rhs; ");
return tmp;
}
else
{
static if (is(Unqual!U == U))
{
// try hard to reuse r-value
mixin("rhs "~op~"= this;");
return rhs;
}
else
{
auto tmp = this;
mixin("tmp "~op~"= rhs;");
return tmp;
}
}
}
else static if (op == "-") // anti-symmetric
{
auto tmp = this;
tmp -= rhs;
return tmp;
}
else
static assert(0, "no operator "~op~" defined for Set");
}
///
@safe unittest
{
import std.algorithm.comparison : equal;
import std.range : iota;
auto lower = unicode.LowerCase;
auto upper = unicode.UpperCase;
auto ascii = unicode.ASCII;
assert((lower & upper).empty); // no intersection
auto lowerASCII = lower & ascii;
assert(lowerASCII.byCodepoint.equal(iota('a', 'z'+1)));
// throw away all of the lowercase ASCII
assert((ascii - lower).length == 128 - 26);
auto onlyOneOf = lower ~ ascii;
assert(!onlyOneOf['Δ']); // not ASCII and not lowercase
assert(onlyOneOf['$']); // ASCII and not lowercase
assert(!onlyOneOf['a']); // ASCII and lowercase
assert(onlyOneOf['я']); // not ASCII but lowercase
// throw away all cased letters from ASCII
auto noLetters = ascii - (lower | upper);
assert(noLetters.length == 128 - 26*2);
}
/// The 'op=' versions of the above overloaded operators.
ref This opOpAssign(string op, U)(U rhs)
if (isCodepointSet!U || is(U:dchar))
{
static if (op == "|") // union
{
static if (is(U:dchar))
{
this.addInterval(rhs, rhs+1);
return this;
}
else
return this.add(rhs);
}
else static if (op == "&") // intersection
return this.intersect(rhs);// overloaded
else static if (op == "-") // set difference
return this.sub(rhs);// overloaded
else static if (op == "~") // symmetric set difference
{
auto copy = this & rhs;
this |= rhs;
this -= copy;
return this;
}
else
static assert(0, "no operator "~op~" defined for Set");
}
/**
Tests the presence of codepoint $(D ch) in this set,
the same as $(LREF opIndex).
*/
bool opBinaryRight(string op: "in", U)(U ch) const
if (is(U : dchar))
{
return this[ch];
}
///
@safe unittest
{
assert('я' in unicode.Cyrillic);
assert(!('z' in unicode.Cyrillic));
}
/**
* Obtains a set that is the inversion of this set.
*
* See_Also: $(LREF inverted)
*/
auto opUnary(string op: "!")()
{
return this.inverted;
}
/**
A range that spans each $(CODEPOINT) in this set.
*/
@property auto byCodepoint()
{
@trusted static struct CodepointRange
{
this(This set)
{
r = set.byInterval;
if (!r.empty)
cur = r.front.a;
}
@property dchar front() const
{
return cast(dchar) cur;
}
@property bool empty() const
{
return r.empty;
}
void popFront()
{
cur++;
while (cur >= r.front.b)
{
r.popFront();
if (r.empty)
break;
cur = r.front.a;
}
}
private:
uint cur;
typeof(This.init.byInterval) r;
}
return CodepointRange(this);
}
///
@safe unittest
{
import std.algorithm.comparison : equal;
import std.range : iota;
auto set = unicode.ASCII;
set.byCodepoint.equal(iota(0, 0x80));
}
/**
$(P Obtain textual representation of this set in from of
open-right intervals and feed it to $(D sink).
)
$(P Used by various standard formatting facilities such as
$(REF formattedWrite, std,_format), $(REF write, std,_stdio),
$(REF writef, std,_stdio), $(REF to, std,_conv) and others.
)
Example:
---
import std.conv;
assert(unicode.ASCII.to!string == "[0..128$(RPAREN)");
---
*/
private import std.format : FormatException, FormatSpec;
/***************************************
* Obtain a textual representation of this InversionList
* in form of open-right intervals.
*
* The formatting flag is applied individually to each value, for example:
* $(LI $(B %s) and $(B %d) format the intervals as a [low .. high$(RPAREN) range of integrals)
* $(LI $(B %x) formats the intervals as a [low .. high$(RPAREN) range of lowercase hex characters)
* $(LI $(B %X) formats the intervals as a [low .. high$(RPAREN) range of uppercase hex characters)
*/
void toString(Writer)(scope Writer sink,
FormatSpec!char fmt) /* const */
{
import std.format : formatValue;
auto range = byInterval;
if (range.empty)
return;
while (1)
{
auto i = range.front;
range.popFront();
put(sink, "[");
formatValue(sink, i.a, fmt);
put(sink, "..");
formatValue(sink, i.b, fmt);
put(sink, ")");
if (range.empty) return;
put(sink, " ");
}
}
///
@safe unittest
{
import std.conv : to;
import std.format : format;
import std.uni : unicode;
assert(unicode.Cyrillic.to!string ==
"[1024..1157) [1159..1320) [7467..7468) [7544..7545) [11744..11776) [42560..42648) [42655..42656)");
// The specs '%s' and '%d' are equivalent to the to!string call above.
assert(format("%d", unicode.Cyrillic) == unicode.Cyrillic.to!string);
assert(format("%#x", unicode.Cyrillic) ==
"[0x400..0x485) [0x487..0x528) [0x1d2b..0x1d2c) [0x1d78..0x1d79) [0x2de0..0x2e00) "
~"[0xa640..0xa698) [0xa69f..0xa6a0)");
assert(format("%#X", unicode.Cyrillic) ==
"[0X400..0X485) [0X487..0X528) [0X1D2B..0X1D2C) [0X1D78..0X1D79) [0X2DE0..0X2E00) "
~"[0XA640..0XA698) [0XA69F..0XA6A0)");
}
@safe unittest
{
import std.exception : assertThrown;
import std.format : format;
assertThrown!FormatException(format("%a", unicode.ASCII));
}
/**
Add an interval [a, b$(RPAREN) to this set.
*/
ref add()(uint a, uint b)
{
addInterval(a, b);
return this;
}
///
@safe unittest
{
CodepointSet someSet;
someSet.add('0', '5').add('A','Z'+1);
someSet.add('5', '9'+1);
assert(someSet['0']);
assert(someSet['5']);
assert(someSet['9']);
assert(someSet['Z']);
}
private:
package(std) // used from: std.regex.internal.parser
ref intersect(U)(U rhs)
if (isCodepointSet!U)
{
Marker mark;
foreach ( i; rhs.byInterval)
{
mark = this.dropUpTo(i.a, mark);
mark = this.skipUpTo(i.b, mark);
}
this.dropUpTo(uint.max, mark);
return this;
}
ref intersect()(dchar ch)
{
foreach (i; byInterval)
if (i.a <= ch && ch < i.b)
return this = This.init.add(ch, ch+1);
this = This.init;
return this;
}
///
@safe unittest
{
assert(unicode.Cyrillic.intersect('-').byInterval.empty);
}
ref sub()(dchar ch)
{
return subChar(ch);
}
// same as the above except that skip & drop parts are swapped
package(std) // used from: std.regex.internal.parser
ref sub(U)(U rhs)
if (isCodepointSet!U)
{
Marker mark;
foreach (i; rhs.byInterval)
{
mark = this.skipUpTo(i.a, mark);
mark = this.dropUpTo(i.b, mark);
}
return this;
}
package(std) // used from: std.regex.internal.parse
ref add(U)(U rhs)
if (isCodepointSet!U)
{
Marker start;
foreach (i; rhs.byInterval)
{
start = addInterval(i.a, i.b, start);
}
return this;
}
// end of mixin-able part
//============================================================================
public:
/**
Obtains a set that is the inversion of this set.
See the '!' $(LREF opUnary) for the same but using operators.
*/
@property auto inverted()
{
InversionList inversion = this;
if (inversion.data.length == 0)
{
inversion.addInterval(0, lastDchar+1);
return inversion;
}
if (inversion.data[0] != 0)
genericReplace(inversion.data, 0, 0, [0]);
else
genericReplace(inversion.data, 0, 1, cast(uint[]) null);
if (data[data.length-1] != lastDchar+1)
genericReplace(inversion.data,
inversion.data.length, inversion.data.length, [lastDchar+1]);
else
genericReplace(inversion.data,
inversion.data.length-1, inversion.data.length, cast(uint[]) null);
return inversion;
}
///
@safe unittest
{
auto set = unicode.ASCII;
// union with the inverse gets all of the code points in the Unicode
assert((set | set.inverted).length == 0x110000);
// no intersection with the inverse
assert((set & set.inverted).empty);
}
/**
Generates string with D source code of unary function with name of
$(D funcName) taking a single $(D dchar) argument. If $(D funcName) is empty
the code is adjusted to be a lambda function.
The function generated tests if the $(CODEPOINT) passed
belongs to this set or not. The result is to be used with string mixin.
The intended usage area is aggressive optimization via meta programming
in parser generators and the like.
Note: Use with care for relatively small or regular sets. It
could end up being slower then just using multi-staged tables.
Example:
---
import std.stdio;
// construct set directly from [a, b$RPAREN intervals
auto set = CodepointSet(10, 12, 45, 65, 100, 200);
writeln(set);
writeln(set.toSourceCode("func"));
---
The above outputs something along the lines of:
---
bool func(dchar ch) @safe pure nothrow @nogc
{
if (ch < 45)
{
if (ch == 10 || ch == 11) return true;
return false;
}
else if (ch < 65) return true;
else
{
if (ch < 100) return false;
if (ch < 200) return true;
return false;
}
}
---
*/
string toSourceCode(string funcName="")
{
import std.algorithm.searching : countUntil;
import std.array : array;
import std.format : format;
enum maxBinary = 3;
static string linearScope(R)(R ivals, string indent)
{
string result = indent~"{\n";
string deeper = indent~" ";
foreach (ival; ivals)
{
immutable span = ival[1] - ival[0];
assert(span != 0);
if (span == 1)
{
result ~= format("%sif (ch == %s) return true;\n", deeper, ival[0]);
}
else if (span == 2)
{
result ~= format("%sif (ch == %s || ch == %s) return true;\n",
deeper, ival[0], ival[0]+1);
}
else
{
if (ival[0] != 0) // dchar is unsigned and < 0 is useless
result ~= format("%sif (ch < %s) return false;\n", deeper, ival[0]);
result ~= format("%sif (ch < %s) return true;\n", deeper, ival[1]);
}
}
result ~= format("%sreturn false;\n%s}\n", deeper, indent); // including empty range of intervals
return result;
}
static string binaryScope(R)(R ivals, string indent)
{
// time to do unrolled comparisons?
if (ivals.length < maxBinary)
return linearScope(ivals, indent);
else
return bisect(ivals, ivals.length/2, indent);
}
// not used yet if/elsebinary search is far better with DMD as of 2.061
// and GDC is doing fine job either way
static string switchScope(R)(R ivals, string indent)
{
string result = indent~"switch (ch){\n";
string deeper = indent~" ";
foreach (ival; ivals)
{
if (ival[0]+1 == ival[1])
{
result ~= format("%scase %s: return true;\n",
deeper, ival[0]);
}
else
{
result ~= format("%scase %s: .. case %s: return true;\n",
deeper, ival[0], ival[1]-1);
}
}
result ~= deeper~"default: return false;\n"~indent~"}\n";
return result;
}
static string bisect(R)(R range, size_t idx, string indent)
{
string deeper = indent ~ " ";
// bisect on one [a, b) interval at idx
string result = indent~"{\n";
// less branch, < a
result ~= format("%sif (ch < %s)\n%s",
deeper, range[idx][0], binaryScope(range[0 .. idx], deeper));
// middle point, >= a && < b
result ~= format("%selse if (ch < %s) return true;\n",
deeper, range[idx][1]);
// greater or equal branch, >= b
result ~= format("%selse\n%s",
deeper, binaryScope(range[idx+1..$], deeper));
return result~indent~"}\n";
}
string code = format("bool %s(dchar ch) @safe pure nothrow @nogc\n",
funcName.empty ? "function" : funcName);
auto range = byInterval.array();
// special case first bisection to be on ASCII vs beyond
auto tillAscii = countUntil!"a[0] > 0x80"(range);
if (tillAscii <= 0) // everything is ASCII or nothing is ascii (-1 & 0)
code ~= binaryScope(range, "");
else
code ~= bisect(range, tillAscii, "");
return code;
}
/**
True if this set doesn't contain any $(CODEPOINTS).
*/
@property bool empty() const
{
return data.length == 0;
}
///
@safe unittest
{
CodepointSet emptySet;
assert(emptySet.length == 0);
assert(emptySet.empty);
}
private:
alias This = typeof(this);
alias Marker = size_t;
// a random-access range of integral pairs
static struct Intervals(Range)
{
this(Range sp)
{
slice = sp;
start = 0;
end = sp.length;
}
this(Range sp, size_t s, size_t e)
{
slice = sp;
start = s;
end = e;
}
@property auto front()const
{
immutable a = slice[start];
immutable b = slice[start+1];
return CodepointInterval(a, b);
}
//may break sorted property - but we need std.sort to access it
//hence package protection attribute
package @property void front(CodepointInterval val)
{
slice[start] = val.a;
slice[start+1] = val.b;
}
@property auto back()const
{
immutable a = slice[end-2];
immutable b = slice[end-1];
return CodepointInterval(a, b);
}
//ditto about package
package @property void back(CodepointInterval val)
{
slice[end-2] = val.a;
slice[end-1] = val.b;
}
void popFront()
{
start += 2;
}
void popBack()
{
end -= 2;
}
auto opIndex(size_t idx) const
{
immutable a = slice[start+idx*2];
immutable b = slice[start+idx*2+1];
return CodepointInterval(a, b);
}
//ditto about package
package void opIndexAssign(CodepointInterval val, size_t idx)
{
slice[start+idx*2] = val.a;
slice[start+idx*2+1] = val.b;
}
auto opSlice(size_t s, size_t e)
{
return Intervals(slice, s*2+start, e*2+start);
}
@property size_t length()const { return slice.length/2; }
@property bool empty()const { return start == end; }
@property auto save(){ return this; }
private:
size_t start, end;
Range slice;
}
// called after construction from intervals
// to make sure invariants hold
void sanitize()
{
import std.algorithm.comparison : max;
import std.algorithm.mutation : SwapStrategy;
import std.algorithm.sorting : sort;
if (data.length == 0)
return;
alias Ival = CodepointInterval;
//intervals wrapper for a _range_ over packed array
auto ivals = Intervals!(typeof(data[]))(data[]);
//@@@BUG@@@ can't use "a.a < b.a" see issue 12265
sort!((a,b) => a.a < b.a, SwapStrategy.stable)(ivals);
// what follows is a variation on stable remove
// differences:
// - predicate is binary, and is tested against
// the last kept element (at 'i').
// - predicate mutates lhs (merges rhs into lhs)
size_t len = ivals.length;
size_t i = 0;
size_t j = 1;
while (j < len)
{
if (ivals[i].b >= ivals[j].a)
{
ivals[i] = Ival(ivals[i].a, max(ivals[i].b, ivals[j].b));
j++;
}
else //unmergable
{
// check if there is a hole after merges
// (in the best case we do 0 writes to ivals)
if (j != i+1)
ivals[i+1] = ivals[j]; //copy over
i++;
j++;
}
}
len = i + 1;
for (size_t k=0; k + 1 < len; k++)
{
assert(ivals[k].a < ivals[k].b);
assert(ivals[k].b < ivals[k+1].a);
}
data.length = len * 2;
}
// special case for normal InversionList
ref subChar(dchar ch)
{
auto mark = skipUpTo(ch);
if (mark != data.length
&& data[mark] == ch && data[mark-1] == ch)
{
// it has split, meaning that ch happens to be in one of intervals
data[mark] = data[mark]+1;
}
return this;
}
//
Marker addInterval(int a, int b, Marker hint=Marker.init)
in
{
assert(a <= b);
}
body
{
import std.range : assumeSorted, SearchPolicy;
auto range = assumeSorted(data[]);
size_t pos;
size_t a_idx = hint + range[hint..$].lowerBound!(SearchPolicy.gallop)(a).length;
if (a_idx == range.length)
{
// [---+++----++++----++++++]
// [ a b]
data.append(a, b);
return data.length-1;
}
size_t b_idx = range[a_idx .. range.length].lowerBound!(SearchPolicy.gallop)(b).length+a_idx;
uint[3] buf = void;
uint to_insert;
debug(std_uni)
{
writefln("a_idx=%d; b_idx=%d;", a_idx, b_idx);
}
if (b_idx == range.length)
{
// [-------++++++++----++++++-]
// [ s a b]
if (a_idx & 1)// a in positive
{
buf[0] = b;
to_insert = 1;
}
else// a in negative
{
buf[0] = a;
buf[1] = b;
to_insert = 2;
}
pos = genericReplace(data, a_idx, b_idx, buf[0 .. to_insert]);
return pos - 1;
}
uint top = data[b_idx];
debug(std_uni)
{
writefln("a_idx=%d; b_idx=%d;", a_idx, b_idx);
writefln("a=%s; b=%s; top=%s;", a, b, top);
}
if (a_idx & 1)
{// a in positive
if (b_idx & 1)// b in positive
{
// [-------++++++++----++++++-]
// [ s a b ]
buf[0] = top;
to_insert = 1;
}
else // b in negative
{
// [-------++++++++----++++++-]
// [ s a b ]
if (top == b)
{
assert(b_idx+1 < data.length);
buf[0] = data[b_idx+1];
pos = genericReplace(data, a_idx, b_idx+2, buf[0 .. 1]);
return pos - 1;
}
buf[0] = b;
buf[1] = top;
to_insert = 2;
}
}
else
{ // a in negative
if (b_idx & 1) // b in positive
{
// [----------+++++----++++++-]
// [ a b ]
buf[0] = a;
buf[1] = top;
to_insert = 2;
}
else// b in negative
{
// [----------+++++----++++++-]
// [ a s b ]
if (top == b)
{
assert(b_idx+1 < data.length);
buf[0] = a;
buf[1] = data[b_idx+1];
pos = genericReplace(data, a_idx, b_idx+2, buf[0 .. 2]);
return pos - 1;
}
buf[0] = a;
buf[1] = b;
buf[2] = top;
to_insert = 3;
}
}
pos = genericReplace(data, a_idx, b_idx+1, buf[0 .. to_insert]);
debug(std_uni)
{
writefln("marker idx: %d; length=%d", pos, data[pos], data.length);
writeln("inserting ", buf[0 .. to_insert]);
}
return pos - 1;
}
//
Marker dropUpTo(uint a, Marker pos=Marker.init)
in
{
assert(pos % 2 == 0); // at start of interval
}
body
{
auto range = assumeSorted!"a <= b"(data[pos .. data.length]);
if (range.empty)
return pos;
size_t idx = pos;
idx += range.lowerBound(a).length;
debug(std_uni)
{
writeln("dropUpTo full length=", data.length);
writeln(pos,"~~~", idx);
}
if (idx == data.length)
return genericReplace(data, pos, idx, cast(uint[])[]);
if (idx & 1)
{ // a in positive
//[--+++----++++++----+++++++------...]
// |<---si s a t
genericReplace(data, pos, idx, [a]);
}
else
{ // a in negative
//[--+++----++++++----+++++++-------+++...]
// |<---si s a t
genericReplace(data, pos, idx, cast(uint[])[]);
}
return pos;
}
//
Marker skipUpTo(uint a, Marker pos=Marker.init)
out(result)
{
assert(result % 2 == 0);// always start of interval
//(may be 0-width after-split)
}
body
{
assert(data.length % 2 == 0);
auto range = assumeSorted!"a <= b"(data[pos .. data.length]);
size_t idx = pos+range.lowerBound(a).length;
if (idx >= data.length) // could have Marker point to recently removed stuff
return data.length;
if (idx & 1)// inside of interval, check for split
{
immutable top = data[idx];
if (top == a)// no need to split, it's end
return idx+1;
immutable start = data[idx-1];
if (a == start)
return idx-1;
// split it up
genericReplace(data, idx, idx+1, [a, a, top]);
return idx+1; // avoid odd index
}
return idx;
}
CowArray!SP data;
}
@system unittest
{
import std.conv : to;
assert(unicode.ASCII.to!string() == "[0..128)");
}
// pedantic version for ctfe, and aligned-access only architectures
@system private uint safeRead24(scope const ubyte* ptr, size_t idx) pure nothrow @nogc
{
idx *= 3;
version(LittleEndian)
return ptr[idx] + (cast(uint) ptr[idx+1]<<8)
+ (cast(uint) ptr[idx+2]<<16);
else
return (cast(uint) ptr[idx]<<16) + (cast(uint) ptr[idx+1]<<8)
+ ptr[idx+2];
}
// ditto
@system private void safeWrite24(scope ubyte* ptr, uint val, size_t idx) pure nothrow @nogc
{
idx *= 3;
version(LittleEndian)
{
ptr[idx] = val & 0xFF;
ptr[idx+1] = (val >> 8) & 0xFF;
ptr[idx+2] = (val >> 16) & 0xFF;
}
else
{
ptr[idx] = (val >> 16) & 0xFF;
ptr[idx+1] = (val >> 8) & 0xFF;
ptr[idx+2] = val & 0xFF;
}
}
// unaligned x86-like read/write functions
@system private uint unalignedRead24(scope const ubyte* ptr, size_t idx) pure nothrow @nogc
{
uint* src = cast(uint*)(ptr+3*idx);
version(LittleEndian)
return *src & 0xFF_FFFF;
else
return *src >> 8;
}
// ditto
@system private void unalignedWrite24(scope ubyte* ptr, uint val, size_t idx) pure nothrow @nogc
{
uint* dest = cast(uint*)(cast(ubyte*) ptr + 3*idx);
version(LittleEndian)
*dest = val | (*dest & 0xFF00_0000);
else
*dest = (val << 8) | (*dest & 0xFF);
}
@system private uint read24(scope const ubyte* ptr, size_t idx) pure nothrow @nogc
{
static if (hasUnalignedReads)
return __ctfe ? safeRead24(ptr, idx) : unalignedRead24(ptr, idx);
else
return safeRead24(ptr, idx);
}
@system private void write24(scope ubyte* ptr, uint val, size_t idx) pure nothrow @nogc
{
static if (hasUnalignedReads)
return __ctfe ? safeWrite24(ptr, val, idx) : unalignedWrite24(ptr, val, idx);
else
return safeWrite24(ptr, val, idx);
}
struct CowArray(SP=GcPolicy)
{
import std.range.primitives : hasLength;
@safe:
static auto reuse(uint[] arr)
{
CowArray cow;
cow.data = arr;
SP.append(cow.data, 1);
assert(cow.refCount == 1);
assert(cow.length == arr.length);
return cow;
}
this(Range)(Range range)
if (isInputRange!Range && hasLength!Range)
{
import std.algorithm.mutation : copy;
length = range.length;
copy(range, data[0..$-1]);
}
this(Range)(Range range)
if (isForwardRange!Range && !hasLength!Range)
{
import std.algorithm.mutation : copy;
import std.range.primitives : walkLength;
immutable len = walkLength(range.save);
length = len;
copy(range, data[0..$-1]);
}
this(this)
{
if (!empty)
{
refCount = refCount + 1;
}
}
~this()
{
if (!empty)
{
immutable cnt = refCount;
if (cnt == 1)
SP.destroy(data);
else
refCount = cnt - 1;
}
}
// no ref-count for empty U24 array
@property bool empty() const { return data.length == 0; }
// report one less then actual size
@property size_t length() const
{
return data.length ? data.length - 1 : 0;
}
//+ an extra slot for ref-count
@property void length(size_t len)
{
import std.algorithm.comparison : min;
import std.algorithm.mutation : copy;
if (len == 0)
{
if (!empty)
freeThisReference();
return;
}
immutable total = len + 1; // including ref-count
if (empty)
{
data = SP.alloc!uint(total);
refCount = 1;
return;
}
immutable cur_cnt = refCount;
if (cur_cnt != 1) // have more references to this memory
{
refCount = cur_cnt - 1;
auto new_data = SP.alloc!uint(total);
// take shrinking into account
auto to_copy = min(total, data.length) - 1;
copy(data[0 .. to_copy], new_data[0 .. to_copy]);
data = new_data; // before setting refCount!
refCount = 1;
}
else // 'this' is the only reference
{
// use the realloc (hopefully in-place operation)
data = SP.realloc(data, total);
refCount = 1; // setup a ref-count in the new end of the array
}
}
alias opDollar = length;
uint opIndex()(size_t idx)const
{
return data[idx];
}
void opIndexAssign(uint val, size_t idx)
{
auto cnt = refCount;
if (cnt != 1)
dupThisReference(cnt);
data[idx] = val;
}
//
auto opSlice(size_t from, size_t to)
{
if (!empty)
{
auto cnt = refCount;
if (cnt != 1)
dupThisReference(cnt);
}
return data[from .. to];
}
//
auto opSlice(size_t from, size_t to) const
{
return data[from .. to];
}
// length slices before the ref count
auto opSlice()
{
return opSlice(0, length);
}
// ditto
auto opSlice() const
{
return opSlice(0, length);
}
void append(Range)(Range range)
if (isInputRange!Range && hasLength!Range && is(ElementType!Range : uint))
{
size_t nl = length + range.length;
length = nl;
copy(range, this[nl-range.length .. nl]);
}
void append()(uint[] val...)
{
length = length + val.length;
data[$-val.length-1 .. $-1] = val[];
}
bool opEquals()(auto const ref CowArray rhs)const
{
if (empty ^ rhs.empty)
return false; // one is empty and the other isn't
return empty || data[0..$-1] == rhs.data[0..$-1];
}
private:
// ref-count is right after the data
@property uint refCount() const
{
return data[$-1];
}
@property void refCount(uint cnt)
{
data[$-1] = cnt;
}
void freeThisReference()
{
immutable count = refCount;
if (count != 1) // have more references to this memory
{
// dec shared ref-count
refCount = count - 1;
data = [];
}
else
SP.destroy(data);
assert(!data.ptr);
}
void dupThisReference(uint count)
in
{
assert(!empty && count != 1 && count == refCount);
}
body
{
import std.algorithm.mutation : copy;
// dec shared ref-count
refCount = count - 1;
// copy to the new chunk of RAM
auto new_data = SP.alloc!uint(data.length);
// bit-blit old stuff except the counter
copy(data[0..$-1], new_data[0..$-1]);
data = new_data; // before setting refCount!
refCount = 1; // so that this updates the right one
}
uint[] data;
}
@safe unittest// Uint24 tests
{
import std.algorithm.comparison : equal;
import std.algorithm.mutation : copy;
import std.conv : text;
import std.range : iota, chain;
import std.range.primitives : isBidirectionalRange, isOutputRange;
void funcRef(T)(ref T u24)
{
u24.length = 2;
u24[1] = 1024;
T u24_c = u24;
assert(u24[1] == 1024);
u24.length = 0;
assert(u24.empty);
u24.append([1, 2]);
assert(equal(u24[], [1, 2]));
u24.append(111);
assert(equal(u24[], [1, 2, 111]));
assert(!u24_c.empty && u24_c[1] == 1024);
u24.length = 3;
copy(iota(0, 3), u24[]);
assert(equal(u24[], iota(0, 3)));
assert(u24_c[1] == 1024);
}
void func2(T)(T u24)
{
T u24_2 = u24;
T u24_3;
u24_3 = u24_2;
assert(u24_2 == u24_3);
assert(equal(u24[], u24_2[]));
assert(equal(u24_2[], u24_3[]));
funcRef(u24_3);
assert(equal(u24_3[], iota(0, 3)));
assert(!equal(u24_2[], u24_3[]));
assert(equal(u24_2[], u24[]));
u24_2 = u24_3;
assert(equal(u24_2[], iota(0, 3)));
// to test that passed arg is intact outside
// plus try out opEquals
u24 = u24_3;
u24 = T.init;
u24_3 = T.init;
assert(u24.empty);
assert(u24 == u24_3);
assert(u24 != u24_2);
}
foreach (Policy; AliasSeq!(GcPolicy, ReallocPolicy))
{
alias Range = typeof(CowArray!Policy.init[]);
alias U24A = CowArray!Policy;
static assert(isForwardRange!Range);
static assert(isBidirectionalRange!Range);
static assert(isOutputRange!(Range, uint));
static assert(isRandomAccessRange!(Range));
auto arr = U24A([42u, 36, 100]);
assert(arr[0] == 42);
assert(arr[1] == 36);
arr[0] = 72;
arr[1] = 0xFE_FEFE;
assert(arr[0] == 72);
assert(arr[1] == 0xFE_FEFE);
assert(arr[2] == 100);
U24A arr2 = arr;
assert(arr2[0] == 72);
arr2[0] = 11;
// test COW-ness
assert(arr[0] == 72);
assert(arr2[0] == 11);
// set this to about 100M to stress-test COW memory management
foreach (v; 0 .. 10_000)
func2(arr);
assert(equal(arr[], [72, 0xFE_FEFE, 100]));
auto r2 = U24A(iota(0, 100));
assert(equal(r2[], iota(0, 100)), text(r2[]));
copy(iota(10, 170, 2), r2[10 .. 90]);
assert(equal(r2[], chain(iota(0, 10), iota(10, 170, 2), iota(90, 100)))
, text(r2[]));
}
}
version(unittest)
{
private alias AllSets = AliasSeq!(InversionList!GcPolicy, InversionList!ReallocPolicy);
}
@safe unittest// core set primitives test
{
import std.conv : text;
foreach (CodeList; AllSets)
{
CodeList a;
//"plug a hole" test
a.add(10, 20).add(25, 30).add(15, 27);
assert(a == CodeList(10, 30), text(a));
auto x = CodeList.init;
x.add(10, 20).add(30, 40).add(50, 60);
a = x;
a.add(20, 49);//[10, 49) [50, 60)
assert(a == CodeList(10, 49, 50 ,60));
a = x;
a.add(20, 50);
assert(a == CodeList(10, 60), text(a));
// simple unions, mostly edge effects
x = CodeList.init;
x.add(10, 20).add(40, 60);
a = x;
a.add(10, 25); //[10, 25) [40, 60)
assert(a == CodeList(10, 25, 40, 60));
a = x;
a.add(5, 15); //[5, 20) [40, 60)
assert(a == CodeList(5, 20, 40, 60));
a = x;
a.add(0, 10); // [0, 20) [40, 60)
assert(a == CodeList(0, 20, 40, 60));
a = x;
a.add(0, 5); // prepand
assert(a == CodeList(0, 5, 10, 20, 40, 60), text(a));
a = x;
a.add(5, 20);
assert(a == CodeList(5, 20, 40, 60));
a = x;
a.add(3, 37);
assert(a == CodeList(3, 37, 40, 60));
a = x;
a.add(37, 65);
assert(a == CodeList(10, 20, 37, 65));
// some tests on helpers for set intersection
x = CodeList.init.add(10, 20).add(40, 60).add(100, 120);
a = x;
auto m = a.skipUpTo(60);
a.dropUpTo(110, m);
assert(a == CodeList(10, 20, 40, 60, 110, 120), text(a.data[]));
a = x;
a.dropUpTo(100);
assert(a == CodeList(100, 120), text(a.data[]));
a = x;
m = a.skipUpTo(50);
a.dropUpTo(140, m);
assert(a == CodeList(10, 20, 40, 50), text(a.data[]));
a = x;
a.dropUpTo(60);
assert(a == CodeList(100, 120), text(a.data[]));
}
}
//test constructor to work with any order of intervals
@safe unittest
{
import std.algorithm.comparison : equal;
import std.conv : text, to;
import std.range : chain, iota;
import std.typecons : tuple;
//ensure constructor handles bad ordering and overlap
auto c1 = CodepointSet('а', 'я'+1, 'А','Я'+1);
foreach (ch; chain(iota('а', 'я'+1), iota('А','Я'+1)))
assert(ch in c1, to!string(ch));
//contiguos
assert(CodepointSet(1000, 1006, 1006, 1009)
.byInterval.equal([tuple(1000, 1009)]));
//contains
assert(CodepointSet(900, 1200, 1000, 1100)
.byInterval.equal([tuple(900, 1200)]));
//intersect left
assert(CodepointSet(900, 1100, 1000, 1200)
.byInterval.equal([tuple(900, 1200)]));
//intersect right
assert(CodepointSet(1000, 1200, 900, 1100)
.byInterval.equal([tuple(900, 1200)]));
//ditto with extra items at end
assert(CodepointSet(1000, 1200, 900, 1100, 800, 850)
.byInterval.equal([tuple(800, 850), tuple(900, 1200)]));
assert(CodepointSet(900, 1100, 1000, 1200, 800, 850)
.byInterval.equal([tuple(800, 850), tuple(900, 1200)]));
//"plug a hole" test
auto c2 = CodepointSet(20, 40,
60, 80, 100, 140, 150, 200,
40, 60, 80, 100, 140, 150
);
assert(c2.byInterval.equal([tuple(20, 200)]));
auto c3 = CodepointSet(
20, 40, 60, 80, 100, 140, 150, 200,
0, 10, 15, 100, 10, 20, 200, 220);
assert(c3.byInterval.equal([tuple(0, 140), tuple(150, 220)]));
}
@safe unittest
{ // full set operations
import std.conv : text;
foreach (CodeList; AllSets)
{
CodeList a, b, c, d;
//"plug a hole"
a.add(20, 40).add(60, 80).add(100, 140).add(150, 200);
b.add(40, 60).add(80, 100).add(140, 150);
c = a | b;
d = b | a;
assert(c == CodeList(20, 200), text(CodeList.stringof," ", c));
assert(c == d, text(c," vs ", d));
b = CodeList.init.add(25, 45).add(65, 85).add(95,110).add(150, 210);
c = a | b; //[20,45) [60, 85) [95, 140) [150, 210)
d = b | a;
assert(c == CodeList(20, 45, 60, 85, 95, 140, 150, 210), text(c));
assert(c == d, text(c," vs ", d));
b = CodeList.init.add(10, 20).add(30,100).add(145,200);
c = a | b;//[10, 140) [145, 200)
d = b | a;
assert(c == CodeList(10, 140, 145, 200));
assert(c == d, text(c," vs ", d));
b = CodeList.init.add(0, 10).add(15, 100).add(10, 20).add(200, 220);
c = a | b;//[0, 140) [150, 220)
d = b | a;
assert(c == CodeList(0, 140, 150, 220));
assert(c == d, text(c," vs ", d));
a = CodeList.init.add(20, 40).add(60, 80);
b = CodeList.init.add(25, 35).add(65, 75);
c = a & b;
d = b & a;
assert(c == CodeList(25, 35, 65, 75), text(c));
assert(c == d, text(c," vs ", d));
a = CodeList.init.add(20,