Permalink
Cannot retrieve contributors at this time
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
548 lines (433 sloc)
14.9 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
to_binary = fn | |
"" -> | |
nil | |
codepoints -> | |
codepoints = :binary.split(codepoints, " ", [:global]) | |
Enum.reduce codepoints, "", fn(codepoint, acc) -> | |
acc <> <<String.to_integer(codepoint, 16)::utf8>> | |
end | |
end | |
defmodule String.Unicode do | |
@moduledoc false | |
def version, do: {7, 0, 0} | |
data_path = Path.join(__DIR__, "UnicodeData.txt") | |
{codes, whitespace} = Enum.reduce File.stream!(data_path), {[], []}, fn(line, {cacc, wacc}) -> | |
[codepoint, _name, _category, | |
_class, bidi, _decomposition, | |
_numeric_1, _numeric_2, _numeric_3, | |
_bidi_mirror, _unicode_1, _iso, | |
upper, lower, title] = :binary.split(line, ";", [:global]) | |
title = :binary.part(title, 0, byte_size(title) - 1) | |
cond do | |
upper != "" or lower != "" or title != "" -> | |
{[{to_binary.(codepoint), | |
to_binary.(upper), | |
to_binary.(lower), | |
to_binary.(title)} | cacc], | |
wacc} | |
bidi in ["B", "S", "WS"] -> | |
{cacc, [to_binary.(codepoint) | wacc]} | |
true -> | |
{cacc, wacc} | |
end | |
end | |
special_path = Path.join(__DIR__, "SpecialCasing.txt") | |
codes = Enum.reduce File.stream!(special_path), codes, fn(line, acc) -> | |
[codepoint, lower, title, upper, _] = :binary.split(line, "; ", [:global]) | |
key = to_binary.(codepoint) | |
:lists.keystore(key, 1, acc, {key, | |
to_binary.(upper), | |
to_binary.(lower), | |
to_binary.(title)}) | |
end | |
# Downcase | |
def downcase(string), do: downcase(string, "") | |
for {codepoint, _upper, lower, _title} <- codes, lower && lower != codepoint do | |
defp downcase(unquote(codepoint) <> rest, acc) do | |
downcase(rest, acc <> unquote(lower)) | |
end | |
end | |
defp downcase(<<char, rest::binary>>, acc) do | |
downcase(rest, <<acc::binary, char>>) | |
end | |
defp downcase("", acc), do: acc | |
# Upcase | |
def upcase(string), do: upcase(string, "") | |
for {codepoint, upper, _lower, _title} <- codes, upper && upper != codepoint do | |
defp upcase(unquote(codepoint) <> rest, acc) do | |
upcase(rest, acc <> unquote(upper)) | |
end | |
end | |
defp upcase(<<char, rest::binary>>, acc) do | |
upcase(rest, <<acc::binary, char>>) | |
end | |
defp upcase("", acc), do: acc | |
# Titlecase once | |
def titlecase_once(""), do: {"", ""} | |
for {codepoint, _upper, _lower, title} <- codes, title && title != codepoint do | |
def titlecase_once(unquote(codepoint) <> rest) do | |
{unquote(title), rest} | |
end | |
end | |
def titlecase_once(<<char, rest::binary>>) do | |
{<<char>>, rest} | |
end | |
# Strip | |
def lstrip(string) | |
def lstrip(""), do: "" | |
for codepoint <- whitespace do | |
def lstrip(unquote(codepoint) <> rest) do | |
lstrip(rest) | |
end | |
end | |
def lstrip(string) when is_binary(string), do: string | |
@whitespace_max_size 3 | |
for codepoint <- whitespace do | |
# We need to increment @whitespace_max_size as well | |
# as the small table (_s) if we add a new entry here. | |
case byte_size(codepoint) do | |
3 -> | |
defp do_rstrip_l(unquote(codepoint)), do: -3 | |
2 -> | |
defp do_rstrip_l(<<_, unquote(codepoint)>>), do: -2 | |
defp do_rstrip_s(unquote(codepoint)), do: <<>> | |
1 -> | |
defp do_rstrip_l(<<unquote(codepoint), unquote(codepoint), unquote(codepoint)>>), do: -3 | |
defp do_rstrip_l(<<_, unquote(codepoint), unquote(codepoint)>>), do: -2 | |
defp do_rstrip_l(<<_, _, unquote(codepoint)>>), do: -1 | |
defp do_rstrip_s(<<x, unquote(codepoint)>>), do: do_rstrip_s(<<x>>) | |
defp do_rstrip_s(unquote(codepoint)), do: <<>> | |
end | |
end | |
defp do_rstrip_l(_), do: 0 | |
defp do_rstrip_s(o), do: o | |
def rstrip(string) when is_binary(string) do | |
rstrip(string, byte_size(string)) | |
end | |
defp rstrip(string, size) when size < @whitespace_max_size do | |
do_rstrip_s(string) | |
end | |
defp rstrip(string, size) do | |
trail = binary_part(string, size, -@whitespace_max_size) | |
case do_rstrip_l(trail) do | |
0 -> string | |
x -> rstrip(binary_part(string, 0, size + x), size + x) | |
end | |
end | |
# Split | |
def split(""), do: [] | |
def split(string) when is_binary(string) do | |
:lists.reverse do_split(string, "", []) | |
end | |
for codepoint <- whitespace do | |
defp do_split(unquote(codepoint) <> rest, buffer, acc) do | |
do_split(rest, "", add_buffer_to_acc(buffer, acc)) | |
end | |
end | |
defp do_split(<<char, rest::binary>>, buffer, acc) do | |
do_split(rest, <<buffer::binary, char>>, acc) | |
end | |
defp do_split(<<>>, buffer, acc) do | |
add_buffer_to_acc(buffer, acc) | |
end | |
@compile {:inline, add_buffer_to_acc: 2} | |
defp add_buffer_to_acc("", acc), do: acc | |
defp add_buffer_to_acc(buffer, acc), do: [buffer|acc] | |
# Codepoints | |
def next_codepoint(<<cp::utf8, rest::binary>>) do | |
{<<cp::utf8>>, rest} | |
end | |
def next_codepoint(<<cp, rest::binary>>) do | |
{<<cp>>, rest} | |
end | |
def next_codepoint(<<>>) do | |
nil | |
end | |
def codepoints(binary) when is_binary(binary) do | |
do_codepoints(next_codepoint(binary)) | |
end | |
defp do_codepoints({c, rest}) do | |
[c|do_codepoints(next_codepoint(rest))] | |
end | |
defp do_codepoints(nil) do | |
[] | |
end | |
end | |
defmodule String.Graphemes do | |
@moduledoc false | |
cluster_path = Path.join(__DIR__, "GraphemeBreakProperty.txt") | |
regex = ~r/(?:^([0-9A-F]+)(?:\.\.([0-9A-F]+))?)\s+;\s(\w+)/m | |
to_range = fn | |
first, "" -> | |
[<<String.to_integer(first, 16)::utf8>>] | |
first, last -> | |
range = String.to_integer(first, 16)..String.to_integer(last, 16) | |
Enum.map(range, fn(int) -> <<int::utf8>> end) | |
end | |
cluster = Enum.reduce File.stream!(cluster_path), %{}, fn(line, dict) -> | |
[_full, first, last, class] = Regex.run(regex, line) | |
# Skip surrogates | |
if first == "D800" and last == "DFFF" do | |
dict | |
else | |
list = to_range.(first, last) | |
Map.update(dict, class, list, &(&1 ++ list)) | |
end | |
end | |
# There is no codepoint marked as Prepend by Unicode 6.3.0 | |
if cluster["Prepend"] do | |
raise "it seems this new unicode version has added Prepend items. " <> | |
"Please remove this error and uncomment the code below" | |
end | |
# Don't break CRLF | |
def next_grapheme_size(<<?\r, ?\n, rest::binary>>) do | |
{2, rest} | |
end | |
# Break on control | |
for codepoint <- cluster["CR"] ++ cluster["LF"] ++ cluster["Control"] do | |
def next_grapheme_size(<<unquote(codepoint), rest::binary>>) do | |
{unquote(byte_size(codepoint)), rest} | |
end | |
end | |
# Break on Prepend* | |
# for codepoint <- cluster["Prepend"] do | |
# def next_grapheme_size(<<unquote(codepoint), rest::binary>>) do | |
# next_prepend_size(rest, unquote(byte_size(codepoint))) | |
# end | |
# end | |
# Handle Hangul L | |
for codepoint <- cluster["L"] do | |
def next_grapheme_size(<<unquote(codepoint), rest::binary>>) do | |
next_hangul_l_size(rest, unquote(byte_size(codepoint))) | |
end | |
end | |
# Handle Hangul T | |
for codepoint <- cluster["T"] do | |
def next_grapheme_size(<<unquote(codepoint), rest::binary>>) do | |
next_hangul_t_size(rest, unquote(byte_size(codepoint))) | |
end | |
end | |
# Handle Regional | |
for codepoint <- cluster["Regional_Indicator"] do | |
def next_grapheme_size(<<unquote(codepoint), rest::binary>>) do | |
next_regional_size(rest, unquote(byte_size(codepoint))) | |
end | |
end | |
# Handle extended entries | |
def next_grapheme_size(<<cp::utf8, rest::binary>>) do | |
case cp do | |
x when x <= 0x007F -> next_extend_size(rest, 1) | |
x when x <= 0x07FF -> next_extend_size(rest, 2) | |
x when x <= 0xFFFF -> next_extend_size(rest, 3) | |
_ -> next_extend_size(rest, 4) | |
end | |
end | |
def next_grapheme_size(<<_, rest::binary>>) do | |
{1, rest} | |
end | |
def next_grapheme_size(<<>>) do | |
nil | |
end | |
# Handle Hangul L | |
for codepoint <- cluster["L"] do | |
defp next_hangul_l_size(<<unquote(codepoint), rest::binary>>, size) do | |
next_hangul_l_size(rest, size + unquote(byte_size(codepoint))) | |
end | |
end | |
for codepoint <- cluster["LV"] do | |
defp next_hangul_l_size(<<unquote(codepoint), rest::binary>>, size) do | |
next_hangul_v_size(rest, size + unquote(byte_size(codepoint))) | |
end | |
end | |
for codepoint <- cluster["LVT"] do | |
defp next_hangul_l_size(<<unquote(codepoint), rest::binary>>, size) do | |
next_hangul_t_size(rest, size + unquote(byte_size(codepoint))) | |
end | |
end | |
defp next_hangul_l_size(rest, size) do | |
next_hangul_v_size(rest, size) | |
end | |
# Handle Hangul V | |
for codepoint <- cluster["V"] do | |
defp next_hangul_v_size(<<unquote(codepoint), rest::binary>>, size) do | |
next_hangul_v_size(rest, size + unquote(byte_size(codepoint))) | |
end | |
end | |
defp next_hangul_v_size(rest, size) do | |
next_hangul_t_size(rest, size) | |
end | |
# Handle Hangul T | |
for codepoint <- cluster["T"] do | |
defp next_hangul_t_size(<<unquote(codepoint), rest::binary>>, size) do | |
next_hangul_t_size(rest, size + unquote(byte_size(codepoint))) | |
end | |
end | |
defp next_hangul_t_size(rest, size) do | |
next_extend_size(rest, size) | |
end | |
# Handle regional | |
for codepoint <- cluster["Regional_Indicator"] do | |
defp next_regional_size(<<unquote(codepoint), rest::binary>>, size) do | |
next_regional_size(rest, size + unquote(byte_size(codepoint))) | |
end | |
end | |
defp next_regional_size(rest, size) do | |
next_extend_size(rest, size) | |
end | |
# Handle Extend+SpacingMark | |
for codepoint <- cluster["Extend"] ++ cluster["SpacingMark"] do | |
defp next_extend_size(<<unquote(codepoint), rest::binary>>, size) do | |
next_extend_size(rest, size + unquote(byte_size(codepoint))) | |
end | |
end | |
defp next_extend_size(rest, size) do | |
{size, rest} | |
end | |
# Handle Prepend | |
# for codepoint <- cluster["Prepend"] do | |
# defp next_prepend_size(<<unquote(codepoint), rest::binary>>, size) do | |
# next_prepend_size(rest, size + unquote(byte_size(codepoint))) | |
# end | |
# end | |
# | |
# defp next_prepend_size(rest, size) do | |
# {size, rest} | |
# end | |
## Tight-loop implementations | |
def graphemes(binary) when is_binary(binary) do | |
do_graphemes(next_grapheme_size(binary), binary) | |
end | |
defp do_graphemes({size, rest}, binary) do | |
[:binary.part(binary, 0, size)|do_graphemes(next_grapheme_size(rest), rest)] | |
end | |
defp do_graphemes(nil, _) do | |
[] | |
end | |
def length(string) do | |
do_length(next_grapheme_size(string), 0) | |
end | |
defp do_length({_, rest}, acc) do | |
do_length(next_grapheme_size(rest), acc + 1) | |
end | |
defp do_length(nil, acc), do: acc | |
def split_at(string, pos) do | |
do_split_at(string, 0, pos, 0) | |
end | |
defp do_split_at(string, acc, desired_pos, current_pos) when desired_pos > current_pos do | |
case next_grapheme_size(string) do | |
{count, rest} -> do_split_at(rest, acc + count, desired_pos, current_pos + 1) | |
nil -> {acc, nil} | |
end | |
end | |
defp do_split_at(string, acc, desired_pos, desired_pos) do | |
{acc, string} | |
end | |
end | |
defmodule String.Normalizer do | |
@moduledoc false | |
decomposition_path = Path.join(__DIR__, "Decomposition.txt") | |
decompositions = Enum.reduce File.stream!(decomposition_path), [], fn(line, acc) -> | |
[key, first, second, third, fourth, _] = :binary.split(line, ";", [:global]) | |
decomposition = to_binary.(first) <> (to_binary.(second) || "") <> | |
(to_binary.(third) || "") <> | |
(to_binary.(fourth) || "") | |
[{to_binary.(key), decomposition} | acc] | |
end | |
composition_path = Path.join(__DIR__, "Composition.txt") | |
compositions = Enum.reduce File.stream!(composition_path), [], fn(line, acc) -> | |
[first, second, composition, _] = :binary.split(line, ";", [:global]) | |
key = to_binary.(first) <> to_binary.(second) | |
[{key, to_binary.(composition)} | acc] | |
end | |
combining_class_path = Path.join(__DIR__, "CombiningClasses.txt") | |
combining_classes = Enum.reduce File.stream!(combining_class_path), [], fn(line, acc) -> | |
[codepoint, class, _] = :binary.split(line, ";", [:global]) | |
[{String.to_integer(codepoint, 16), class} | acc] | |
end | |
# Normalize | |
def normalize(string, :nfd) when is_binary(string) do | |
normalize_nfd(string, "") | |
end | |
def normalize(string, :nfc) when is_binary(string) do | |
normalize_nfc(string, "") | |
end | |
defp normalize_nfd("", acc), do: acc | |
defp normalize_nfd(<<cp::utf8, rest::binary>>, acc) when cp in 0xAC00..0xD7A3 do | |
{syllable_index, t_count, n_count} = {cp - 0xAC00, 28, 588} | |
lead = 0x1100 + div(syllable_index, n_count) | |
vowel = 0x1161 + div(rem(syllable_index, n_count), t_count) | |
trail = 0x11A7 + rem(syllable_index, t_count) | |
binary = | |
if trail == 0x11A7 do | |
<<lead::utf8, vowel::utf8>> | |
else | |
<<lead::utf8, vowel::utf8, trail::utf8>> | |
end | |
normalize_nfd(rest, acc <> binary) | |
end | |
for {binary, decomposition} <- decompositions do | |
defp normalize_nfd(unquote(binary) <> rest, acc) do | |
normalize_nfd(rest, acc <> unquote(decomposition)) | |
end | |
end | |
defp normalize_nfd(binary, acc) do | |
{n, rest} = String.Graphemes.next_grapheme_size(binary) | |
part = :binary.part(binary, 0, n) | |
case n do | |
1 -> normalize_nfd(rest, acc <> part) | |
_ -> normalize_nfd(rest, acc <> canonical_order(part)) | |
end | |
end | |
defp normalize_nfc("", acc), do: acc | |
defp normalize_nfc(<<cp::utf8, rest::binary>>, acc) when cp in 0xAC00..0xD7A3 do | |
normalize_nfc(rest, acc <> <<cp::utf8>>) | |
end | |
defp normalize_nfc(binary, acc) do | |
{n, rest} = String.Graphemes.next_grapheme_size(binary) | |
part = :binary.part(binary, 0, n) | |
case n do | |
1 -> normalize_nfc(rest, acc <> part) | |
_ -> normalize_nfc(rest, acc <> compose(normalize_nfd(part, ""))) | |
end | |
end | |
defp canonical_order(binary) do | |
binary | |
|> :unicode.characters_to_list() | |
|> Enum.sort_by(&combining_class/1) | |
|> :unicode.characters_to_binary() | |
end | |
for {codepoint, class} <- combining_classes do | |
defp combining_class(unquote(codepoint)), do: unquote(class) | |
end | |
defp combining_class(_), do: 0 | |
defp compose(<<_::utf8>> = binary), do: binary | |
defp compose(<<lead::utf8, vowel::utf8, rest::binary>>) when lead in 0x1100..0x1112 and vowel in 0x1161..0x1175 do | |
codepoint = 0xAC00 + ((lead - 0x1100) * 588) + ((vowel - 0x1161) * 28) | |
case rest do | |
<<trail::utf8, accents::binary>> when trail in 0x11A7..0x11C2 -> | |
<<codepoint + trail - 0x11A7::utf8, accents::binary>> | |
_ -> | |
<<codepoint::utf8, rest::binary>> | |
end | |
end | |
for {binary, composition} <- compositions do | |
defp compose(unquote(binary)), do: unquote(composition) | |
end | |
defp compose(<<cp::utf8, rest::binary>>) do | |
compose(rest, <<cp::utf8>>, "", combining_class(cp) - 1) | |
end | |
defp compose("", base, accents, _), do: base <> accents | |
defp compose(<<cp::utf8, rest::binary>>, base, accents, last_class) do | |
part_class = combining_class(cp) | |
combined = <<base::binary, cp::utf8>> | |
if last_class < part_class and composable?(combined) do | |
compose(rest, compose(combined), accents, last_class) | |
else | |
compose(rest, base, <<accents::binary, cp::utf8>>, part_class) | |
end | |
end | |
for {binary, _} <- compositions do | |
defp composable?(unquote(binary)), do: true | |
end | |
defp composable?(_), do: false | |
end |