/
folding.ex
128 lines (95 loc) · 3.57 KB
/
folding.ex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
defmodule Unicode.String.Case.Folding do
@moduledoc """
Implements the Unicode Case Folding algorithm.
The intention of case folding is to facilitate
case-insensitive string comparisons. It is not
intended to be a general purpose transformation.
Although case folding does generally use lower
case as its normal form, it is not true for
all scripts and codepoints. Therefore case
folding should not be used as an alternative
to `String.downcase/1`.
"""
@turkic_languages [:tr, :az]
@fold_status [:turkic, :common, :full]
@doc """
Case fold a string.
Returns a string after applying the Unicode
Case Folding algorithm.
Case folding is intended to suport case
insensitve string comparisons such as that
implemented by `Unicode.String.equals_ignoring_case?/2` which
calls this function on its parameters.
### Arguments
* `string` is any `String.t()`
* `mode or language tag` is either the atoms `:turkic` or `nil`
or a map that includes the key `:language` with a value that
is a lowercase atom representing an [ISO 639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes)
language code. The [CLDR language tag](https://hexdocs.pm/ex_cldr/Cldr.LanguageTag.html) which is defined
as part of the [ex_cldr](https://hex.pm/packages/ex_cldr) is one
such example. See [Cldr.validate_locale/2](https://hexdocs.pm/ex_cldr/Cldr.html#validate_locale/2)
for further information. The default is `nil`.
### Returns
* The case folded string
### Notes
* No normalization is applied to the
string on either input or output.
* Case folding does not apply any transformation
to accented characters. `"ü"` will not case fold
to `"u"` for example.
### Examples
iex> Unicode.String.Case.Folding.fold("THIS")
"this"
iex> Unicode.String.Case.Folding.fold("grüßen")
"grüssen"
iex(13)> Unicode.String.Case.Folding.fold("I")
"i"
# Turkic languages such as Turkish and Azerbaijani have
# a dotless lower case "i"
iex> Unicode.String.Case.Folding.fold("I", :turkic)
"ı"
iex> Unicode.String.Case.Folding.fold("I", %{language: :az})
"ı"
"""
def fold(string) when is_binary(string) do
fold(string, :full, nil)
end
def fold(string, %{language: language}) when language in @turkic_languages do
fold(string, :full, :turkic)
end
def fold(string, language) when language in @turkic_languages do
fold(string, :full, :turkic)
end
def fold(string, %{language: _language}) do
fold(string, :full, nil)
end
def fold(string, :turkic) when is_binary(string) do
fold(string, :full, :turkic)
end
def fold(string, _other) when is_binary(string) do
fold(string, :full, nil)
end
for [status, from, to] <- Unicode.Utils.case_folding(), status in @fold_status do
to = if is_list(to), do: List.to_string(to), else: List.to_string([to])
case status do
:turkic ->
defp fold(<<unquote(from)::utf8, rest::binary>>, _status, :turkic) do
<<unquote(to), fold(rest, unquote(status))::binary>>
end
:common ->
defp fold(<<unquote(from)::utf8, rest::binary>>, status, mode) do
<<unquote(to), fold(rest, status, mode)::binary>>
end
:full ->
defp fold(<<unquote(from)::utf8, rest::binary>>, unquote(status), mode) do
<<unquote(to), fold(rest, unquote(status), mode)::binary>>
end
end
end
defp fold(<<from::utf8, rest::binary>>, status, mode) do
<<from::utf8, fold(rest, status, mode)::binary>>
end
defp fold("", _, _) do
""
end
end