Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
otp/lib/stdlib/src/string.erl
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
2263 lines (2042 sloc)
73.8 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
%% | |
%% %CopyrightBegin% | |
%% | |
%% Copyright Ericsson AB 1996-2023. All Rights Reserved. | |
%% | |
%% Licensed under the Apache License, Version 2.0 (the "License"); | |
%% you may not use this file except in compliance with the License. | |
%% You may obtain a copy of the License at | |
%% | |
%% http://www.apache.org/licenses/LICENSE-2.0 | |
%% | |
%% Unless required by applicable law or agreed to in writing, software | |
%% distributed under the License is distributed on an "AS IS" BASIS, | |
%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
%% See the License for the specific language governing permissions and | |
%% limitations under the License. | |
%% | |
%% %CopyrightEnd% | |
%% | |
%% A string library that works on grapheme clusters, with the exception | |
%% of codepoints of class 'prepend' and non modern (or decomposed) Hangul. | |
%% If these codepoints appear, functions like 'find/2' may return a string | |
%% which starts inside a grapheme cluster. | |
%% These exceptions are made because the codepoints classes are | |
%% seldom used and require that we are able look at previous codepoints in | |
%% the stream and is thus hard to implement effectively. | |
%% | |
%% GC (grapheme cluster) implies that the length of string 'ß↑e̊' is 3 though | |
%% it is represented by the codepoints [223,8593,101,778] or the | |
%% utf8 binary <<195,159,226,134,145,101,204,138>> | |
%% | |
%% And that searching for strings or graphemes finds the correct positions: | |
%% | |
%% find("eeeee̊eee", "e̊") -> "e̊ee".: | |
%% find("1£4e̊abcdef", "e") -> "ef" | |
%% | |
%% Most functions expect all input to be normalized to one form, | |
%% see unicode:characters_to_nfc and unicode:characters_to_nfd functions. | |
%% When appending strings no checking is done to verify that the | |
%% result is valid unicode strings. | |
%% | |
%% The functions may crash for invalid utf-8 input. | |
%% | |
%% Return value should be kept consistent when return type is | |
%% unicode:chardata() i.e. binary input => binary output, | |
%% list input => list output mixed input => mixed output | |
%% | |
-module(string). | |
-export([is_empty/1, length/1, to_graphemes/1, | |
reverse/1, | |
equal/2, equal/3, equal/4, | |
slice/2, slice/3, | |
pad/2, pad/3, pad/4, trim/1, trim/2, trim/3, chomp/1, | |
take/2, take/3, take/4, | |
lexemes/2, nth_lexeme/3, | |
uppercase/1, lowercase/1, titlecase/1,casefold/1, | |
prefix/2, | |
split/2,split/3,replace/3,replace/4, | |
find/2,find/3, | |
next_codepoint/1, next_grapheme/1 | |
]). | |
-export([to_float/1, to_integer/1]). | |
%% Old (will be deprecated) lists/string API kept for backwards compability | |
-export([len/1, concat/2, % equal/2, (extended in the new api) | |
chr/2,rchr/2,str/2,rstr/2, | |
span/2,cspan/2,substr/2,substr/3, tokens/2, | |
chars/2,chars/3]). | |
-export([copies/2,words/1,words/2,strip/1,strip/2,strip/3, | |
sub_word/2,sub_word/3,left/2,left/3,right/2,right/3, | |
sub_string/2,sub_string/3,centre/2,centre/3, join/2]). | |
-export([to_upper/1, to_lower/1]). | |
%% | |
-import(lists,[member/2]). | |
-compile({no_auto_import,[length/1]}). | |
-compile({inline, [btoken/2, rev/1, append/2, stack/2, search_compile/1]}). | |
-define(ASCII_LIST(CP1,CP2), | |
is_integer(CP1), 0 =< CP1, CP1 < 256, | |
is_integer(CP2), 0 =< CP2, CP2 < 256, CP1 =/= $\r). | |
-export_type([grapheme_cluster/0]). | |
-type grapheme_cluster() :: char() | [char()]. | |
-type direction() :: 'leading' | 'trailing'. | |
-dialyzer({no_improper_lists, [stack/2, length_b/3]}). | |
%%% BIFs internal (not documented) should not to be used outside of this module | |
%%% May be removed | |
-export([list_to_float/1, list_to_integer/1]). | |
%% Uses bifs: string:list_to_float/1 and string:list_to_integer/1 | |
-spec list_to_float(String) -> {Float, Rest} | {'error', Reason} when | |
String :: string(), | |
Float :: float(), | |
Rest :: string(), | |
Reason :: 'no_float' | 'not_a_list'. | |
list_to_float(_) -> | |
erlang:nif_error(undef). | |
-spec list_to_integer(String) -> {Int, Rest} | {'error', Reason} when | |
String :: string(), | |
Int :: integer(), | |
Rest :: string(), | |
Reason :: 'no_integer' | 'not_a_list'. | |
list_to_integer(_) -> | |
erlang:nif_error(undef). | |
%%% End of BIFs | |
%% Check if string is the empty string | |
-spec is_empty(String::unicode:chardata()) -> boolean(). | |
is_empty([]) -> true; | |
is_empty(<<>>) -> true; | |
is_empty([L|R]) -> is_empty(L) andalso is_empty(R); | |
is_empty(_) -> false. | |
%% Count the number of grapheme clusters in chardata | |
-spec length(String::unicode:chardata()) -> non_neg_integer(). | |
length(<<CP1/utf8, Bin/binary>>) -> | |
length_b(Bin, CP1, 0); | |
length(CD) -> | |
length_1(CD, 0). | |
%% Convert a string to a list of grapheme clusters | |
-spec to_graphemes(String::unicode:chardata()) -> [grapheme_cluster()]. | |
to_graphemes(CD0) -> | |
case unicode_util:gc(CD0) of | |
[GC|CD] -> [GC|to_graphemes(CD)]; | |
[] -> []; | |
{error, Err} -> error({badarg, Err}) | |
end. | |
%% Compare two strings return boolean, assumes that the input are | |
%% normalized to same form, see unicode:characters_to_nfX_xxx(..) | |
-spec equal(A, B) -> boolean() when | |
A::unicode:chardata(), | |
B::unicode:chardata(). | |
equal(A,B) when is_binary(A), is_binary(B) -> | |
A =:= B; | |
equal(A,B) -> | |
equal_1(A,B). | |
%% Compare two strings return boolean, assumes that the input are | |
%% normalized to same form, see unicode:characters_to_nfX_xxx(..) | |
%% does casefold on the fly | |
-spec equal(A, B, IgnoreCase) -> boolean() when | |
A::unicode:chardata(), | |
B::unicode:chardata(), | |
IgnoreCase :: boolean(). | |
equal(A, B, false) -> | |
equal(A,B); | |
equal(A, B, true) -> | |
equal_nocase(A,B). | |
%% Compare two strings return boolean | |
%% if specified does casefold and normalization on the fly | |
-spec equal(A, B, IgnoreCase, Norm) -> boolean() when | |
A :: unicode:chardata(), | |
B :: unicode:chardata(), | |
IgnoreCase :: boolean(), | |
Norm :: 'none' | 'nfc' | 'nfd' | 'nfkc' | 'nfkd'. | |
equal(A, B, Case, none) -> | |
equal(A,B,Case); | |
equal(A, B, false, Norm) -> | |
equal_norm(A, B, Norm); | |
equal(A, B, true, Norm) -> | |
equal_norm_nocase(A, B, Norm). | |
%% Reverse grapheme clusters | |
-spec reverse(String::unicode:chardata()) -> [grapheme_cluster()]. | |
reverse(<<CP1/utf8, Rest/binary>>) -> | |
reverse_b(Rest, CP1, []); | |
reverse(CD) -> | |
reverse_1(CD, []). | |
%% Slice a string and return rest of string | |
%% Note: counts grapheme_clusters | |
-spec slice(String, Start) -> Slice when | |
String::unicode:chardata(), | |
Start :: non_neg_integer(), | |
Slice :: unicode:chardata(). | |
slice(CD, N) when is_integer(N), N >= 0 -> | |
case slice_l0(CD, N) of | |
[] when is_binary(CD) -> <<>>; | |
Res -> Res | |
end. | |
-spec slice(String, Start, Length) -> Slice when | |
String::unicode:chardata(), | |
Start :: non_neg_integer(), | |
Length :: 'infinity' | non_neg_integer(), | |
Slice :: unicode:chardata(). | |
slice(CD, N, Length) | |
when is_integer(N), N >= 0, is_integer(Length), Length > 0 -> | |
case slice_l0(CD, N) of | |
[] when is_binary(CD) -> <<>>; | |
L -> slice_trail(L, Length) | |
end; | |
slice(CD, N, infinity) when is_integer(N), N >= 0 -> | |
case slice_l0(CD, N) of | |
[] when is_binary(CD) -> <<>>; | |
Res -> Res | |
end; | |
slice(CD, _, 0) -> | |
case is_binary(CD) of | |
true -> <<>>; | |
false -> [] | |
end. | |
%% Pad a string to desired length | |
-spec pad(String, Length) -> unicode:charlist() when | |
String ::unicode:chardata(), | |
Length :: integer(). | |
pad(CD, Length) -> | |
pad(CD, Length, trailing, $\s). | |
-spec pad(String, Length, Dir) -> unicode:charlist() when | |
String ::unicode:chardata(), | |
Length :: integer(), | |
Dir :: direction() | 'both'. | |
pad(CD, Length, Dir) -> | |
pad(CD, Length, Dir, $\s). | |
-spec pad(String, Length, Dir, Char) -> unicode:charlist() when | |
String ::unicode:chardata(), | |
Length :: integer(), | |
Dir :: direction() | 'both', | |
Char :: grapheme_cluster(). | |
pad(CD, Length, leading, Char) when is_integer(Length) -> | |
Len = length(CD), | |
[lists:duplicate(max(0, Length-Len), Char), CD]; | |
pad(CD, Length, trailing, Char) when is_integer(Length) -> | |
Len = length(CD), | |
[CD|lists:duplicate(max(0, Length-Len), Char)]; | |
pad(CD, Length, both, Char) when is_integer(Length) -> | |
Len = length(CD), | |
Size = max(0, Length-Len), | |
Pre = lists:duplicate(Size div 2, Char), | |
Post = case Size rem 2 of | |
1 -> [Char]; | |
_ -> [] | |
end, | |
[Pre, CD, Pre|Post]. | |
%% Strip characters from whitespace or Separator in Direction | |
-spec trim(String) -> unicode:chardata() when | |
String :: unicode:chardata(). | |
trim(Str) -> | |
trim(Str, both, unicode_util:whitespace()). | |
-spec trim(String, Dir) -> unicode:chardata() when | |
String :: unicode:chardata(), | |
Dir :: direction() | 'both'. | |
trim(Str, Dir) -> | |
trim(Str, Dir, unicode_util:whitespace()). | |
-spec trim(String, Dir, Characters) -> unicode:chardata() when | |
String :: unicode:chardata(), | |
Dir :: direction() | 'both', | |
Characters :: [grapheme_cluster()]. | |
trim(Str, _, []) -> Str; | |
trim(Str, leading, [Sep]) | |
when is_list(Str), is_integer(Sep), 0 =< Sep, Sep < 256 -> | |
trim_ls(Str, Sep); | |
trim(Str, leading, Sep) when is_list(Sep) -> | |
trim_l(Str, Sep); | |
trim(Str, trailing, [Sep]) | |
when is_list(Str), is_integer(Sep), 0 =< Sep, Sep < 256 -> | |
trim_ts(Str, Sep); | |
trim(Str, trailing, Seps0) when is_list(Seps0) -> | |
Seps = search_pattern(Seps0), | |
trim_t(Str, 0, Seps); | |
trim(Str, both, Sep) when is_list(Sep) -> | |
trim(trim(Str,leading,Sep), trailing, Sep). | |
%% Delete trailing newlines or \r\n | |
-spec chomp(String::unicode:chardata()) -> unicode:chardata(). | |
chomp(Str) -> | |
trim(Str, trailing, [[$\r,$\n],$\n]). | |
%% Split String into two parts where the leading part consists of Characters | |
-spec take(String, Characters) -> {Leading, Trailing} when | |
String::unicode:chardata(), | |
Characters::[grapheme_cluster()], | |
Leading::unicode:chardata(), | |
Trailing::unicode:chardata(). | |
take(Str, Sep) -> | |
take(Str, Sep, false, leading). | |
-spec take(String, Characters, Complement) -> {Leading, Trailing} when | |
String::unicode:chardata(), | |
Characters::[grapheme_cluster()], | |
Complement::boolean(), | |
Leading::unicode:chardata(), | |
Trailing::unicode:chardata(). | |
take(Str, Sep, Complement) -> | |
take(Str, Sep, Complement, leading). | |
-spec take(String, Characters, Complement, Dir) -> {Leading, Trailing} when | |
String::unicode:chardata(), | |
Characters::[grapheme_cluster()], | |
Complement::boolean(), | |
Dir::direction(), | |
Leading::unicode:chardata(), | |
Trailing::unicode:chardata(). | |
take(Str, [], Complement, Dir) -> | |
Empty = case is_binary(Str) of true -> <<>>; false -> [] end, | |
case {Complement,Dir} of | |
{false, leading} -> {Empty, Str}; | |
{false, trailing} -> {Str, Empty}; | |
{true, leading} -> {Str, Empty}; | |
{true, trailing} -> {Empty, Str} | |
end; | |
take(Str, Sep, false, leading) -> | |
take_l(Str, Sep, []); | |
take(Str, Sep0, true, leading) -> | |
Sep = search_pattern(Sep0), | |
take_lc(Str, Sep, []); | |
take(Str, Sep0, false, trailing) -> | |
Sep = search_pattern(Sep0), | |
take_t(Str, 0, Sep); | |
take(Str, Sep0, true, trailing) -> | |
Sep = search_pattern(Sep0), | |
take_tc(Str, 0, Sep). | |
%% Uppercase all chars in Str | |
-spec uppercase(String::unicode:chardata()) -> unicode:chardata(). | |
uppercase(CD) when is_list(CD) -> | |
try uppercase_list(CD, false) | |
catch unchanged -> CD | |
end; | |
uppercase(<<CP1/utf8, Rest/binary>>=Orig) -> | |
try uppercase_bin(CP1, Rest, false) of | |
List -> unicode:characters_to_binary(List) | |
catch unchanged -> Orig | |
end; | |
uppercase(<<>>) -> | |
<<>>; | |
uppercase(Bin) -> | |
error({badarg, Bin}). | |
%% Lowercase all chars in Str | |
-spec lowercase(String::unicode:chardata()) -> unicode:chardata(). | |
lowercase(CD) when is_list(CD) -> | |
try lowercase_list(CD, false) | |
catch unchanged -> CD | |
end; | |
lowercase(<<CP1/utf8, Rest/binary>>=Orig) -> | |
try lowercase_bin(CP1, Rest, false) of | |
List -> unicode:characters_to_binary(List) | |
catch unchanged -> Orig | |
end; | |
lowercase(<<>>) -> | |
<<>>; | |
lowercase(Bin) -> | |
error({badarg, Bin}). | |
%% Make a titlecase of the first char in Str | |
-spec titlecase(String::unicode:chardata()) -> unicode:chardata(). | |
titlecase(CD) when is_list(CD) -> | |
case unicode_util:titlecase(CD) of | |
[GC|Tail] -> append(GC,Tail); | |
Empty -> Empty | |
end; | |
titlecase(CD) when is_binary(CD) -> | |
case unicode_util:titlecase(CD) of | |
[CP|Chars] when is_integer(CP) -> <<CP/utf8,Chars/binary>>; | |
[CPs|Chars] -> | |
<< << <<CP/utf8>> || CP <- CPs>>/binary, Chars/binary>>; | |
[] -> <<>> | |
end. | |
%% Make a comparable string of the Str should be used for equality tests only | |
-spec casefold(String::unicode:chardata()) -> unicode:chardata(). | |
casefold(CD) when is_list(CD) -> | |
try casefold_list(CD, false) | |
catch unchanged -> CD | |
end; | |
casefold(<<CP1/utf8, Rest/binary>>=Orig) -> | |
try casefold_bin(CP1, Rest, false) of | |
List -> unicode:characters_to_binary(List) | |
catch unchanged -> Orig | |
end; | |
casefold(<<>>) -> | |
<<>>; | |
casefold(Bin) -> | |
error({badarg, Bin}). | |
-spec to_integer(String) -> {Int, Rest} | {'error', Reason} when | |
String :: unicode:chardata(), | |
Int :: integer(), | |
Rest :: unicode:chardata(), | |
Reason :: 'no_integer' | badarg. | |
to_integer(String) -> | |
try take(String, "+-0123456789") of | |
{Head, Tail} -> | |
case is_empty(Head) of | |
true -> {error, no_integer}; | |
false -> | |
List = unicode:characters_to_list(Head), | |
case string:list_to_integer(List) of | |
{error, _} = Err -> Err; | |
{Int, Rest} -> | |
to_number(String, Int, Rest, List, Tail) | |
end | |
end | |
catch _:_ -> {error, badarg} | |
end. | |
-spec to_float(String) -> {Float, Rest} | {'error', Reason} when | |
String :: unicode:chardata(), | |
Float :: float(), | |
Rest :: unicode:chardata(), | |
Reason :: 'no_float' | 'badarg'. | |
to_float(String) -> | |
try take(String, "+-0123456789eE.,") of | |
{Head, Tail} -> | |
case is_empty(Head) of | |
true -> {error, no_float}; | |
false -> | |
List = unicode:characters_to_list(Head), | |
case string:list_to_float(List) of | |
{error, _} = Err -> Err; | |
{Float, Rest} -> | |
to_number(String, Float, Rest, List, Tail) | |
end | |
end | |
catch _:_ -> {error, badarg} | |
end. | |
to_number(String, Number, Rest, List, _Tail) when is_binary(String) -> | |
BSz = erlang:length(List)-erlang:length(Rest), | |
<<_:BSz/binary, Cont/binary>> = String, | |
{Number, Cont}; | |
to_number(_, Number, Rest, _, Tail) -> | |
{Number, concat(Rest,Tail)}. | |
%% Return the remaining string with prefix removed or else nomatch | |
-spec prefix(String::unicode:chardata(), Prefix::unicode:chardata()) -> | |
'nomatch' | unicode:chardata(). | |
prefix(Str, Prefix0) -> | |
Result = case unicode:characters_to_list(Prefix0) of | |
[] -> Str; | |
Prefix -> prefix_1(Str, Prefix) | |
end, | |
case Result of | |
[] when is_binary(Str) -> <<>>; | |
Res -> Res | |
end. | |
%% split String with the first occurrence of SearchPattern, return list of splits | |
-spec split(String, SearchPattern) -> [unicode:chardata()] when | |
String :: unicode:chardata(), | |
SearchPattern :: unicode:chardata(). | |
split(String, SearchPattern) -> | |
split(String, SearchPattern, leading). | |
%% split String with SearchPattern, return list of splits | |
-spec split(String, SearchPattern, Where) -> [unicode:chardata()] when | |
String :: unicode:chardata(), | |
SearchPattern :: unicode:chardata(), | |
Where :: direction() | 'all'. | |
split(String, SearchPattern, Where) -> | |
case is_empty(SearchPattern) of | |
true -> [String]; | |
false -> | |
SearchPatternCPs = unicode:characters_to_list(SearchPattern), | |
case split_1(String, SearchPatternCPs, 0, Where, [], []) of | |
{_Curr, []} -> [String]; | |
{_Curr, Acc} when Where =:= trailing -> Acc; | |
{Curr, Acc} when Where =:= all -> lists:reverse([Curr|Acc]); | |
Acc when is_list(Acc) -> Acc | |
end | |
end. | |
%% Replace the first SearchPattern in String with Replacement | |
-spec replace(String, SearchPattern, Replacement) -> | |
[unicode:chardata()] when | |
String :: unicode:chardata(), | |
SearchPattern :: unicode:chardata(), | |
Replacement :: unicode:chardata(). | |
replace(String, SearchPattern, Replacement) -> | |
lists:join(Replacement, split(String, SearchPattern)). | |
%% Replace Where SearchPattern in String with Replacement | |
-spec replace(String, SearchPattern, Replacement, Where) -> | |
[unicode:chardata()] when | |
String :: unicode:chardata(), | |
SearchPattern :: unicode:chardata(), | |
Replacement :: unicode:chardata(), | |
Where :: direction() | 'all'. | |
replace(String, SearchPattern, Replacement, Where) -> | |
lists:join(Replacement, split(String, SearchPattern, Where)). | |
%% Split Str into a list of chardata separated by one of the grapheme | |
%% clusters in Seps | |
-spec lexemes(String::unicode:chardata(), | |
SeparatorList::[grapheme_cluster()]) -> | |
[unicode:chardata()]. | |
lexemes([], _) -> []; | |
lexemes(Str, []) -> [Str]; | |
lexemes(Str, Seps0) when is_list(Seps0) -> | |
Seps = search_pattern(Seps0), | |
lexemes_m(Str, Seps, []). | |
-spec nth_lexeme(String, N, SeparatorList) -> unicode:chardata() when | |
String::unicode:chardata(), | |
N::non_neg_integer(), | |
SeparatorList::[grapheme_cluster()]. | |
nth_lexeme(Str, 1, []) -> Str; | |
nth_lexeme(Str, N, Seps0) when is_list(Seps0), is_integer(N), N > 0 -> | |
Seps = search_pattern(Seps0), | |
nth_lexeme_m(Str, Seps, N). | |
%% find first SearchPattern in String return rest of string | |
-spec find(String, SearchPattern) -> unicode:chardata() | 'nomatch' when | |
String::unicode:chardata(), | |
SearchPattern::unicode:chardata(). | |
find(String, SearchPattern) -> | |
find(String, SearchPattern, leading). | |
%% find SearchPattern in String (search in Dir direction) return rest of string | |
-spec find(String, SearchPattern, Dir) -> unicode:chardata() | 'nomatch' when | |
String::unicode:chardata(), | |
SearchPattern::unicode:chardata(), | |
Dir::direction(). | |
find(String, "", _) -> String; | |
find(String, <<>>, _) -> String; | |
find(String, SearchPattern, leading) -> | |
find_l(String, unicode:characters_to_list(SearchPattern)); | |
find(String, SearchPattern, trailing) -> | |
find_r(String, unicode:characters_to_list(SearchPattern), nomatch). | |
%% Fetch first grapheme cluster and return rest in tail | |
-spec next_grapheme(String::unicode:chardata()) -> | |
maybe_improper_list(grapheme_cluster(),unicode:chardata()) | | |
{error,unicode:chardata()}. | |
next_grapheme(CD) -> unicode_util:gc(CD). | |
%% Fetch first codepoint and return rest in tail | |
-spec next_codepoint(String::unicode:chardata()) -> | |
maybe_improper_list(char(),unicode:chardata()) | | |
{error,unicode:chardata()}. | |
next_codepoint(CD) -> unicode_util:cp(CD). | |
%% Internals | |
length_1([CP1|[CP2|_]=Cont], N) when ?ASCII_LIST(CP1,CP2) -> | |
length_1(Cont, N+1); | |
length_1(Str, N) -> | |
case unicode_util:gc(Str) of | |
[] -> N; | |
[_|Rest] -> length_1(Rest, N+1); | |
{error, Err} -> error({badarg, Err}) | |
end. | |
length_b(<<CP2/utf8, Rest/binary>>, CP1, N) | |
when ?ASCII_LIST(CP1,CP2) -> | |
length_b(Rest, CP2, N+1); | |
length_b(Bin0, CP1, N) -> | |
[_|Bin1] = unicode_util:gc([CP1|Bin0]), | |
case unicode_util:cp(Bin1) of | |
[] -> N+1; | |
[CP3|Bin] -> length_b(Bin, CP3, N+1); | |
{error, Err} -> error({badarg, Err}) | |
end. | |
equal_1([A|AR], [B|BR]) when is_integer(A), is_integer(B) -> | |
A =:= B andalso equal_1(AR, BR); | |
equal_1([], BR) -> is_empty(BR); | |
equal_1(A0,B0) -> | |
case {unicode_util:cp(A0), unicode_util:cp(B0)} of | |
{[CP|A],[CP|B]} -> equal_1(A,B); | |
{[], []} -> true; | |
{L1,L2} when is_list(L1), is_list(L2) -> false | |
end. | |
equal_nocase(A, A) -> true; | |
equal_nocase(A0, B0) -> | |
case {unicode_util:cp(unicode_util:casefold(A0)), | |
unicode_util:cp(unicode_util:casefold(B0))} of | |
{[CP|A],[CP|B]} -> equal_nocase(A,B); | |
{[], []} -> true; | |
{L1,L2} when is_list(L1), is_list(L2) -> false | |
end. | |
equal_norm(A, A, _Norm) -> true; | |
equal_norm(A0, B0, Norm) -> | |
case {unicode_util:cp(unicode_util:Norm(A0)), | |
unicode_util:cp(unicode_util:Norm(B0))} of | |
{[CP|A],[CP|B]} -> equal_norm(A,B, Norm); | |
{[], []} -> true; | |
{L1,L2} when is_list(L1), is_list(L2) -> false | |
end. | |
equal_norm_nocase(A, A, _Norm) -> true; | |
equal_norm_nocase(A0, B0, Norm) -> | |
case {unicode_util:cp(unicode_util:casefold(unicode_util:Norm(A0))), | |
unicode_util:cp(unicode_util:casefold(unicode_util:Norm(B0)))} of | |
{[CP|A],[CP|B]} -> equal_norm_nocase(A,B, Norm); | |
{[], []} -> true; | |
{L1,L2} when is_list(L1), is_list(L2) -> false | |
end. | |
reverse_1([CP1|[CP2|_]=Cont], Acc) when ?ASCII_LIST(CP1,CP2) -> | |
reverse_1(Cont, [CP1|Acc]); | |
reverse_1(CD, Acc) -> | |
case unicode_util:gc(CD) of | |
[GC|Rest] -> reverse_1(Rest, [GC|Acc]); | |
[] -> Acc; | |
{error, Err} -> error({badarg, Err}) | |
end. | |
reverse_b(<<CP2/utf8, Rest/binary>>, CP1, Acc) | |
when ?ASCII_LIST(CP1,CP2) -> | |
reverse_b(Rest, CP2, [CP1|Acc]); | |
reverse_b(Bin0, CP1, Acc) -> | |
[GC|Bin1] = unicode_util:gc([CP1|Bin0]), | |
case unicode_util:cp(Bin1) of | |
[] -> [GC|Acc]; | |
[CP3|Bin] -> reverse_b(Bin, CP3, [GC|Acc]); | |
{error, Err} -> error({badarg, Err}) | |
end. | |
slice_l0(<<CP1/utf8, Bin/binary>>, N) when N > 0 -> | |
slice_lb(Bin, CP1, N); | |
slice_l0(L, N) -> | |
slice_l(L, N). | |
slice_l([CP1|[CP2|_]=Cont], N) | |
when ?ASCII_LIST(CP1,CP2), is_integer(N), N > 0 -> | |
slice_l(Cont, N-1); | |
slice_l(CD, N) when is_integer(N), N > 0 -> | |
case unicode_util:gc(CD) of | |
[_|Cont] -> slice_l(Cont, N-1); | |
[] -> []; | |
{error, Err} -> error({badarg, Err}) | |
end; | |
slice_l(Cont, 0) -> | |
Cont. | |
slice_lb(<<CP2/utf8, Bin/binary>>, CP1, N) | |
when ?ASCII_LIST(CP1,CP2), is_integer(N), N > 1 -> | |
slice_lb(Bin, CP2, N-1); | |
slice_lb(Bin, CP1, N) -> | |
[_|Rest] = unicode_util:gc([CP1|Bin]), | |
if N > 1 -> | |
case unicode_util:cp(Rest) of | |
[CP2|Cont] -> slice_lb(Cont, CP2, N-1); | |
[] -> <<>>; | |
{error, Err} -> error({badarg, Err}) | |
end; | |
N =:= 1 -> | |
Rest | |
end. | |
slice_trail(Orig, N) when is_binary(Orig) -> | |
case Orig of | |
<<CP1/utf8, Bin/binary>> when N > 0 -> | |
Length = slice_bin(Bin, CP1, N), | |
Sz = byte_size(Orig) - Length, | |
<<Keep:Sz/binary, _/binary>> = Orig, | |
Keep; | |
<<_, _/binary>> when N > 0 -> | |
error({badarg, Orig}); | |
_ -> | |
<<>> | |
end; | |
slice_trail(CD, N) when is_list(CD) -> | |
slice_list(CD, N). | |
slice_list([CP1|[CP2|_]=Cont], N) when ?ASCII_LIST(CP1,CP2),N > 0 -> | |
[CP1|slice_list(Cont, N-1)]; | |
slice_list(CD, N) when N > 0 -> | |
case unicode_util:gc(CD) of | |
[GC|Cont] -> append(GC, slice_list(Cont, N-1)); | |
[] -> []; | |
{error, Err} -> error({badarg, Err}) | |
end; | |
slice_list(_, 0) -> | |
[]. | |
slice_bin(<<CP2/utf8, Bin/binary>>, CP1, N) when ?ASCII_LIST(CP1,CP2), N > 0 -> | |
slice_bin(Bin, CP2, N-1); | |
slice_bin(CD, CP1, N) when N > 0 -> | |
[_|Bin] = unicode_util:gc([CP1|CD]), | |
case unicode_util:cp(Bin) of | |
[CP2|Cont] -> slice_bin(Cont, CP2, N-1); | |
[] -> 0; | |
{error, Err} -> error({badarg, Err}) | |
end; | |
slice_bin(CD, CP1, 0) -> | |
byte_size(CD)+byte_size(<<CP1/utf8>>). | |
uppercase_list([CP1|[CP2|_]=Cont], _Changed) | |
when is_integer(CP1), $a =< CP1, CP1 =< $z, | |
is_integer(CP2), 0 =< CP2, CP2 < 256 -> | |
[CP1-32|uppercase_list(Cont, true)]; | |
uppercase_list([CP1|[CP2|_]=Cont], Changed) | |
when is_integer(CP1), 0 =< CP1, CP1 < 128, | |
is_integer(CP2), 0 =< CP2, CP2 < 256 -> | |
[CP1|uppercase_list(Cont, Changed)]; | |
uppercase_list([], true) -> | |
[]; | |
uppercase_list([], false) -> | |
throw(unchanged); | |
uppercase_list(CPs0, Changed) -> | |
case unicode_util:uppercase(CPs0) of | |
[Char|CPs] when Char =:= hd(CPs0) -> [Char|uppercase_list(CPs, Changed)]; | |
[Char|CPs] -> append(Char,uppercase_list(CPs, true)); | |
[] -> uppercase_list([], Changed) | |
end. | |
uppercase_bin(CP1, <<CP2/utf8, Bin/binary>>, _Changed) | |
when is_integer(CP1), $a =< CP1, CP1 =< $z, CP2 < 256 -> | |
[CP1-32|uppercase_bin(CP2, Bin, true)]; | |
uppercase_bin(CP1, <<CP2/utf8, Bin/binary>>, Changed) | |
when is_integer(CP1), 0 =< CP1, CP1 < 128, CP2 < 256 -> | |
[CP1|uppercase_bin(CP2, Bin, Changed)]; | |
uppercase_bin(CP1, Bin, Changed) -> | |
case unicode_util:uppercase([CP1|Bin]) of | |
[CP1|CPs] -> | |
case unicode_util:cp(CPs) of | |
[Next|Rest] when is_integer(Next), Next >= 0 -> | |
[CP1|uppercase_bin(Next, Rest, Changed)]; | |
[] when Changed -> | |
[CP1]; | |
[] -> | |
throw(unchanged); | |
{error, Err} -> | |
error({badarg, Err}) | |
end; | |
[Char|CPs] -> | |
case unicode_util:cp(CPs) of | |
[Next|Rest] when is_integer(Next), Next >= 0 -> | |
[Char|uppercase_bin(Next, Rest, true)]; | |
[] -> | |
[Char]; | |
{error, Err} -> | |
error({badarg, Err}) | |
end | |
end. | |
lowercase_list([CP1|[CP2|_]=Cont], _Changed) | |
when is_integer(CP1), $A =< CP1, CP1 =< $Z, | |
is_integer(CP2), 0 =< CP2, CP2 < 256 -> | |
[CP1+32|lowercase_list(Cont, true)]; | |
lowercase_list([CP1|[CP2|_]=Cont], Changed) | |
when is_integer(CP1), 0 =< CP1, CP1 < 128, | |
is_integer(CP2), 0 =< CP2, CP2 < 256 -> | |
[CP1|lowercase_list(Cont, Changed)]; | |
lowercase_list([], true) -> | |
[]; | |
lowercase_list([], false) -> | |
throw(unchanged); | |
lowercase_list(CPs0, Changed) -> | |
case unicode_util:lowercase(CPs0) of | |
[Char|CPs] when Char =:= hd(CPs0) -> [Char|lowercase_list(CPs, Changed)]; | |
[Char|CPs] -> append(Char,lowercase_list(CPs, true)); | |
[] -> lowercase_list([], Changed) | |
end. | |
lowercase_bin(CP1, <<CP2/utf8, Bin/binary>>, _Changed) | |
when is_integer(CP1), $A =< CP1, CP1 =< $Z, CP2 < 256 -> | |
[CP1+32|lowercase_bin(CP2, Bin, true)]; | |
lowercase_bin(CP1, <<CP2/utf8, Bin/binary>>, Changed) | |
when is_integer(CP1), 0 =< CP1, CP1 < 128, CP2 < 256 -> | |
[CP1|lowercase_bin(CP2, Bin, Changed)]; | |
lowercase_bin(CP1, Bin, Changed) -> | |
case unicode_util:lowercase([CP1|Bin]) of | |
[CP1|CPs] -> | |
case unicode_util:cp(CPs) of | |
[Next|Rest] when is_integer(Next), Next >= 0 -> | |
[CP1|lowercase_bin(Next, Rest, Changed)]; | |
[] when Changed -> | |
[CP1]; | |
[] -> | |
throw(unchanged); | |
{error, Err} -> | |
error({badarg, Err}) | |
end; | |
[Char|CPs] -> | |
case unicode_util:cp(CPs) of | |
[Next|Rest] when is_integer(Next), Next >= 0 -> | |
[Char|lowercase_bin(Next, Rest, true)]; | |
[] -> | |
[Char]; | |
{error, Err} -> | |
error({badarg, Err}) | |
end | |
end. | |
casefold_list([CP1|[CP2|_]=Cont], _Changed) | |
when is_integer(CP1), $A =< CP1, CP1 =< $Z, | |
is_integer(CP2), 0 =< CP2, CP2 < 256 -> | |
[CP1+32|casefold_list(Cont, true)]; | |
casefold_list([CP1|[CP2|_]=Cont], Changed) | |
when is_integer(CP1), 0 =< CP1, CP1 < 128, | |
is_integer(CP2), 0 =< CP2, CP2 < 256 -> | |
[CP1|casefold_list(Cont, Changed)]; | |
casefold_list([], true) -> | |
[]; | |
casefold_list([], false) -> | |
throw(unchanged); | |
casefold_list(CPs0, Changed) -> | |
case unicode_util:casefold(CPs0) of | |
[Char|CPs] when Char =:= hd(CPs0) -> [Char|casefold_list(CPs, Changed)]; | |
[Char|CPs] -> append(Char,casefold_list(CPs, true)); | |
[] -> casefold_list([], Changed) | |
end. | |
casefold_bin(CP1, <<CP2/utf8, Bin/binary>>, _Changed) | |
when is_integer(CP1), $A =< CP1, CP1 =< $Z, CP2 < 256 -> | |
[CP1+32|casefold_bin(CP2, Bin, true)]; | |
casefold_bin(CP1, <<CP2/utf8, Bin/binary>>, Changed) | |
when is_integer(CP1), 0 =< CP1, CP1 < 128, CP2 < 256 -> | |
[CP1|casefold_bin(CP2, Bin, Changed)]; | |
casefold_bin(CP1, Bin, Changed) -> | |
case unicode_util:casefold([CP1|Bin]) of | |
[CP1|CPs] -> | |
case unicode_util:cp(CPs) of | |
[Next|Rest] when is_integer(Next), Next >= 0 -> | |
[CP1|casefold_bin(Next, Rest, Changed)]; | |
[] when Changed -> | |
[CP1]; | |
[] -> | |
throw(unchanged); | |
{error, Err} -> | |
error({badarg, Err}) | |
end; | |
[Char|CPs] -> | |
case unicode_util:cp(CPs) of | |
[Next|Rest] when is_integer(Next), Next >= 0 -> | |
[Char|casefold_bin(Next, Rest, true)]; | |
[] -> | |
[Char]; | |
{error, Err} -> | |
error({badarg, Err}) | |
end | |
end. | |
%% Fast path for ascii searching for one character in lists | |
trim_ls([CP1|[CP2|_]=Cont]=Str, Sep) | |
when ?ASCII_LIST(CP1,CP2) -> | |
case Sep of | |
CP1 -> trim_ls(Cont, Sep); | |
_ -> Str | |
end; | |
trim_ls(Str, Sep) -> | |
trim_l(Str, [Sep]). | |
trim_l([CP1|[CP2|_]=Cont]=Str, Sep) | |
when ?ASCII_LIST(CP1,CP2) -> | |
case lists:member(CP1, Sep) of | |
true -> trim_l(Cont, Sep); | |
false -> Str | |
end; | |
trim_l([Bin|Cont0], Sep) when is_binary(Bin) -> | |
case bin_search_inv(Bin, Cont0, Sep) of | |
{nomatch, Cont} -> trim_l(Cont, Sep); | |
Keep -> Keep | |
end; | |
trim_l(Str, Sep) when is_list(Str) -> | |
case unicode_util:gc(Str) of | |
[C|Cs] -> | |
case lists:member(C, Sep) of | |
true -> trim_l(Cs, Sep); | |
false -> Str | |
end; | |
[] -> [] | |
end; | |
trim_l(Bin, Sep) when is_binary(Bin) -> | |
case bin_search_inv(Bin, [], Sep) of | |
{nomatch,_} -> <<>>; | |
[Keep] -> Keep | |
end. | |
%% Fast path for ascii searching for one character in lists | |
trim_ts([Sep|Cs1]=Str, Sep) -> | |
case Cs1 of | |
[] -> []; | |
[CP2|_] when ?ASCII_LIST(Sep,CP2) -> | |
Tail = trim_ts(Cs1, Sep), | |
case is_empty(Tail) of | |
true -> []; | |
false -> [Sep|Tail] | |
end; | |
_ -> | |
trim_t(Str, 0, search_pattern([Sep])) | |
end; | |
trim_ts([CP|Cont],Sep) when is_integer(CP) -> | |
[CP|trim_ts(Cont, Sep)]; | |
trim_ts(Str, Sep) -> | |
trim_t(Str, 0, search_pattern([Sep])). | |
trim_t([CP1|Cont]=Cs0, _, {GCs,CPs,_}=Seps) when is_integer(CP1) -> | |
case lists:member(CP1, CPs) of | |
true -> | |
[GC|Cs1] = unicode_util:gc(Cs0), | |
case lists:member(GC, GCs) of | |
true -> | |
Tail = trim_t(Cs1, 0, Seps), | |
case is_empty(Tail) of | |
true -> []; | |
false -> append(GC,Tail) | |
end; | |
false -> | |
append(GC,trim_t(Cs1, 0, Seps)) | |
end; | |
false -> | |
[CP1|trim_t(Cont, 0, Seps)] | |
end; | |
trim_t([Bin|Cont0], N, {GCs,_,_}=Seps0) when is_binary(Bin) -> | |
<<_:N/binary, Rest/binary>> = Bin, | |
Seps = search_compile(Seps0), | |
case bin_search(Rest, Cont0, Seps) of | |
{nomatch,_} -> | |
stack(Bin, trim_t(Cont0, 0, Seps)); | |
[SepStart|Cont1] -> | |
case bin_search_inv(SepStart, Cont1, GCs) of | |
{nomatch, Cont} -> | |
Tail = trim_t(Cont, 0, Seps), | |
case is_empty(Tail) of | |
true -> | |
KeepSz = byte_size(Bin) - byte_size(SepStart), | |
<<Keep:KeepSz/binary, _/binary>> = Bin, | |
Keep; | |
false -> | |
Used = cp_prefix(Cont0, Cont), | |
stack(Bin, stack(Used, Tail)) | |
end; | |
[NonSep|Cont] when is_binary(NonSep) -> | |
KeepSz = byte_size(Bin) - byte_size(NonSep), | |
trim_t([Bin|Cont], KeepSz, Seps) | |
end | |
end; | |
trim_t(Str, 0, {GCs,_,_}=Seps) when is_list(Str) -> | |
case unicode_util:gc(Str) of | |
[GC|Cs1] -> | |
case lists:member(GC, GCs) of | |
true -> | |
Tail = trim_t(Cs1, 0, Seps), | |
case is_empty(Tail) of | |
true -> []; | |
false -> append(GC,Tail) | |
end; | |
false -> | |
append(GC,trim_t(Cs1, 0, Seps)) | |
end; | |
[] -> [] | |
end; | |
trim_t(Bin, N, {GCs,_,_}=Seps0) when is_binary(Bin) -> | |
<<_:N/binary, Rest/binary>> = Bin, | |
Seps = search_compile(Seps0), | |
case bin_search(Rest, [], Seps) of | |
{nomatch,_} -> Bin; | |
[SepStart] -> | |
case bin_search_inv(SepStart, [], GCs) of | |
{nomatch,_} -> | |
KeepSz = byte_size(Bin) - byte_size(SepStart), | |
<<Keep:KeepSz/binary, _/binary>> = Bin, | |
Keep; | |
[NonSep] -> | |
KeepSz = byte_size(Bin) - byte_size(NonSep), | |
trim_t(Bin, KeepSz, Seps) | |
end | |
end. | |
take_l([CP1|[CP2|_]=Cont]=Str, Seps, Acc) | |
when ?ASCII_LIST(CP1,CP2) -> | |
case lists:member(CP1, Seps) of | |
true -> take_l(Cont, Seps, [CP1|Acc]); | |
false -> {rev(Acc), Str} | |
end; | |
take_l([Bin|Cont0], Seps, Acc) when is_binary(Bin) -> | |
case bin_search_inv(Bin, Cont0, Seps) of | |
{nomatch, Cont} -> | |
Used = cp_prefix(Cont0, Cont), | |
take_l(Cont, Seps, [unicode:characters_to_binary([Bin|Used])|Acc]); | |
[Bin1|_]=After when is_binary(Bin1) -> | |
First = byte_size(Bin) - byte_size(Bin1), | |
<<Keep:First/binary, _/binary>> = Bin, | |
{btoken(Keep,Acc), After} | |
end; | |
take_l(Str, Seps, Acc) when is_list(Str) -> | |
case unicode_util:gc(Str) of | |
[C|Cs] -> | |
case lists:member(C, Seps) of | |
true -> take_l(Cs, Seps, append(rev(C),Acc)); | |
false -> {rev(Acc), Str} | |
end; | |
[] -> {rev(Acc), []} | |
end; | |
take_l(Bin, Seps, Acc) when is_binary(Bin) -> | |
case bin_search_inv(Bin, [], Seps) of | |
{nomatch,_} -> | |
{btoken(Bin, Acc), <<>>}; | |
[After] -> | |
First = byte_size(Bin) - byte_size(After), | |
<<Keep:First/binary, _/binary>> = Bin, | |
{btoken(Keep, Acc), After} | |
end. | |
take_lc([CP1|Cont]=Str0, {GCs,CPs,_}=Seps, Acc) when is_integer(CP1) -> | |
case lists:member(CP1, CPs) of | |
true -> | |
[GC|Str] = unicode_util:gc(Str0), | |
case lists:member(GC, GCs) of | |
false -> take_lc(Str, Seps, append(rev(GC),Acc)); | |
true -> {rev(Acc), Str0} | |
end; | |
false -> | |
take_lc(Cont, Seps, append(CP1,Acc)) | |
end; | |
take_lc([Bin|Cont0], Seps0, Acc) when is_binary(Bin) -> | |
Seps = search_compile(Seps0), | |
case bin_search(Bin, Cont0, Seps) of | |
{nomatch, Cont} -> | |
Used = cp_prefix(Cont0, Cont), | |
take_lc(Cont, Seps, [unicode:characters_to_binary([Bin|Used])|Acc]); | |
[Bin1|_]=After when is_binary(Bin1) -> | |
First = byte_size(Bin) - byte_size(Bin1), | |
<<Keep:First/binary, _/binary>> = Bin, | |
{btoken(Keep,Acc), After} | |
end; | |
take_lc(Str, {GCs,_,_}=Seps, Acc) when is_list(Str) -> | |
case unicode_util:gc(Str) of | |
[C|Cs] -> | |
case lists:member(C, GCs) of | |
false -> take_lc(Cs, Seps, append(rev(C),Acc)); | |
true -> {rev(Acc), Str} | |
end; | |
[] -> {rev(Acc), []} | |
end; | |
take_lc(Bin, Seps0, Acc) when is_binary(Bin) -> | |
Seps = search_compile(Seps0), | |
case bin_search(Bin, [], Seps) of | |
{nomatch,_} -> | |
{btoken(Bin, Acc), <<>>}; | |
[After] -> | |
First = byte_size(Bin) - byte_size(After), | |
<<Keep:First/binary, _/binary>> = Bin, | |
{btoken(Keep, Acc), After} | |
end. | |
take_t([CP1|Cont]=Str0, _, {GCs,CPs,_}=Seps) when is_integer(CP1) -> | |
case lists:member(CP1, CPs) of | |
true -> | |
[GC|Str] = unicode_util:gc(Str0), | |
case lists:member(GC, GCs) of | |
true -> | |
{Head, Tail} = take_t(Str, 0, Seps), | |
case is_empty(Head) of | |
true -> {Head, append(GC,Tail)}; | |
false -> {append(GC,Head), Tail} | |
end; | |
false -> | |
{Head, Tail} = take_t(Str, 0, Seps), | |
{append(GC,Head), Tail} | |
end; | |
false -> | |
{Head, Tail} = take_t(Cont, 0, Seps), | |
{[CP1|Head], Tail} | |
end; | |
take_t([Bin|Cont0], N, {GCs,_,_}=Seps0) when is_binary(Bin) -> | |
<<_:N/binary, Rest/binary>> = Bin, | |
Seps = search_compile(Seps0), | |
case bin_search(Rest, Cont0, Seps) of | |
{nomatch,Cont} -> | |
Used = cp_prefix(Cont0, Cont), | |
{Head, Tail} = take_t(Cont, 0, Seps), | |
{stack(unicode:characters_to_binary([Bin|Used]), Head), Tail}; | |
[SepStart|Cont1] -> | |
case bin_search_inv(SepStart, Cont1, GCs) of | |
{nomatch, Cont} -> | |
{Head, Tail} = take_t(Cont, 0, Seps), | |
Used = cp_prefix(Cont0, Cont), | |
case is_empty(Head) of | |
true -> | |
KeepSz = byte_size(Bin) - byte_size(SepStart), | |
<<Keep:KeepSz/binary, End/binary>> = Bin, | |
{Keep, stack(stack(End,Used),Tail)}; | |
false -> | |
{stack(unicode:characters_to_binary([Bin|Used]),Head), Tail} | |
end; | |
[NonSep|Cont] when is_binary(NonSep) -> | |
KeepSz = byte_size(Bin) - byte_size(NonSep), | |
take_t([Bin|Cont], KeepSz, Seps) | |
end | |
end; | |
take_t(Str, 0, {GCs,_,_}=Seps) when is_list(Str) -> | |
case unicode_util:gc(Str) of | |
[GC|Cs1] -> | |
case lists:member(GC, GCs) of | |
true -> | |
{Head, Tail} = take_t(Cs1, 0, Seps), | |
case is_empty(Head) of | |
true -> {Head, append(GC,Tail)}; | |
false -> {append(GC,Head), Tail} | |
end; | |
false -> | |
{Head, Tail} = take_t(Cs1, 0, Seps), | |
{append(GC,Head), Tail} | |
end; | |
[] -> {[],[]} | |
end; | |
take_t(Bin, N, {GCs,_,_}=Seps0) when is_binary(Bin) -> | |
<<_:N/binary, Rest/binary>> = Bin, | |
Seps = search_compile(Seps0), | |
case bin_search(Rest, [], Seps) of | |
{nomatch,_} -> {Bin, <<>>}; | |
[SepStart] -> | |
case bin_search_inv(SepStart, [], GCs) of | |
{nomatch,_} -> | |
KeepSz = byte_size(Bin) - byte_size(SepStart), | |
<<Before:KeepSz/binary, End/binary>> = Bin, | |
{Before, End}; | |
[NonSep] -> | |
KeepSz = byte_size(Bin) - byte_size(NonSep), | |
take_t(Bin, KeepSz, Seps) | |
end | |
end. | |
take_tc([CP1|[CP2|_]=Cont], _, {GCs,_,_}=Seps) when ?ASCII_LIST(CP1,CP2) -> | |
case lists:member(CP1, GCs) of | |
false -> | |
{Head, Tail} = take_tc(Cont, 0, Seps), | |
case is_empty(Head) of | |
true -> {Head, append(CP1,Tail)}; | |
false -> {append(CP1,Head), Tail} | |
end; | |
true -> | |
{Head, Tail} = take_tc(Cont, 0, Seps), | |
{append(CP1,Head), Tail} | |
end; | |
take_tc([Bin|Cont0], N, {GCs,_,_}=Seps0) when is_binary(Bin) -> | |
<<_:N/binary, Rest/binary>> = Bin, | |
case bin_search_inv(Rest, Cont0, GCs) of | |
{nomatch,Cont} -> | |
Used = cp_prefix(Cont0, Cont), | |
{Head, Tail} = take_tc(Cont, 0, Seps0), | |
{stack(unicode:characters_to_binary([Bin|Used]), Head), Tail}; | |
[SepStart|Cont1] -> | |
Seps = search_compile(Seps0), | |
case bin_search(SepStart, Cont1, Seps) of | |
{nomatch, Cont} -> | |
{Head, Tail} = take_tc(Cont, 0, Seps), | |
Used = cp_prefix(Cont0, Cont), | |
case is_empty(Head) of | |
true -> | |
KeepSz = byte_size(Bin) - byte_size(SepStart), | |
<<Keep:KeepSz/binary, End/binary>> = Bin, | |
{Keep, stack(stack(End,Used),Tail)}; | |
false -> | |
{stack(unicode:characters_to_binary([Bin|Used]),Head), Tail} | |
end; | |
[NonSep|Cont] when is_binary(NonSep) -> | |
KeepSz = byte_size(Bin) - byte_size(NonSep), | |
take_tc([Bin|Cont], KeepSz, Seps) | |
end | |
end; | |
take_tc(Str, 0, {GCs,_,_}=Seps) when is_list(Str) -> | |
case unicode_util:gc(Str) of | |
[GC|Cs1] -> | |
case lists:member(GC, GCs) of | |
false -> | |
{Head, Tail} = take_tc(Cs1, 0, Seps), | |
case is_empty(Head) of | |
true -> {Head, append(GC,Tail)}; | |
false -> {append(GC,Head), Tail} | |
end; | |
true -> | |
{Head, Tail} = take_tc(Cs1, 0, Seps), | |
{append(GC,Head), Tail} | |
end; | |
[] -> {[],[]} | |
end; | |
take_tc(Bin, N, {GCs,_,_}=Seps0) when is_binary(Bin) -> | |
<<_:N/binary, Rest/binary>> = Bin, | |
case bin_search_inv(Rest, [], GCs) of | |
{nomatch,_} -> {Bin, <<>>}; | |
[SepStart] -> | |
Seps = search_compile(Seps0), | |
case bin_search(SepStart, [], Seps) of | |
{nomatch,_} -> | |
KeepSz = byte_size(Bin) - byte_size(SepStart), | |
<<Before:KeepSz/binary, End/binary>> = Bin, | |
{Before, End}; | |
[NonSep] -> | |
KeepSz = byte_size(Bin) - byte_size(NonSep), | |
take_tc(Bin, KeepSz, Seps) | |
end | |
end. | |
prefix_1(Cs0, [GC]) -> | |
case unicode_util:gc(Cs0) of | |
[GC|Cs] -> Cs; | |
_ -> nomatch | |
end; | |
prefix_1([CP|Cs], [Pre|PreR]) when is_integer(CP) -> | |
case CP =:= Pre of | |
true -> prefix_1(Cs,PreR); | |
false -> nomatch | |
end; | |
prefix_1(<<CP/utf8, Cs/binary>>, [Pre|PreR]) -> | |
case CP =:= Pre of | |
true -> prefix_1(Cs,PreR); | |
false -> nomatch | |
end; | |
prefix_1(Cs0, [Pre|PreR]) -> | |
case unicode_util:cp(Cs0) of | |
[Pre|Cs] -> prefix_1(Cs,PreR); | |
_ -> nomatch | |
end. | |
split_1([CP1|Cs]=Cs0, [C|_]=Needle, _, Where, Curr, Acc) when is_integer(CP1) -> | |
case CP1=:=C of | |
true -> | |
case prefix_1(Cs0, Needle) of | |
nomatch -> split_1(Cs, Needle, 0, Where, append(C,Curr), Acc); | |
Rest when Where =:= leading -> | |
[rev(Curr), Rest]; | |
Rest when Where =:= trailing -> | |
split_1(Cs, Needle, 0, Where, [C|Curr], [rev(Curr), Rest]); | |
Rest when Where =:= all -> | |
split_1(Rest, Needle, 0, Where, [], [rev(Curr)|Acc]) | |
end; | |
false -> | |
split_1(Cs, Needle, 0, Where, append(CP1,Curr), Acc) | |
end; | |
split_1([Bin|Cont0], Needle, Start, Where, Curr0, Acc) | |
when is_binary(Bin) -> | |
case bin_search_str(Bin, Start, Cont0, Needle) of | |
{nomatch,Sz,Cont} -> | |
<<Keep:Sz/binary, _/binary>> = Bin, | |
split_1(Cont, Needle, 0, Where, [Keep|Curr0], Acc); | |
{Before, [Cs0|Cont], After} -> | |
Curr = add_non_empty(Before,Curr0), | |
case Where of | |
leading -> | |
[rev(Curr),After]; | |
trailing -> | |
<<_/utf8, Cs/binary>> = Cs0, | |
Next = byte_size(Bin) - byte_size(Cs), | |
split_1([Bin|Cont], Needle, Next, Where, | |
Curr0, [rev(Curr),After]); | |
all -> | |
split_1(After, Needle, 0, Where, [], [rev(Curr)|Acc]) | |
end | |
end; | |
split_1(Cs0, [C|_]=Needle, _, Where, Curr, Acc) when is_list(Cs0) -> | |
case unicode_util:cp(Cs0) of | |
[C|Cs] -> | |
case prefix_1(Cs0, Needle) of | |
nomatch -> split_1(Cs, Needle, 0, Where, append(C,Curr), Acc); | |
Rest when Where =:= leading -> | |
[rev(Curr), Rest]; | |
Rest when Where =:= trailing -> | |
split_1(Cs, Needle, 0, Where, [C|Curr], [rev(Curr), Rest]); | |
Rest when Where =:= all -> | |
split_1(Rest, Needle, 0, Where, [], [rev(Curr)|Acc]) | |
end; | |
[Other|Cs] -> | |
split_1(Cs, Needle, 0, Where, append(Other,Curr), Acc); | |
[] -> | |
{rev(Curr), Acc} | |
end; | |
split_1(Bin, [_C|_]=Needle, Start, Where, Curr0, Acc) -> | |
case bin_search_str(Bin, Start, [], Needle) of | |
{nomatch,_,_} -> | |
<<_:Start/binary, Keep/binary>> = Bin, | |
{rev([Keep|Curr0]), Acc}; | |
{Before, [Cs0], After} -> | |
case Where of | |
leading -> | |
[rev([Before|Curr0]),After]; | |
trailing -> | |
<<_/utf8, Cs/binary>> = Cs0, | |
Next = byte_size(Bin) - byte_size(Cs), | |
split_1(Bin, Needle, Next, Where, Curr0, | |
[btoken(Before,Curr0),After]); | |
all -> | |
Next = byte_size(Bin) - byte_size(After), | |
<<_:Start/binary, Keep/binary>> = Before, | |
Curr = [Keep|Curr0], | |
split_1(Bin, Needle, Next, Where, [], [rev(Curr)|Acc]) | |
end | |
end. | |
lexemes_m([CP|_]=Cs0, {GCs,CPs,_}=Seps0, Ts) when is_integer(CP) -> | |
case lists:member(CP, CPs) of | |
true -> | |
[GC|Cs2] = unicode_util:gc(Cs0), | |
case lists:member(GC, GCs) of | |
true -> | |
lexemes_m(Cs2, Seps0, Ts); | |
false -> | |
Seps = search_compile(Seps0), | |
{Lexeme,Rest} = lexeme_pick(Cs0, Seps, []), | |
lexemes_m(Rest, Seps, [Lexeme|Ts]) | |
end; | |
false -> | |
Seps = search_compile(Seps0), | |
{Lexeme,Rest} = lexeme_pick(Cs0, Seps, []), | |
lexemes_m(Rest, Seps, [Lexeme|Ts]) | |
end; | |
lexemes_m([Bin|Cont0], {GCs,_,_}=Seps0, Ts) when is_binary(Bin) -> | |
case bin_search_inv(Bin, Cont0, GCs) of | |
{nomatch,Cont} -> | |
lexemes_m(Cont, Seps0, Ts); | |
Cs -> | |
Seps = search_compile(Seps0), | |
{Lexeme,Rest} = lexeme_pick(Cs, Seps, []), | |
lexemes_m(Rest, Seps, [Lexeme|Ts]) | |
end; | |
lexemes_m(Cs0, {GCs, _, _}=Seps0, Ts) when is_list(Cs0) -> | |
case unicode_util:gc(Cs0) of | |
[C|Cs] -> | |
case lists:member(C, GCs) of | |
true -> | |
lexemes_m(Cs, Seps0, Ts); | |
false -> | |
Seps = search_compile(Seps0), | |
{Lexeme,Rest} = lexeme_pick(Cs0, Seps, []), | |
lexemes_m(Rest, Seps, [Lexeme|Ts]) | |
end; | |
[] -> | |
lists:reverse(Ts) | |
end; | |
lexemes_m(Bin, {GCs,_,_}=Seps0, Ts) when is_binary(Bin) -> | |
case bin_search_inv(Bin, [], GCs) of | |
{nomatch,_} -> | |
lists:reverse(Ts); | |
[Cs] -> | |
Seps = search_compile(Seps0), | |
{Lexeme,Rest} = lexeme_pick(Cs, Seps, []), | |
lexemes_m(Rest, Seps, add_non_empty(Lexeme,Ts)) | |
end. | |
lexeme_pick([CP|Cs1]=Cs0, {GCs,CPs,_}=Seps, Tkn) when is_integer(CP) -> | |
case lists:member(CP, CPs) of | |
true -> | |
[GC|Cs2] = unicode_util:gc(Cs0), | |
case lists:member(GC, GCs) of | |
true -> {rev(Tkn), Cs2}; | |
false -> lexeme_pick(Cs2, Seps, append(rev(GC),Tkn)) | |
end; | |
false -> lexeme_pick(Cs1, Seps, [CP|Tkn]) | |
end; | |
lexeme_pick([Bin|Cont0], Seps, Tkn) when is_binary(Bin) -> | |
case bin_search(Bin, Cont0, Seps) of | |
{nomatch,_} -> | |
lexeme_pick(Cont0, Seps, [Bin|Tkn]); | |
[Left|_Cont] = Cs -> | |
Bytes = byte_size(Bin) - byte_size(Left), | |
<<Lexeme:Bytes/binary, _/binary>> = Bin, | |
{btoken(Lexeme, Tkn), Cs} | |
end; | |
lexeme_pick(Cs0, {GCs, CPs, _} = Seps, Tkn) when is_list(Cs0) -> | |
case unicode_util:cp(Cs0) of | |
[CP|Cs] -> | |
case lists:member(CP, CPs) of | |
true -> | |
[GC|Cs2] = unicode_util:gc(Cs0), | |
case lists:member(GC, GCs) of | |
true -> {rev(Tkn), Cs2}; | |
false -> lexeme_pick(Cs2, Seps, append(rev(GC),Tkn)) | |
end; | |
false -> | |
lexeme_pick(Cs, Seps, append(CP,Tkn)) | |
end; | |
[] -> | |
{rev(Tkn), []} | |
end; | |
lexeme_pick(Bin, Seps, Tkn) when is_binary(Bin) -> | |
case bin_search(Bin, [], Seps) of | |
{nomatch,_} -> | |
{btoken(Bin,Tkn), []}; | |
[Left] -> | |
Bytes = byte_size(Bin) - byte_size(Left), | |
<<Lexeme:Bytes/binary, _/binary>> = Bin, | |
{btoken(Lexeme, Tkn), Left} | |
end. | |
nth_lexeme_m([Bin|Cont0], {GCs,_,_}=Seps0, N) when is_binary(Bin) -> | |
case bin_search_inv(Bin, Cont0, GCs) of | |
{nomatch,Cont} -> | |
nth_lexeme_m(Cont, Seps0, N); | |
Cs when N > 1 -> | |
Rest = lexeme_skip(Cs, Seps0), | |
nth_lexeme_m(Rest, Seps0, N-1); | |
Cs -> | |
Seps = search_compile(Seps0), | |
{Lexeme,_} = lexeme_pick(Cs, Seps, []), | |
Lexeme | |
end; | |
nth_lexeme_m(Cs0, {GCs, _, _}=Seps0, N) when is_list(Cs0) -> | |
case unicode_util:gc(Cs0) of | |
[C|Cs] -> | |
case lists:member(C, GCs) of | |
true -> | |
nth_lexeme_m(Cs, Seps0, N); | |
false when N > 1 -> | |
Cs1 = lexeme_skip(Cs, Seps0), | |
nth_lexeme_m(Cs1, Seps0, N-1); | |
false -> | |
Seps = search_compile(Seps0), | |
{Lexeme,_} = lexeme_pick(Cs0, Seps, []), | |
Lexeme | |
end; | |
[] -> | |
[] | |
end; | |
nth_lexeme_m(Bin, {GCs,_,_}=Seps0, N) when is_binary(Bin) -> | |
Seps = search_compile(Seps0), | |
case bin_search_inv(Bin, [], GCs) of | |
[Cs] when N > 1 -> | |
Cs1 = lexeme_skip(Cs, Seps), | |
nth_lexeme_m(Cs1, Seps, N-1); | |
[Cs] -> | |
{Lexeme,_} = lexeme_pick(Cs, Seps, []), | |
Lexeme; | |
{nomatch,_} -> | |
<<>> | |
end. | |
lexeme_skip([CP|Cs1]=Cs0, {GCs,CPs,_}=Seps) when is_integer(CP) -> | |
case lists:member(CP, CPs) of | |
true -> | |
[GC|Cs2] = unicode_util:gc(Cs0), | |
case lists:member(GC, GCs) of | |
true -> Cs2; | |
false -> lexeme_skip(Cs2, Seps) | |
end; | |
false -> | |
lexeme_skip(Cs1, Seps) | |
end; | |
lexeme_skip([Bin|Cont0], Seps0) when is_binary(Bin) -> | |
Seps = search_compile(Seps0), | |
case bin_search(Bin, Cont0, Seps) of | |
{nomatch,_} -> lexeme_skip(Cont0, Seps); | |
Cs -> tl(unicode_util:gc(Cs)) | |
end; | |
lexeme_skip(Cs0, {GCs, CPs, _} = Seps) when is_list(Cs0) -> | |
case unicode_util:cp(Cs0) of | |
[CP|Cs] -> | |
case lists:member(CP, CPs) of | |
true -> | |
[GC|Cs2] = unicode_util:gc(Cs0), | |
case lists:member(GC, GCs) of | |
true -> Cs2; | |
false -> lexeme_skip(Cs2, Seps) | |
end; | |
false -> | |
lexeme_skip(Cs, Seps) | |
end; | |
[] -> | |
[] | |
end; | |
lexeme_skip(Bin, Seps0) when is_binary(Bin) -> | |
Seps = search_compile(Seps0), | |
case bin_search(Bin, [], Seps) of | |
{nomatch,_} -> <<>>; | |
[Left] -> tl(unicode_util:gc(Left)) | |
end. | |
find_l([C1|Cs]=Cs0, [C|_]=Needle) when is_integer(C1) -> | |
case C1 of | |
C -> | |
case prefix_1(Cs0, Needle) of | |
nomatch -> find_l(Cs, Needle); | |
_ -> Cs0 | |
end; | |
_ -> | |
find_l(Cs, Needle) | |
end; | |
find_l([Bin|Cont0], Needle) when is_binary(Bin) -> | |
case bin_search_str(Bin, 0, Cont0, Needle) of | |
{nomatch, _, Cont} -> | |
find_l(Cont, Needle); | |
{_Before, Cs, _After} -> | |
Cs | |
end; | |
find_l(Cs0, [C|_]=Needle) when is_list(Cs0) -> | |
case unicode_util:cp(Cs0) of | |
[C|Cs] -> | |
case prefix_1(Cs0, Needle) of | |
nomatch -> find_l(Cs, Needle); | |
_ -> Cs0 | |
end; | |
[_C|Cs] -> | |
find_l(Cs, Needle); | |
[] -> nomatch | |
end; | |
find_l(Bin, Needle) -> | |
case bin_search_str(Bin, 0, [], Needle) of | |
{nomatch,_,_} -> nomatch; | |
{_Before, [Cs], _After} -> Cs | |
end. | |
find_r([Cp|Cs]=Cs0, [C|_]=Needle, Res) when is_integer(Cp) -> | |
case Cp of | |
C -> | |
case prefix_1(Cs0, Needle) of | |
nomatch -> find_r(Cs, Needle, Res); | |
_ -> find_r(Cs, Needle, Cs0) | |
end; | |
_ -> | |
find_r(Cs, Needle, Res) | |
end; | |
find_r([Bin|Cont0], Needle, Res) when is_binary(Bin) -> | |
case bin_search_str(Bin, 0, Cont0, Needle) of | |
{nomatch,_,Cont} -> | |
find_r(Cont, Needle, Res); | |
{_, Cs0, _} -> | |
[_|Cs] = unicode_util:gc(Cs0), | |
find_r(Cs, Needle, Cs0) | |
end; | |
find_r(Cs0, [C|_]=Needle, Res) when is_list(Cs0) -> | |
case unicode_util:cp(Cs0) of | |
[C|Cs] -> | |
case prefix_1(Cs0, Needle) of | |
nomatch -> find_r(Cs, Needle, Res); | |
_ -> find_r(Cs, Needle, Cs0) | |
end; | |
[_C|Cs] -> | |
find_r(Cs, Needle, Res); | |
[] -> Res | |
end; | |
find_r(Bin, Needle, Res) -> | |
case bin_search_str(Bin, 0, [], Needle) of | |
{nomatch,_,_} -> Res; | |
{_Before, [Cs0], _After} -> | |
<<_/utf8, Cs/binary>> = Cs0, | |
find_r(Cs, Needle, Cs0) | |
end. | |
%% These are used to avoid creating lists around binaries | |
%% might be unnecessary, is there a better solution? | |
btoken(Token, []) -> Token; | |
btoken(BinPart, [C]) when is_integer(C) -> <<C/utf8, BinPart/binary>>; | |
btoken(<<>>, Tkn) -> lists:reverse(Tkn); | |
btoken(BinPart, Cs) -> [lists:reverse(Cs),BinPart]. | |
rev([B]) when is_binary(B) -> B; | |
rev(L) when is_list(L) -> lists:reverse(L); | |
rev(C) when is_integer(C) -> C. | |
append(Char, <<>>) when is_integer(Char) -> [Char]; | |
append(Char, <<>>) when is_list(Char) -> Char; | |
append(Char, Bin) when is_binary(Bin) -> [Char,Bin]; | |
append(Char, Str) when is_integer(Char) -> [Char|Str]; | |
append(GC, Str) when is_list(GC) -> GC ++ Str. | |
stack(Bin, []) -> Bin; | |
stack(<<>>, St) -> St; | |
stack([], St) -> St; | |
stack(Bin, St) -> [Bin|St]. | |
add_non_empty(<<>>, L) -> L; | |
add_non_empty(Token, L) -> [Token|L]. | |
cp_prefix(Orig, Cont) -> | |
case unicode_util:cp(Cont) of | |
[] -> Orig; | |
[Cp|Rest] -> cp_prefix_1(Orig, Cp, Rest) | |
end. | |
cp_prefix_1(Orig, Until, Cont) -> | |
case unicode_util:cp(Orig) of | |
[Until|Rest] -> | |
case equal(Rest, Cont) of | |
true -> []; | |
false-> [Until|cp_prefix_1(Rest, Until, Cont)] | |
end; | |
[CP|Rest] -> [CP|cp_prefix_1(Rest, Until, Cont)] | |
end. | |
%% Binary special | |
bin_search(Bin, Cont, {Seps,_,BP}) -> | |
bin_search_loop(Bin, 0, BP, Cont, Seps). | |
%% Need to work with [<<$a>>, <<778/utf8>>], | |
%% i.e. å in nfd form $a "COMBINING RING ABOVE" | |
%% and PREPEND characters like "ARABIC NUMBER SIGN" 1536 <<216,128>> | |
%% combined with other characters are currently ignored. | |
search_pattern({_,_,_}=P) -> P; | |
search_pattern(Seps) -> | |
CPs = search_cp(Seps), | |
{Seps, CPs, undefined}. | |
search_compile({Sep, CPs, undefined}) -> | |
{Sep, CPs, binary:compile_pattern(bin_pattern(CPs))}; | |
search_compile({_,_,_}=Compiled) -> Compiled. | |
search_cp([CP|Seps]) when is_integer(CP) -> | |
[CP|search_cp(Seps)]; | |
search_cp([Pattern|Seps]) -> | |
[CP|_] = unicode_util:cp(Pattern), | |
[CP|search_cp(Seps)]; | |
search_cp([]) -> []. | |
bin_pattern([CP|Seps]) -> | |
[<<CP/utf8>>|bin_pattern(Seps)]; | |
bin_pattern([]) -> []. | |
bin_search_loop(Bin0, Start, _, Cont, _Seps) | |
when byte_size(Bin0) =< Start; Start < 0 -> | |
{nomatch, Cont}; | |
bin_search_loop(Bin0, Start, BinSeps, Cont, Seps) -> | |
<<_:Start/binary, Bin/binary>> = Bin0, | |
case binary:match(Bin, BinSeps) of | |
nomatch -> | |
{nomatch,Cont}; | |
{Where, _CL} when Cont =:= [] -> | |
<<_:Where/binary, Cont1/binary>> = Bin, | |
[GC|Cont2] = unicode_util:gc(Cont1), | |
case lists:member(GC, Seps) of | |
false when Cont2 =:= [] -> | |
{nomatch, []}; | |
false -> | |
Next = byte_size(Bin0) - byte_size(Cont2), | |
bin_search_loop(Bin0, Next, BinSeps, Cont, Seps); | |
true -> | |
[Cont1] | |
end; | |
{Where, _CL} -> | |
<<_:Where/binary, Cont0/binary>> = Bin, | |
Cont1 = [Cont0|Cont], | |
[GC|Cont2] = unicode_util:gc(Cont1), | |
case lists:member(GC, Seps) of | |
false -> | |
case Cont2 of | |
[BinR|Cont] when is_binary(BinR) -> | |
Next = byte_size(Bin0) - byte_size(BinR), | |
bin_search_loop(Bin0, Next, BinSeps, Cont, Seps); | |
_ -> | |
{nomatch, Cont2} | |
end; | |
true -> | |
Cont1 | |
end | |
end. | |
bin_search_inv(<<>>, Cont, _) -> | |
{nomatch, Cont}; | |
bin_search_inv(Bin, Cont, [Sep]) -> | |
bin_search_inv_1(Bin, Cont, Sep); | |
bin_search_inv(Bin, Cont, Seps) -> | |
bin_search_inv_n(Bin, Cont, Seps). | |
bin_search_inv_1(<<CP1/utf8, BinRest/binary>>=Bin0, Cont, Sep) -> | |
case BinRest of | |
<<CP2/utf8, _/binary>> when ?ASCII_LIST(CP1, CP2) -> | |
case CP1 of | |
Sep -> bin_search_inv_1(BinRest, Cont, Sep); | |
_ -> [Bin0|Cont] | |
end; | |
_ when Cont =:= [] -> | |
case unicode_util:gc(Bin0) of | |
[Sep|Bin] -> bin_search_inv_1(Bin, Cont, Sep); | |
_ -> [Bin0|Cont] | |
end; | |
_ -> | |
case unicode_util:gc([Bin0|Cont]) of | |
[Sep|[Bin|Cont]] when is_binary(Bin) -> | |
bin_search_inv_1(Bin, Cont, Sep); | |
[Sep|Cs] -> | |
{nomatch, Cs}; | |
_ -> [Bin0|Cont] | |
end | |
end; | |
bin_search_inv_1(<<>>, Cont, _Sep) -> | |
{nomatch, Cont}; | |
bin_search_inv_1([], Cont, _Sep) -> | |
{nomatch, Cont}; | |
bin_search_inv_1(Bin, _, _) -> | |
error({badarg, Bin}). | |
bin_search_inv_n(<<CP1/utf8, BinRest/binary>>=Bin0, Cont, Seps) -> | |
case BinRest of | |
<<CP2/utf8, _/binary>> when ?ASCII_LIST(CP1, CP2) -> | |
case lists:member(CP1,Seps) of | |
true -> bin_search_inv_n(BinRest, Cont, Seps); | |
false -> [Bin0|Cont] | |
end; | |
_ when Cont =:= [] -> | |
[GC|Bin] = unicode_util:gc(Bin0), | |
case lists:member(GC, Seps) of | |
true -> bin_search_inv_n(Bin, Cont, Seps); | |
false -> [Bin0|Cont] | |
end; | |
_ -> | |
[GC|Cs0] = unicode_util:gc([Bin0|Cont]), | |
case lists:member(GC, Seps) of | |
false -> [Bin0|Cont]; | |
true -> | |
case Cs0 of | |
[Bin|Cont] when is_binary(Bin) -> | |
bin_search_inv_n(Bin, Cont, Seps); | |
_ -> | |
{nomatch, Cs0} | |
end | |
end | |
end; | |
bin_search_inv_n(<<>>, Cont, _Sep) -> | |
{nomatch, Cont}; | |
bin_search_inv_n([], Cont, _Sep) -> | |
{nomatch, Cont}; | |
bin_search_inv_n(Bin, _, _) -> | |
error({badarg, Bin}). | |
bin_search_str(Bin0, Start, [], SearchCPs) -> | |
Compiled = binary:compile_pattern(unicode:characters_to_binary(SearchCPs)), | |
bin_search_str_1(Bin0, Start, Compiled, SearchCPs); | |
bin_search_str(Bin0, Start, Cont, [CP|_]=SearchCPs) -> | |
First = binary:compile_pattern(<<CP/utf8>>), | |
bin_search_str_2(Bin0, Start, Cont, First, SearchCPs). | |
bin_search_str_1(Bin0, Start, First, SearchCPs) -> | |
<<_:Start/binary, Bin/binary>> = Bin0, | |
case binary:match(Bin, First) of | |
nomatch -> {nomatch, byte_size(Bin0), []}; | |
{Where0, _} -> | |
Where = Start+Where0, | |
<<Keep:Where/binary, Cs0/binary>> = Bin0, | |
case prefix_1(Cs0, SearchCPs) of | |
nomatch -> | |
<<_/utf8, Cs/binary>> = Cs0, | |
KeepSz = byte_size(Bin0) - byte_size(Cs), | |
bin_search_str_1(Bin0, KeepSz, First, SearchCPs); | |
[] -> | |
{Keep, [Cs0], <<>>}; | |
Rest -> | |
{Keep, [Cs0], Rest} | |
end | |
end. | |
bin_search_str_2(Bin0, Start, Cont, First, SearchCPs) -> | |
<<_:Start/binary, Bin/binary>> = Bin0, | |
case binary:match(Bin, First) of | |
nomatch -> {nomatch, byte_size(Bin0), Cont}; | |
{Where0, _} when is_integer(Where0) -> | |
Where = Start+Where0, | |
<<Keep:Where/binary, Cs0/binary>> = Bin0, | |
[GC|Cs]=unicode_util:gc(Cs0), | |
case prefix_1(stack(Cs0,Cont), SearchCPs) of | |
nomatch when is_binary(Cs) -> | |
KeepSz = byte_size(Bin0) - byte_size(Cs), | |
bin_search_str_2(Bin0, KeepSz, Cont, First, SearchCPs); | |
nomatch -> | |
{nomatch, Where, stack([GC|Cs],Cont)}; | |
[] -> | |
{Keep, [Cs0|Cont], <<>>}; | |
Rest -> | |
{Keep, [Cs0|Cont], Rest} | |
end | |
end. | |
%%--------------------------------------------------------------------------- | |
%% OLD lists API kept for backwards compability | |
%%--------------------------------------------------------------------------- | |
%% Robert's bit | |
%% len(String) | |
%% Return the length of a string. | |
-spec len(String) -> Length when | |
String :: string(), | |
Length :: non_neg_integer(). | |
len(S) -> erlang:length(S). | |
%% equal(String1, String2) | |
%% Test if 2 strings are equal. | |
%% -spec equal(String1, String2) -> boolean() when | |
%% String1 :: string(), | |
%% String2 :: string(). | |
%% equal(S, S) -> true; | |
%% equal(_, _) -> false. | |
%% concat(String1, String2) | |
%% Concatenate 2 strings. | |
-spec concat(String1, String2) -> String3 when | |
String1 :: string(), | |
String2 :: string(), | |
String3 :: string(). | |
concat(S1, S2) -> S1 ++ S2. | |
%% chr(String, Char) | |
%% rchr(String, Char) | |
%% Return the first/last index of the character in a string. | |
-spec chr(String, Character) -> Index when | |
String :: string(), | |
Character :: char(), | |
Index :: non_neg_integer(). | |
chr(S, C) when is_integer(C) -> chr(S, C, 1). | |
chr([C|_Cs], C, I) -> I; | |
chr([_|Cs], C, I) -> chr(Cs, C, I+1); | |
chr([], _C, _I) -> 0. | |
-spec rchr(String, Character) -> Index when | |
String :: string(), | |
Character :: char(), | |
Index :: non_neg_integer(). | |
rchr(S, C) when is_integer(C) -> rchr(S, C, 1, 0). | |
rchr([C|Cs], C, I, _L) -> %Found one, now find next! | |
rchr(Cs, C, I+1, I); | |
rchr([_|Cs], C, I, L) -> | |
rchr(Cs, C, I+1, L); | |
rchr([], _C, _I, L) -> L. | |
%% str(String, SubString) | |
%% rstr(String, SubString) | |
%% index(String, SubString) | |
%% Return the first/last index of the sub-string in a string. | |
%% index/2 is kept for backwards compatibility. | |
-spec str(String, SubString) -> Index when | |
String :: string(), | |
SubString :: string(), | |
Index :: non_neg_integer(). | |
str(S, Sub) when is_list(Sub) -> str(S, Sub, 1). | |
str([C|S], [C|Sub], I) -> | |
case l_prefix(Sub, S) of | |
true -> I; | |
false -> str(S, [C|Sub], I+1) | |
end; | |
str([_|S], Sub, I) -> str(S, Sub, I+1); | |
str([], _Sub, _I) -> 0. | |
-spec rstr(String, SubString) -> Index when | |
String :: string(), | |
SubString :: string(), | |
Index :: non_neg_integer(). | |
rstr(S, Sub) when is_list(Sub) -> rstr(S, Sub, 1, 0). | |
rstr([C|S], [C|Sub], I, L) -> | |
case l_prefix(Sub, S) of | |
true -> rstr(S, [C|Sub], I+1, I); | |
false -> rstr(S, [C|Sub], I+1, L) | |
end; | |
rstr([_|S], Sub, I, L) -> rstr(S, Sub, I+1, L); | |
rstr([], _Sub, _I, L) -> L. | |
l_prefix([C|Pre], [C|String]) -> l_prefix(Pre, String); | |
l_prefix([], String) when is_list(String) -> true; | |
l_prefix(Pre, String) when is_list(Pre), is_list(String) -> false. | |
%% span(String, Chars) -> Length. | |
%% cspan(String, Chars) -> Length. | |
-spec span(String, Chars) -> Length when | |
String :: string(), | |
Chars :: string(), | |
Length :: non_neg_integer(). | |
span(S, Cs) when is_list(Cs) -> span(S, Cs, 0). | |
span([C|S], Cs, I) -> | |
case member(C, Cs) of | |
true -> span(S, Cs, I+1); | |
false -> I | |
end; | |
span([], _Cs, I) -> I. | |
-spec cspan(String, Chars) -> Length when | |
String :: string(), | |
Chars :: string(), | |
Length :: non_neg_integer(). | |
cspan(S, Cs) when is_list(Cs) -> cspan(S, Cs, 0). | |
cspan([C|S], Cs, I) -> | |
case member(C, Cs) of | |
true -> I; | |
false -> cspan(S, Cs, I+1) | |
end; | |
cspan([], _Cs, I) -> I. | |
%% substr(String, Start) | |
%% substr(String, Start, Length) | |
%% Extract a sub-string from String. | |
-spec substr(String, Start) -> SubString when | |
String :: string(), | |
SubString :: string(), | |
Start :: pos_integer(). | |
substr(String, 1) when is_list(String) -> | |
String; | |
substr(String, S) when is_integer(S), S > 1 -> | |
substr2(String, S). | |
-spec substr(String, Start, Length) -> SubString when | |
String :: string(), | |
SubString :: string(), | |
Start :: pos_integer(), | |
Length :: non_neg_integer(). | |
substr(String, S, L) when is_integer(S), S >= 1, is_integer(L), L >= 0 -> | |
substr1(substr2(String, S), L). | |
substr1([C|String], L) when L > 0 -> [C|substr1(String, L-1)]; | |
substr1(String, _L) when is_list(String) -> []. %Be nice! | |
substr2(String, 1) when is_list(String) -> String; | |
substr2([_|String], S) -> substr2(String, S-1). | |
%% tokens(String, Seperators). | |
%% Return a list of tokens seperated by characters in Seperators. | |
-spec tokens(String, SeparatorList) -> Tokens when | |
String :: string(), | |
SeparatorList :: string(), | |
Tokens :: [Token :: nonempty_string()]. | |
tokens(S, Seps) -> | |
case Seps of | |
[] -> | |
case S of | |
[] -> []; | |
[_|_] -> [S] | |
end; | |
[C] -> | |
tokens_single_1(lists:reverse(S), C, []); | |
[_|_] -> | |
tokens_multiple_1(lists:reverse(S), Seps, []) | |
end. | |
tokens_single_1([Sep|S], Sep, Toks) -> | |
tokens_single_1(S, Sep, Toks); | |
tokens_single_1([C|S], Sep, Toks) -> | |
tokens_single_2(S, Sep, Toks, [C]); | |
tokens_single_1([], _, Toks) -> | |
Toks. | |
tokens_single_2([Sep|S], Sep, Toks, Tok) -> | |
tokens_single_1(S, Sep, [Tok|Toks]); | |
tokens_single_2([C|S], Sep, Toks, Tok) -> | |
tokens_single_2(S, Sep, Toks, [C|Tok]); | |
tokens_single_2([], _Sep, Toks, Tok) -> | |
[Tok|Toks]. | |
tokens_multiple_1([C|S], Seps, Toks) -> | |
case member(C, Seps) of | |
true -> tokens_multiple_1(S, Seps, Toks); | |
false -> tokens_multiple_2(S, Seps, Toks, [C]) | |
end; | |
tokens_multiple_1([], _Seps, Toks) -> | |
Toks. | |
tokens_multiple_2([C|S], Seps, Toks, Tok) -> | |
case member(C, Seps) of | |
true -> tokens_multiple_1(S, Seps, [Tok|Toks]); | |
false -> tokens_multiple_2(S, Seps, Toks, [C|Tok]) | |
end; | |
tokens_multiple_2([], _Seps, Toks, Tok) -> | |
[Tok|Toks]. | |
-spec chars(Character, Number) -> String when | |
Character :: char(), | |
Number :: non_neg_integer(), | |
String :: string(). | |
chars(C, N) -> chars(C, N, []). | |
-spec chars(Character, Number, Tail) -> String when | |
Character :: char(), | |
Number :: non_neg_integer(), | |
Tail :: string(), | |
String :: string(). | |
chars(C, N, Tail) when is_integer(N), N > 0 -> | |
chars(C, N-1, [C|Tail]); | |
chars(C, 0, Tail) when is_integer(C) -> | |
Tail. | |
%% Torbjörn's bit. | |
%%% COPIES %%% | |
-spec copies(String, Number) -> Copies when | |
String :: string(), | |
Copies :: string(), | |
Number :: non_neg_integer(). | |
copies(CharList, Num) when is_list(CharList), is_integer(Num), Num >= 0 -> | |
copies(CharList, Num, []). | |
copies(_CharList, 0, R) -> | |
R; | |
copies(CharList, Num, R) -> | |
copies(CharList, Num-1, CharList++R). | |
%%% WORDS %%% | |
-spec words(String) -> Count when | |
String :: string(), | |
Count :: pos_integer(). | |
words(String) -> words(String, $\s). | |
-spec words(String, Character) -> Count when | |
String :: string(), | |
Character :: char(), | |
Count :: pos_integer(). | |
words(String, Char) when is_integer(Char) -> | |
w_count(strip(String, both, Char), Char, 0). | |
w_count([], _, Num) -> Num+1; | |
w_count([H|T], H, Num) -> w_count(strip(T, left, H), H, Num+1); | |
w_count([_H|T], Char, Num) -> w_count(T, Char, Num). | |
%%% SUB_WORDS %%% | |
-spec sub_word(String, Number) -> Word when | |
String :: string(), | |
Word :: string(), | |
Number :: integer(). | |
sub_word(String, Index) -> sub_word(String, Index, $\s). | |
-spec sub_word(String, Number, Character) -> Word when | |
String :: string(), | |
Word :: string(), | |
Number :: integer(), | |
Character :: char(). | |
sub_word(String, Index, Char) when is_integer(Index), is_integer(Char) -> | |
case words(String, Char) of | |
Num when Num < Index -> | |
[]; | |
_Num -> | |
s_word(strip(String, left, Char), Index, Char, 1, []) | |
end. | |
s_word([], _, _, _,Res) -> lists:reverse(Res); | |
s_word([Char|_],Index,Char,Index,Res) -> lists:reverse(Res); | |
s_word([H|T],Index,Char,Index,Res) -> s_word(T,Index,Char,Index,[H|Res]); | |
s_word([Char|T],Stop,Char,Index,Res) when Index < Stop -> | |
s_word(strip(T,left,Char),Stop,Char,Index+1,Res); | |
s_word([_|T],Stop,Char,Index,Res) when Index < Stop -> | |
s_word(T,Stop,Char,Index,Res). | |
%%% STRIP %%% | |
-spec strip(string()) -> string(). | |
strip(String) -> strip(String, both). | |
-spec strip(String, Direction) -> Stripped when | |
String :: string(), | |
Stripped :: string(), | |
Direction :: 'left' | 'right' | 'both'. | |
strip(String, left) -> strip_left(String, $\s); | |
strip(String, right) -> strip_right(String, $\s); | |
strip(String, both) -> | |
strip_right(strip_left(String, $\s), $\s). | |
-spec strip(String, Direction, Character) -> Stripped when | |
String :: string(), | |
Stripped :: string(), | |
Direction :: 'left' | 'right' | 'both', | |
Character :: char(). | |
strip(String, right, Char) -> strip_right(String, Char); | |
strip(String, left, Char) -> strip_left(String, Char); | |
strip(String, both, Char) -> | |
strip_right(strip_left(String, Char), Char). | |
strip_left([Sc|S], Sc) -> | |
strip_left(S, Sc); | |
strip_left([_|_]=S, Sc) when is_integer(Sc) -> S; | |
strip_left([], Sc) when is_integer(Sc) -> []. | |
strip_right([Sc|S], Sc) -> | |
case strip_right(S, Sc) of | |
[] -> []; | |
T -> [Sc|T] | |
end; | |
strip_right([C|S], Sc) -> | |
[C|strip_right(S, Sc)]; | |
strip_right([], Sc) when is_integer(Sc) -> | |
[]. | |
%%% LEFT %%% | |
-spec left(String, Number) -> Left when | |
String :: string(), | |
Left :: string(), | |
Number :: non_neg_integer(). | |
left(String, Len) when is_integer(Len) -> left(String, Len, $\s). | |
-spec left(String, Number, Character) -> Left when | |
String :: string(), | |
Left :: string(), | |
Number :: non_neg_integer(), | |
Character :: char(). | |
left(String, Len, Char) when is_integer(Len), is_integer(Char) -> | |
Slen = erlang:length(String), | |
if | |
Slen > Len -> substr(String, 1, Len); | |
Slen < Len -> l_pad(String, Len-Slen, Char); | |
Slen =:= Len -> String | |
end. | |
l_pad(String, Num, Char) -> String ++ chars(Char, Num). | |
%%% RIGHT %%% | |
-spec right(String, Number) -> Right when | |
String :: string(), | |
Right :: string(), | |
Number :: non_neg_integer(). | |
right(String, Len) when is_integer(Len) -> right(String, Len, $\s). | |
-spec right(String, Number, Character) -> Right when | |
String :: string(), | |
Right :: string(), | |
Number :: non_neg_integer(), | |
Character :: char(). | |
right(String, Len, Char) when is_integer(Len), is_integer(Char) -> | |
Slen = erlang:length(String), | |
if | |
Slen > Len -> substr(String, Slen-Len+1); | |
Slen < Len -> r_pad(String, Len-Slen, Char); | |
Slen =:= Len -> String | |
end. | |
r_pad(String, Num, Char) -> chars(Char, Num, String). | |
%%% CENTRE %%% | |
-spec centre(String, Number) -> Centered when | |
String :: string(), | |
Centered :: string(), | |
Number :: non_neg_integer(). | |
centre(String, Len) when is_integer(Len) -> centre(String, Len, $\s). | |
-spec centre(String, Number, Character) -> Centered when | |
String :: string(), | |
Centered :: string(), | |
Number :: non_neg_integer(), | |
Character :: char(). | |
centre(String, 0, Char) when is_list(String), is_integer(Char) -> | |
[]; % Strange cases to centre string | |
centre(String, Len, Char) when is_integer(Len), is_integer(Char) -> | |
Slen = erlang:length(String), | |
if | |
Slen > Len -> substr(String, (Slen-Len) div 2 + 1, Len); | |
Slen < Len -> | |
N = (Len-Slen) div 2, | |
r_pad(l_pad(String, Len-(Slen+N), Char), N, Char); | |
Slen =:= Len -> String | |
end. | |
%%% SUB_STRING %%% | |
-spec sub_string(String, Start) -> SubString when | |
String :: string(), | |
SubString :: string(), | |
Start :: pos_integer(). | |
sub_string(String, Start) -> substr(String, Start). | |
-spec sub_string(String, Start, Stop) -> SubString when | |
String :: string(), | |
SubString :: string(), | |
Start :: pos_integer(), | |
Stop :: pos_integer(). | |
sub_string(String, Start, Stop) when is_integer(Start), is_integer(Stop) -> | |
substr(String, Start, Stop - Start + 1). | |
%% ISO/IEC 8859-1 (latin1) letters are converted, others are ignored | |
%% | |
to_lower_char(C) when is_integer(C), $A =< C, C =< $Z -> | |
C + 32; | |
to_lower_char(C) when is_integer(C), 16#C0 =< C, C =< 16#D6 -> | |
C + 32; | |
to_lower_char(C) when is_integer(C), 16#D8 =< C, C =< 16#DE -> | |
C + 32; | |
to_lower_char(C) -> | |
C. | |
to_upper_char(C) when is_integer(C), $a =< C, C =< $z -> | |
C - 32; | |
to_upper_char(C) when is_integer(C), 16#E0 =< C, C =< 16#F6 -> | |
C - 32; | |
to_upper_char(C) when is_integer(C), 16#F8 =< C, C =< 16#FE -> | |
C - 32; | |
to_upper_char(C) -> | |
C. | |
-spec to_lower(String) -> Result when | |
String :: io_lib:latin1_string(), | |
Result :: io_lib:latin1_string() | |
; (Char) -> CharResult when | |
Char :: char(), | |
CharResult :: char(). | |
to_lower(S) when is_list(S) -> | |
[to_lower_char(C) || C <- S]; | |
to_lower(C) when is_integer(C) -> | |
to_lower_char(C). | |
-spec to_upper(String) -> Result when | |
String :: io_lib:latin1_string(), | |
Result :: io_lib:latin1_string() | |
; (Char) -> CharResult when | |
Char :: char(), | |
CharResult :: char(). | |
to_upper(S) when is_list(S) -> | |
[to_upper_char(C) || C <- S]; | |
to_upper(C) when is_integer(C) -> | |
to_upper_char(C). | |
-spec join(StringList, Separator) -> String when | |
StringList :: [string()], | |
Separator :: string(), | |
String :: string(). | |
join([], Sep) when is_list(Sep) -> | |
[]; | |
join([H|T], Sep) -> | |
H ++ lists:append([Sep ++ X || X <- T]). |