Permalink
Browse files

Allow noncharacter code points in unicode encoding and decoding

The two noncharacter code points 16#FFFE and 16#FFFF were not
allowed to be encoded or decoded using the unicode module or
bit syntax. That causes an inconsistency, since the noncharacters
16#FDD0 to 16#FDEF could be encoded/decoded.

There is two ways to fix that inconsistency.

We have chosen to allow 16#FFFE and 16#FFFF to be encoded and
decoded, because the noncharacters could be useful internally
within an application and it will make encoding and decoding
slightly faster.

Reported-by: Alisdair Sullivan
  • Loading branch information...
1 parent 6ca6dd3 commit 34db76765561487e526fe66d3d19ecf3b3fb9dc8 @bjorng bjorng committed Aug 30, 2011
@@ -3967,8 +3967,7 @@ void process_main(void)
* too big numbers).
*/
if (is_not_small(val) || val > make_small(0x10FFFFUL) ||
- (make_small(0xD800UL) <= val && val <= make_small(0xDFFFUL)) ||
- val == make_small(0xFFFEUL) || val == make_small(0xFFFFUL)) {
+ (make_small(0xD800UL) <= val && val <= make_small(0xDFFFUL))) {
goto badarg;
}
Next(2);
@@ -3987,8 +3986,8 @@ void process_main(void)
* the valid range).
*/
if (is_not_small(tmp_arg1) || tmp_arg1 > make_small(0x10FFFFUL) ||
- (make_small(0xD800UL) <= tmp_arg1 && tmp_arg1 <= make_small(0xDFFFUL)) ||
- tmp_arg1 == make_small(0xFFFEUL) || tmp_arg1 == make_small(0xFFFFUL)) {
+ (make_small(0xD800UL) <= tmp_arg1 &&
+ tmp_arg1 <= make_small(0xDFFFUL))) {
ErlBinMatchBuffer *mb = ms_matchbuffer(tmp_arg2);
mb->offset -= 32;
@@ -845,8 +845,7 @@ erts_bs_put_utf8(ERL_BITS_PROTO_1(Eterm arg))
dst[1] = 0x80 | (val & 0x3F);
num_bits = 16;
} else if (val < 0x10000UL) {
- if ((0xD800 <= val && val <= 0xDFFF) ||
- val == 0xFFFE || val == 0xFFFF) {
+ if (0xD800 <= val && val <= 0xDFFF) {
return 0;
}
dst[0] = 0xE0 | (val >> 12);
@@ -886,8 +885,7 @@ erts_bs_put_utf16(ERL_BITS_PROTO_2(Eterm arg, Uint flags))
return 0;
}
val = unsigned_val(arg);
- if (val > 0x10FFFF || (0xD800 <= val && val <= 0xDFFF) ||
- val == 0xFFFE || val == 0xFFFF) {
+ if (val > 0x10FFFF || (0xD800 <= val && val <= 0xDFFF)) {
return 0;
}
@@ -1652,8 +1650,7 @@ erts_bs_get_utf8(ErlBinMatchBuffer* mb)
return THE_NON_VALUE;
}
result = (((result << 6) + a) << 6) + b - (Eterm) 0x000E2080UL;
- if ((0xD800 <= result && result <= 0xDFFF) ||
- result == 0xFFFE || result == 0xFFFF) {
+ if (0xD800 <= result && result <= 0xDFFF) {
return THE_NON_VALUE;
}
mb->offset += 24;
@@ -1723,9 +1720,6 @@ erts_bs_get_utf16(ErlBinMatchBuffer* mb, Uint flags)
w1 = (src[0] << 8) | src[1];
}
if (w1 < 0xD800 || w1 > 0xDFFF) {
- if (w1 == 0xFFFE || w1 == 0xFFFF) {
- return THE_NON_VALUE;
- }
mb->offset += 16;
return make_small(w1);
} else if (w1 > 0xDBFF) {
@@ -348,12 +348,6 @@ static int copy_utf8_bin(byte *target, byte *source, Uint size,
return copied;
}
- if (((*source) == 0xEF) && (source[1] == 0xBF) &&
- ((source[2] == 0xBE) || (source[2] == 0xBF))) {
- *err_pos = source;
- return copied;
- }
-
*(target++) = *(source++);
*(target++) = *(source++);
*(target++) = *(source++);
@@ -714,9 +708,8 @@ static Eterm do_build_utf8(Process *p, Eterm ioterm, int *left, int latin1,
target[(*pos)++] = (((byte) (x & 0x3F)) |
((byte) 0x80));
} else if (x < 0x10000) {
- if ((x >= 0xD800 && x <= 0xDFFF) ||
- (x == 0xFFFE) ||
- (x == 0xFFFF)) { /* Invalid unicode range */
+ if (x >= 0xD800 && x <= 0xDFFF) {
+ /* Invalid unicode range */
*err = 1;
goto done;
}
@@ -1230,10 +1223,6 @@ int erts_analyze_utf8(byte *source, Uint size,
((source[1] & 0x20) != 0)) {
return ERTS_UTF8_ERROR;
}
- if (((*source) == 0xEF) && (source[1] == 0xBF) &&
- ((source[2] == 0xBE) || (source[2] == 0xBF))) {
- return ERTS_UTF8_ERROR;
- }
source += 3;
size -= 3;
} else if (((*source) & ((byte) 0xF8)) == 0xF0) {
@@ -2166,9 +2155,8 @@ Sint erts_native_filename_need(Eterm ioterm, int encoding)
} else if (x < 0x800) {
need += 2;
} else if (x < 0x10000) {
- if ((x >= 0xD800 && x <= 0xDFFF) ||
- (x == 0xFFFE) ||
- (x == 0xFFFF)) { /* Invalid unicode range */
+ if (x >= 0xD800 && x <= 0xDFFF) {
+ /* Invalid unicode range */
DESTROY_ESTACK(stack);
return ((Sint) -1);
}
@@ -2314,9 +2302,7 @@ void erts_native_filename_put(Eterm ioterm, int encoding, byte *p)
*p++ = (((byte) (x & 0x3F)) |
((byte) 0x80));
} else if (x < 0x10000) {
- ASSERT(!((x >= 0xD800 && x <= 0xDFFF) ||
- (x == 0xFFFE) ||
- (x == 0xFFFF)));
+ ASSERT(!(x >= 0xD800 && x <= 0xDFFF));
*p++ = (((byte) (x >> 12)) |
((byte) 0xE0));
*p++ = ((((byte) (x >> 6)) & 0x3F) |
@@ -64,8 +64,7 @@ end_per_group(_GroupName, Config) ->
utf8_roundtrip(Config) when is_list(Config) ->
?line utf8_roundtrip(0, 16#D7FF),
- ?line utf8_roundtrip(16#E000, 16#FFFD),
- ?line utf8_roundtrip(16#10000, 16#10FFFF),
+ ?line utf8_roundtrip(16#E000, 16#10FFFF),
ok.
utf8_roundtrip(First, Last) when First =< Last ->
@@ -91,8 +90,7 @@ utf16_roundtrip(Config) when is_list(Config) ->
do_utf16_roundtrip(Fun) ->
do_utf16_roundtrip(0, 16#D7FF, Fun),
- do_utf16_roundtrip(16#E000, 16#FFFD, Fun),
- do_utf16_roundtrip(16#10000, 16#10FFFF, Fun).
+ do_utf16_roundtrip(16#E000, 16#10FFFF, Fun).
do_utf16_roundtrip(First, Last, Fun) when First =< Last ->
Fun(First),
@@ -129,8 +127,7 @@ utf32_roundtrip(Config) when is_list(Config) ->
do_utf32_roundtrip(Fun) ->
do_utf32_roundtrip(0, 16#D7FF, Fun),
- do_utf32_roundtrip(16#E000, 16#FFFD, Fun),
- do_utf32_roundtrip(16#10000, 16#10FFFF, Fun).
+ do_utf32_roundtrip(16#E000, 16#10FFFF, Fun).
do_utf32_roundtrip(First, Last, Fun) when First =< Last ->
Fun(First),
@@ -158,7 +155,6 @@ utf32_little_roundtrip(Char) ->
utf8_illegal_sequences(Config) when is_list(Config) ->
?line fail_range(16#10FFFF+1, 16#10FFFF+512), %Too large.
?line fail_range(16#D800, 16#DFFF), %Reserved for UTF-16.
- ?line fail_range(16#FFFE, 16#FFFF), %Non-characters.
%% Illegal first character.
?line [fail(<<I,16#8F,16#8F,16#8F>>) || I <- lists:seq(16#80, 16#BF)],
@@ -251,7 +247,6 @@ fail_1(_) -> ok.
utf16_illegal_sequences(Config) when is_list(Config) ->
?line utf16_fail_range(16#10FFFF+1, 16#10FFFF+512), %Too large.
?line utf16_fail_range(16#D800, 16#DFFF), %Reserved for UTF-16.
- ?line utf16_fail_range(16#FFFE, 16#FFFF), %Non-characters.
?line lonely_hi_surrogate(16#D800, 16#DFFF),
?line leading_lo_surrogate(16#DC00, 16#DFFF),
@@ -300,7 +295,6 @@ leading_lo_surrogate(_, _, _) -> ok.
utf32_illegal_sequences(Config) when is_list(Config) ->
?line utf32_fail_range(16#10FFFF+1, 16#10FFFF+512), %Too large.
?line utf32_fail_range(16#D800, 16#DFFF), %Reserved for UTF-16.
- ?line utf32_fail_range(16#FFFE, 16#FFFF), %Non-characters.
?line utf32_fail_range(-100, -1),
ok.
@@ -264,18 +264,10 @@ literals(Config) when is_list(Config) ->
?line {'EXIT',{badarg,_}} = (catch <<(-1)/utf32,I/utf8>>),
?line {'EXIT',{badarg,_}} = (catch <<(-1)/little-utf32,I/utf8>>),
?line {'EXIT',{badarg,_}} = (catch <<16#D800/utf8,I/utf8>>),
- ?line {'EXIT',{badarg,_}} = (catch <<16#FFFE/utf8,I/utf8>>),
- ?line {'EXIT',{badarg,_}} = (catch <<16#FFFF/utf8,I/utf8>>),
?line {'EXIT',{badarg,_}} = (catch <<16#D800/utf16,I/utf8>>),
?line {'EXIT',{badarg,_}} = (catch <<16#D800/little-utf16,I/utf8>>),
- ?line {'EXIT',{badarg,_}} = (catch <<16#FFFE/utf16,I/utf8>>),
- ?line {'EXIT',{badarg,_}} = (catch <<16#FFFE/little-utf16,I/utf8>>),
- ?line {'EXIT',{badarg,_}} = (catch <<16#FFFF/utf16,I/utf8>>),
- ?line {'EXIT',{badarg,_}} = (catch <<16#FFFF/little-utf16,I/utf8>>),
?line {'EXIT',{badarg,_}} = (catch <<16#D800/utf32,I/utf8>>),
?line {'EXIT',{badarg,_}} = (catch <<16#D800/little-utf32,I/utf8>>),
- ?line {'EXIT',{badarg,_}} = (catch <<16#FFFE/utf32,I/utf8>>),
- ?line {'EXIT',{badarg,_}} = (catch <<16#FFFF/little-utf32,I/utf8>>),
B = 16#10FFFF+1,
?line {'EXIT',{badarg,_}} = (catch <<B/utf8>>),
@@ -286,20 +278,11 @@ literals(Config) when is_list(Config) ->
%% Matching of bad literals.
?line error = bad_literal_match(<<237,160,128>>), %16#D800 in UTF-8
- ?line error = bad_literal_match(<<239,191,190>>), %16#FFFE in UTF-8
- ?line error = bad_literal_match(<<239,191,191>>), %16#FFFF in UTF-8
?line error = bad_literal_match(<<244,144,128,128>>), %16#110000 in UTF-8
- ?line error = bad_literal_match(<<255,254>>), %16#FFFE in UTF-16
- ?line error = bad_literal_match(<<255,255>>), %16#FFFF in UTF-16
-
?line error = bad_literal_match(<<16#D800:32>>),
- ?line error = bad_literal_match(<<16#FFFE:32>>),
- ?line error = bad_literal_match(<<16#FFFF:32>>),
?line error = bad_literal_match(<<16#110000:32>>),
?line error = bad_literal_match(<<16#D800:32/little>>),
- ?line error = bad_literal_match(<<16#FFFE:32/little>>),
- ?line error = bad_literal_match(<<16#FFFF:32/little>>),
?line error = bad_literal_match(<<16#110000:32/little>>),
ok.
@@ -314,11 +297,7 @@ match_literal(<<"bj\366rn"/big-utf16>>) -> bjorn_utf16be;
match_literal(<<"bj\366rn"/little-utf16>>) -> bjorn_utf16le.
bad_literal_match(<<16#D800/utf8>>) -> ok;
-bad_literal_match(<<16#FFFE/utf8>>) -> ok;
-bad_literal_match(<<16#FFFF/utf8>>) -> ok;
bad_literal_match(<<16#110000/utf8>>) -> ok;
-bad_literal_match(<<16#FFFE/utf16>>) -> ok;
-bad_literal_match(<<16#FFFF/utf16>>) -> ok;
bad_literal_match(<<16#D800/utf32>>) -> ok;
bad_literal_match(<<16#110000/utf32>>) -> ok;
bad_literal_match(<<16#D800/little-utf32>>) -> ok;
@@ -166,7 +166,7 @@ protected int doHashCode() {
/**
* Validate a code point according to Erlang definition; Unicode 3.0.
* That is; valid in the range U+0..U+10FFFF, but not in the range
- * U+D800..U+DFFF (surrogat pairs), nor U+FFFE..U+FFFF (non-characters).
+ * U+D800..U+DFFF (surrogat pairs).
*
* @param cp
* the code point value to validate
@@ -179,8 +179,7 @@ public static boolean isValidCodePoint(final int cp) {
// Erlang definition of valid Unicode code points;
// Unicode 3.0, XML, et.al.
return (cp>>>16) <= 0x10 // in 0..10FFFF; Unicode range
- && (cp & ~0x7FF) != 0xD800 // not in D800..DFFF; surrogate range
- && (cp & ~1) != 0xFFFE; // not in FFFE..FFFF; non-characters
+ && (cp & ~0x7FF) != 0xD800; // not in D800..DFFF; surrogate range
}
/**
@@ -203,8 +203,7 @@
<item>greater than <c>16#10FFFF</c>
(the maximum unicode character),</item>
<item>in the range <c>16#D800</c> to <c>16#DFFF</c>
- (invalid unicode range)</item>
- <item>or equal to 16#FFFE or 16#FFFF (non characters)</item>
+ (invalid range reserved for UTF-16 surrogate pairs)</item>
</list>
is found.
</item>
@@ -322,7 +322,7 @@ roundtrips(Config) when is_list(Config) ->
ex_roundtrips(Config) when is_list(Config) ->
?line L1 = ranges(0, 16#D800 - 1,
erlang:system_info(context_reductions) * 11),
- ?line L2 = ranges(16#DFFF + 1, 16#FFFE - 1,
+ ?line L2 = ranges(16#DFFF + 1, 16#10000 - 1,
erlang:system_info(context_reductions) * 11),
%?line L3 = ranges(16#FFFF + 1, 16#10FFFF,
% erlang:system_info(context_reductions) * 11),
@@ -569,7 +569,6 @@ utf16_illegal_sequences_bif(Config) when is_list(Config) ->
ex_utf16_illegal_sequences_bif(Config) when is_list(Config) ->
?line utf16_fail_range_bif_simple(16#10FFFF+1, 16#10FFFF+512), %Too large.
?line utf16_fail_range_bif(16#D800, 16#DFFF), %Reserved for UTF-16.
- ?line utf16_fail_range_bif(16#FFFE, 16#FFFF), %Non-characters.
?line lonely_hi_surrogate_bif(16#D800, 16#DBFF,incomplete),
?line lonely_hi_surrogate_bif(16#DC00, 16#DFFF,error),
@@ -644,7 +643,6 @@ utf8_illegal_sequences_bif(Config) when is_list(Config) ->
ex_utf8_illegal_sequences_bif(Config) when is_list(Config) ->
?line fail_range_bif(16#10FFFF+1, 16#10FFFF+512), %Too large.
?line fail_range_bif(16#D800, 16#DFFF), %Reserved for UTF-16.
- ?line fail_range_bif(16#FFFE, 16#FFFF), %Reserved (BOM).
%% Illegal first character.
?line [fail_bif(<<I,16#8F,16#8F,16#8F>>,unicode) || I <- lists:seq(16#80, 16#BF)],
@@ -879,9 +879,8 @@ Ei = Value |
and UTF-32, respectively.</p>
<p>When constructing a segment of a <c>utf</c> type, <c>Value</c>
- must be an integer in one of the ranges 0..16#D7FF,
- 16#E000..16#FFFD, or 16#10000..16#10FFFF
- (i.e. a valid Unicode code point). Construction
+ must be an integer in the range 0..16#D7FF or
+ 16#E000....16#10FFFF. Construction
will fail with a <c>badarg</c> exception if <c>Value</c> is
outside the allowed ranges. The size of the resulting binary
segment depends on the type and/or <c>Value</c>. For <c>utf8</c>,
@@ -896,14 +895,13 @@ Ei = Value |
<c><![CDATA[<<$a/utf8,$b/utf8,$c/utf8>>]]></c>.</p>
<p>A successful match of a segment of a <c>utf</c> type results
- in an integer in one of the ranges 0..16#D7FF, 16#E000..16#FFFD,
- or 16#10000..16#10FFFF
- (i.e. a valid Unicode code point). The match will fail if returned value
+ in an integer in the range 0..16#D7FF or 16#E000..16#10FFFF.
+ The match will fail if returned value
would fall outside those ranges.</p>
<p>A segment of type <c>utf8</c> will match 1 to 4 bytes in the binary,
if the binary at the match position contains a valid UTF-8 sequence.
- (See RFC-2279 or the Unicode standard.)</p>
+ (See RFC-3629 or the Unicode standard.)</p>
<p>A segment of type <c>utf16</c> may match 2 or 4 bytes in the binary.
The match will fail if the binary at the match position does not contain

0 comments on commit 34db767

Please sign in to comment.