/
Utf8Test.java
348 lines (313 loc) · 12.7 KB
/
Utf8Test.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
/*
* Copyright (C) 2013 The Guava Authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.google.common.base;
import static com.google.common.truth.Truth.assertThat;
import static java.lang.Character.MAX_CODE_POINT;
import static java.lang.Character.MAX_HIGH_SURROGATE;
import static java.lang.Character.MAX_LOW_SURROGATE;
import static java.lang.Character.MIN_HIGH_SURROGATE;
import static java.lang.Character.MIN_LOW_SURROGATE;
import static java.lang.Character.MIN_SUPPLEMENTARY_CODE_POINT;
import com.google.common.annotations.GwtCompatible;
import com.google.common.annotations.GwtIncompatible;
import com.google.common.collect.ImmutableList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Random;
import junit.framework.TestCase;
/**
* Unit tests for {@link Utf8}.
*
* @author Jon Perlow
* @author Martin Buchholz
* @author Clément Roux
*/
@GwtCompatible(emulated = true)
public class Utf8Test extends TestCase {
private static final ImmutableList<String> ILL_FORMED_STRINGS;
static {
ImmutableList.Builder<String> builder = ImmutableList.builder();
char[] surrogates = {
MAX_LOW_SURROGATE, MAX_HIGH_SURROGATE, MIN_LOW_SURROGATE, MIN_HIGH_SURROGATE,
};
for (char surrogate : surrogates) {
builder.add(newString(surrogate));
builder.add(newString(surrogate, 'n'));
builder.add(newString('n', surrogate));
builder.add(newString(surrogate, surrogate));
}
builder.add(newString(MIN_LOW_SURROGATE, MAX_HIGH_SURROGATE));
ILL_FORMED_STRINGS = builder.build();
}
public void testEncodedLength_validStrings() {
assertEquals(0, Utf8.encodedLength(""));
assertEquals(11, Utf8.encodedLength("Hello world"));
assertEquals(8, Utf8.encodedLength("Résumé"));
assertEquals(
461,
Utf8.encodedLength(
"威廉·莎士比亞(William Shakespeare,"
+ "1564年4月26號—1616年4月23號[1])係隻英國嗰演員、劇作家同詩人,"
+ "有時間佢簡稱莎翁;中國清末民初哈拕翻譯做舌克斯毕、沙斯皮耳、筛斯比耳、"
+ "莎基斯庇尔、索士比尔、夏克思芘尔、希哀苦皮阿、叶斯壁、沙克皮尔、"
+ "狹斯丕爾。[2]莎士比亞編寫過好多作品,佢嗰劇作響西洋文學好有影響,"
+ "哈都拕人翻譯做好多話。"));
// A surrogate pair
assertEquals(4, Utf8.encodedLength(newString(MIN_HIGH_SURROGATE, MIN_LOW_SURROGATE)));
}
public void testEncodedLength_validStrings2() {
HashMap<Integer, Integer> utf8Lengths = new HashMap<>();
utf8Lengths.put(0x00, 1);
utf8Lengths.put(0x7f, 1);
utf8Lengths.put(0x80, 2);
utf8Lengths.put(0x7ff, 2);
utf8Lengths.put(0x800, 3);
utf8Lengths.put(MIN_SUPPLEMENTARY_CODE_POINT - 1, 3);
utf8Lengths.put(MIN_SUPPLEMENTARY_CODE_POINT, 4);
utf8Lengths.put(MAX_CODE_POINT, 4);
Integer[] codePoints = utf8Lengths.keySet().toArray(new Integer[] {});
StringBuilder sb = new StringBuilder();
Random rnd = new Random();
for (int trial = 0; trial < 100; trial++) {
sb.setLength(0);
int utf8Length = 0;
for (int i = 0; i < 6; i++) {
Integer randomCodePoint = codePoints[rnd.nextInt(codePoints.length)];
sb.appendCodePoint(randomCodePoint);
utf8Length += utf8Lengths.get(randomCodePoint);
if (utf8Length != Utf8.encodedLength(sb)) {
StringBuilder repro = new StringBuilder();
for (int j = 0; j < sb.length(); j++) {
repro.append(" ").append((int) sb.charAt(j)); // GWT compatible
}
assertEquals(repro.toString(), utf8Length, Utf8.encodedLength(sb));
}
}
}
}
public void testEncodedLength_invalidStrings() {
testEncodedLengthFails(newString(MIN_HIGH_SURROGATE), 0);
testEncodedLengthFails("foobar" + newString(MIN_HIGH_SURROGATE), 6);
testEncodedLengthFails(newString(MIN_LOW_SURROGATE), 0);
testEncodedLengthFails("foobar" + newString(MIN_LOW_SURROGATE), 6);
testEncodedLengthFails(newString(MIN_HIGH_SURROGATE, MIN_HIGH_SURROGATE), 0);
}
private static void testEncodedLengthFails(String invalidString, int invalidCodePointIndex) {
try {
Utf8.encodedLength(invalidString);
fail();
} catch (IllegalArgumentException expected) {
assertThat(expected)
.hasMessageThat()
.isEqualTo("Unpaired surrogate at index " + invalidCodePointIndex);
}
}
// 128 - [chars 0x0000 to 0x007f]
private static final long ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS = 0x007f - 0x0000 + 1;
// 128
private static final long EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT =
ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS;
// 1920 [chars 0x0080 to 0x07FF]
private static final long TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS = 0x07FF - 0x0080 + 1;
// 18,304
private static final long EXPECTED_TWO_BYTE_ROUNDTRIPPABLE_COUNT =
// Both bytes are one byte characters
(long) Math.pow(EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT, 2)
+
// The possible number of two byte characters
TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS;
// 2048
private static final long THREE_BYTE_SURROGATES = 2 * 1024;
// 61,440 [chars 0x0800 to 0xFFFF, minus surrogates]
private static final long THREE_BYTE_ROUNDTRIPPABLE_CHARACTERS =
0xFFFF - 0x0800 + 1 - THREE_BYTE_SURROGATES;
// 2,650,112
private static final long EXPECTED_THREE_BYTE_ROUNDTRIPPABLE_COUNT =
// All one byte characters
(long) Math.pow(EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT, 3)
+
// One two byte character and a one byte character
2 * TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS * ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS
+
// Three byte characters
THREE_BYTE_ROUNDTRIPPABLE_CHARACTERS;
// 1,048,576 [chars 0x10000L to 0x10FFFF]
private static final long FOUR_BYTE_ROUNDTRIPPABLE_CHARACTERS = 0x10FFFF - 0x10000L + 1;
// 289,571,839
private static final long EXPECTED_FOUR_BYTE_ROUNDTRIPPABLE_COUNT =
// All one byte characters
(long) Math.pow(EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT, 4)
+
// One and three byte characters
2 * THREE_BYTE_ROUNDTRIPPABLE_CHARACTERS * ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS
+
// Two two byte characters
TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS * TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS
+
// Permutations of one and two byte characters
3
* TWO_BYTE_ROUNDTRIPPABLE_CHARACTERS
* ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS
* ONE_BYTE_ROUNDTRIPPABLE_CHARACTERS
+
// Four byte characters
FOUR_BYTE_ROUNDTRIPPABLE_CHARACTERS;
/** Tests that round tripping of all two byte permutations work. */
@GwtIncompatible // java.nio.charset.Charset
public void testIsWellFormed_1Byte() {
testBytes(1, EXPECTED_ONE_BYTE_ROUNDTRIPPABLE_COUNT);
}
/** Tests that round tripping of all two byte permutations work. */
@GwtIncompatible // java.nio.charset.Charset
public void testIsWellFormed_2Bytes() {
testBytes(2, EXPECTED_TWO_BYTE_ROUNDTRIPPABLE_COUNT);
}
/** Tests that round tripping of all three byte permutations work. */
@GwtIncompatible // java.nio.charset.Charset
public void testIsWellFormed_3Bytes() {
testBytes(3, EXPECTED_THREE_BYTE_ROUNDTRIPPABLE_COUNT);
}
/**
* Tests that round tripping of a sample of four byte permutations work. All permutations are
* prohibitively expensive to test for automated runs. This method tests specific four-byte cases.
*/
public void testIsWellFormed_4BytesSamples() {
// Valid 4 byte.
assertWellFormed(0xF0, 0xA4, 0xAD, 0xA2);
// Bad trailing bytes
assertNotWellFormed(0xF0, 0xA4, 0xAD, 0x7F);
assertNotWellFormed(0xF0, 0xA4, 0xAD, 0xC0);
// Special cases for byte2
assertNotWellFormed(0xF0, 0x8F, 0xAD, 0xA2);
assertNotWellFormed(0xF4, 0x90, 0xAD, 0xA2);
}
/** Tests some hard-coded test cases. */
public void testSomeSequences() {
// Empty
assertWellFormed();
// One-byte characters, including control characters
assertWellFormed(0x00, 0x61, 0x62, 0x63, 0x7F); // "\u0000abc\u007f"
// Two-byte characters
assertWellFormed(0xC2, 0xA2, 0xC2, 0xA2); // "\u00a2\u00a2"
// Three-byte characters
assertWellFormed(0xc8, 0x8a, 0x63, 0xc8, 0x8a, 0x63); // "\u020ac\u020ac"
// Four-byte characters
// "\u024B62\u024B62"
assertWellFormed(0xc9, 0x8b, 0x36, 0x32, 0xc9, 0x8b, 0x36, 0x32);
// Mixed string
// "a\u020ac\u00a2b\\u024B62u020acc\u00a2de\u024B62"
assertWellFormed(
0x61, 0xc8, 0x8a, 0x63, 0xc2, 0xa2, 0x62, 0x5c, 0x75, 0x30, 0x32, 0x34, 0x42, 0x36, 0x32,
0x75, 0x30, 0x32, 0x30, 0x61, 0x63, 0x63, 0xc2, 0xa2, 0x64, 0x65, 0xc9, 0x8b, 0x36, 0x32);
// Not a valid string
assertNotWellFormed(-1, 0, -1, 0);
}
public void testShardsHaveExpectedRoundTrippables() {
// A sanity check.
long actual = 0;
for (long expected : generateFourByteShardsExpectedRunnables()) {
actual += expected;
}
assertEquals(EXPECTED_FOUR_BYTE_ROUNDTRIPPABLE_COUNT, actual);
}
private static String newString(char... chars) {
return new String(chars);
}
private static byte[] toByteArray(int... bytes) {
byte[] realBytes = new byte[bytes.length];
for (int i = 0; i < bytes.length; i++) {
realBytes[i] = (byte) bytes[i];
}
return realBytes;
}
private static void assertWellFormed(int... bytes) {
assertTrue(Utf8.isWellFormed(toByteArray(bytes)));
}
private static void assertNotWellFormed(int... bytes) {
assertFalse(Utf8.isWellFormed(toByteArray(bytes)));
}
private static long[] generateFourByteShardsExpectedRunnables() {
long[] expected = new long[128];
// 0-63 are all 5300224
for (int i = 0; i <= 63; i++) {
expected[i] = 5300224;
}
// 97-111 are all 2342912
for (int i = 97; i <= 111; i++) {
expected[i] = 2342912;
}
// 113-117 are all 1048576
for (int i = 113; i <= 117; i++) {
expected[i] = 1048576;
}
// One offs
expected[112] = 786432;
expected[118] = 786432;
expected[119] = 1048576;
expected[120] = 458752;
expected[121] = 524288;
expected[122] = 65536;
// Anything not assigned was the default 0.
return expected;
}
/**
* Helper to run the loop to test all the permutations for the number of bytes specified.
*
* @param numBytes the number of bytes in the byte array
* @param expectedCount the expected number of roundtrippable permutations
*/
@GwtIncompatible // java.nio.charset.Charset
private static void testBytes(int numBytes, long expectedCount) {
testBytes(numBytes, expectedCount, 0, -1);
}
/**
* Helper to run the loop to test all the permutations for the number of bytes specified. This
* overload is useful for debugging to get the loop to start at a certain character.
*
* @param numBytes the number of bytes in the byte array
* @param expectedCount the expected number of roundtrippable permutations
* @param start the starting bytes encoded as a long as big-endian
* @param lim the limit of bytes to process encoded as a long as big-endian, or -1 to mean the max
* limit for numBytes
*/
@GwtIncompatible // java.nio.charset.Charset
private static void testBytes(int numBytes, long expectedCount, long start, long lim) {
byte[] bytes = new byte[numBytes];
if (lim == -1) {
lim = 1L << (numBytes * 8);
}
long countRoundTripped = 0;
for (long byteChar = start; byteChar < lim; byteChar++) {
long tmpByteChar = byteChar;
for (int i = 0; i < numBytes; i++) {
bytes[bytes.length - i - 1] = (byte) tmpByteChar;
tmpByteChar = tmpByteChar >> 8;
}
boolean isRoundTrippable = Utf8.isWellFormed(bytes);
assertEquals(isRoundTrippable, Utf8.isWellFormed(bytes, 0, numBytes));
String s = new String(bytes, Charsets.UTF_8);
byte[] bytesReencoded = s.getBytes(Charsets.UTF_8);
boolean bytesEqual = Arrays.equals(bytes, bytesReencoded);
if (bytesEqual != isRoundTrippable) {
fail();
}
if (isRoundTrippable) {
countRoundTripped++;
}
}
assertEquals(expectedCount, countRoundTripped);
}
}