Skip to content
This repository

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse code

Convert filenames read on MacOSX to canonical form

  • Loading branch information...
commit 9622ab2132e2501ee5769357a914dcc6635e515c 1 parent 4161de1
Patrik Nyblom bufflig authored
1  erts/emulator/beam/bif.tab
@@ -799,6 +799,7 @@ bif erlang:nif_error/2
799 799 #
800 800 bif prim_file:internal_name2native/1
801 801 bif prim_file:internal_native2name/1
  802 +bif prim_file:internal_normalize_utf8/1
802 803 bif file:native_name_encoding/0
803 804 #
804 805 # Obsolete
416 erts/emulator/beam/erl_unicode.c
@@ -30,6 +30,8 @@
30 30 #include "big.h"
31 31
32 32 #include "erl_unicode.h"
  33 +#include "erl_unicode_normalize.h"
  34 +
33 35
34 36 typedef struct _restart_context {
35 37 byte *bytes;
@@ -54,13 +56,6 @@ static BIF_RETTYPE finalize_list_to_list(Process *p,
54 56 Uint num_resulting_chars,
55 57 int state, int left,
56 58 Eterm tail);
57   -static int analyze_utf8(byte *source, Uint size,
58   - byte **err_pos, Uint *num_chars, int *left);
59   -#define UTF8_OK 0
60   -#define UTF8_INCOMPLETE 1
61   -#define UTF8_ERROR 2
62   -#define UTF8_ANALYZE_MORE 3
63   -
64 59 static BIF_RETTYPE characters_to_utf8_trap(BIF_ALIST_3);
65 60 static BIF_RETTYPE characters_to_list_trap_1(BIF_ALIST_3);
66 61 static BIF_RETTYPE characters_to_list_trap_2(BIF_ALIST_3);
@@ -970,11 +965,11 @@ static int is_valid_utf8(Eterm orig_bin)
970 965 bytes = erts_get_aligned_binary_bytes(orig_bin, &temp_alloc);
971 966 }
972 967 size = binary_size(orig_bin);
973   - ret = analyze_utf8(bytes,
  968 + ret = erts_analyze_utf8(bytes,
974 969 size,
975 970 &endpos,&numchar,NULL);
976 971 erts_free_aligned_binary_bytes(temp_alloc);
977   - return (ret == UTF8_OK);
  972 + return (ret == ERTS_UTF8_OK);
978 973 }
979 974
980 975 BIF_RETTYPE unicode_characters_to_binary_2(BIF_ALIST_2)
@@ -1084,14 +1079,14 @@ static BIF_RETTYPE build_list_return(Process *p, byte *bytes, int pos, Uint char
1084 1079 hp += 2;
1085 1080 rest_term = CONS(hp,leftover_bin,rest_term);
1086 1081 }
1087   - BIF_RET(finalize_list_to_list(p, bytes, rest_term, 0U, pos, characters, UTF8_ERROR, left, NIL));
  1082 + BIF_RET(finalize_list_to_list(p, bytes, rest_term, 0U, pos, characters, ERTS_UTF8_ERROR, left, NIL));
1088 1083 } else if (rest_term == NIL && num_leftovers != 0) {
1089 1084 Eterm leftover_bin = new_binary(p, leftover, num_leftovers);
1090 1085 if (check_leftovers(leftover,num_leftovers) != 0) {
1091   - BIF_RET(finalize_list_to_list(p, bytes, leftover_bin, 0U, pos, characters, UTF8_ERROR,
  1086 + BIF_RET(finalize_list_to_list(p, bytes, leftover_bin, 0U, pos, characters, ERTS_UTF8_ERROR,
1092 1087 left, NIL));
1093 1088 } else {
1094   - BIF_RET(finalize_list_to_list(p, bytes, leftover_bin, 0U, pos, characters, UTF8_INCOMPLETE,
  1089 + BIF_RET(finalize_list_to_list(p, bytes, leftover_bin, 0U, pos, characters, ERTS_UTF8_INCOMPLETE,
1095 1090 left, NIL));
1096 1091 }
1097 1092 } else { /* All OK */
@@ -1107,11 +1102,11 @@ static BIF_RETTYPE build_list_return(Process *p, byte *bytes, int pos, Uint char
1107 1102 rc.num_processed_bytes = 0; /* not used */
1108 1103 rc.num_bytes_to_process = pos;
1109 1104 rc.num_resulting_chars = characters;
1110   - rc.state = UTF8_OK; /* not used */
  1105 + rc.state = ERTS_UTF8_OK; /* not used */
1111 1106 BIF_TRAP3(&characters_to_list_trap_1_exp, p, make_magic_bin_for_restart(p,&rc),
1112 1107 rest_term, latin1);
1113 1108 } else { /* Success */
1114   - BIF_RET(finalize_list_to_list(p, bytes, NIL, 0U, pos, characters, UTF8_OK, left, NIL));
  1109 + BIF_RET(finalize_list_to_list(p, bytes, NIL, 0U, pos, characters, ERTS_UTF8_OK, left, NIL));
1115 1110 }
1116 1111 }
1117 1112 }
@@ -1205,7 +1200,7 @@ BIF_RETTYPE unicode_characters_to_list_2(BIF_ALIST_2)
1205 1200 * When input to characters_to_list is a plain binary and the format is 'unicode', we do
1206 1201 * a faster analyze and size count with this function.
1207 1202 */
1208   -static int analyze_utf8(byte *source, Uint size,
  1203 +int erts_analyze_utf8(byte *source, Uint size,
1209 1204 byte **err_pos, Uint *num_chars, int *left)
1210 1205 {
1211 1206 *err_pos = source;
@@ -1216,60 +1211,60 @@ static int analyze_utf8(byte *source, Uint size,
1216 1211 --size;
1217 1212 } else if (((*source) & ((byte) 0xE0)) == 0xC0) {
1218 1213 if (size < 2) {
1219   - return UTF8_INCOMPLETE;
  1214 + return ERTS_UTF8_INCOMPLETE;
1220 1215 }
1221 1216 if (((source[1] & ((byte) 0xC0)) != 0x80) ||
1222 1217 ((*source) < 0xC2) /* overlong */) {
1223   - return UTF8_ERROR;
  1218 + return ERTS_UTF8_ERROR;
1224 1219 }
1225 1220 source += 2;
1226 1221 size -= 2;
1227 1222 } else if (((*source) & ((byte) 0xF0)) == 0xE0) {
1228 1223 if (size < 3) {
1229   - return UTF8_INCOMPLETE;
  1224 + return ERTS_UTF8_INCOMPLETE;
1230 1225 }
1231 1226 if (((source[1] & ((byte) 0xC0)) != 0x80) ||
1232 1227 ((source[2] & ((byte) 0xC0)) != 0x80) ||
1233 1228 (((*source) == 0xE0) && (source[1] < 0xA0)) /* overlong */ ) {
1234   - return UTF8_ERROR;
  1229 + return ERTS_UTF8_ERROR;
1235 1230 }
1236 1231 if ((((*source) & ((byte) 0xF)) == 0xD) &&
1237 1232 ((source[1] & 0x20) != 0)) {
1238   - return UTF8_ERROR;
  1233 + return ERTS_UTF8_ERROR;
1239 1234 }
1240 1235 if (((*source) == 0xEF) && (source[1] == 0xBF) &&
1241 1236 ((source[2] == 0xBE) || (source[2] == 0xBF))) {
1242   - return UTF8_ERROR;
  1237 + return ERTS_UTF8_ERROR;
1243 1238 }
1244 1239 source += 3;
1245 1240 size -= 3;
1246 1241 } else if (((*source) & ((byte) 0xF8)) == 0xF0) {
1247 1242 if (size < 4) {
1248   - return UTF8_INCOMPLETE;
  1243 + return ERTS_UTF8_INCOMPLETE;
1249 1244 }
1250 1245 if (((source[1] & ((byte) 0xC0)) != 0x80) ||
1251 1246 ((source[2] & ((byte) 0xC0)) != 0x80) ||
1252 1247 ((source[3] & ((byte) 0xC0)) != 0x80) ||
1253 1248 (((*source) == 0xF0) && (source[1] < 0x90)) /* overlong */) {
1254   - return UTF8_ERROR;
  1249 + return ERTS_UTF8_ERROR;
1255 1250 }
1256 1251 if ((((*source) & ((byte)0x7)) > 0x4U) ||
1257 1252 ((((*source) & ((byte)0x7)) == 0x4U) &&
1258 1253 ((source[1] & ((byte)0x3F)) > 0xFU))) {
1259   - return UTF8_ERROR;
  1254 + return ERTS_UTF8_ERROR;
1260 1255 }
1261 1256 source += 4;
1262 1257 size -= 4;
1263 1258 } else {
1264   - return UTF8_ERROR;
  1259 + return ERTS_UTF8_ERROR;
1265 1260 }
1266 1261 ++(*num_chars);
1267 1262 *err_pos = source;
1268 1263 if (left && --(*left) <= 0) {
1269   - return UTF8_ANALYZE_MORE;
  1264 + return ERTS_UTF8_ANALYZE_MORE;
1270 1265 }
1271 1266 }
1272   - return UTF8_OK;
  1267 + return ERTS_UTF8_OK;
1273 1268 }
1274 1269
1275 1270 /*
@@ -1304,7 +1299,7 @@ static Eterm do_utf8_to_list(Process *p, Uint num, byte *bytes, Uint sz,
1304 1299 } else if (((*source) & ((byte) 0xE0)) == 0xC0) {
1305 1300 unipoint =
1306 1301 (((Uint) ((*source) & ((byte) 0x1F))) << 6) |
1307   - ((Uint) (source[1] & ((byte) 0x3F)));
  1302 + ((Uint) (source[1] & ((byte) 0x3F)));
1308 1303 } else if (((*source) & ((byte) 0xF0)) == 0xE0) {
1309 1304 unipoint =
1310 1305 (((Uint) ((*source) & ((byte) 0xF))) << 12) |
@@ -1330,6 +1325,216 @@ static Eterm do_utf8_to_list(Process *p, Uint num, byte *bytes, Uint sz,
1330 1325 return ret;
1331 1326 }
1332 1327
  1328 +static int is_candidate(Uint cp)
  1329 +{
  1330 + int index,pos;
  1331 + if (cp < 768) return 0;
  1332 + if (cp > 4023) {
  1333 + if (cp == 12441 || cp == 12442) return 1;
  1334 + return 0;
  1335 + }
  1336 + index = cp / 32 - COMP_CANDIDATE_MAP_OFFSET;
  1337 + pos = cp % 32;
  1338 + return !!(comp_candidate_map[index] & (1UL << pos));
  1339 +}
  1340 +
  1341 +static int hashsearch(int *htab, int htab_size, CompEntry *cv, Uint16 c)
  1342 +{
  1343 + int bucket = c % htab_size;
  1344 + while (htab[bucket] != -1 && cv[htab[bucket]].c != c)
  1345 + bucket = (bucket + 1) % htab_size;
  1346 + return htab[bucket];
  1347 +}
  1348 +
  1349 +#define TRANSLATE_NO 0
  1350 +#define TRANSLATE_MAYBE -1
  1351 +
  1352 +/* The s array is reversed */
  1353 +static int translate(Uint16 *s, int slen, Uint16 *res)
  1354 +{
  1355 + /* Go backwards through buffer and match against tree */
  1356 + int pos = 0;
  1357 + CompEntry *cv = compose_tab;
  1358 + int *hc = hash_compose_tab;
  1359 + int cvs = compose_tab_size;
  1360 + int x;
  1361 + while (pos < slen) {
  1362 + x = hashsearch(hc,cvs*HASH_SIZE_FACTOR,cv,s[pos]);
  1363 + if (x < 0) {
  1364 + return TRANSLATE_NO;
  1365 + }
  1366 + if (cv[x].res) {
  1367 + *res = cv[x].res;
  1368 + return pos;
  1369 + }
  1370 + cvs = cv[x].num_subs;
  1371 + hc = cv[x].hash;
  1372 + cv = cv[x].subs;
  1373 + ++pos;
  1374 + }
  1375 + return TRANSLATE_MAYBE;
  1376 +}
  1377 +
  1378 +static void handle_first_norm(Uint16 *savepoints, int *numpointsp, Uint unipoint)
  1379 +{
  1380 + /*erts_fprintf(stderr,"CP = %d, numpoints = %d\n",(int) unipoint,(int) *numpointsp);*/
  1381 + *numpointsp = 1;
  1382 + savepoints[0] = (Uint16) unipoint;
  1383 +}
  1384 +
  1385 +static void cleanup_norm(Eterm **hpp, Uint16 *savepoints, int numpoints, Eterm *retp)
  1386 +{
  1387 + Eterm *hp = *hpp;
  1388 + int res,i;
  1389 + Uint16 newpoint;
  1390 + Eterm ret = *retp;
  1391 +
  1392 + ret = CONS(hp,make_small((Uint) savepoints[0]),ret);
  1393 + hp += 2;
  1394 +
  1395 + for (i = 1;i < numpoints;) {
  1396 + if(!is_candidate(savepoints[i]) ||
  1397 + ((res = translate(savepoints+i,numpoints - i, &newpoint)) <= 0)) {
  1398 + ret = CONS(hp,make_small((Uint) savepoints[i]),ret);
  1399 + hp += 2;
  1400 + ++i;
  1401 + } else {
  1402 + ret = CONS(hp,make_small((Uint) newpoint),ret);
  1403 + hp += 2;
  1404 + i += res;
  1405 + }
  1406 + }
  1407 + *retp = ret;
  1408 +}
  1409 +
  1410 +static void handle_potential_norm(Eterm **hpp, Uint16 *savepoints, int *numpointsp, Uint unipoint, Eterm *retp)
  1411 +{
  1412 + Eterm *hp = *hpp;
  1413 + int numpoints = *numpointsp;
  1414 + int res,i;
  1415 + Uint16 newpoint;
  1416 + Eterm ret = *retp;
  1417 +
  1418 + /* erts_fprintf(stderr,"CP = %d, numpoints = %d\n",(int) unipoint,(int) numpoints);*/
  1419 + if ((unipoint >> 16) == 0) { /* otherwise we're done here */
  1420 + savepoints[numpoints++] = (Uint16) unipoint;
  1421 + res = translate(savepoints,numpoints,&newpoint);
  1422 + if (res == TRANSLATE_NO) {
  1423 + ret = CONS(hp,make_small((Uint) savepoints[0]),ret);
  1424 + hp += 2;
  1425 + for (i = 1;i < numpoints;) {
  1426 + if(!is_candidate(savepoints[i]) ||
  1427 + ((res = translate(savepoints+i,numpoints - i, &newpoint)) == 0)) {
  1428 + ret = CONS(hp,make_small((Uint) savepoints[i]),ret);
  1429 + hp += 2;
  1430 + ++i;
  1431 + } else if (res > 0) {
  1432 + ret = CONS(hp,make_small((Uint) newpoint),ret);
  1433 + hp += 2;
  1434 + i += res;
  1435 + } else { /* res < 0 */
  1436 + /* A "maybe", means we are not done yet */
  1437 + int j = 0;
  1438 + while (i < numpoints) {
  1439 + savepoints[j++] = savepoints[i++];
  1440 + }
  1441 + numpoints = j;
  1442 + goto breakaway;
  1443 + }
  1444 + }
  1445 + numpoints = 0;
  1446 + breakaway:
  1447 + ;
  1448 + } else if (res > 0) {
  1449 + numpoints = 0;
  1450 + ret = CONS(hp,make_small((Uint) newpoint),ret);
  1451 + hp += 2;
  1452 + } /* < 0 means go on */
  1453 + } else {
  1454 + /* Unconditional rollup, this character is larger than 16 bit */
  1455 + ret = CONS(hp,make_small((Uint) savepoints[0]),ret);
  1456 + hp += 2;
  1457 +
  1458 + for (i = 1;i < numpoints;) {
  1459 + if(!is_candidate(savepoints[i]) ||
  1460 + ((res = translate(savepoints+i,numpoints - i, &newpoint)) <= 0)) {
  1461 + ret = CONS(hp,make_small((Uint) savepoints[i]),ret);
  1462 + hp += 2;
  1463 + ++i;
  1464 + } else {
  1465 + ret = CONS(hp,make_small((Uint) newpoint),ret);
  1466 + hp += 2;
  1467 + i += res;
  1468 + }
  1469 + }
  1470 + ret = CONS(hp,make_small(unipoint),ret);
  1471 + hp += 2;
  1472 + numpoints = 0;
  1473 + }
  1474 + *hpp = hp;
  1475 + *numpointsp = numpoints;
  1476 + *retp = ret;
  1477 +}
  1478 +
  1479 +static Eterm do_utf8_to_list_normalize(Process *p, Uint num, byte *bytes, Uint sz)
  1480 +{
  1481 + Eterm *hp,*hp_end;
  1482 + Eterm ret;
  1483 + byte *source;
  1484 + Uint unipoint;
  1485 + Uint16 savepoints[4];
  1486 + int numpoints = 0;
  1487 +
  1488 + ASSERT(num > 0);
  1489 +
  1490 + hp = HAlloc(p,num * 2); /* May be to much */
  1491 + hp_end = hp + num * 2;
  1492 + ret = NIL;
  1493 + source = bytes + sz;
  1494 + while(--source >= bytes) {
  1495 + if (((*source) & ((byte) 0x80)) == 0) {
  1496 + unipoint = (Uint) *source;
  1497 + } else if (((*source) & ((byte) 0xE0)) == 0xC0) {
  1498 + unipoint =
  1499 + (((Uint) ((*source) & ((byte) 0x1F))) << 6) |
  1500 + ((Uint) (source[1] & ((byte) 0x3F)));
  1501 + } else if (((*source) & ((byte) 0xF0)) == 0xE0) {
  1502 + unipoint =
  1503 + (((Uint) ((*source) & ((byte) 0xF))) << 12) |
  1504 + (((Uint) (source[1] & ((byte) 0x3F))) << 6) |
  1505 + ((Uint) (source[2] & ((byte) 0x3F)));
  1506 + } else if (((*source) & ((byte) 0xF8)) == 0xF0) {
  1507 + unipoint =
  1508 + (((Uint) ((*source) & ((byte) 0x7))) << 18) |
  1509 + (((Uint) (source[1] & ((byte) 0x3F))) << 12) |
  1510 + (((Uint) (source[2] & ((byte) 0x3F))) << 6) |
  1511 + ((Uint) (source[3] & ((byte) 0x3F)));
  1512 + } else {
  1513 + /* ignore 2#10XXXXXX */
  1514 + continue;
  1515 + }
  1516 + if (numpoints) {
  1517 + handle_potential_norm(&hp,savepoints,&numpoints,unipoint,&ret);
  1518 + continue;
  1519 + }
  1520 + /* We are not building up any normalizations yet, look that we shouldn't start... */
  1521 + if (is_candidate(unipoint)) {
  1522 + handle_first_norm(savepoints,&numpoints,unipoint);
  1523 + continue;
  1524 + }
  1525 + ret = CONS(hp,make_small(unipoint),ret);
  1526 + hp += 2;
  1527 + }
  1528 + /* so, we'we looped to the beginning, do we have anything saved? */
  1529 + if (numpoints) {
  1530 + cleanup_norm(&hp,savepoints,numpoints,&ret);
  1531 + }
  1532 + if (hp_end != hp) {
  1533 + HRelease(p,hp_end,hp);
  1534 + }
  1535 + return ret;
  1536 +}
  1537 +
1333 1538 /*
1334 1539 * The last step of characters_to_list, build a list from the buffer 'bytes' (created in the same way
1335 1540 * as for characters_to_utf8). All sizes are known in advance and most data will be held in a
@@ -1378,10 +1583,10 @@ static BIF_RETTYPE finalize_list_to_list(Process *p,
1378 1583 */
1379 1584
1380 1585 free_restart(bytes);
1381   - if (state == UTF8_INCOMPLETE) {
  1586 + if (state == ERTS_UTF8_INCOMPLETE) {
1382 1587 hp = HAlloc(p,4);
1383 1588 ret = TUPLE3(hp,am_incomplete,converted,rest);
1384   - } else if (state == UTF8_ERROR) {
  1589 + } else if (state == ERTS_UTF8_ERROR) {
1385 1590 hp = HAlloc(p,4);
1386 1591 ret = TUPLE3(hp,am_error,converted,rest);
1387 1592 } else {
@@ -1408,7 +1613,7 @@ static BIF_RETTYPE characters_to_list_trap_2(BIF_ALIST_3)
1408 1613
1409 1614 /*
1410 1615 * Hooks into the process of decoding a binary depending on state.
1411   - * If last_state is UTF8_ANALYZE_MORE, num_bytes_to_process
  1616 + * If last_state is ERTS_UTF8_ANALYZE_MORE, num_bytes_to_process
1412 1617 * and num_resulting_chars will grow
1413 1618 * until we're done analyzing the binary. Then we'll eat
1414 1619 * the bytes to process, lowering num_bytes_to_process and num_resulting_chars,
@@ -1465,14 +1670,14 @@ static BIF_RETTYPE do_bif_utf8_to_list(Process *p,
1465 1670
1466 1671 left = allowed_iterations(p);
1467 1672
1468   - if (state == UTF8_ANALYZE_MORE) {
1469   - state = analyze_utf8(bytes + num_bytes_to_process,
  1673 + if (state == ERTS_UTF8_ANALYZE_MORE) {
  1674 + state = erts_analyze_utf8(bytes + num_bytes_to_process,
1470 1675 size - num_bytes_to_process,
1471 1676 &endpos,&numchar,&left);
1472 1677 cost_to_proc(p,numchar);
1473 1678 num_resulting_chars += numchar;
1474 1679 num_bytes_to_process = endpos - bytes;
1475   - if (state == UTF8_ANALYZE_MORE) {
  1680 + if (state == ERTS_UTF8_ANALYZE_MORE) {
1476 1681 Eterm epos = erts_make_integer(num_bytes_to_process,p);
1477 1682 Eterm enumchar = erts_make_integer(num_resulting_chars,p);
1478 1683 erts_free_aligned_binary_bytes(temp_alloc);
@@ -1528,7 +1733,7 @@ static BIF_RETTYPE do_bif_utf8_to_list(Process *p,
1528 1733 ErlSubBin *sb;
1529 1734 Eterm orig;
1530 1735 Uint offset;
1531   - ASSERT(state != UTF8_OK);
  1736 + ASSERT(state != ERTS_UTF8_OK);
1532 1737 hp = HAlloc(p, ERL_SUB_BIN_SIZE);
1533 1738 sb = (ErlSubBin *) hp;
1534 1739 ERTS_GET_REAL_BIN(orig_bin, orig, offset, bitoffs, bitsize);
@@ -1544,14 +1749,14 @@ static BIF_RETTYPE do_bif_utf8_to_list(Process *p,
1544 1749
1545 1750 /* Done */
1546 1751
1547   - if (state == UTF8_INCOMPLETE) {
  1752 + if (state == ERTS_UTF8_INCOMPLETE) {
1548 1753 if (check_leftovers(bytes + num_bytes_to_process + num_processed_bytes,
1549 1754 b_sz) != 0) {
1550 1755 goto error_return;
1551 1756 }
1552 1757 hp = HAlloc(p,4);
1553 1758 ret = TUPLE3(hp,am_incomplete,converted,rest);
1554   - } else if (state == UTF8_ERROR) {
  1759 + } else if (state == ERTS_UTF8_ERROR) {
1555 1760 error_return:
1556 1761 hp = HAlloc(p,4);
1557 1762 ret = TUPLE3(hp,am_error,converted,rest);
@@ -1589,7 +1794,7 @@ static BIF_RETTYPE characters_to_list_trap_3(BIF_ALIST_3)
1589 1794 0U, /* nothing processed yet */
1590 1795 num_bytes_to_process,
1591 1796 num_resulting_chars,
1592   - UTF8_ANALYZE_MORE, /* always this state here */
  1797 + ERTS_UTF8_ANALYZE_MORE, /* always this state here */
1593 1798 NIL); /* Nothing built -> no tail yet */
1594 1799
1595 1800 }
@@ -1642,7 +1847,7 @@ static BIF_RETTYPE utf8_to_list(BIF_ALIST_1)
1642 1847 BIF_ERROR(BIF_P,BADARG);
1643 1848 }
1644 1849 return do_bif_utf8_to_list(BIF_P, BIF_ARG_1, 0U, 0U, 0U,
1645   - UTF8_ANALYZE_MORE,NIL);
  1850 + ERTS_UTF8_ANALYZE_MORE,NIL);
1646 1851 }
1647 1852
1648 1853
@@ -1728,8 +1933,8 @@ binary_to_atom(Process* p, Eterm bin, Eterm enc, int must_exist)
1728 1933 Uint n;
1729 1934 int reds_left = bin_size+1; /* Number of reductions left. */
1730 1935
1731   - if (analyze_utf8(bytes, bin_size, &err_pos,
1732   - &n, &reds_left) == UTF8_OK) {
  1936 + if (erts_analyze_utf8(bytes, bin_size, &err_pos,
  1937 + &n, &reds_left) == ERTS_UTF8_OK) {
1733 1938 /*
1734 1939 * Correct UTF-8 encoding, but too many characters to
1735 1940 * fit in an atom.
@@ -1818,7 +2023,7 @@ BIF_RETTYPE binary_to_existing_atom_2(BIF_ALIST_2)
1818 2023 * Simpler non-interruptable routines for UTF-8 and
1819 2024 * Windowish UTF-16 (restricted)
1820 2025 **********************************************************/
1821   -static Sint simple_char_need(Eterm ioterm, int encoding)
  2026 +Sint erts_native_filename_need(Eterm ioterm, int encoding)
1822 2027 {
1823 2028 Eterm *objp;
1824 2029 Eterm obj;
@@ -1833,6 +2038,7 @@ static Sint simple_char_need(Eterm ioterm, int encoding)
1833 2038 case ERL_FILENAME_LATIN1:
1834 2039 need = ap->len;
1835 2040 break;
  2041 + case ERL_FILENAME_UTF8_MAC:
1836 2042 case ERL_FILENAME_UTF8:
1837 2043 for (i = 0; i < ap->len; i++) {
1838 2044 need += (ap->name[i] >= 0x80) ? 2 : 1;
@@ -1882,6 +2088,7 @@ static Sint simple_char_need(Eterm ioterm, int encoding)
1882 2088 }
1883 2089 need += 1;
1884 2090 break;
  2091 + case ERL_FILENAME_UTF8_MAC:
1885 2092 case ERL_FILENAME_UTF8:
1886 2093 if (x < 0x80) {
1887 2094 need +=1;
@@ -1956,7 +2163,7 @@ static Sint simple_char_need(Eterm ioterm, int encoding)
1956 2163 return need;
1957 2164 }
1958 2165
1959   -static void simple_put_chars(Eterm ioterm, int encoding, byte *p)
  2166 +void erts_native_filename_put(Eterm ioterm, int encoding, byte *p)
1960 2167 {
1961 2168 Eterm *objp;
1962 2169 Eterm obj;
@@ -1972,6 +2179,7 @@ static void simple_put_chars(Eterm ioterm, int encoding, byte *p)
1972 2179 *p++ = ap->name[i];
1973 2180 }
1974 2181 break;
  2182 + case ERL_FILENAME_UTF8_MAC:
1975 2183 case ERL_FILENAME_UTF8:
1976 2184 for (i = 0; i < ap->len; i++) {
1977 2185 if(ap->name[i] < 0x80) {
@@ -2024,6 +2232,7 @@ static void simple_put_chars(Eterm ioterm, int encoding, byte *p)
2024 2232 ASSERT( x < 256);
2025 2233 *p++ = (byte) x;
2026 2234 break;
  2235 + case ERL_FILENAME_UTF8_MAC:
2027 2236 case ERL_FILENAME_UTF8:
2028 2237 if (x < 0x80) {
2029 2238 *p++ = (byte) x;
@@ -2102,7 +2311,39 @@ static void simple_put_chars(Eterm ioterm, int encoding, byte *p)
2102 2311 DESTROY_ESTACK(stack);
2103 2312 return;
2104 2313 }
2105   -
  2314 +void erts_copy_utf8_to_utf16_little(byte *target, byte *bytes, int num_chars)
  2315 +{
  2316 + Uint unipoint;
  2317 +
  2318 + while (num_chars--) {
  2319 + if (((*bytes) & ((byte) 0x80)) == 0) {
  2320 + unipoint = (Uint) *bytes;
  2321 + ++bytes;
  2322 + } else if (((*bytes) & ((byte) 0xE0)) == 0xC0) {
  2323 + unipoint =
  2324 + (((Uint) ((*bytes) & ((byte) 0x1F))) << 6) |
  2325 + ((Uint) (bytes[1] & ((byte) 0x3F)));
  2326 + bytes += 2;
  2327 + } else if (((*bytes) & ((byte) 0xF0)) == 0xE0) {
  2328 + unipoint =
  2329 + (((Uint) ((*bytes) & ((byte) 0xF))) << 12) |
  2330 + (((Uint) (bytes[1] & ((byte) 0x3F))) << 6) |
  2331 + ((Uint) (bytes[2] & ((byte) 0x3F)));
  2332 + bytes +=3;
  2333 + } else if (((*bytes) & ((byte) 0xF8)) == 0xF0) {
  2334 + unipoint =
  2335 + (((Uint) ((*bytes) & ((byte) 0x7))) << 18) |
  2336 + (((Uint) (bytes[1] & ((byte) 0x3F))) << 12) |
  2337 + (((Uint) (bytes[2] & ((byte) 0x3F))) << 6) |
  2338 + ((Uint) (bytes[3] & ((byte) 0x3F)));
  2339 + bytes += 4;
  2340 + } else {
  2341 + erl_exit(1,"Internal unicode error in prim_file:internal_name2native/1");
  2342 + }
  2343 + *target++ = (byte) (unipoint & 0xFF);
  2344 + *target++ = (byte) ((unipoint >> 8) & 0xFF);
  2345 + }
  2346 +}
2106 2347
2107 2348 /*
2108 2349 * This internal bif converts a filename to whatever format is suitable for the file driver
@@ -2120,7 +2361,6 @@ BIF_RETTYPE prim_file_internal_name2native_1(BIF_ALIST_1)
2120 2361 byte *bytes;
2121 2362 byte *err_pos;
2122 2363 Uint size,num_chars;
2123   - Uint unipoint;
2124 2364 /* Uninterpreted encoding except if windows widechar, in case we convert from
2125 2365 utf8 to win_wchar */
2126 2366 size = binary_size(BIF_ARG_1);
@@ -2137,7 +2377,7 @@ BIF_RETTYPE prim_file_internal_name2native_1(BIF_ALIST_1)
2137 2377 /* In a wchar world, the emulator flags only affect how
2138 2378 binaries are interpreted when sent from the user. */
2139 2379 /* Determine real length and create a new binary */
2140   - if (analyze_utf8(bytes,size,&err_pos,&num_chars,NULL) != UTF8_OK ||
  2380 + if (erts_analyze_utf8(bytes,size,&err_pos,&num_chars,NULL) != ERTS_UTF8_OK ||
2141 2381 erts_get_user_requested_filename_encoding() == ERL_FILENAME_LATIN1) {
2142 2382 /* What to do now? Maybe latin1, so just take byte for byte instead */
2143 2383 bin_term = new_binary(BIF_P, 0, (size+1)*2);
@@ -2154,43 +2394,16 @@ BIF_RETTYPE prim_file_internal_name2native_1(BIF_ALIST_1)
2154 2394 /* OK, UTF8 ok, number of characters is in num_chars */
2155 2395 bin_term = new_binary(BIF_P, 0, (num_chars+1)*2);
2156 2396 bin_p = binary_bytes(bin_term);
2157   - while (num_chars--) {
2158   - if (((*bytes) & ((byte) 0x80)) == 0) {
2159   - unipoint = (Uint) *bytes;
2160   - ++bytes;
2161   - } else if (((*bytes) & ((byte) 0xE0)) == 0xC0) {
2162   - unipoint =
2163   - (((Uint) ((*bytes) & ((byte) 0x1F))) << 6) |
2164   - ((Uint) (bytes[1] & ((byte) 0x3F)));
2165   - bytes += 2;
2166   - } else if (((*bytes) & ((byte) 0xF0)) == 0xE0) {
2167   - unipoint =
2168   - (((Uint) ((*bytes) & ((byte) 0xF))) << 12) |
2169   - (((Uint) (bytes[1] & ((byte) 0x3F))) << 6) |
2170   - ((Uint) (bytes[2] & ((byte) 0x3F)));
2171   - bytes +=3;
2172   - } else if (((*bytes) & ((byte) 0xF8)) == 0xF0) {
2173   - unipoint =
2174   - (((Uint) ((*bytes) & ((byte) 0x7))) << 18) |
2175   - (((Uint) (bytes[1] & ((byte) 0x3F))) << 12) |
2176   - (((Uint) (bytes[2] & ((byte) 0x3F))) << 6) |
2177   - ((Uint) (bytes[3] & ((byte) 0x3F)));
2178   - bytes += 4;
2179   - } else {
2180   - erl_exit(1,"Internal unicode error in file:name2native/1");
2181   - }
2182   - *bin_p++ = (byte) (unipoint & 0xFF);
2183   - *bin_p++ = (byte) ((unipoint >> 8) & 0xFF);
2184   - }
  2397 + erts_copy_utf8_to_utf16_little(bin_p, bytes, num_chars);
2185 2398 /* zero termination */
2186   - *bin_p++ = 0;
2187   - *bin_p++ = 0;
  2399 + bin_p[num_chars*2] = 0;
  2400 + bin_p[num_chars*2+1] = 0;
2188 2401 erts_free_aligned_binary_bytes(temp_alloc);
2189 2402 BIF_RET(bin_term);
2190 2403 } /* binary */
2191 2404
2192 2405
2193   - if ((need = simple_char_need(BIF_ARG_1,encoding)) < 0) {
  2406 + if ((need = erts_native_filename_need(BIF_ARG_1,encoding)) < 0) {
2194 2407 BIF_ERROR(BIF_P,BADARG);
2195 2408 }
2196 2409 if (encoding == ERL_FILENAME_WIN_WCHAR) {
@@ -2201,7 +2414,7 @@ BIF_RETTYPE prim_file_internal_name2native_1(BIF_ALIST_1)
2201 2414
2202 2415 bin_term = new_binary(BIF_P, 0, need);
2203 2416 bin_p = binary_bytes(bin_term);
2204   - simple_put_chars(BIF_ARG_1,encoding,bin_p);
  2417 + erts_native_filename_put(BIF_ARG_1,encoding,bin_p);
2205 2418 bin_p[need-1] = 0;
2206 2419 if (encoding == ERL_FILENAME_WIN_WCHAR) {
2207 2420 bin_p[need-2] = 0;
@@ -2223,6 +2436,7 @@ BIF_RETTYPE prim_file_internal_native2name_1(BIF_ALIST_1)
2223 2436 Uint num_built; /* characters */
2224 2437 Uint num_eaten; /* bytes */
2225 2438 Eterm ret;
  2439 + int mac = 0;
2226 2440
2227 2441 if (is_not_binary(BIF_ARG_1)) {
2228 2442 BIF_ERROR(BIF_P,BADARG);
@@ -2241,15 +2455,21 @@ BIF_RETTYPE prim_file_internal_native2name_1(BIF_ALIST_1)
2241 2455 bytes = binary_bytes(real_bin)+offset;
2242 2456
2243 2457 BIF_RET(erts_bin_bytes_to_list(NIL, hp, bytes, size, bitoffs));
  2458 + case ERL_FILENAME_UTF8_MAC:
  2459 + mac = 1;
2244 2460 case ERL_FILENAME_UTF8:
2245 2461 bytes = erts_get_aligned_binary_bytes(BIF_ARG_1, &temp_alloc);
2246   - if (analyze_utf8(bytes,size,&err_pos,&num_chars,NULL) != UTF8_OK) {
  2462 + if (erts_analyze_utf8(bytes,size,&err_pos,&num_chars,NULL) != ERTS_UTF8_OK) {
2247 2463 erts_free_aligned_binary_bytes(temp_alloc);
2248 2464 goto noconvert;
2249 2465 }
2250 2466 num_built = 0;
2251 2467 num_eaten = 0;
2252   - ret = do_utf8_to_list(BIF_P, num_chars, bytes, size, num_chars, &num_built, &num_eaten, NIL);
  2468 + if (mac) {
  2469 + ret = do_utf8_to_list_normalize(BIF_P, num_chars, bytes, size);
  2470 + } else {
  2471 + ret = do_utf8_to_list(BIF_P, num_chars, bytes, size, num_chars, &num_built, &num_eaten, NIL);
  2472 + }
2253 2473 erts_free_aligned_binary_bytes(temp_alloc);
2254 2474 BIF_RET(ret);
2255 2475 case ERL_FILENAME_WIN_WCHAR:
@@ -2267,9 +2487,9 @@ BIF_RETTYPE prim_file_internal_native2name_1(BIF_ALIST_1)
2267 2487 while (size > 0) {
2268 2488 Uint x = ((Uint) *bytes--) << 8;
2269 2489 x |= ((Uint) *bytes--);
  2490 + size -= 2;
2270 2491 ret = CONS(hp,make_small(x),ret);
2271 2492 hp += 2;
2272   - size -= 2;
2273 2493 }
2274 2494 erts_free_aligned_binary_bytes(temp_alloc);
2275 2495 BIF_RET(ret);
@@ -2280,11 +2500,45 @@ BIF_RETTYPE prim_file_internal_native2name_1(BIF_ALIST_1)
2280 2500 BIF_RET(BIF_ARG_1);
2281 2501 }
2282 2502
  2503 +BIF_RETTYPE prim_file_internal_normalize_utf8_1(BIF_ALIST_1)
  2504 +{
  2505 + Eterm real_bin;
  2506 + Uint offset;
  2507 + Uint size,num_chars;
  2508 + Uint bitsize;
  2509 + Uint bitoffs;
  2510 + Eterm ret;
  2511 + byte *temp_alloc = NULL;
  2512 + byte *bytes;
  2513 + byte *err_pos;
  2514 +
  2515 + if (is_not_binary(BIF_ARG_1)) {
  2516 + BIF_ERROR(BIF_P,BADARG);
  2517 + }
  2518 + size = binary_size(BIF_ARG_1);
  2519 + ERTS_GET_REAL_BIN(BIF_ARG_1, real_bin, offset, bitoffs, bitsize);
  2520 + if (bitsize != 0) {
  2521 + BIF_ERROR(BIF_P,BADARG);
  2522 + }
  2523 + if (size == 0) {
  2524 + BIF_RET(NIL);
  2525 + }
  2526 + bytes = erts_get_aligned_binary_bytes(BIF_ARG_1, &temp_alloc);
  2527 + if (erts_analyze_utf8(bytes,size,&err_pos,&num_chars,NULL) != ERTS_UTF8_OK) {
  2528 + erts_free_aligned_binary_bytes(temp_alloc);
  2529 + BIF_ERROR(BIF_P,BADARG);
  2530 + }
  2531 + ret = do_utf8_to_list_normalize(BIF_P, num_chars, bytes, size);
  2532 + erts_free_aligned_binary_bytes(temp_alloc);
  2533 + BIF_RET(ret);
  2534 +}
  2535 +
2283 2536 BIF_RETTYPE file_native_name_encoding_0(BIF_ALIST_0)
2284 2537 {
2285 2538 switch (erts_get_native_filename_encoding()) {
2286 2539 case ERL_FILENAME_LATIN1:
2287 2540 BIF_RET(am_latin1);
  2541 + case ERL_FILENAME_UTF8_MAC:
2288 2542 case ERL_FILENAME_UTF8:
2289 2543 BIF_RET(am_utf8);
2290 2544 case ERL_FILENAME_WIN_WCHAR:
1,687 erts/emulator/beam/erl_unicode_normalize.h
... ... @@ -0,0 +1,1687 @@
  1 +/*
  2 +* %CopyrightBegin%
  3 +*
  4 +* Copyright Ericsson AB 1999-2010. All Rights Reserved.
  5 +*
  6 +* The contents of this file are subject to the Erlang Public License,
  7 +* Version 1.1, (the "License"); you may not use this file except in
  8 +* compliance with the License. You should have received a copy of the
  9 +* Erlang Public License along with this software. If not, it can be
  10 +* retrieved online at http://www.erlang.org/.
  11 +*
  12 +* Software distributed under the License is distributed on an "AS IS"
  13 +* basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
  14 +* the License for the specific language governing rights and limitations
  15 +* under the License.
  16 +*
  17 +* %CopyrightEnd%
  18 +*/
  19 +/*
  20 +* This file is automatically generated by dec.erl, do not edit manually
  21 +*/
  22 +#define HASH_SIZE_FACTOR 2
  23 +typedef struct _compose_entry {
  24 + Uint16 c;
  25 + Uint16 res;
  26 + Uint16 num_subs;
  27 + struct _compose_entry *subs;
  28 + int *hash;
  29 +} CompEntry;
  30 +
  31 +static int compose_tab_size = 61;
  32 +static int hash_compose_tab_0_15[12] =
  33 +{-1,3,-1,5,-1,0,4,2,-1,1,-1,-1}; /* hash_compose_tab_0_15 */
  34 +static CompEntry compose_tab_0_15[] = {
  35 +{65, 7846, 0, NULL, NULL},
  36 +{69, 7872, 0, NULL, NULL},
  37 +{79, 7890, 0, NULL, NULL},
  38 +{97, 7847, 0, NULL, NULL},
  39 +{101, 7873, 0, NULL, NULL},
  40 +{111, 7891, 0, NULL, NULL}
  41 +}; /* compose_tab_0_15 */
  42 +static int hash_compose_tab_0_16[8] =
  43 +{3,-1,-1,-1,-1,0,2,1}; /* hash_compose_tab_0_16 */
  44 +static CompEntry compose_tab_0_16[] = {
  45 +{69, 7700, 0, NULL, NULL},
  46 +{79, 7760, 0, NULL, NULL},
  47 +{101, 7701, 0, NULL, NULL},
  48 +{111, 7761, 0, NULL, NULL}
  49 +}; /* compose_tab_0_16 */
  50 +static int hash_compose_tab_0_17[4] =
  51 +{-1,0,1,-1}; /* hash_compose_tab_0_17 */
  52 +static CompEntry compose_tab_0_17[] = {
  53 +{65, 7856, 0, NULL, NULL},
  54 +{97, 7857, 0, NULL, NULL}
  55 +}; /* compose_tab_0_17 */
  56 +static int hash_compose_tab_0_18[8] =
  57 +{-1,2,-1,-1,-1,0,1,3}; /* hash_compose_tab_0_18 */
  58 +static CompEntry compose_tab_0_18[] = {
  59 +{85, 475, 0, NULL, NULL},
  60 +{117, 476, 0, NULL, NULL},
  61 +{953, 8146, 0, NULL, NULL},
  62 +{965, 8162, 0, NULL, NULL}
  63 +}; /* compose_tab_0_18 */
  64 +static int hash_compose_tab_0_19_0[12] =
  65 +{-1,0,2,4,-1,-1,-1,1,-1,3,5,-1}; /* hash_compose_tab_0_19_0 */
  66 +static CompEntry compose_tab_0_19_0[] = {
  67 +{913, 8074, 0, NULL, NULL},
  68 +{919, 8090, 0, NULL, NULL},
  69 +{937, 8106, 0, NULL, NULL},
  70 +{945, 8066, 0, NULL, NULL},
  71 +{951, 8082, 0, NULL, NULL},
  72 +{969, 8098, 0, NULL, NULL}
  73 +}; /* compose_tab_0_19_0 */
  74 +static int hash_compose_tab_0_19[28] =
  75 +{9,10,-1,5,-1,-1,-1,11,-1,-1,-1,-1,-1,6,12,-1,-1,1,13,-1,-1,2,7,3,-1,0,4,8}; /* hash_compose_tab_0_19 */
  76 +static CompEntry compose_tab_0_19[] = {
  77 +{837, 0, 6, compose_tab_0_19_0, hash_compose_tab_0_19_0},
  78 +{913, 7946, 0, NULL, NULL},
  79 +{917, 7962, 0, NULL, NULL},
  80 +{919, 7978, 0, NULL, NULL},
  81 +{921, 7994, 0, NULL, NULL},
  82 +{927, 8010, 0, NULL, NULL},
  83 +{937, 8042, 0, NULL, NULL},
  84 +{945, 7938, 0, NULL, NULL},
  85 +{949, 7954, 0, NULL, NULL},
  86 +{951, 7970, 0, NULL, NULL},
  87 +{953, 7986, 0, NULL, NULL},
  88 +{959, 8002, 0, NULL, NULL},
  89 +{965, 8018, 0, NULL, NULL},
  90 +{969, 8034, 0, NULL, NULL}
  91 +}; /* compose_tab_0_19 */
  92 +static int hash_compose_tab_0_20_0[12] =
  93 +{-1,0,2,4,-1,-1,-1,1,-1,3,5,-1}; /* hash_compose_tab_0_20_0 */
  94 +static CompEntry compose_tab_0_20_0[] = {
  95 +{913, 8075, 0, NULL, NULL},
  96 +{919, 8091, 0, NULL, NULL},
  97 +{937, 8107, 0, NULL, NULL},
  98 +{945, 8067, 0, NULL, NULL},
  99 +{951, 8083, 0, NULL, NULL},
  100 +{969, 8099, 0, NULL, NULL}
  101 +}; /* compose_tab_0_20_0 */
  102 +static int hash_compose_tab_0_20[30] =
  103 +{-1,-1,-1,6,-1,13,-1,7,-1,14,-1,-1,-1,1,-1,8,-1,2,-1,3,9,4,10,11,-1,-1,-1,0,5,
  104 + 12}; /* hash_compose_tab_0_20 */
  105 +static CompEntry compose_tab_0_20[] = {
  106 +{837, 0, 6, compose_tab_0_20_0, hash_compose_tab_0_20_0},
  107 +{913, 7947, 0, NULL, NULL},
  108 +{917, 7963, 0, NULL, NULL},
  109 +{919, 7979, 0, NULL, NULL},
  110 +{921, 7995, 0, NULL, NULL},
  111 +{927, 8011, 0, NULL, NULL},
  112 +{933, 8027, 0, NULL, NULL},
  113 +{937, 8043, 0, NULL, NULL},
  114 +{945, 7939, 0, NULL, NULL},
  115 +{949, 7955, 0, NULL, NULL},
  116 +{951, 7971, 0, NULL, NULL},
  117 +{953, 7987, 0, NULL, NULL},
  118 +{959, 8003, 0, NULL, NULL},
  119 +{965, 8019, 0, NULL, NULL},
  120 +{969, 8035, 0, NULL, NULL}
  121 +}; /* compose_tab_0_20 */
  122 +static int hash_compose_tab_0_21[8] =
  123 +{2,-1,-1,-1,-1,1,3,0}; /* hash_compose_tab_0_21 */
  124 +static CompEntry compose_tab_0_21[] = {
  125 +{79, 7900, 0, NULL, NULL},
  126 +{85, 7914, 0, NULL, NULL},
  127 +{111, 7901, 0, NULL, NULL},
  128 +{117, 7915, 0, NULL, NULL}
  129 +}; /* compose_tab_0_21 */
  130 +static int hash_compose_tab_0_22[6] =
  131 +{-1,-1,-1,0,1,2}; /* hash_compose_tab_0_22 */
  132 +static CompEntry compose_tab_0_22[] = {
  133 +{945, 8114, 0, NULL, NULL},
  134 +{951, 8130, 0, NULL, NULL},
  135 +{969, 8178, 0, NULL, NULL}
  136 +}; /* compose_tab_0_22 */
  137 +static int hash_compose_tab_0[78] =
  138 +{38,3,29,-1,-1,-1,-1,4,19,5,20,6,14,30,31,21,32,33,37,7,-1,-1,-1,8,34,-1,-1,9,
  139 + -1,35,-1,-1,-1,10,36,-1,-1,-1,-1,11,-1,12,-1,13,-1,-1,-1,-1,-1,-1,-1,-1,-1,
  140 + -1,-1,23,-1,22,-1,24,-1,25,-1,26,-1,0,-1,-1,15,1,16,27,17,2,18,28,-1,-1}; /* hash_compose_tab_0 */
  141 +static CompEntry compose_tab_0[] = {
  142 +{65, 192, 0, NULL, NULL},
  143 +{69, 200, 0, NULL, NULL},
  144 +{73, 204, 0, NULL, NULL},
  145 +{79, 210, 0, NULL, NULL},
  146 +{85, 217, 0, NULL, NULL},
  147 +{87, 7808, 0, NULL, NULL},
  148 +{89, 7922, 0, NULL, NULL},
  149 +{97, 224, 0, NULL, NULL},
  150 +{101, 232, 0, NULL, NULL},
  151 +{105, 236, 0, NULL, NULL},
  152 +{111, 242, 0, NULL, NULL},
  153 +{117, 249, 0, NULL, NULL},
  154 +{119, 7809, 0, NULL, NULL},
  155 +{121, 7923, 0, NULL, NULL},
  156 +{168, 8173, 0, NULL, NULL},
  157 +{770, 0, 6, compose_tab_0_15, hash_compose_tab_0_15},
  158 +{772, 0, 4, compose_tab_0_16, hash_compose_tab_0_16},
  159 +{774, 0, 2, compose_tab_0_17, hash_compose_tab_0_17},
  160 +{776, 0, 4, compose_tab_0_18, hash_compose_tab_0_18},
  161 +{787, 0, 14, compose_tab_0_19, hash_compose_tab_0_19},
  162 +{788, 0, 15, compose_tab_0_20, hash_compose_tab_0_20},
  163 +{795, 0, 4, compose_tab_0_21, hash_compose_tab_0_21},
  164 +{837, 0, 3, compose_tab_0_22, hash_compose_tab_0_22},
  165 +{913, 8122, 0, NULL, NULL},
  166 +{917, 8136, 0, NULL, NULL},
  167 +{919, 8138, 0, NULL, NULL},
  168 +{921, 8154, 0, NULL, NULL},
  169 +{927, 8184, 0, NULL, NULL},
  170 +{933, 8170, 0, NULL, NULL},
  171 +{937, 8186, 0, NULL, NULL},
  172 +{945, 8048, 0, NULL, NULL},
  173 +{949, 8050, 0, NULL, NULL},
  174 +{951, 8052, 0, NULL, NULL},
  175 +{953, 8054, 0, NULL, NULL},
  176 +{959, 8056, 0, NULL, NULL},
  177 +{965, 8058, 0, NULL, NULL},
  178 +{969, 8060, 0, NULL, NULL},
  179 +{8127, 8141, 0, NULL, NULL},
  180 +{8190, 8157, 0, NULL, NULL}
  181 +}; /* compose_tab_0 */
  182 +static int hash_compose_tab_1_39[12] =
  183 +{-1,3,-1,5,-1,0,4,2,-1,1,-1,-1}; /* hash_compose_tab_1_39 */
  184 +static CompEntry compose_tab_1_39[] = {
  185 +{65, 7844, 0, NULL, NULL},
  186 +{69, 7870, 0, NULL, NULL},
  187 +{79, 7888, 0, NULL, NULL},
  188 +{97, 7845, 0, NULL, NULL},
  189 +{101, 7871, 0, NULL, NULL},
  190 +{111, 7889, 0, NULL, NULL}
  191 +}; /* compose_tab_1_39 */
  192 +static int hash_compose_tab_1_40[8] =
  193 +{2,-1,-1,-1,-1,1,3,0}; /* hash_compose_tab_1_40 */
  194 +static CompEntry compose_tab_1_40[] = {
  195 +{79, 7756, 0, NULL, NULL},
  196 +{85, 7800, 0, NULL, NULL},
  197 +{111, 7757, 0, NULL, NULL},
  198 +{117, 7801, 0, NULL, NULL}
  199 +}; /* compose_tab_1_40 */
  200 +static int hash_compose_tab_1_41[8] =
  201 +{3,-1,-1,-1,-1,0,2,1}; /* hash_compose_tab_1_41 */
  202 +static CompEntry compose_tab_1_41[] = {
  203 +{69, 7702, 0, NULL, NULL},
  204 +{79, 7762, 0, NULL, NULL},
  205 +{101, 7703, 0, NULL, NULL},
  206 +{111, 7763, 0, NULL, NULL}
  207 +}; /* compose_tab_1_41 */
  208 +static int hash_compose_tab_1_42[4] =
  209 +{-1,0,1,-1}; /* hash_compose_tab_1_42 */
  210 +static CompEntry compose_tab_1_42[] = {
  211 +{65, 7854, 0, NULL, NULL},
  212 +{97, 7855, 0, NULL, NULL}
  213 +}; /* compose_tab_1_42 */
  214 +static int hash_compose_tab_1_43[12] =
  215 +{-1,0,1,-1,-1,4,5,-1,-1,2,3,-1}; /* hash_compose_tab_1_43 */
  216 +static CompEntry compose_tab_1_43[] = {
  217 +{73, 7726, 0, NULL, NULL},
  218 +{85, 471, 0, NULL, NULL},
  219 +{105, 7727, 0, NULL, NULL},
  220 +{117, 472, 0, NULL, NULL},
  221 +{953, 8147, 0, NULL, NULL},
  222 +{965, 8163, 0, NULL, NULL}
  223 +}; /* compose_tab_1_43 */
  224 +static int hash_compose_tab_1_44[4] =
  225 +{-1,0,1,-1}; /* hash_compose_tab_1_44 */
  226 +static CompEntry compose_tab_1_44[] = {
  227 +{65, 506, 0, NULL, NULL},
  228 +{97, 507, 0, NULL, NULL}
  229 +}; /* compose_tab_1_44 */
  230 +static int hash_compose_tab_1_45_0[12] =
  231 +{-1,0,2,4,-1,-1,-1,1,-1,3,5,-1}; /* hash_compose_tab_1_45_0 */
  232 +static CompEntry compose_tab_1_45_0[] = {
  233 +{913, 8076, 0, NULL, NULL},
  234 +{919, 8092, 0, NULL, NULL},
  235 +{937, 8108, 0, NULL, NULL},
  236 +{945, 8068, 0, NULL, NULL},
  237 +{951, 8084, 0, NULL, NULL},
  238 +{969, 8100, 0, NULL, NULL}
  239 +}; /* compose_tab_1_45_0 */
  240 +static int hash_compose_tab_1_45[28] =
  241 +{9,10,-1,5,-1,-1,-1,11,-1,-1,-1,-1,-1,6,12,-1,-1,1,13,-1,-1,2,7,3,-1,0,4,8}; /* hash_compose_tab_1_45 */
  242 +static CompEntry compose_tab_1_45[] = {
  243 +{837, 0, 6, compose_tab_1_45_0, hash_compose_tab_1_45_0},
  244 +{913, 7948, 0, NULL, NULL},
  245 +{917, 7964, 0, NULL, NULL},
  246 +{919, 7980, 0, NULL, NULL},
  247 +{921, 7996, 0, NULL, NULL},
  248 +{927, 8012, 0, NULL, NULL},
  249 +{937, 8044, 0, NULL, NULL},
  250 +{945, 7940, 0, NULL, NULL},
  251 +{949, 7956, 0, NULL, NULL},
  252 +{951, 7972, 0, NULL, NULL},
  253 +{953, 7988, 0, NULL, NULL},
  254 +{959, 8004, 0, NULL, NULL},
  255 +{965, 8020, 0, NULL, NULL},
  256 +{969, 8036, 0, NULL, NULL}
  257 +}; /* compose_tab_1_45 */
  258 +static int hash_compose_tab_1_46_0[12] =
  259 +{-1,0,2,4,-1,-1,-1,1,-1,3,5,-1}; /* hash_compose_tab_1_46_0 */
  260 +static CompEntry compose_tab_1_46_0[] = {
  261 +{913, 8077, 0, NULL, NULL},
  262 +{919, 8093, 0, NULL, NULL},
  263 +{937, 8109, 0, NULL, NULL},
  264 +{945, 8069, 0, NULL, NULL},
  265 +{951, 8085, 0, NULL, NULL},
  266 +{969, 8101, 0, NULL, NULL}
  267 +}; /* compose_tab_1_46_0 */
  268 +static int hash_compose_tab_1_46[30] =
  269 +{-1,-1,-1,6,-1,13,-1,7,-1,14,-1,-1,-1,1,-1,8,-1,2,-1,3,9,4,10,11,-1,-1,-1,0,5,
  270 + 12}; /* hash_compose_tab_1_46 */
  271 +static CompEntry compose_tab_1_46[] = {
  272 +{837, 0, 6, compose_tab_1_46_0, hash_compose_tab_1_46_0},
  273 +{913, 7949, 0, NULL, NULL},
  274 +{917, 7965, 0, NULL, NULL},
  275 +{919, 7981, 0, NULL, NULL},
  276 +{921, 7997, 0, NULL, NULL},
  277 +{927, 8013, 0, NULL, NULL},
  278 +{933, 8029, 0, NULL, NULL},
  279 +{937, 8045, 0, NULL, NULL},
  280 +{945, 7941, 0, NULL, NULL},
  281 +{949, 7957, 0, NULL, NULL},
  282 +{951, 7973, 0, NULL, NULL},
  283 +{953, 7989, 0, NULL, NULL},
  284 +{959, 8005, 0, NULL, NULL},
  285 +{965, 8021, 0, NULL, NULL},
  286 +{969, 8037, 0, NULL, NULL}
  287 +}; /* compose_tab_1_46 */
  288 +static int hash_compose_tab_1_47[8] =
  289 +{2,-1,-1,-1,-1,1,3,0}; /* hash_compose_tab_1_47 */
  290 +static CompEntry compose_tab_1_47[] = {
  291 +{79, 7898, 0, NULL, NULL},
  292 +{85, 7912, 0, NULL, NULL},
  293 +{111, 7899, 0, NULL, NULL},
  294 +{117, 7913, 0, NULL, NULL}
  295 +}; /* compose_tab_1_47 */
  296 +static int hash_compose_tab_1_48[4] =
  297 +{1,-1,-1,0}; /* hash_compose_tab_1_48 */
  298 +static CompEntry compose_tab_1_48[] = {
  299 +{67, 7688, 0, NULL, NULL},
  300 +{99, 7689, 0, NULL, NULL}
  301 +}; /* compose_tab_1_48 */
  302 +static int hash_compose_tab_1_49[6] =
  303 +{-1,-1,-1,0,1,2}; /* hash_compose_tab_1_49 */
  304 +static CompEntry compose_tab_1_49[] = {
  305 +{945, 8116, 0, NULL, NULL},
  306 +{951, 8132, 0, NULL, NULL},
  307 +{959, 8180, 0, NULL, NULL}
  308 +}; /* compose_tab_1_49 */
  309 +static int hash_compose_tab_1[140] =
  310 +{-1,-1,-1,-1,-1,-1,-1,68,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
  311 + -1,-1,-1,34,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
  312 + -1,-1,-1,-1,-1,-1,-1,-1,35,-1,-1,-1,-1,64,-1,0,-1,1,-1,2,39,3,40,4,41,5,6,7,
  313 + 8,9,10,36,11,12,42,13,43,14,44,15,16,37,45,46,50,47,51,17,52,18,53,19,54,20,
  314 + 55,21,56,22,23,24,25,26,27,38,28,29,48,30,57,31,58,32,33,59,60,61,62,65,66,
  315 + 63,67,69,-1,-1,-1,-1,-1,49,-1,-1}; /* hash_compose_tab_1 */
  316 +static CompEntry compose_tab_1[] = {
  317 +{65, 193, 0, NULL, NULL},
  318 +{67, 262, 0, NULL, NULL},
  319 +{69, 201, 0, NULL, NULL},
  320 +{71, 500, 0, NULL, NULL},
  321 +{73, 205, 0, NULL, NULL},
  322 +{75, 7728, 0, NULL, NULL},
  323 +{76, 313, 0, NULL, NULL},
  324 +{77, 7742, 0, NULL, NULL},
  325 +{78, 323, 0, NULL, NULL},
  326 +{79, 211, 0, NULL, NULL},
  327 +{80, 7764, 0, NULL, NULL},
  328 +{82, 340, 0, NULL, NULL},
  329 +{83, 346, 0, NULL, NULL},
  330 +{85, 218, 0, NULL, NULL},
  331 +{87, 7810, 0, NULL, NULL},
  332 +{89, 221, 0, NULL, NULL},
  333 +{90, 377, 0, NULL, NULL},
  334 +{97, 225, 0, NULL, NULL},
  335 +{99, 263, 0, NULL, NULL},
  336 +{101, 233, 0, NULL, NULL},
  337 +{103, 501, 0, NULL, NULL},
  338 +{105, 237, 0, NULL, NULL},
  339 +{107, 7729, 0, NULL, NULL},
  340 +{108, 314, 0, NULL, NULL},
  341 +{109, 7743, 0, NULL, NULL},
  342 +{110, 324, 0, NULL, NULL},
  343 +{111, 243, 0, NULL, NULL},
  344 +{112, 7765, 0, NULL, NULL},
  345 +{114, 341, 0, NULL, NULL},
  346 +{115, 347, 0, NULL, NULL},
  347 +{117, 250, 0, NULL, NULL},
  348 +{119, 7811, 0, NULL, NULL},
  349 +{121, 253, 0, NULL, NULL},
  350 +{122, 378, 0, NULL, NULL},
  351 +{168, 8174, 0, NULL, NULL},
  352 +{198, 508, 0, NULL, NULL},
  353 +{216, 510, 0, NULL, NULL},
  354 +{230, 509, 0, NULL, NULL},
  355 +{248, 511, 0, NULL, NULL},
  356 +{770, 0, 6, compose_tab_1_39, hash_compose_tab_1_39},
  357 +{771, 0, 4, compose_tab_1_40, hash_compose_tab_1_40},
  358 +{772, 0, 4, compose_tab_1_41, hash_compose_tab_1_41},
  359 +{774, 0, 2, compose_tab_1_42, hash_compose_tab_1_42},
  360 +{776, 0, 6, compose_tab_1_43, hash_compose_tab_1_43},
  361 +{778, 0, 2, compose_tab_1_44, hash_compose_tab_1_44},
  362 +{787, 0, 14, compose_tab_1_45, hash_compose_tab_1_45},
  363 +{788, 0, 15, compose_tab_1_46, hash_compose_tab_1_46},
  364 +{795, 0, 4, compose_tab_1_47, hash_compose_tab_1_47},
  365 +{807, 0, 2, compose_tab_1_48, hash_compose_tab_1_48},
  366 +{837, 0, 3, compose_tab_1_49, hash_compose_tab_1_49},
  367 +{913, 8123, 0, NULL, NULL},
  368 +{917, 8137, 0, NULL, NULL},
  369 +{919, 8139, 0, NULL, NULL},
  370 +{921, 8155, 0, NULL, NULL},
  371 +{927, 8185, 0, NULL, NULL},
  372 +{933, 8171, 0, NULL, NULL},
  373 +{937, 8187, 0, NULL, NULL},
  374 +{945, 8049, 0, NULL, NULL},
  375 +{949, 8051, 0, NULL, NULL},
  376 +{951, 8053, 0, NULL, NULL},
  377 +{953, 8055, 0, NULL, NULL},
  378 +{959, 8057, 0, NULL, NULL},
  379 +{965, 8059, 0, NULL, NULL},
  380 +{969, 8061, 0, NULL, NULL},
  381 +{1043, 1027, 0, NULL, NULL},
  382 +{1050, 1036, 0, NULL, NULL},
  383 +{1075, 1107, 0, NULL, NULL},
  384 +{1082, 1116, 0, NULL, NULL},
  385 +{8127, 8142, 0, NULL, NULL},
  386 +{8190, 8158, 0, NULL, NULL}
  387 +}; /* compose_tab_1 */
  388 +static int hash_compose_tab_2_26[12] =
  389 +{-1,3,-1,5,-1,0,4,2,-1,1,-1,-1}; /* hash_compose_tab_2_26 */
  390 +static CompEntry compose_tab_2_26[] = {
  391 +{65, 7852, 0, NULL, NULL},
  392 +{69, 7878, 0, NULL, NULL},
  393 +{79, 7896, 0, NULL, NULL},
  394 +{97, 7853, 0, NULL, NULL},
  395 +{101, 7879, 0, NULL, NULL},
  396 +{111, 7897, 0, NULL, NULL}
  397 +}; /* compose_tab_2_26 */
  398 +static int hash_compose_tab_2[54] =
  399 +{-1,-1,-1,20,-1,-1,-1,21,-1,22,-1,0,23,1,24,2,25,3,4,5,6,-1,-1,-1,-1,7,-1,-1,
  400 + -1,8,-1,9,-1,10,-1,11,12,-1,-1,-1,-1,-1,-1,13,-1,14,-1,15,26,16,17,18,19,-1}; /* hash_compose_tab_2 */
  401 +static CompEntry compose_tab_2[] = {
  402 +{65, 194, 0, NULL, NULL},
  403 +{67, 264, 0, NULL, NULL},
  404 +{69, 202, 0, NULL, NULL},
  405 +{71, 284, 0, NULL, NULL},
  406 +{72, 292, 0, NULL, NULL},
  407 +{73, 206, 0, NULL, NULL},
  408 +{74, 308, 0, NULL, NULL},
  409 +{79, 212, 0, NULL, NULL},
  410 +{83, 348, 0, NULL, NULL},