Skip to content

Commit 2d99fff

Browse files
committed
simplify utf-8 ident
1 parent 6cc5aa0 commit 2d99fff

File tree

1 file changed

+23
-164
lines changed

1 file changed

+23
-164
lines changed

parse.c

Lines changed: 23 additions & 164 deletions
Original file line numberDiff line numberDiff line change
@@ -833,10 +833,6 @@ static uint8_t keyword_tag(Token *tok)
833833
// Forward declaration for hex digit conversion
834834
static int from_hex(char c);
835835

836-
// UTF-8 helpers for identifier parsing
837-
// Check if byte is a UTF-8 continuation byte (10xxxxxx)
838-
static inline bool is_utf8_cont(unsigned char c) { return (c & 0xC0) == 0x80; }
839-
840836
// Get the number of bytes in a UTF-8 sequence from the leading byte
841837
static int utf8_char_len(unsigned char c)
842838
{
@@ -851,154 +847,9 @@ static int utf8_char_len(unsigned char c)
851847
return 0; // Invalid
852848
}
853849

854-
// Decode a UTF-8 character and return its Unicode codepoint
855-
static uint32_t decode_utf8(char *p, int *len)
856-
{
857-
unsigned char *s = (unsigned char *)p;
858-
int n = utf8_char_len(s[0]);
859-
if (n == 0)
860-
{
861-
*len = 1;
862-
return 0;
863-
}
864-
865-
// Validate continuation bytes
866-
for (int i = 1; i < n; i++)
867-
if (!is_utf8_cont(s[i]))
868-
{
869-
*len = 1;
870-
return 0;
871-
}
872-
873-
*len = n;
874-
switch (n)
875-
{
876-
case 1:
877-
return s[0];
878-
case 2:
879-
return ((s[0] & 0x1F) << 6) | (s[1] & 0x3F);
880-
case 3:
881-
return ((s[0] & 0x0F) << 12) | ((s[1] & 0x3F) << 6) | (s[2] & 0x3F);
882-
case 4:
883-
return ((s[0] & 0x07) << 18) | ((s[1] & 0x3F) << 12) | ((s[2] & 0x3F) << 6) | (s[3] & 0x3F);
884-
default:
885-
return 0;
886-
}
887-
}
888-
889-
// Unicode XID_Start ranges for identifier start characters (sorted by start)
890-
// See: Unicode Standard Annex #31, C11/C23 compatible
891-
static const struct
892-
{
893-
uint32_t start, end;
894-
} xid_start_ranges[] = {
895-
{0x00C0, 0x00FF}, // Latin-1 Supplement
896-
{0x0100, 0x017F}, // Latin Extended-A
897-
{0x0180, 0x024F}, // Latin Extended-B
898-
{0x0250, 0x02AF}, // IPA Extensions
899-
{0x0370, 0x03FF}, // Greek and Coptic
900-
{0x0400, 0x04FF}, // Cyrillic
901-
{0x0500, 0x052F}, // Cyrillic Supplement
902-
{0x0530, 0x058F}, // Armenian
903-
{0x0590, 0x05FF}, // Hebrew
904-
{0x0600, 0x06FF}, // Arabic
905-
{0x0750, 0x077F}, // Arabic Supplement
906-
{0x0900, 0x097F}, // Devanagari
907-
{0x0980, 0x09FF}, // Bengali
908-
{0x0A00, 0x0A7F}, // Gurmukhi
909-
{0x0A80, 0x0AFF}, // Gujarati
910-
{0x0B00, 0x0B7F}, // Oriya
911-
{0x0B80, 0x0BFF}, // Tamil
912-
{0x0C00, 0x0C7F}, // Telugu
913-
{0x0C80, 0x0CFF}, // Kannada
914-
{0x0D00, 0x0D7F}, // Malayalam
915-
{0x0D80, 0x0DFF}, // Sinhala
916-
{0x0E00, 0x0E7F}, // Thai
917-
{0x0E80, 0x0EFF}, // Lao
918-
{0x0F00, 0x0FFF}, // Tibetan
919-
{0x10A0, 0x10FF}, // Georgian
920-
{0x1100, 0x11FF}, // Hangul Jamo
921-
{0x1200, 0x137F}, // Ethiopian
922-
{0x13A0, 0x13FF}, // Cherokee
923-
{0x1400, 0x167F}, // Canadian Aboriginal Syllabics
924-
{0x1780, 0x17FF}, // Khmer
925-
{0x1800, 0x18AF}, // Mongolian
926-
{0x1E00, 0x1EFF}, // Latin Extended Additional
927-
{0x1F00, 0x1FFF}, // Greek Extended
928-
{0x2100, 0x214F}, // Letterlike Symbols
929-
{0x3040, 0x309F}, // Hiragana
930-
{0x30A0, 0x30FF}, // Katakana
931-
{0x3100, 0x312F}, // Bopomofo
932-
{0x31A0, 0x31BF}, // Bopomofo Extended
933-
{0x31F0, 0x31FF}, // Katakana Phonetic Extensions
934-
{0x3400, 0x4DBF}, // CJK Extension A
935-
{0x4E00, 0x9FFF}, // CJK Unified
936-
{0xAC00, 0xD7AF}, // Hangul Syllables
937-
{0xF900, 0xFAFF}, // CJK Compatibility Ideographs
938-
{0x1D400, 0x1D7FF}, // Mathematical Alphanumeric Symbols
939-
{0x20000, 0x2A6DF}, // CJK Extension B
940-
{0x2A700, 0x2B73F}, // CJK Extension C
941-
{0x2B740, 0x2B81F}, // CJK Extension D
942-
{0x2B820, 0x2CEAF}, // CJK Extension E
943-
{0x2CEB0, 0x2EBEF}, // CJK Extension F
944-
{0x30000, 0x3134F}, // CJK Extension G
945-
};
946-
947-
#define XID_START_RANGE_COUNT (sizeof(xid_start_ranges) / sizeof(xid_start_ranges[0]))
948-
949-
// Check if a Unicode codepoint is valid for identifier start (XID_Start + _ + $)
950-
// Uses binary search over sorted ranges for O(log N) lookup
951-
static bool is_ident_start_unicode(uint32_t cp)
952-
{
953-
if (cp < 0x80)
954-
return isalpha(cp) || cp == '_' || cp == '$';
955-
956-
// Binary search over XID_Start ranges
957-
int lo = 0, hi = XID_START_RANGE_COUNT - 1;
958-
while (lo <= hi)
959-
{
960-
int mid = lo + (hi - lo) / 2;
961-
if (cp < xid_start_ranges[mid].start)
962-
hi = mid - 1;
963-
else if (cp > xid_start_ranges[mid].end)
964-
lo = mid + 1;
965-
else
966-
return true; // cp is within range [start, end]
967-
}
968-
return false;
969-
}
970-
971-
// Check if a Unicode codepoint is valid for identifier continuation (XID_Continue)
972-
static bool is_ident_cont_unicode(uint32_t cp)
973-
{
974-
if (cp < 0x80)
975-
return isalnum(cp) || cp == '_' || cp == '$';
976-
if (is_ident_start_unicode(cp))
977-
return true;
978-
// Combining marks, modifiers, and other continuation characters
979-
if (cp >= 0x0300 && cp <= 0x036F)
980-
return true; // Combining Diacritical Marks
981-
if (cp >= 0x1DC0 && cp <= 0x1DFF)
982-
return true; // Combining Diacritical Marks Supplement
983-
if (cp >= 0x20D0 && cp <= 0x20FF)
984-
return true; // Combining Diacritical Marks for Symbols
985-
if (cp >= 0xFE20 && cp <= 0xFE2F)
986-
return true; // Combining Half Marks
987-
// Numeric characters (for continuation only)
988-
if (cp >= 0x0660 && cp <= 0x0669)
989-
return true; // Arabic-Indic Digits
990-
if (cp >= 0x06F0 && cp <= 0x06F9)
991-
return true; // Extended Arabic-Indic Digits
992-
if (cp >= 0x0966 && cp <= 0x096F)
993-
return true; // Devanagari Digits
994-
if (cp >= 0x09E6 && cp <= 0x09EF)
995-
return true; // Bengali Digits
996-
if (cp >= 0x0E50 && cp <= 0x0E59)
997-
return true; // Thai Digits
998-
if (cp >= 0xFF10 && cp <= 0xFF19)
999-
return true; // Fullwidth Digits
1000-
return false;
1001-
}
850+
// any non-ASCII byte or UCN that survived preprocessing is a valid identifier.
851+
static inline bool is_ident_start_ascii(char c) { return isalpha(c) || c == '_' || c == '$'; }
852+
static inline bool is_ident_cont_ascii(char c) { return isalnum(c) || c == '_' || c == '$'; }
1002853

1003854
// Read a UCN (Universal Character Name) \uXXXX or \UXXXXXXXX
1004855
// Returns the number of bytes consumed, or 0 if not a valid UCN
@@ -1033,21 +884,24 @@ static int read_ident(char *start)
1033884
uint32_t cp;
1034885
int len;
1035886

1036-
// Check for UCN at start
887+
// Check for UCN at start (\uXXXX or \UXXXXXXXX)
1037888
len = read_ucn(p, &cp);
1038889
if (len > 0)
1039890
{
1040-
if (!is_ident_start_unicode(cp))
1041-
return 0;
891+
p += len;
892+
}
893+
else if ((unsigned char)*p >= 0x80)
894+
{
895+
// Non-ASCII UTF-8 start byte
896+
len = utf8_char_len((unsigned char)*p);
897+
if (len == 0) return 0;
1042898
p += len;
1043899
}
1044900
else
1045901
{
1046-
// Check for UTF-8 or ASCII start
1047-
cp = decode_utf8(p, &len);
1048-
if (!is_ident_start_unicode(cp))
902+
if (!is_ident_start_ascii(*p))
1049903
return 0;
1050-
p += len;
904+
p++;
1051905
}
1052906

1053907
// Continue reading identifier characters
@@ -1056,16 +910,21 @@ static int read_ident(char *start)
1056910
len = read_ucn(p, &cp);
1057911
if (len > 0)
1058912
{
1059-
if (!is_ident_cont_unicode(cp))
1060-
break;
1061913
p += len;
1062914
continue;
1063915
}
1064916

1065-
cp = decode_utf8(p, &len);
1066-
if (!is_ident_cont_unicode(cp))
917+
if ((unsigned char)*p >= 0x80)
918+
{
919+
len = utf8_char_len((unsigned char)*p);
920+
if (len == 0) break;
921+
p += len;
922+
continue;
923+
}
924+
925+
if (!is_ident_cont_ascii(*p))
1067926
break;
1068-
p += len;
927+
p++;
1069928
}
1070929

1071930
return p - start;

0 commit comments

Comments
 (0)