@@ -833,10 +833,6 @@ static uint8_t keyword_tag(Token *tok)
833833// Forward declaration for hex digit conversion
834834static int from_hex (char c );
835835
836- // UTF-8 helpers for identifier parsing
837- // Check if byte is a UTF-8 continuation byte (10xxxxxx)
838- static inline bool is_utf8_cont (unsigned char c ) { return (c & 0xC0 ) == 0x80 ; }
839-
840836// Get the number of bytes in a UTF-8 sequence from the leading byte
841837static int utf8_char_len (unsigned char c )
842838{
@@ -851,154 +847,9 @@ static int utf8_char_len(unsigned char c)
851847 return 0 ; // Invalid
852848}
853849
854- // Decode a UTF-8 character and return its Unicode codepoint
855- static uint32_t decode_utf8 (char * p , int * len )
856- {
857- unsigned char * s = (unsigned char * )p ;
858- int n = utf8_char_len (s [0 ]);
859- if (n == 0 )
860- {
861- * len = 1 ;
862- return 0 ;
863- }
864-
865- // Validate continuation bytes
866- for (int i = 1 ; i < n ; i ++ )
867- if (!is_utf8_cont (s [i ]))
868- {
869- * len = 1 ;
870- return 0 ;
871- }
872-
873- * len = n ;
874- switch (n )
875- {
876- case 1 :
877- return s [0 ];
878- case 2 :
879- return ((s [0 ] & 0x1F ) << 6 ) | (s [1 ] & 0x3F );
880- case 3 :
881- return ((s [0 ] & 0x0F ) << 12 ) | ((s [1 ] & 0x3F ) << 6 ) | (s [2 ] & 0x3F );
882- case 4 :
883- return ((s [0 ] & 0x07 ) << 18 ) | ((s [1 ] & 0x3F ) << 12 ) | ((s [2 ] & 0x3F ) << 6 ) | (s [3 ] & 0x3F );
884- default :
885- return 0 ;
886- }
887- }
888-
889- // Unicode XID_Start ranges for identifier start characters (sorted by start)
890- // See: Unicode Standard Annex #31, C11/C23 compatible
891- static const struct
892- {
893- uint32_t start , end ;
894- } xid_start_ranges [] = {
895- {0x00C0 , 0x00FF }, // Latin-1 Supplement
896- {0x0100 , 0x017F }, // Latin Extended-A
897- {0x0180 , 0x024F }, // Latin Extended-B
898- {0x0250 , 0x02AF }, // IPA Extensions
899- {0x0370 , 0x03FF }, // Greek and Coptic
900- {0x0400 , 0x04FF }, // Cyrillic
901- {0x0500 , 0x052F }, // Cyrillic Supplement
902- {0x0530 , 0x058F }, // Armenian
903- {0x0590 , 0x05FF }, // Hebrew
904- {0x0600 , 0x06FF }, // Arabic
905- {0x0750 , 0x077F }, // Arabic Supplement
906- {0x0900 , 0x097F }, // Devanagari
907- {0x0980 , 0x09FF }, // Bengali
908- {0x0A00 , 0x0A7F }, // Gurmukhi
909- {0x0A80 , 0x0AFF }, // Gujarati
910- {0x0B00 , 0x0B7F }, // Oriya
911- {0x0B80 , 0x0BFF }, // Tamil
912- {0x0C00 , 0x0C7F }, // Telugu
913- {0x0C80 , 0x0CFF }, // Kannada
914- {0x0D00 , 0x0D7F }, // Malayalam
915- {0x0D80 , 0x0DFF }, // Sinhala
916- {0x0E00 , 0x0E7F }, // Thai
917- {0x0E80 , 0x0EFF }, // Lao
918- {0x0F00 , 0x0FFF }, // Tibetan
919- {0x10A0 , 0x10FF }, // Georgian
920- {0x1100 , 0x11FF }, // Hangul Jamo
921- {0x1200 , 0x137F }, // Ethiopian
922- {0x13A0 , 0x13FF }, // Cherokee
923- {0x1400 , 0x167F }, // Canadian Aboriginal Syllabics
924- {0x1780 , 0x17FF }, // Khmer
925- {0x1800 , 0x18AF }, // Mongolian
926- {0x1E00 , 0x1EFF }, // Latin Extended Additional
927- {0x1F00 , 0x1FFF }, // Greek Extended
928- {0x2100 , 0x214F }, // Letterlike Symbols
929- {0x3040 , 0x309F }, // Hiragana
930- {0x30A0 , 0x30FF }, // Katakana
931- {0x3100 , 0x312F }, // Bopomofo
932- {0x31A0 , 0x31BF }, // Bopomofo Extended
933- {0x31F0 , 0x31FF }, // Katakana Phonetic Extensions
934- {0x3400 , 0x4DBF }, // CJK Extension A
935- {0x4E00 , 0x9FFF }, // CJK Unified
936- {0xAC00 , 0xD7AF }, // Hangul Syllables
937- {0xF900 , 0xFAFF }, // CJK Compatibility Ideographs
938- {0x1D400 , 0x1D7FF }, // Mathematical Alphanumeric Symbols
939- {0x20000 , 0x2A6DF }, // CJK Extension B
940- {0x2A700 , 0x2B73F }, // CJK Extension C
941- {0x2B740 , 0x2B81F }, // CJK Extension D
942- {0x2B820 , 0x2CEAF }, // CJK Extension E
943- {0x2CEB0 , 0x2EBEF }, // CJK Extension F
944- {0x30000 , 0x3134F }, // CJK Extension G
945- };
946-
947- #define XID_START_RANGE_COUNT (sizeof(xid_start_ranges) / sizeof(xid_start_ranges[0]))
948-
949- // Check if a Unicode codepoint is valid for identifier start (XID_Start + _ + $)
950- // Uses binary search over sorted ranges for O(log N) lookup
951- static bool is_ident_start_unicode (uint32_t cp )
952- {
953- if (cp < 0x80 )
954- return isalpha (cp ) || cp == '_' || cp == '$' ;
955-
956- // Binary search over XID_Start ranges
957- int lo = 0 , hi = XID_START_RANGE_COUNT - 1 ;
958- while (lo <= hi )
959- {
960- int mid = lo + (hi - lo ) / 2 ;
961- if (cp < xid_start_ranges [mid ].start )
962- hi = mid - 1 ;
963- else if (cp > xid_start_ranges [mid ].end )
964- lo = mid + 1 ;
965- else
966- return true; // cp is within range [start, end]
967- }
968- return false;
969- }
970-
971- // Check if a Unicode codepoint is valid for identifier continuation (XID_Continue)
972- static bool is_ident_cont_unicode (uint32_t cp )
973- {
974- if (cp < 0x80 )
975- return isalnum (cp ) || cp == '_' || cp == '$' ;
976- if (is_ident_start_unicode (cp ))
977- return true;
978- // Combining marks, modifiers, and other continuation characters
979- if (cp >= 0x0300 && cp <= 0x036F )
980- return true; // Combining Diacritical Marks
981- if (cp >= 0x1DC0 && cp <= 0x1DFF )
982- return true; // Combining Diacritical Marks Supplement
983- if (cp >= 0x20D0 && cp <= 0x20FF )
984- return true; // Combining Diacritical Marks for Symbols
985- if (cp >= 0xFE20 && cp <= 0xFE2F )
986- return true; // Combining Half Marks
987- // Numeric characters (for continuation only)
988- if (cp >= 0x0660 && cp <= 0x0669 )
989- return true; // Arabic-Indic Digits
990- if (cp >= 0x06F0 && cp <= 0x06F9 )
991- return true; // Extended Arabic-Indic Digits
992- if (cp >= 0x0966 && cp <= 0x096F )
993- return true; // Devanagari Digits
994- if (cp >= 0x09E6 && cp <= 0x09EF )
995- return true; // Bengali Digits
996- if (cp >= 0x0E50 && cp <= 0x0E59 )
997- return true; // Thai Digits
998- if (cp >= 0xFF10 && cp <= 0xFF19 )
999- return true; // Fullwidth Digits
1000- return false;
1001- }
850+ // any non-ASCII byte or UCN that survived preprocessing is a valid identifier.
851+ static inline bool is_ident_start_ascii (char c ) { return isalpha (c ) || c == '_' || c == '$' ; }
852+ static inline bool is_ident_cont_ascii (char c ) { return isalnum (c ) || c == '_' || c == '$' ; }
1002853
1003854// Read a UCN (Universal Character Name) \uXXXX or \UXXXXXXXX
1004855// Returns the number of bytes consumed, or 0 if not a valid UCN
@@ -1033,21 +884,24 @@ static int read_ident(char *start)
1033884 uint32_t cp ;
1034885 int len ;
1035886
1036- // Check for UCN at start
887+ // Check for UCN at start (\uXXXX or \UXXXXXXXX)
1037888 len = read_ucn (p , & cp );
1038889 if (len > 0 )
1039890 {
1040- if (!is_ident_start_unicode (cp ))
1041- return 0 ;
891+ p += len ;
892+ }
893+ else if ((unsigned char )* p >= 0x80 )
894+ {
895+ // Non-ASCII UTF-8 start byte
896+ len = utf8_char_len ((unsigned char )* p );
897+ if (len == 0 ) return 0 ;
1042898 p += len ;
1043899 }
1044900 else
1045901 {
1046- // Check for UTF-8 or ASCII start
1047- cp = decode_utf8 (p , & len );
1048- if (!is_ident_start_unicode (cp ))
902+ if (!is_ident_start_ascii (* p ))
1049903 return 0 ;
1050- p += len ;
904+ p ++ ;
1051905 }
1052906
1053907 // Continue reading identifier characters
@@ -1056,16 +910,21 @@ static int read_ident(char *start)
1056910 len = read_ucn (p , & cp );
1057911 if (len > 0 )
1058912 {
1059- if (!is_ident_cont_unicode (cp ))
1060- break ;
1061913 p += len ;
1062914 continue ;
1063915 }
1064916
1065- cp = decode_utf8 (p , & len );
1066- if (!is_ident_cont_unicode (cp ))
917+ if ((unsigned char )* p >= 0x80 )
918+ {
919+ len = utf8_char_len ((unsigned char )* p );
920+ if (len == 0 ) break ;
921+ p += len ;
922+ continue ;
923+ }
924+
925+ if (!is_ident_cont_ascii (* p ))
1067926 break ;
1068- p += len ;
927+ p ++ ;
1069928 }
1070929
1071930 return p - start ;
0 commit comments