In [225]:
import sys
import pandas as pd
import unicodedata
from collections import defaultdict

unicode_category = defaultdict(list)
for c in map(chr, range(sys.maxunicode + 1)):
    unicode_category[unicodedata.category(c)].append(ord(c))

In [226]:
def get_ranges(l):
    ranges = []
    prev_x = 0
    
    for x in sorted(l):
        if not ranges:
            ranges.append([x])
        elif x - prev_x == 1:
            ranges[-1] = [ranges[-1][0], x]
        else:
            ranges.append([x])
        prev_x = x
        
    print(f"reduced {len(l)} ints to {len(ranges)} ranges")
    return ranges

def get_unicode_categories(*categories):
    output = []
    
    for category in categories:
        results = unicode_category[category]
        output.extend(results)
        print(f"{len(results)} for ${category}")
    
    return output

In [227]:
UnicodeLetter = get_unicode_categories('Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Nl')
UnicodeCombiningMark = get_unicode_categories('Mn', 'Mc')
UnicodeDigit = get_unicode_categories('Nd')
UnicodeConnectorPunctuation = get_unicode_categories('Pc')

ZWNJ = 0x200C
ZWJ = 0x200D

"""

    This implementation can safely ignore the 
    UnicodeEscapeSequence requirement as escape
    sequences are preprocessed out before the 
    Identifier checks.

    GRAMMAR:
    IdentifierStart ::
        UnicodeLetter
        $
        _
        \ UnicodeEscapeSequence
        
"""
IdentifierStart = [
    36, # $
    95, # _
    *UnicodeLetters
]

"""

    GRAMMAR:
    IdentifierPart ::
        IdentifierStart
        UnicodeCombiningMark
        UnicodeDigit
        UnicodeConnectorPunctuation
        <ZWNJ>
        <ZWJ>
        
"""
IdentifierPart = [
    *IdentifierStart,
    *UnicodeCombiningMark,
    *UnicodeDigit,
    *UnicodeConnectorPunctuation,
    ZWNJ,
    ZWJ
]

1788 for $Lu
2151 for $Ll
31 for $Lt
259 for $Lm
121414 for $Lo
236 for $Nl
1826 for $Mn
429 for $Mc
630 for $Nd
10 for $Pc


In [228]:
IdentifierPartRanges = get_ranges(IdentifierPart)
nRanges = len(IdentifierPartRanges)

output = ""
i = 0
for charRange in IdentifierPartRanges:
    charRange = [hex(char) for char in charRange]
    if len(charRange) == 1:
        output += f"(ch == {charRange[0]})"
    else:
        output += f"({charRange[0]} <= ch & ch <= {charRange[1]})"
        
    i += 1
    if (i < nRanges):
        output += "|"
        

print(f"\n({output})")

reduced 128778 ints to 718 ranges

((ch == 0x24)|(0x30 <= ch & ch <= 0x39)|(0x41 <= ch & ch <= 0x5a)|(ch == 0x5f)|(ch == 0x5f)|(0x61 <= ch & ch <= 0x7a)|(ch == 0xaa)|(ch == 0xb5)|(ch == 0xba)|(0xc0 <= ch & ch <= 0xd6)|(0xd8 <= ch & ch <= 0xf6)|(0xf8 <= ch & ch <= 0x2c1)|(0x2c6 <= ch & ch <= 0x2d1)|(0x2e0 <= ch & ch <= 0x2e4)|(ch == 0x2ec)|(ch == 0x2ee)|(0x300 <= ch & ch <= 0x374)|(0x376 <= ch & ch <= 0x377)|(0x37a <= ch & ch <= 0x37d)|(ch == 0x37f)|(ch == 0x386)|(0x388 <= ch & ch <= 0x38a)|(ch == 0x38c)|(0x38e <= ch & ch <= 0x3a1)|(0x3a3 <= ch & ch <= 0x3f5)|(0x3f7 <= ch & ch <= 0x481)|(0x483 <= ch & ch <= 0x487)|(0x48a <= ch & ch <= 0x52f)|(0x531 <= ch & ch <= 0x556)|(ch == 0x559)|(0x560 <= ch & ch <= 0x588)|(0x591 <= ch & ch <= 0x5bd)|(ch == 0x5bf)|(0x5c1 <= ch & ch <= 0x5c2)|(0x5c4 <= ch & ch <= 0x5c5)|(ch == 0x5c7)|(0x5d0 <= ch & ch <= 0x5ea)|(0x5ef <= ch & ch <= 0x5f2)|(0x610 <= ch & ch <= 0x61a)|(0x620 <= ch & ch <= 0x669)|(0x66e <= ch & ch <= 0x6d3)|(0x6d5 <= ch & ch <= 0x6dc)|(