# Unicode Character Finder

In [1]:
from urllib import request
UCD_URL = 'http://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt'

In [2]:
with request.urlopen(UCD_URL) as fp:
    octets = fp.read()

octets[:200]    

b'0000;<control>;Cc;0;BN;;;;;N;NULL;;;;\n0001;<control>;Cc;0;BN;;;;;N;START OF HEADING;;;;\n0002;<control>;Cc;0;BN;;;;;N;START OF TEXT;;;;\n0003;<control>;Cc;0;BN;;;;;N;END OF TEXT;;;;\n0004;<control>;Cc;0;'

In [3]:
text = octets.decode('ascii')
lines = text.strip().split('\n')
print(len(lines))
lines[65]

30592


'0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061;'

In [4]:
def tokenize(*parts):
    tokens = []
    for part in parts:
        part = part.replace('-', ' ').upper()
        tokens.extend(part.split())
    return tokens

In [5]:
tokenize('AAA')

['AAA']

In [6]:
tokenize('AAA-bb')

['AAA', 'BB']

In [7]:
tokenize('AAA', 'bb-c')

['AAA', 'BB', 'C']

In [8]:
def parse(ucd_line):
    code, name, _ = ucd_line.split(';', 2)
    char = chr(int(code, 16))
    words = set(tokenize(name))
    return char, name, words

In [9]:
parse(lines[65])

('A', 'LATIN CAPITAL LETTER A', {'A', 'CAPITAL', 'LATIN', 'LETTER'})

In [10]:
parse(lines[0xb5])

('µ', 'MICRO SIGN', {'MICRO', 'SIGN'})

In [11]:
parse(lines[0x3c])

('<', 'LESS-THAN SIGN', {'LESS', 'SIGN', 'THAN'})

In [12]:
def select(query_list, lines):
    query_set = set(tokenize(*query_list))
    for line in lines:
        char, name, words = parse(line)
        if query_set <= words:
            yield char, name

In [13]:
for char, name in select(['chess', 'black'], lines):
    print('{}\t{}'.format(char, name))

♚	BLACK CHESS KING
♛	BLACK CHESS QUEEN
♜	BLACK CHESS ROOK
♝	BLACK CHESS BISHOP
♞	BLACK CHESS KNIGHT
♟	BLACK CHESS PAWN


In [14]:
for char, name in select(['question'], lines):
    print('{}\t{}'.format(char, name))

?	QUESTION MARK
¿	INVERTED QUESTION MARK
;	GREEK QUESTION MARK
՞	ARMENIAN QUESTION MARK
؟	ARABIC QUESTION MARK
፧	ETHIOPIC QUESTION MARK
᥅	LIMBU QUESTION MARK
⁇	DOUBLE QUESTION MARK
⁈	QUESTION EXCLAMATION MARK
⁉	EXCLAMATION QUESTION MARK
⍰	APL FUNCTIONAL SYMBOL QUAD QUESTION
❓	BLACK QUESTION MARK ORNAMENT
❔	WHITE QUESTION MARK ORNAMENT
⩻	LESS-THAN WITH QUESTION MARK ABOVE
⩼	GREATER-THAN WITH QUESTION MARK ABOVE
⳺	COPTIC OLD NUBIAN DIRECT QUESTION MARK
⳻	COPTIC OLD NUBIAN INDIRECT QUESTION MARK
⸮	REVERSED QUESTION MARK
㉄	CIRCLED IDEOGRAPH QUESTION
꘏	VAI QUESTION MARK
꛷	BAMUM QUESTION MARK
︖	PRESENTATION FORM FOR VERTICAL QUESTION MARK
﹖	SMALL QUESTION MARK
？	FULLWIDTH QUESTION MARK
𑅃	CHAKMA QUESTION MARK
𞥟	ADLAM INITIAL QUESTION MARK
󠀿	TAG QUESTION MARK
