# Unicode Character Finder

In [1]:
from urllib import request
UCD_URL = 'http://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt'

In [2]:
with request.urlopen(UCD_URL) as fp:
    octets = fp.read()

octets[:200]    

b'0000;<control>;Cc;0;BN;;;;;N;NULL;;;;\n0001;<control>;Cc;0;BN;;;;;N;START OF HEADING;;;;\n0002;<control>;Cc;0;BN;;;;;N;START OF TEXT;;;;\n0003;<control>;Cc;0;BN;;;;;N;END OF TEXT;;;;\n0004;<control>;Cc;0;'

In [3]:
text = octets.decode('ascii')
lines = text.strip().split('\n')
print(len(lines))
lines[65]

30592


'0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061;'

In [4]:
def tokenize(*parts):
    tokens = []
    for part in parts:
        part = part.replace('-', ' ').upper()
        tokens.extend(part.split())
    return tokens

In [5]:
tokenize('aaa')

['AAA']

In [6]:
tokenize('aaa-BB')

['AAA', 'BB']

In [7]:
tokenize('aaa', 'BB-c')

['AAA', 'BB', 'C']

In [8]:
def parse(ucd_line):
    code, name, _ = ucd_line.split(';', 2)
    char = chr(int(code, 16))
    words = set(tokenize(name))
    return char, name, words

In [9]:
parse(lines[65])

('A', 'LATIN CAPITAL LETTER A', {'A', 'CAPITAL', 'LATIN', 'LETTER'})

In [10]:
parse(lines[0xb5])

('µ', 'MICRO SIGN', {'MICRO', 'SIGN'})

In [11]:
parse(lines[0x3c])

('<', 'LESS-THAN SIGN', {'LESS', 'SIGN', 'THAN'})

In [12]:
def select(query_list, lines):
    query_set = set(tokenize(*query_list))
    for line in lines:
        char, name, words = parse(line)
        if query_set <= words:
            yield char, name

In [13]:
for char, name in select(['chess', 'black'], lines):
    print('{}\t{}'.format(char, name))

♚	BLACK CHESS KING
♛	BLACK CHESS QUEEN
♜	BLACK CHESS ROOK
♝	BLACK CHESS BISHOP
♞	BLACK CHESS KNIGHT
♟	BLACK CHESS PAWN


In [14]:
def summary(count):
    if count > 1:
        return '{} characters found.'.format(count)
    elif count == 1:
        return '1 character found.'
    else:
        return 'No matching character name.'
    

def finder():
    while True:
        print('━' * 70)
        query = input('Words to search:')
        if not query:
            break
        count = 0
        for char, name in select(query.split(), lines):
            print('{}\t{}'.format(char, name))
            count += 1
        print(summary(count))


In [15]:
finder()

━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
Words to search:cat face
🐱	CAT FACE
😸	GRINNING CAT FACE WITH SMILING EYES
😹	CAT FACE WITH TEARS OF JOY
😺	SMILING CAT FACE WITH OPEN MOUTH
😻	SMILING CAT FACE WITH HEART-SHAPED EYES
😼	CAT FACE WITH WRY SMILE
😽	KISSING CAT FACE WITH CLOSED EYES
😾	POUTING CAT FACE
😿	CRYING CAT FACE
🙀	WEARY CAT FACE
10 characters found.
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
Words to search:
