# Analysis of Unicode Character Names

In [1]:
import sys
import unicodedata
import collections

words = collections.Counter()

def named_chars():
    for code in range(sys.maxunicode):
        char = chr(code)
        try:
            yield char, unicodedata.name(char)
        except ValueError: # no such name
            continue

In [2]:
sys.maxunicode

1114111

In [3]:
l = list(named_chars())

In [4]:
len(l)

122047

In [5]:
l[:10], l[-10:]

([(' ', 'SPACE'),
  ('!', 'EXCLAMATION MARK'),
  ('"', 'QUOTATION MARK'),
  ('#', 'NUMBER SIGN'),
  ('$', 'DOLLAR SIGN'),
  ('%', 'PERCENT SIGN'),
  ('&', 'AMPERSAND'),
  ("'", 'APOSTROPHE'),
  ('(', 'LEFT PARENTHESIS'),
  (')', 'RIGHT PARENTHESIS')],
 [('󠇦', 'VARIATION SELECTOR-247'),
  ('󠇧', 'VARIATION SELECTOR-248'),
  ('󠇨', 'VARIATION SELECTOR-249'),
  ('󠇩', 'VARIATION SELECTOR-250'),
  ('󠇪', 'VARIATION SELECTOR-251'),
  ('󠇫', 'VARIATION SELECTOR-252'),
  ('󠇬', 'VARIATION SELECTOR-253'),
  ('󠇭', 'VARIATION SELECTOR-254'),
  ('󠇮', 'VARIATION SELECTOR-255'),
  ('󠇯', 'VARIATION SELECTOR-256')])

In [6]:
for char, name in named_chars():
    parts = name.replace('-', ' ').split()
    words.update(parts)
    
len(words)

102743

In [7]:
for word, count in words.most_common(100):
    print(f'{count:6d} {word}')

 81593 CJK
 81533 IDEOGRAPH
 80428 UNIFIED
 13393 SYLLABLE
 11735 HANGUL
  9280 LETTER
  3042 SIGN
  2630 WITH
  2557 SMALL
  1887 CAPITAL
  1654 HIEROGLYPH
  1492 LATIN
  1284 ARABIC
  1249 YI
  1234 CUNEIFORM
  1198 SYMBOL
  1152 MATHEMATICAL
  1071 EGYPTIAN
  1014 COMPATIBILITY
   807 FORM
   792 A
   785 DIGIT
   756 TANGUT
   755 COMPONENT
   710 CANADIAN
   710 SYLLABICS
   688 VOWEL
   672 SIGNWRITING
   664 TIMES
   657 BAMUM
   584 BOLD
   583 ANATOLIAN
   576 ARROW
   573 AND
   569 PHASE
   552 LINEAR
   530 GREEK
   516 LIGATURE
   516 MUSICAL
   495 ETHIOPIC
   456 E
   449 FOR
   444 CYRILLIC
   440 COMBINING
   438 DOUBLE
   430 ABOVE
   426 ITALIC
   419 OLD
   403 NUMBER
   400 RIGHT
   397 LEFT
   387 SERIF
   385 U
   385 SANS
   385 RADICAL
   375 CIRCLED
   366 DOTS
   346 SQUARE
   344 TAI
   343 B
   337 O
   332 FINAL
   329 MARK
   301 I
   300 VAI
   295 TWO
   292 HAND
   292 BLACK
   282 ONE
   279 BELOW
   273 DOT
   260 VARIATION
   260 SELECTOR
   257 PAT

In [8]:
max(words, key=len)

'CEONGCHIEUMSSANGCIEUC'

In [9]:
singles = sorted((count, word) for word, count in words.items() if len(word)==1)

In [10]:
len(singles)

35

In [11]:
for count, word in reversed(singles):
    print(f'{count:6d} {word}')    

   792 A
   456 E
   385 U
   343 B
   337 O
   301 I
   212 D
   202 C
   160 R
   151 L
   142 S
   140 N
   123 F
   108 M
    96 Y
    96 T
    95 K
    94 H
    93 P
    92 Z
    90 G
    88 V
    85 X
    80 J
    74 W
    61 2
    55 Q
    33 3
    29 1
    22 5
    22 4
    16 6
    11 9
    11 7
     9 8


In [12]:
unique = sorted(word for word, count in words.items() if count==1)

In [13]:
len(unique)

98448

In [14]:
unique[:50], unique[-50:]

(['001',
  '002',
  '004',
  '005',
  '006',
  '007',
  '008',
  '009',
  '010',
  '011',
  '012',
  '013',
  '014',
  '015',
  '016',
  '017',
  '018',
  '019',
  '022',
  '023',
  '024',
  '026',
  '027',
  '028',
  '029',
  '031',
  '032',
  '033',
  '034',
  '035',
  '036',
  '037',
  '038',
  '039',
  '040',
  '041',
  '042',
  '043',
  '044',
  '045',
  '046',
  '047',
  '048',
  '049',
  '052',
  '053',
  '054',
  '055',
  '056',
  '057'],
 ['ZOP',
  'ZOX',
  'ZRA',
  'ZSA',
  'ZSHA',
  'ZUBUR',
  'ZUM',
  'ZUO',
  'ZUOP',
  'ZUOX',
  'ZURX',
  'ZUT',
  'ZUX',
  'ZWA',
  'ZWARAKAY',
  'ZWJ',
  'ZYGOS',
  'ZYP',
  'ZYR',
  'ZYRX',
  'ZYT',
  'ZYX',
  'ZZAA',
  'ZZAP',
  'ZZAT',
  'ZZAX',
  'ZZEE',
  'ZZEP',
  'ZZEX',
  'ZZIE',
  'ZZIEP',
  'ZZIEX',
  'ZZIP',
  'ZZIT',
  'ZZIX',
  'ZZOP',
  'ZZOX',
  'ZZSA',
  'ZZSYA',
  'ZZUP',
  'ZZUR',
  'ZZURX',
  'ZZUX',
  'ZZY',
  'ZZYA',
  'ZZYP',
  'ZZYR',
  'ZZYRX',
  'ZZYT',
  'ZZYX'])

In [19]:
f'{len(l)}'

'122047'