In [109]:
import os
from dataclasses import dataclass


@dataclass
class Emoji:
    code: str  # starting with u
    emoji: str
    description: str
    group: str
    sub_group: str

In [110]:
current_group = None
current_subgroup = None

emojis: dict[str, Emoji] = dict()


for line in open('emoji-test.txt'):
    if line.startswith('# group:'):
        current_group = line.split(':')[-1].strip()
        current_subgroup = None
        continue
    elif line.startswith('# subgroup:'):
        current_subgroup = line.split(':')[-1].strip()
        continue
    elif line.startswith('#') or line == '\n':
        continue

    code, line = line.split('; ')
    code = 'u' + code.strip().lower().replace(' ', '_')
    code = code.replace('_fe0f', '')

    status, line = line.split('# ')
    status = status.strip()

    if status.strip() != 'fully-qualified':
        continue

    arr = line.split(' ')
    emoji = arr[0]
    version = arr[1]
    description = ' '.join(arr[2:])

    if description.startswith('flag: '):
        continue
    if description.startswith('Japanese'):
        continue

    item = Emoji(code, emoji, description, current_group, current_subgroup)
    emojis[code] = item

In [111]:
standart = set(emojis)
len(emojis)

3491

In [112]:
NOTO_EMOJI_DIR = 'data/noto-emoji'
noto_set = os.listdir(NOTO_EMOJI_DIR)
noto_set = {x[6:-4] for x in noto_set}

In [113]:
missing_png = {
    code
    for code in standart - noto_set
}
missing_standart = noto_set - standart

In [114]:
print(len(missing_png))
{
    code: emojis[code].description
    for code in missing_png
}

0


{}

In [115]:
print(len(missing_standart))
print('\n'.join(sorted(missing_standart)))

85
u0023
u002a
u0030
u0031
u0032
u0033
u0034
u0035
u0036
u0037
u0038
u0039
u1f1e6
u1f1e7
u1f1e8
u1f1e9
u1f1ea
u1f1eb
u1f1ec
u1f1ed
u1f1ee
u1f1ef
u1f1f0
u1f1f1
u1f1f2
u1f1f3
u1f1f4
u1f1f5
u1f1f6
u1f1f7
u1f1f8
u1f1f9
u1f1fa
u1f1fb
u1f1fc
u1f1fd
u1f1fe
u1f1ff
u1f201
u1f202
u1f21a
u1f22f
u1f232
u1f233
u1f234
u1f235
u1f236
u1f237
u1f238
u1f239
u1f23a
u1f250
u1f251
u1f38e
u1f3e3
u1f3ef
u1f3fb
u1f3fc
u1f3fd
u1f3fe
u1f3ff
u1f530
u1f93c_1f3fb
u1f93c_1f3fb_200d_2640
u1f93c_1f3fb_200d_2642
u1f93c_1f3fc
u1f93c_1f3fc_200d_2640
u1f93c_1f3fc_200d_2642
u1f93c_1f3fd
u1f93c_1f3fd_200d_2640
u1f93c_1f3fd_200d_2642
u1f93c_1f3fe
u1f93c_1f3fe_200d_2640
u1f93c_1f3fe_200d_2642
u1f93c_1f3ff
u1f93c_1f3ff_200d_2640
u1f93c_1f3ff_200d_2642
u1f9b0
u1f9b1
u1f9b2
u1f9b3
u20e3
u3297
u3299
ufe82b


Manually checked "extra" png from noto font are:
* '#', '*' symbols
* numbers
* letters
* Japanese words
* extra skin tones for wrestling emoji
* hair color components

In [116]:
description_symbols = set()
for code, emoji in emojis.items():
    description_symbols.update(emoji.description.lower())
description_symbols = sorted(description_symbols)
print(len(description_symbols))
description_symbols

49


['\n',
 ' ',
 '!',
 '#',
 '(',
 ')',
 '*',
 ',',
 '-',
 '.',
 '0',
 '1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 ':',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z',
 'ñ',
 '’']

In [122]:
symbols_to_remove = '():!.,'
translation_table = str.maketrans('', '', symbols_to_remove)

description_words = set()
for code, emoji in emojis.items():
    words = emoji.description \
        .translate(translation_table) \
        .replace('’s', '') \
        .lower().split()
    description_words.update(words)

description_words = sorted(description_words)
print(len(description_words))
description_words

1421


['#',
 '*',
 '0',
 '1',
 '10',
 '1st',
 '2',
 '2nd',
 '3',
 '3rd',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 'a',
 'ab',
 'abacus',
 'accordion',
 'adhesive',
 'admission',
 'adult',
 'aerial',
 'aid',
 'airplane',
 'alarm',
 'alembic',
 'alien',
 'alternation',
 'ambulance',
 'american',
 'americas',
 'amphora',
 'amulet',
 'anatomical',
 'anchor',
 'and',
 'angel',
 'anger',
 'angry',
 'anguished',
 'ant',
 'antenna',
 'anxious',
 'apple',
 'aquarius',
 'aries',
 'arm',
 'arrival',
 'arrow',
 'arrows',
 'articulated',
 'artist',
 'arts',
 'asia-australia',
 'asterisk',
 'astonished',
 'astronaut',
 'at',
 'atm',
 'atom',
 'auto',
 'automobile',
 'avocado',
 'away',
 'axe',
 'b',
 'baby',
 'back',
 'backhand',
 'backpack',
 'bacon',
 'badge',
 'badger',
 'badminton',
 'bag',
 'bagel',
 'baggage',
 'bags',
 'baguette',
 'balance',
 'bald',
 'ball',
 'ballet',
 'balloon',
 'ballot',
 'banana',
 'bandage',
 'banjo',
 'bank',
 'banknote',
 'bar',
 'barber',
 'bars',
 'baseball',
 'basket',
 'b