In [1]:
# https://github.com/uclnlp/emoji2vec/tree/master

In [2]:
import os
os.environ["HF_HOME"] = "/data/.cache/huggingface"

In [3]:
from transformers import AutoModel
from numpy.linalg import norm

cos_sim = lambda a,b: (a @ b.T) / (norm(a)*norm(b))
model = AutoModel.from_pretrained('jinaai/jina-embeddings-v2-base-zh', trust_remote_code=True) # trust_remote_code is needed to use the encode method
embeddings = model.encode(['How is the weather today?', '今天天气怎么样?'])
print(cos_sim(embeddings[0], embeddings[1]))

  from .autonotebook import tqdm as notebook_tqdm


0.7860602


In [4]:
v1 = model.encode("月亮")
v2 = model.encode("汽车")
v = model.encode("🌕")
cos_sim(v, v1), cos_sim(v, v2)

(0.3786118, 0.18881394)

In [5]:
def parse_emoji_table(fn):
    emoji_table = [l.strip().split("\t") for l in open(fn).readlines()]
    emoji_des = {}
    for d,k in emoji_table:
        if "Ideograph" in d:
            continue
        sws = ["Negative Squared ", "Circled ", "Curved ", "Squared ", "Keycap ", "Clock Face "]
        stopwords = []
        for w in sws:
            stopwords.append(w)
            stopwords.append(w.lower())
        for w in stopwords:
            if w in d:
                d2 = d.replace(w, "")
                if k in emoji_des:
                    if d2 not in emoji_des[k]:
                        emoji_des[k].append(d2)
                        break
                else:
                    emoji_des[k] = [d2]
        if k in emoji_des:
            if d not in emoji_des[k]:
                emoji_des[k].append(d)
        else:
            emoji_des[k] = [d]
    return emoji_des
emoji_des0 = parse_emoji_table("emoji_table0.txt")


In [6]:
def decode_emoji(emoji_code):
    return chr(int(emoji_code[2:], 16))
emoji_des0 = {decode_emoji(k): v for k, v in emoji_des0.items()}
emoji_des0 = {k:f"{k}: " + ", ".join(v) for k,v in emoji_des0.items()}


In [7]:
emoji_des1 = parse_emoji_table("emoji_table.txt")
emoji_des1 = {k:f"{k}: " + ", ".join(v) for k,v in emoji_des1.items()}

In [8]:
len(emoji_des0), len(emoji_des1)

(1012, 1661)

In [21]:
e = "🐈"
emoji_des0.get(e, None), emoji_des1.get(e, None)

('🐈: Cat, Feline, Housecat, Domestic Cat',
 '🐈: domestic cat, kitten, housecat, feline, cat, meow')

In [10]:
emoji_embeddings0 = {k:model.encode(v) for k,v in emoji_des0.items()}
emoji_embeddings1 = {k:model.encode(v) for k,v in emoji_des1.items()}

In [11]:
len(emoji_embeddings0), len(emoji_embeddings1)

(1012, 1661)

In [12]:
emoji_embeddings = emoji_embeddings0.copy()
for k,v in emoji_embeddings1.items():
    if k not in emoji_embeddings:
        emoji_embeddings[k] = v
    else:
        r = 0.9
        emoji_embeddings[k] = r * emoji_embeddings[k] + (1-r) * v


In [13]:
import pickle
# save emoji_embeddings
pickle.dump(emoji_embeddings, open('emoji_embeddings.pkl', 'wb'))
pickle.dump(emoji_embeddings, open('emoji_embeddings0.pkl', 'wb'))
pickle.dump(emoji_embeddings, open('emoji_embeddings1.pkl', 'wb'))

# load emoji_embeddings
emoji_embeddings = pickle.load(open('emoji_embeddings.pkl', 'rb'))

In [14]:
nomred_emoji_embeddings = {k:v/norm(v) for k,v in emoji_embeddings.items()}
pickle.dump(nomred_emoji_embeddings, open('emoji_normed_embeddings.pkl', 'wb'))
emoji_embeddings = pickle.load(open('emoji_normed_embeddings.pkl', 'rb'))

In [15]:
import numpy as np
emoji_list = np.array(list(emoji_embeddings.keys()))

# emoji_list = np.array([decode_emoji(emoji_code) for emoji_code in emoji_code_list])

In [16]:
emoji_Fs = np.stack(list(emoji_embeddings.values()))

In [17]:
# dump emoji_Fs use numpy

np.save("emoji.npy", {"emoji_features": emoji_Fs, "emoji_list": emoji_list})
    


emoji = np.load('emoji.npy', allow_pickle=True).item()
emoji_features = emoji['emoji_features']
