In [1]:
from transformers import AutoTokenizer, AutoModel
import numpy as np 
import torch
import json

# Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    # First element of model_output contains all token embeddings
    token_embeddings = model_output[0]
    input_mask_expanded = attention_mask.unsqueeze(
        -1).expand(token_embeddings.size()).float()
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    return sum_embeddings / sum_mask

# Load AutoModel from huggingface model repository
tokenizer = AutoTokenizer.from_pretrained(
    "sentence-transformers/bert-base-nli-mean-tokens")
model = AutoModel.from_pretrained(
    "sentence-transformers/bert-base-nli-mean-tokens")

Some weights of the model checkpoint at sentence-transformers/bert-base-nli-mean-tokens were not used when initializing BertModel: ['classifier.bias', 'classifier.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [10]:
# Sentences we want sentence embeddings for
sentences = ['Travel abroad']

# Tokenize sentences
encoded_input = tokenizer(sentences, padding=True,
                          truncation=True, max_length=128, return_tensors='pt')

# Compute token embeddings
with torch.no_grad():
    model_output = model(**encoded_input)

# Perform pooling. In this case, mean pooling
sentence_embeddings = mean_pooling(
    model_output, encoded_input['attention_mask'])

In [74]:
import pandas as pd 
emoji_df = pd.read_csv('emoji_df.csv')
emoji_vector = {}
for i, row in emoji_df.iterrows():
    emoji_description = row[1]
    emoji = row[0]
    # ========== 轉成 vector
    dog = ['heart']
    # Tokenize sentences
    encoded_emoji_desc = tokenizer(emoji_description, padding=True,
                              truncation=True, max_length=128, return_tensors='pt')
    with torch.no_grad():
        model_output = model(**encoded_emoji_desc)
    emoji_embeddings = mean_pooling(
        model_output, encoded_emoji_desc['attention_mask'])
    # ==================
    emoji_vector[emoji] = emoji_embeddings.numpy().reshape(-1)

class NumpyEncoder(json.JSONEncoder):
    """ Special json encoder for numpy types """
    def default(self, obj):
        if isinstance(obj, (np.int_, np.intc, np.intp, np.int8,
                            np.int16, np.int32, np.int64, np.uint8,
                            np.uint16, np.uint32, np.uint64)):
            return int(obj)
        elif isinstance(obj, (np.float_, np.float16, np.float32,
                              np.float64)):
            return float(obj)
        elif isinstance(obj, (np.ndarray,)):
            return obj.tolist()
        return json.JSONEncoder.default(self, obj)
    
with open('emoji_embeddings.json', 'w') as json_file:
    json.dump(emoji_vector, json_file, cls=NumpyEncoder)

In [11]:
import json

with open('emoji_embeddings.json') as f:
    emoji_embeddings = json.load(f)
    
scores = []

for idx, emoji in emoji_embeddings.items():
    score = np.sum(np.abs(sentence_embeddings.numpy().reshape(-1) - emoji))
    scores.append(score)
    
scores = np.array(scores)
topK_idx = np.argsort(scores)[:5]
recommend_emojis = []
emoji_keys = list(emoji_embeddings.keys())

for x in topK_idx:
    recommend_emojis.append(emoji_keys[x])
    
print(recommend_emojis)

['🛃', '🎫', '🗺️', '🗺', '💞']
