In [53]:
import torch
from torch.nn import functional as F
import pandas as pd
import numpy as np
from emoji import demojize
from transformers import AutoTokenizer, AutoModel, AutoModelForMaskedLM
from tqdm.notebook import tqdm
import time
from pandarallel import pandarallel
pandarallel.initialize(nb_workers=8)
tqdm.pandas()

from settings import AMBIGUITY_PATH, AMBIGUITY_CLUSTER

from src.data.utils import save_to_csv

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


  from pandas import Panel


In [76]:
print("Load data...")
path = "/scratch/czestoch/sampled_tweets_bigger.txt.gz"
tweets = pd.read_csv(path, header=0, lineterminator='\n', encoding='utf-8')
# tweets.emojis = tweets.emojis.parallel_apply(eval)

# tweets = tweets.explode("emojis", ignore_index=True)
# tweets = tweets.drop_duplicates()

Load data...


In [77]:
tweets.head()

Unnamed: 0,tweet,emojis
0,90' - 3 minuti di recupero 0Ô∏è‚É£ - 3Ô∏è‚É£,0Ô∏è‚É£
1,[ AC MILAN 0Ô∏è‚É£-1Ô∏è‚É£ BENEVENTO ‚è±FIN DU MATCH !,0Ô∏è‚É£
2,0Ô∏è‚É£„Ä∞Ô∏èüíØ real quick,0Ô∏è‚É£
3,Ronaldo's last away goal in La Liga? October 1...,0Ô∏è‚É£
4,(L1) ANGERS 0Ô∏è‚É£-2Ô∏è‚É£ LORIENT (L2) (N2) GRANVILL...,0Ô∏è‚É£


In [79]:
len(tweets)

163900

In [80]:
our_emojis = pd.read_csv(AMBIGUITY_CLUSTER, encoding='utf-8').emoji.unique()
tweets = tweets[tweets.emojis.isin(our_emojis)]
df = tweets.groupby("emojis").count()
numerous_emojis = df[df.tweet >= 100].index.tolist()
tweets = tweets[tweets.emojis.isin(numerous_emojis)]

In [74]:
our_emojis = pd.read_csv(AMBIGUITY_CLUSTER, encoding='utf-8').emoji.unique()
tweets = tweets[tweets.emojis.isin(our_emojis)]

df = tweets.groupby("emojis").count()
numerous_emojis = df[df.tweet >= 100].index.tolist()
tweets = tweets[tweets.emojis.isin(numerous_emojis)]

tweets = tweets[:1000]

del our_emojis
del numerous_emojis
del df

def preprocess_tweets(group):
    emoji = group.emojis.unique()[0]
    if emoji == "*‚É£" or emoji == '*Ô∏è‚É£':
        emoji = f"\{emoji}"
    try:
        group.tweet = group.tweet.apply(lambda x: x.replace(emoji, "<mask>", 1))
        group.tweet = group.tweet.apply(demojize)
    except Exception as e:
        return np.nan
    return group

def get_emoji_softmax_variance(texts):
    try:
        tokenized = [tokenizer.tokenize(text) for text in texts.tolist()]
        tokenized = list(filter(lambda x: "<mask>" in x, tokenized))
        input_ = tokenizer(tokenized, return_tensors='pt', is_split_into_words=True, padding=True, truncation=True)
        mask_index = torch.where(input_["input_ids"] == tokenizer.mask_token_id)[1]
        output = model(**input_)
        logits = output.logits
        softmax = F.softmax(logits, dim=-1)
        over_our_words = softmax[torch.arange(softmax.size(0)), mask_index][:, indices].detach().numpy()
        return np.sum(np.var(over_our_words, 0))
#         return softmax[torch.arange(softmax.size(0)), mask_index].detach().numpy().tolist()
#         return softmax[0, mask_index[0], :].detach().numpy().tolist()
    except Exception as e:
        return np.nan

print("Preprocess tweets...")
tweets = tweets.groupby("emojis").parallel_apply(preprocess_tweets)
tweets = tweets.dropna()

print("Load model...")
tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base")
model = AutoModelForMaskedLM.from_pretrained("vinai/bertweet-base")

# get indices of emoji words from out data in bert vocabulary
our_words = set(pd.read_csv(AMBIGUITY_CLUSTER).word.unique())
indices = []
for vocab_idx, vocab_word in enumerate(vocab):
    if vocab_word in our_words:
        indices.append(vocab_idx)
indices = np.array(indices)

print("Extracting embeddings...")
out = tweets.groupby("emojis").tweet.progress_apply(get_emoji_softmax_variance)
del tweets
out = out.dropna()
out = out.reset_index()

print("Saving...")
# save_to_csv(out, "/scratch/czestoch/softmax_emojis_variances.csv")

Preprocess tweets...
Load model...


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Extracting embeddings...


  0%|          | 0/10 [00:00<?, ?it/s]

Saving...


In [75]:
out

Unnamed: 0,emojis,tweet
0,¬©Ô∏è,0.003143
1,¬ÆÔ∏è,0.007534
2,‚ÄºÔ∏è,0.006471
3,‚ÅâÔ∏è,0.005759
4,‚Ñ¢Ô∏è,0.020594
5,‚ÑπÔ∏è,0.007747
6,‚ÜîÔ∏è,0.013571
7,‚ÜóÔ∏è,0.007061
8,‚ÜòÔ∏è,0.000491


In [161]:
###############################

In [1]:
import argparse
import torch
from torch.nn import functional as F
import pandas as pd
import numpy as np
from emoji import demojize
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm
from pandarallel import pandarallel
pandarallel.initialize(nb_workers=8)
tqdm.pandas()

from settings import AMBIGUITY_CLUSTER

from src.data.utils import save_to_csv


def preprocess_tweets(group):
    emoji = group.emojis.unique()[0]
    if emoji == "*‚É£" or emoji == '*Ô∏è‚É£':
        emoji = f"\{emoji}"
    try:
        group.tweet = group.tweet.apply(lambda x: x.replace(emoji, "[EMOJI]", 1))
        group.tweet = group.tweet.apply(demojize)
    except Exception:
        return np.nan
    return group

def get_embeddings_variance(group):
    try:
        encoded_input = tokenizer(group.tolist(), return_tensors='pt', padding=True, truncation=True)
        embeddings = model(**encoded_input)[1][0].detach().numpy()
        return np.sum(embeddings.var(0))
    except Exception:
        return np.nan

    
print("Load data...")
path = "/scratch/czestoch/sampled_tweets_bigger.txt.gz"
tweets = pd.read_csv(path, header=0, lineterminator='\n', encoding='utf-8')
tweets = tweets[:1000]

print("Preprocess tweets...")
tweets = tweets.groupby("emojis").parallel_apply(preprocess_tweets)
tweets = tweets.dropna()

print("Load model...")
tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base",\
                                        additional_special_tokens=["[EMOJI]"])
model = AutoModel.from_pretrained("vinai/bertweet-base")
model.resize_token_embeddings(len(tokenizer))
vocab = list(tokenizer.encoder.keys())

print("Extracting embeddings...")
# variances = tweets.groupby("emojis").tweet.progress_apply(get_embeddings_variance)
# variances = variances.dropna()
# variances = variances.reset_index().rename({0: "variance"}, axis=1)

# save_to_csv(variances, "/scratch/czestoch/emojis_masked_variances.csv")
# save_to_csv(variances, args.output)

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


  from pandas import Panel


Load data...
Preprocess tweets...
Load model...


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Some weights of the model checkpoint at vinai/bertweet-base were not used when initializing RobertaModel: ['lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Extracting embeddings...


In [30]:
our_words = set(pd.read_csv(AMBIGUITY_CLUSTER).word.unique())
indices = []
for vocab_idx, vocab_word in enumerate(vocab):
    if vocab_word in our_words:
        indices.append(vocab_idx)
indices = np.array(indices)

group = tweets[tweets.emojis == '0Ô∏è‚É£']
group = group.tweet
encoded_input = tokenizer(group.tolist(), return_tensors='pt', padding=True, truncation=True)
embeddings = model(**encoded_input)

In [45]:
embeddings[0].size()

torch.Size([100, 128, 768])

In [46]:
embeddings[1].size()

torch.Size([100, 768])

In [38]:
embeddings.shape

(768,)

In [32]:
##################################################################3

In [5]:
# explode
# groupby emoji
# mask emoji
# extract tweet embedding

tweets1 = tweets[:1000]

def preprocess_tweets(group):
    emoji = group.emojis.unique()[0]
    if emoji == "*‚É£" or emoji == '*Ô∏è‚É£':
        emoji = f"\{emoji}"
    try:
        group.tweet = group.tweet.replace(emoji, "[EMOJI]", regex=True)
    except Exception:
        return np.nan
    return group

def get_tweet_embedding(text):
    try:
        encoded_input = tokenizer(text, return_tensors='pt', padding=True, truncation=True)
        return model(**encoded_input)[1][0].detach().numpy().tolist()
    except Exception:
        return np.nan

print("Preprocess tweets...")
tweets1 = tweets1.groupby("emojis").parallel_apply(preprocess_tweets)
tweets1 = tweets1.dropna()

print("Load model...")
tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base",\
                                         additional_special_tokens=["[EMOJI]"])
model = AutoModel.from_pretrained("vinai/bertweet-base")
model.resize_token_embeddings(len(tokenizer))

print("Extracting embeddings...")
out = tweets1.tweet.progress_apply(get_tweet_embedding)

tweets1["embedding"] = out
tweets1 = tweets1.dropna()
# save_to_csv(tweets, "/scratch/czestoch/bert_emojis_masked.csv")

Preprocess tweets...
Load model...


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Some weights of the model checkpoint at vinai/bertweet-base were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.decoder.bias', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Extracting embeddings...


  0%|          | 0/1000 [00:00<?, ?it/s]

In [8]:
tweets1

Unnamed: 0,tweet,emojis,embedding
0,90' - 3 minuti di recupero [EMOJI] - 3Ô∏è‚É£,0Ô∏è‚É£,"[0.2714495360851288, -0.17950321733951569, 0.0..."
1,[ AC MILAN [EMOJI]-1Ô∏è‚É£ BENEVENTO ‚è±FIN DU MATCH !,0Ô∏è‚É£,"[0.21306820213794708, -0.20475709438323975, 0...."
2,[EMOJI]„Ä∞Ô∏èüíØ real quick,0Ô∏è‚É£,"[0.30385464429855347, -0.21020273864269257, 0...."
3,Ronaldo's last away goal in La Liga? October 1...,0Ô∏è‚É£,"[0.2292073518037796, -0.15393587946891785, 0.0..."
4,(L1) ANGERS [EMOJI]-2Ô∏è‚É£ LORIENT (L2) (N2) GRAN...,0Ô∏è‚É£,"[0.21869970858097076, -0.05074208974838257, -0..."
...,...,...,...
995,"[EMOJI]0‚É£ for Dhawan who, along with Manish Pa...",5‚É£,"[0.19394651055335999, -0.13764727115631104, 0...."
996,"Feliz [EMOJI][EMOJI], GOAT! üêê",5‚É£,"[0.31594470143318176, -0.17039579153060913, 0...."
997,[EMOJI]0‚É£ apperances for Joel Matip today. üëè,5‚É£,"[0.2831932604312897, -0.17760756611824036, 0.0..."
998,The Final NCAA stats released &amp; our own fi...,5‚É£,"[0.18135130405426025, -0.12419982254505157, 0...."


In [20]:
np.sum(np.array(tweets1[tweets1.emojis == '0Ô∏è‚É£'].embedding.values.tolist()).var(0))

2.373917717602736

In [21]:
def calculate_variance(group):
    return np.sum(np.array(group.embedding.values.tolist()).var(0))

variances = tweets1.groupby("emojis").parallel_apply(calculate_variance)
variances

In [None]:
# import gc
# print("Extract embeddings...")
# out, i = [], 0
# for _, text in tweets.tweet.iteritems():
#     out.append(get_tweet_embedding(text))
    
#     if i % 500 == 0:
#         print("checkpoint: %s" % i)
#         gc.collect()
#     i += 1

In [2]:
test = pd.read_csv("/scratch/czestoch/bert_emojis_masked.csv.gz")
test.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,tweet,emojis,embedding
0,[EMOJI]üÜòPLEASE üÜòWE WILL NOT BE SILENTüò°plz wa...,*‚É£,"[0.3089597523212433, -0.18129678070545197, 0.0..."
1,*‚É£[EMOJI]PLEASE [EMOJI]WE WILL NOT BE SILENT...,üÜò,"[0.28499189019203186, -0.13812287151813507, -0..."
2,*‚É£üÜòPLEASE üÜòWE WILL NOT BE SILENT[EMOJI]plz w...,üò°,"[0.3065902590751648, -0.15257291495800018, -0...."
3,*‚É£üÜòPLEASE üÜòWE WILL NOT BE SILENTüò°plz watch[E...,üìº,"[0.31592175364494324, -0.18171299993991852, -0..."
4,[EMOJI]PLEASE KILL RATE‚ö†Ô∏èRESCUE ONLY‚ö†Ô∏èThank ...,*‚É£,"[0.25296467542648315, -0.2111043781042099, 0.0..."


In [11]:
save_to_csv(test.dropna(), "/scratch/czestoch/bert_emojis_masked.csv")

In [None]:
for _, row in test.iterrows():
    try:
        assert len(eval(row.embedding)) == 768
    except TypeError:
        print(type(row.embedding))

## Get embedding per emoji in tweet

In [None]:
def preprocess_tweets(text):
    return add_spaces_between_emojis(emoji.demojize(text))

def add_spaces_between_emojis(demojified_text):
    new_text = []
    colons = []
    for char in demojified_text:
        if char == ":":
            if colons:
                new_text.append(char + " ")
                colons.pop()
            else:
                colons.append(char)
                new_text.append(" " + char)
        else:
            new_text.append(char)
    return ''.join(new_text)

In [None]:
tweets.tweet = tweets.tweet.parallel_apply(preprocess_tweets)

In [2]:
# all_emojis = pd.read_csv(AMBIGUITY_CLUSTER).emoji.unique()
all_emojis = pd.read_csv(AMBIGUITY_PATH).emoji.unique()
all_emojis = list(map(lambda x: emoji.demojize(x), all_emojis))

In [3]:
tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base")
emojis_in_tokenizer = {}
for em in all_emojis:
    emoji_tensor = tokenizer(em, return_tensors='pt')['input_ids']
    emoji_vocab_idx = emoji_tensor[0][1].item()
    # if size is 3 it means emoji token was not splitted so it is a known token,
    # [start] [emoji] [stop]
    # index 3 stands for an unknown token
    if emoji_tensor.size(1) == 3 and emoji_vocab_idx != 3:
        emojis_in_tokenizer[em] = emoji_vocab_idx
print(f"{len(emojis_in_tokenizer)} of our emojis are in this model")

emojis_not_in_tokenizer = set(all_emojis) - set(emojis_in_tokenizer.keys())
original_tokenizer_size = len(tokenizer)
print(f"Original number of tokens: {original_tokenizer_size}")
tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base",\
                                         additional_special_tokens=list(emojis_not_in_tokenizer))
print(f"Number of tokens after extension: {len(tokenizer)}")

Special tokens have been added in the vocabulary, make sure the associated word embedding are fine-tuned or trained.


442 of our emojis are in this model
Original number of tokens: 64001


Special tokens have been added in the vocabulary, make sure the associated word embedding are fine-tuned or trained.


Number of tokens after extension: 64884


In [None]:
# all_emojis = pd.read_csv(AMBIGUITY_CLUSTER).emoji.unique()

# tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base")
# emojis_in_tokenizer = {}
# for em in all_emojis:
#     emoji_tensor = tokenizer(emoji.demojize(em), return_tensors='pt')['input_ids']
#     emoji_vocab_idx = emoji_tensor[0][1].item()
#     # if size is 3 it means emoji token was not splitted so it is a known token,
#     # [start] [emoji] [stop]
#     # index 3 stands for an unknown token
#     if emoji_tensor.size(1) == 3 and emoji_vocab_idx != 3:
#         emojis_in_tokenizer[em] = emoji_vocab_idx

# print(f"{len(emojis_in_tokenizer)} of our emojis are in this model")
# del all_emojis

In [4]:
emojis_in_tokenizer_indices = set(emojis_in_tokenizer.values())
model = AutoModel.from_pretrained("vinai/bertweet-base")
model.resize_token_embeddings(len(tokenizer))

def get_emoji_embedding(text):
    tokenized = np.array(tokenizer.tokenize(text))
    encoded_input = tokenizer(text, return_tensors='pt')
    tokens_ids = encoded_input['input_ids']
    mask = [id_.item() in emojis_in_tokenizer_indices \
            or id_.item() >= original_tokenizer_size for id_ in tokens_ids[0]]
    if any(mask):
        try:
            features = model(**encoded_input)[0]
        except IndexError:
            return np.nan, np.nan
        return features[0][mask][:].detach().numpy(), tokenized[mask[1:-1]]
    else:
        return np.nan, np.nan

In [None]:
out = tweets.progress_apply(get_emoji_embedding)

In [None]:
tweets = tweets.to_frame()

In [None]:
tweets[["embedding", "emoji"]] = pd.DataFrame(out.tolist())

In [None]:
tweets.head()

In [None]:
tweets = tweets.dropna()

In [None]:
tweets["embedding"] = tweets["embedding"].parallel_apply(lambda x: x.tolist())
tweets["emoji"] = tweets["emoji"].parallel_apply(lambda x: x.tolist())

In [None]:
tweets = tweets.set_index(['tweet']).apply(pd.Series.explode).reset_index()
tweets = tweets.dropna()

In [21]:
save_to_csv(tweets, "/scratch/czestoch/bert_emojis_with_unknown_emojis.csv")

  0%|          | 0/93390 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (188 > 128). Running this sequence through the model will result in indexing errors


In [30]:
tweets.groupby("emoji").embedding.count()

emoji
:1st_place_medal:            467
:2nd_place_medal:            154
:3rd_place_medal:            130
:AB_button_(blood_type):     116
:ATM_sign:                    63
                            ... 
:zany_face:                 1135
:zebra:                      225
:zipper-mouth_face:           98
:zombie:                      47
:zzz:                        272
Name: embedding, Length: 1193, dtype: int64

## Check if unknown tokens are rubbish

In [8]:
def get_embedding(text):
    encoded_input = tokenizer(text, return_tensors='pt')
    features = model(**encoded_input)
    return features[1].detach().cpu().numpy() 

In [41]:
tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base",\
                                         additional_special_tokens=[":paintbrush_selector:",\
                                                                    emoji.demojize('‚ù§Ô∏è')])
model = AutoModel.from_pretrained("vinai/bertweet-base")
model.resize_token_embeddings(len(tokenizer))

Special tokens have been added in the vocabulary, make sure the associated word embedding are fine-tuned or trained.


Embedding(64003, 768)

In [43]:
# print(f"This is an initially known token: {emoji.demojize('üñåÔ∏è') in emojis_in_tokenizer}")
# print(f"This is an initially known token: {emoji.demojize('‚ù§Ô∏è') in emojis_in_tokenizer}")
text1 = "This is amazing, trust me! üñåÔ∏è"
text2 = "This is amazing, trust me! ‚ù§Ô∏è"
text1 = emoji.demojize(text1)
text2 = emoji.demojize(text2)
one_pass = get_embedding(text1)
second_pass = get_embedding(text2)
print(f"Vectors are the same: {(one_pass == second_pass).all()}")
print(f"Difference between vectors: {(one_pass - second_pass).sum()}")

Vectors are the same: False
Difference between vectors: -0.24916860461235046


In [39]:
print(f"This is an initially known token: {emoji.demojize('üñåÔ∏è') in emojis_in_tokenizer}")
print(f"This is an initially known token: {emoji.demojize('‚ù§Ô∏è') in emojis_in_tokenizer}")
text1 = "This is amazing, trust me! üñåÔ∏è"
text2 = "This is amazing, trust me! ‚ù§Ô∏è"
text1 = emoji.demojize(text1)
text2 = emoji.demojize(text2)
one_pass = get_embedding(text1)
second_pass = get_embedding(text2)
print(f"Vectors are the same: {(one_pass == second_pass).all()}")
print(f"Difference between vectors: {(one_pass - second_pass).sum()}")

This is an initially known token: False
This is an initially known token: False
Vectors are the same: False
Difference between vectors: 1.4294824600219727


In [40]:
print(f"This is an initially known token: {emoji.demojize('üòÇ') in emojis_in_tokenizer}")
print(f"This is an initially known token: {emoji.demojize('üíì') in emojis_in_tokenizer}")
text1 = "This is amazing, trust me! üòÇ"
text2 = "This is amazing, trust me! üíì"
text1 = emoji.demojize(text1)
text2 = emoji.demojize(text2)
one_pass = get_embedding(text1)
second_pass = get_embedding(text2)
print(f"Vectors are the same: {(one_pass == second_pass).all()}")
print(f"Difference between vectors: {(one_pass - second_pass).sum()}")

This is an initially known token: True
This is an initially known token: True
Vectors are the same: False
Difference between vectors: -0.09044761955738068


In [30]:
# print(f"This is an initially known token: {emoji.demojize('üñåÔ∏è') in emojis_in_tokenizer}")
# print(f"This is an initially known token: {emoji.demojize('‚ù§Ô∏è') in emojis_in_tokenizer}")
text2 = "This is amazing, trust me! üñåÔ∏è"
text2 = emoji.demojize(text2)
second_pass = get_embedding(text2)
(one_pass == second_pass).all()

False

In [18]:
print(f"This is an initially known token: {emoji.demojize('üñåÔ∏è') in emojis_in_tokenizer}")
text1 = "This is amazing, trust me! üñåÔ∏è"
text2 = "This is absolutely horrible, never ever try doing it üñåÔ∏è"
text1 = emoji.demojize(text1)
text2 = emoji.demojize(text2)
one_pass = get_embedding(text1)
second_pass = get_embedding(text2)
(one_pass - second_pass).sum()

This is an initially known token: False


-0.47473258

In [None]:
### Heart is initially not in the vocabulary

In [19]:
print(f"This is an initially known token: {emoji.demojize('‚ù§Ô∏è') in emojis_in_tokenizer}")
text1 = "This is amazing, trust me! ‚ù§Ô∏è"
text2 = "This is absolutely horrible, never ever try doing it ‚ù§Ô∏è"
text1 = emoji.demojize(text1)
text2 = emoji.demojize(text2)
one_pass = get_embedding(text1)
second_pass = get_embedding(text2)
(one_pass - second_pass).sum()

This is an initially known token: False


-0.48091918

In [None]:
### Emojis below are in the original vocabulary

In [20]:
print(f"This is an initially known token: {emoji.demojize('üòÇ') in emojis_in_tokenizer}")
text1 = "This is amazing, trust me! üòÇ"
text2 = "This is absolutely horrible, never ever try doing it üòÇ"
text1 = emoji.demojize(text1)
text2 = emoji.demojize(text2)
one_pass = get_embedding(text1)
second_pass = get_embedding(text2)
(one_pass - second_pass).sum()

This is an initially known token: True


-0.2963187

In [21]:
print(f"This is an initially known token: {emoji.demojize('üíì') in emojis_in_tokenizer}")
text1 = "This is amazing, trust me! üíì"
text2 = "This is absolutely horrible, never ever try doing it üíì"
text1 = emoji.demojize(text1)
text2 = emoji.demojize(text2)
one_pass = get_embedding(text1)
second_pass = get_embedding(text2)
(one_pass - second_pass).sum()

This is an initially known token: True


-0.20267165

In [22]:
print(f"This is an initially known token: {emoji.demojize('üß°') in emojis_in_tokenizer}")
text1 = "This is amazing, trust me! üß°"
text2 = "This is absolutely horrible, never ever try doing it üß°"
text1 = emoji.demojize(text1)
text2 = emoji.demojize(text2)
one_pass = get_embedding(text1)
second_pass = get_embedding(text2)
(one_pass - second_pass).sum()

This is an initially known token: True


-0.60374236

In [None]:
### check if embeddinga on unknown tokens that were added to vocabulary are rubish
### check which emojis are in the tokenizer and how to extract their embeddings later
### check parallelization
### save embeddings, yupi