In [1]:
import pandas as pd
import numpy as np
import emoji
from transformers import AutoTokenizer, AutoModel
from tqdm.notebook import tqdm
import time
from pandarallel import pandarallel
pandarallel.initialize(nb_workers=4)
tqdm.pandas()

from settings import AMBIGUITY_PATH

INFO: Pandarallel will run on 4 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [2]:
path = "/home/jczestochowska/workspace/dlab/emoji-ambiguity/data/interim/sampled_tweets.txt.gz"
tweets = pd.read_csv(path, header=0, lineterminator='\n', encoding='utf-8')['tweet']

In [3]:
len(tweets)

93390

In [4]:
tweets.head()

0    🧡"STAR" Adorable female puppy 3 months old *⃣i...
1       *⃣PLEASE   GUNNER 💉4a COLD, NEEDS HERO&amp;🏡*⃣
2    🆘MAS SHELTER AT CAPACITY💉🚨 *⃣PLEASE   PLS HELP...
3    RT / REPLY to VOTE! *⃣  *⃣  *⃣ Not much sweat,...
4    *⃣🆘PLEASE   🆘WE WILL NOT BE SILENT😡plz watch📼 ...
Name: tweet, dtype: object

In [5]:
tweets = tweets[:1000]
len(tweets)

1000

In [6]:
def preprocess_tweets(text):
    return add_spaces_between_emojis(emoji.demojize(text))

def add_spaces_between_emojis(demojified_text):
    new_text = []
    colons = []
    for char in demojified_text:
        if char == ":":
            if colons:
                new_text.append(char + " ")
                colons.pop()
            else:
                colons.append(char)
                new_text.append(" " + char)
        else:
            new_text.append(char)
    return ''.join(new_text)

In [7]:
tweets = tweets.parallel_apply(preprocess_tweets)

In [8]:
all_emojis = pd.read_csv(AMBIGUITY_PATH).emoji.unique()

tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base")
emojis_in_tokenizer = {}
for em in all_emojis:
    emoji_tensor = tokenizer(emoji.demojize(em), return_tensors='pt')['input_ids']
    emoji_vocab_idx = emoji_tensor[0][1].item()
    # if size is 3 it means emoji token was not splitted so it is a known token,
    # [start] [emoji] [stop]
    # index 3 stands for an unknown token
    if emoji_tensor.size(1) == 3 and emoji_vocab_idx != 3:
        emojis_in_tokenizer[em] = emoji_vocab_idx

print(f"{len(emojis_in_tokenizer)} of our emojis are in this model")
del all_emojis

Special tokens have been added in the vocabulary, make sure the associated word embedding are fine-tuned or trained.


442 of our emojis are in this model


In [9]:
emojis_in_tokenizer_indices = set(emojis_in_tokenizer.values())
tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base")
model = AutoModel.from_pretrained("vinai/bertweet-base")

def get_emoji_embedding(text):
    tokenized = np.array(tokenizer.tokenize(text))
    encoded_input = tokenizer(text, return_tensors='pt')
    tokens_ids = encoded_input['input_ids']
    mask = [id_.item() in emojis_in_tokenizer_indices for id_ in tokens_ids[0]]
    if any(mask):
        try:
            features = model(**encoded_input)[0]
        except IndexError:
            return np.nan, np.nan
        return features[0][mask][:].detach().numpy(), tokenized[mask[1:-1]]
    else:
        return np.nan, np.nan

Special tokens have been added in the vocabulary, make sure the associated word embedding are fine-tuned or trained.


In [10]:
out = tweets.progress_apply(get_emoji_embedding)

HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))

Token indices sequence length is longer than the specified maximum sequence length for this model (140 > 128). Running this sequence through the model will result in indexing errors





In [15]:
tweets = tweets.to_frame()

In [17]:
tweets[["embedding", "emoji"]] = pd.DataFrame(out.tolist())

## Check if unknown tokens are rubbish

In [None]:
def get_embedding(text):
    encoded_input = tokenizer(text, return_tensors='pt')
    features = model(**encoded_input)
    return features[0].detach().cpu().numpy() 

In [None]:
tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base",\
                                         additional_special_tokens=[":paintbrush_selector:"])
model = AutoModel.from_pretrained("vinai/bertweet-base")
model.resize_token_embeddings(len(tokenizer))

In [None]:
### Paintbrush is initially not in the vocabulary but was added as an additional token

In [None]:
print(f"This is an initially known token: {'🖌️' in emojis_in_tokenizer}")
text1 = "This is amazing, trust me! 🖌️"
text2 = "This is absolutely horrible, never ever try doing it 🖌️"
text1 = emoji.demojize(text1)
text2 = emoji.demojize(text2)
one_pass = get_embedding(text1)
second_pass = get_embedding(text2)
(one_pass - second_pass).sum()

In [None]:
### Heart is initially not in the vocabulary

In [None]:
print(f"This is an initially known token: {'❤️' in emojis_in_tokenizer}")
text1 = "This is amazing, trust me! ❤️"
text2 = "This is absolutely horrible, never ever try doing it ❤️"
text1 = emoji.demojize(text1)
text2 = emoji.demojize(text2)
one_pass = get_embedding(text1)
second_pass = get_embedding(text2)
(one_pass - second_pass).sum()

In [None]:
### Emojis below are in the original vocabulary

In [None]:
print(f"This is an initially known token: {'😂' in emojis_in_tokenizer}")
text1 = "This is amazing, trust me! 😂"
text2 = "This is absolutely horrible, never ever try doing it 😂"
text1 = emoji.demojize(text1)
text2 = emoji.demojize(text2)
one_pass = get_embedding(text1)
second_pass = get_embedding(text2)
(one_pass - second_pass).sum()

In [None]:
print(f"This is an initially known token: {'💓' in emojis_in_tokenizer}")
text1 = "This is amazing, trust me! 💓"
text2 = "This is absolutely horrible, never ever try doing it 💓"
text1 = emoji.demojize(text1)
text2 = emoji.demojize(text2)
one_pass = get_embedding(text1)
second_pass = get_embedding(text2)
(one_pass - second_pass).sum()

In [None]:
print(f"This is an initially known token: {'🧡' in emojis_in_tokenizer}")
text1 = "This is amazing, trust me! 🧡"
text2 = "This is absolutely horrible, never ever try doing it 🧡"
text1 = emoji.demojize(text1)
text2 = emoji.demojize(text2)
one_pass = get_embedding(text1)
second_pass = get_embedding(text2)
(one_pass - second_pass).sum()

In [None]:
### check if embeddinga on unknown tokens that were added to vocabulary are rubish
### check which emojis are in the tokenizer and how to extract their embeddings later
### check parallelization
### save embeddings, yupi