In [1]:
import pandas as pd
import numpy as np
import emoji
from transformers import AutoTokenizer, AutoModel
from tqdm.notebook import tqdm
import time
from pandarallel import pandarallel
pandarallel.initialize(nb_workers=8)
tqdm.pandas()

from settings import AMBIGUITY_PATH, AMBIGUITY_CLUSTER

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


  from pandas import Panel


In [2]:
from src.data.utils import save_to_csv

In [3]:
path = "/scratch/czestoch/sampled_tweets.txt.gz"
tweets = pd.read_csv(path, header=0, lineterminator='\n', encoding='utf-8')['tweet']

In [4]:
len(tweets)

93390

In [5]:
tweets.head()

0    *⃣🆘PLEASE   🆘WE WILL NOT BE SILENT😡plz watch📼 ...
1       *⃣PLEASE   KILL RATE⚠️RESCUE ONLY⚠️Thank you*⃣
2    *⃣💠*⃣💠*⃣💠*⃣💠*⃣💠*⃣💠 How to TRIGGER a LIBERAL wi...
3    "SAFFRON" 🧡Sweet male puppy 3 months 11.5 lbs ...
4          *⃣PLEASE   3 DUMPED IN DROP BOX😡DIES💉2/22*⃣
Name: tweet, dtype: object

In [5]:
# tweets = tweets[:1000]
# len(tweets)

1000

In [6]:
def preprocess_tweets(text):
    return add_spaces_between_emojis(emoji.demojize(text))

def add_spaces_between_emojis(demojified_text):
    new_text = []
    colons = []
    for char in demojified_text:
        if char == ":":
            if colons:
                new_text.append(char + " ")
                colons.pop()
            else:
                colons.append(char)
                new_text.append(" " + char)
        else:
            new_text.append(char)
    return ''.join(new_text)

In [7]:
tweets = tweets.parallel_apply(preprocess_tweets)

In [11]:
all_emojis = pd.read_csv(AMBIGUITY_CLUSTER).emoji.unique()
all_emojis = list(map(lambda x: emoji.demojize(x), all_emojis))

In [18]:
tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base")
emojis_in_tokenizer = {}
for em in all_emojis:
    emoji_tensor = tokenizer(em, return_tensors='pt')['input_ids']
    emoji_vocab_idx = emoji_tensor[0][1].item()
    # if size is 3 it means emoji token was not splitted so it is a known token,
    # [start] [emoji] [stop]
    # index 3 stands for an unknown token
    if emoji_tensor.size(1) == 3 and emoji_vocab_idx != 3:
        emojis_in_tokenizer[em] = emoji_vocab_idx
print(f"{len(emojis_in_tokenizer)} of our emojis are in this model")

emojis_not_in_tokenizer = set(all_emojis) - set(emojis_in_tokenizer.keys())
original_tokenizer_size = len(tokenizer)
print(f"Original number of tokens: {original_tokenizer_size}")
tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base",\
                                         additional_special_tokens=list(emojis_not_in_tokenizer))
print(f"Number of tokens after extension: {len(tokenizer)}")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


442 of our emojis are in this model
Original number of tokens: 64001


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Number of tokens after extension: 64884


In [None]:
# all_emojis = pd.read_csv(AMBIGUITY_CLUSTER).emoji.unique()

# tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base")
# emojis_in_tokenizer = {}
# for em in all_emojis:
#     emoji_tensor = tokenizer(emoji.demojize(em), return_tensors='pt')['input_ids']
#     emoji_vocab_idx = emoji_tensor[0][1].item()
#     # if size is 3 it means emoji token was not splitted so it is a known token,
#     # [start] [emoji] [stop]
#     # index 3 stands for an unknown token
#     if emoji_tensor.size(1) == 3 and emoji_vocab_idx != 3:
#         emojis_in_tokenizer[em] = emoji_vocab_idx

# print(f"{len(emojis_in_tokenizer)} of our emojis are in this model")
# del all_emojis

In [20]:
emojis_in_tokenizer_indices = set(emojis_in_tokenizer.values())
model = AutoModel.from_pretrained("vinai/bertweet-base")
model.resize_token_embeddings(len(tokenizer))

def get_emoji_embedding(text):
    tokenized = np.array(tokenizer.tokenize(text))
    encoded_input = tokenizer(text, return_tensors='pt')
    tokens_ids = encoded_input['input_ids']
    mask = [id_.item() in emojis_in_tokenizer_indices \
            or id_.item() >= original_tokenizer_size for id_ in tokens_ids[0]]
    if any(mask):
        try:
            features = model(**encoded_input)[0]
        except IndexError:
            return np.nan, np.nan
        return features[0][mask][:].detach().numpy(), tokenized[mask[1:-1]]
    else:
        return np.nan, np.nan

Some weights of the model checkpoint at vinai/bertweet-base were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.decoder.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [21]:
out = tweets.progress_apply(get_emoji_embedding)

  0%|          | 0/93390 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (188 > 128). Running this sequence through the model will result in indexing errors


In [22]:
tweets = tweets.to_frame()

In [23]:
tweets[["embedding", "emoji"]] = pd.DataFrame(out.tolist())

In [24]:
tweets.head()

Unnamed: 0,tweet,embedding,emoji
0,:keycap_asterisk: :SOS_button: PLEASE :SO...,"[[-0.3886098, 0.19909568, 0.20366114, 0.119760...","[:SOS_button:, :SOS_button:, :pouting_face:, :..."
1,:keycap_asterisk: PLEASE KILL RATE :warning...,"[[-0.18006775, 0.1269213, 0.13839212, 0.054697...","[:warning_selector:, :warning_selector:]"
2,:keycap_asterisk: :diamond_with_a_dot: :key...,"[[0.45944917, -0.12830271, 0.1478465, 0.080913...","[:diamond_with_a_dot:, :diamond_with_a_dot:, :..."
3,"""SAFFRON"" :orange_heart: Sweet male puppy 3 m...","[[0.13286656, -0.041414626, 0.34103054, 0.1911...","[:orange_heart:, :green_heart:]"
4,:keycap_asterisk: PLEASE 3 DUMPED IN DROP B...,"[[-0.42245123, -0.23179615, -0.060957894, -0.1...","[:pouting_face:, :syringe:]"


In [25]:
tweets = tweets.dropna()

In [26]:
tweets["embedding"] = tweets["embedding"].parallel_apply(lambda x: x.tolist())
tweets["emoji"] = tweets["emoji"].parallel_apply(lambda x: x.tolist())

In [27]:
tweets = tweets.set_index(['tweet']).apply(pd.Series.explode).reset_index()
tweets = tweets.dropna()

In [28]:
save_to_csv(tweets, "/scratch/czestoch/bert_emojis_with_unknown_emojis.csv")

In [30]:
tweets.groupby("emoji").embedding.count()

emoji
:1st_place_medal:            467
:2nd_place_medal:            154
:3rd_place_medal:            130
:AB_button_(blood_type):     116
:ATM_sign:                    63
                            ... 
:zany_face:                 1135
:zebra:                      225
:zipper-mouth_face:           98
:zombie:                      47
:zzz:                        272
Name: embedding, Length: 1193, dtype: int64

## Check if unknown tokens are rubbish

In [None]:
def get_embedding(text):
    encoded_input = tokenizer(text, return_tensors='pt')
    features = model(**encoded_input)
    return features[0].detach().cpu().numpy() 

In [None]:
tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base",\
                                         additional_special_tokens=[":paintbrush_selector:"])
model = AutoModel.from_pretrained("vinai/bertweet-base")
model.resize_token_embeddings(len(tokenizer))

In [None]:
### Paintbrush is initially not in the vocabulary but was added as an additional token

In [None]:
print(f"This is an initially known token: {'🖌️' in emojis_in_tokenizer}")
text1 = "This is amazing, trust me! 🖌️"
text2 = "This is absolutely horrible, never ever try doing it 🖌️"
text1 = emoji.demojize(text1)
text2 = emoji.demojize(text2)
one_pass = get_embedding(text1)
second_pass = get_embedding(text2)
(one_pass - second_pass).sum()

In [None]:
### Heart is initially not in the vocabulary

In [None]:
print(f"This is an initially known token: {'❤️' in emojis_in_tokenizer}")
text1 = "This is amazing, trust me! ❤️"
text2 = "This is absolutely horrible, never ever try doing it ❤️"
text1 = emoji.demojize(text1)
text2 = emoji.demojize(text2)
one_pass = get_embedding(text1)
second_pass = get_embedding(text2)
(one_pass - second_pass).sum()

In [None]:
### Emojis below are in the original vocabulary

In [None]:
print(f"This is an initially known token: {'😂' in emojis_in_tokenizer}")
text1 = "This is amazing, trust me! 😂"
text2 = "This is absolutely horrible, never ever try doing it 😂"
text1 = emoji.demojize(text1)
text2 = emoji.demojize(text2)
one_pass = get_embedding(text1)
second_pass = get_embedding(text2)
(one_pass - second_pass).sum()

In [None]:
print(f"This is an initially known token: {'💓' in emojis_in_tokenizer}")
text1 = "This is amazing, trust me! 💓"
text2 = "This is absolutely horrible, never ever try doing it 💓"
text1 = emoji.demojize(text1)
text2 = emoji.demojize(text2)
one_pass = get_embedding(text1)
second_pass = get_embedding(text2)
(one_pass - second_pass).sum()

In [None]:
print(f"This is an initially known token: {'🧡' in emojis_in_tokenizer}")
text1 = "This is amazing, trust me! 🧡"
text2 = "This is absolutely horrible, never ever try doing it 🧡"
text1 = emoji.demojize(text1)
text2 = emoji.demojize(text2)
one_pass = get_embedding(text1)
second_pass = get_embedding(text2)
(one_pass - second_pass).sum()

In [None]:
### check if embeddinga on unknown tokens that were added to vocabulary are rubish
### check which emojis are in the tokenizer and how to extract their embeddings later
### check parallelization
### save embeddings, yupi