# Embeddings

In [4]:
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer
import numpy as np

In [5]:
# Preprocess text (username and link placeholders)
def preprocess(text):
    new_text = []
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)

In [6]:
task='emoji'
MODEL = f"cardiffnlp/bertweet-base-{task}"
folder = MODEL.replace('cardiffnlp','../modelos')

tokenizer = AutoTokenizer.from_pretrained(folder)
model = AutoModelForSequenceClassification.from_pretrained(folder)

emoji is not installed, thus not converting emoticons or emojis into text. Install emoji: pip3 install emoji==0.6.0


In [4]:
def sentence_clf_output(text):
    # retorna el SequenceClassifierOutput dado un tweet
    text = preprocess(text)
    encoded_input = tokenizer(text, return_tensors='pt')
    output = model(**encoded_input, return_dict=True, output_hidden_states=True)
    ###
    """ArithmeticErrorscores = output[0][0].detach().numpy()
    scores = softmax(scores)
    return np.argmax(scores), scores"""
    return output

In [5]:
example = "Looking forward to Christmas"
output_obj = sentence_clf_output(example)

In [6]:
output_obj.keys()

odict_keys(['logits', 'hidden_states'])

In [7]:
output_obj['hidden_states'][-1].shape

torch.Size([1, 6, 768])

In [8]:
def first_tok_embedding(cfl_output):
    # retorna un numpy array correspondiente al token <s> contextualizado según el tweet
    return cfl_output['hidden_states'][-1][0][0].detach().numpy().reshape(1,768)

In [9]:
emb1 = first_tok_embedding(output_obj)
print(type(emb1))
emb1.shape

<class 'numpy.ndarray'>


(1, 768)

In [10]:
def sum_embedding(cfl_output):
    # retorna un numpy array correspondiente a la suma de los vectores contextualizados
    return cfl_output['hidden_states'][-1][0].detach().numpy().sum(axis=0).reshape(1,768)

In [11]:
emb2 = sum_embedding(output_obj)
print(type(emb2))
emb2.shape

<class 'numpy.ndarray'>


(1, 768)

In [12]:
def logits_embedding(clf_output):
    # retorna el vector de scores de clasificacion (antes de la capa softmax)
    return clf_output['logits'][0].detach().numpy().reshape(1,20)

In [13]:
emb3 = logits_embedding(output_obj)
print(type(emb3))
emb3.shape

<class 'numpy.ndarray'>


(1, 20)

In [14]:
import pickle

path =  "../../../Data/test/df_us_test.pickle"
df_us_test = pickle.load(open(path, "rb"))

In [15]:
len(df_us_test)

50000

In [16]:
hola = np.concatenate([emb1,emb2],axis=0)
hola.shape

(2, 768)

In [17]:
%%time
y_clf_obj = []

for i, texto in enumerate(df_us_test['text']):
    clf_obj = sentence_clf_output(texto)
    y_clf_obj.append(clf_obj)
    if i==50:
        break

CPU times: user 10.7 s, sys: 526 ms, total: 11.2 s
Wall time: 1.88 s


In [18]:
import os

folder = "bertweet_base_emoji"

In [19]:
%%time

embedding_types = [logits_embedding, sum_embedding, first_tok_embedding]
# array_embs = {}

for emb_func in embedding_types:
    # array_embs[emb_func.__name__] = np.concatenate([emb_func(clf_obj) for clf_obj in y_clf_obj], axis=0)
    arr = np.concatenate([emb_func(clf_obj) for clf_obj in y_clf_obj], axis=0)
    np.save(os.path.join(folder,'vec_{}_{}_{}'.format('test',emb_func.__name__,1)), arr)
    

CPU times: user 0 ns, sys: 7.33 ms, total: 7.33 ms
Wall time: 5.5 ms


Tiempos (approx)

In [20]:
print('test set: {} minutos'.format(1.85*1000/60))
print('test set: {} minutos'.format(1.85*10000/60))

test set: 30.833333333333332 minutos
test set: 308.3333333333333 minutos


Tamaños (approx)

In [21]:
print('test set: {} MB'.format(156.8))
print('test set: {} MB'.format(156.8*10))

test set: 156.8 MB
test set: 1568.0 MB


In [22]:
def guardar(y_list,idx,conjunto):
    embedding_types = [logits_embedding, sum_embedding, first_tok_embedding]

    for emb_func in embedding_types:
        arr = np.concatenate([emb_func(clf_obj) for clf_obj in y_list], axis=0)
        np.save(os.path.join(folder,'vec_{}_{}_{}'.format(conjunto,emb_func.__name__,idx)), arr)

In [27]:
y_clf_obj = []
length = len(df_us_test)
save_rate = 50

for i, texto in enumerate(df_us_test['text']):
    idx = 0
    clf_obj = sentence_clf_output(texto)
    y_clf_obj.append(clf_obj)
    if i%(save_rate)==0 and i!=0:
        guardar(y_clf_obj,idx,'test')
        idx += 1
        y_clf_obj = []
        print('archivo guardado: porcentaje = {}%'.format(100*(i)/length))
    if i==200:
        break

archivo guardado: porcentaje = 0.1%
archivo guardado: porcentaje = 0.2%
archivo guardado: porcentaje = 0.3%
archivo guardado: porcentaje = 0.4%
