## Libreries

In [61]:
import pandas as pd
import numpy as np
import pickle

from transformers import AutoTokenizer, AutoModel, BertModel, BertTokenizer
import torch

## Data

In [62]:
Q_train = pd.read_excel("../../data/train_task_Q.xlsx", index_col="id")
Q_val = pd.read_excel("../../data/val_task_Q.xlsx", index_col="id")
Q_test = pd.read_excel("../../data/test_task_Q.xlsx", index_col="id")

## Embeddings BETO

Load model [BETO](https://huggingface.co/dccuchile/bert-base-spanish-wwm-cased) from [huggingface](https://huggingface.co/) 🤗

In [63]:
model_name = "dccuchile/bert-base-spanish-wwm-cased"
beto_model = BertModel.from_pretrained(model_name)
beto_tokenizer = AutoTokenizer.from_pretrained(model_name, do_lower_case=False)
_ = beto_model.eval()
__ = beto_model.zero_grad()

Some weights of the model checkpoint at dccuchile/bert-base-spanish-wwm-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the model checkpoint at dccuchile/bert-base-spanish-wwm-cased and are newly initialized: ['bert.pooler.dense.bi

In [64]:
beto_tokenizer.all_special_tokens, beto_tokenizer.all_special_ids

(['[UNK]', '[SEP]', '[PAD]', '[CLS]', '[MASK]'], [3, 5, 1, 4, 0])

In [65]:
def get_cls_from_text(text):
    text = " ".join(str(text).replace("\n", " ").split())
    
    tokens = ["[CLS]", "[UNK]"]
    if text != "":
        tokens = beto_tokenizer.tokenize(text)
        tokens = ["[CLS]"] + tokens
    
    input_ids = beto_tokenizer.convert_tokens_to_ids(tokens)    
    input_ids_tensor = torch.tensor(input_ids)
    
    cont = beto_model(input_ids_tensor.unsqueeze(0))
    cls_cont = cont.last_hidden_state[:, 0]
    return cls_cont[0].detach().numpy()

In [66]:
def get_embd(df, text_col="Q"):
    embeddings = []
    for ix in df.index:
        text = df.loc[ix][text_col]
        embd = get_cls_from_text(text)
        embeddings.append(embd)
    o = pd.DataFrame(embeddings)
    o["id"] = df.index
    o = o.set_index("id")
    return o

In [67]:
%%time
embd_Q_train = get_embd(Q_train, text_col="Q")

CPU times: total: 2min 34s
Wall time: 26 s


In [68]:
%%time
embd_Q_val = get_embd(Q_val, text_col="Q")

CPU times: total: 41.2 s
Wall time: 6.91 s


In [69]:
%%time
embd_Q_test = get_embd(Q_test, text_col="Q")

CPU times: total: 2min 24s
Wall time: 24.2 s


## Save features

In [70]:
embd_Q_train.to_excel("../features/beto_pt_features_train_task_Q.xlsx")
embd_Q_val.to_excel("../features/beto_pt_features_val_task_Q.xlsx")
embd_Q_test.to_excel("../features/beto_pt_features_test_task_Q.xlsx")