In [1]:
import torch
from transformers import BertTokenizer, BertModel
from sklearn.linear_model import LogisticRegression
import pandas as pd
import numpy as np
import time
#from keras.preprocessing.sequence import pad_sequences


In [2]:
def tok(tokenizer, text):
    return tokenizer.encode_plus(text, 
                                    add_special_tokens=True,
                                    max_length=510,
                                    padding='longest', 
                                    truncation=True,
                                      return_token_type_ids=True,
                                      return_attention_mask=True,
                                      return_tensors='pt'
                                    )

In [3]:
def set_embed(df, model, tokenizer):
    l = []
    for i, review in enumerate(df.review_body):
        tokened = tok(tokenizer, review)
        #print(model(**tokenizer))
        l.append(embed(model_mul, tokened).numpy())
    return l

In [4]:
from typing import Callable, List, Optional, Tuple
from torch import nn
import pandas as pd
from sklearn.base import TransformerMixin, BaseEstimator
import torch

def embed(model, tokens_tensor ):

    with torch.no_grad():
        tokens_tensor = tokens_tensor.to('cuda')
        model.to('cuda')
        outputs = model(**tokens_tensor)

        # Evaluating the model will return a different number of objects based on 
        # how it's  configured in the `from_pretrained` call earlier. In this case, 
        # becase we set `output_hidden_states = True`, the third item will be the 
        # hidden states from all layers. See the documentation for more details:
        # https://huggingface.co/transformers/model_doc/bert.html#bertmodel
        hidden_states = outputs[2]
        token_embeddings = torch.stack(hidden_states, dim=0)
        token_embeddings = torch.squeeze(token_embeddings, dim=1)
        # Stores the token vectors, with shape [6 x 768]
    
    token_vecs_sum = []

    # `token_embeddings` is a [6 x 12 x 768] tensor.

    # For each token in the sentence...
    for token in token_embeddings:

        # `token` is a [6 x 768] tensor

        # Sum the vectors from the last four layers.
        sum_vec = torch.sum(token[-4:], dim=0)

        # Use `sum_vec` to represent `token`.
        token_vecs_sum.append(sum_vec)
        
    token_vecs = hidden_states[-2][0]

    # Calculate the average of all 6 token vectors.
    sentence_embedding = torch.mean(token_vecs, dim=0)
    
    return sentence_embedding

def embed_cls(model, tokens_tensor):
    #device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    with torch.no_grad():
        tokens_tensor = tokens_tensor.to('cuda')       
        model.to('cuda')
        
        #if torch.cuda.device_count() > 1:
            #print("Let's use", torch.cuda.device_count(), "GPUs!")
            #model = nn.DataParallel(model)
        #else:
            #model.to(device)
        
        outputs = model(**tokens_tensor)
        
        return outputs.pooler_output
        
class BertTransformer(BaseEstimator, TransformerMixin):
    def __init__(
            self,
            bert_tokenizer,
            bert_model,
            max_length: int = 510,
            embedding_func = None,
    ):
        self.tokenizer = bert_tokenizer
        self.model = bert_model
        self.model.eval()
        self.max_length = max_length
        self.embedding_func = embedding_func

        if self.embedding_func is None:
            self.embedding_func = lambda x: x[0][:, 0, :].squeeze()

    def _tokenize(self, text: str) -> Tuple[torch.tensor, torch.tensor]:
        # Tokenize the text with the provided tokenizer
#         tokenized_text = self.tokenizer.encode_plus(text,
#                                                     add_special_tokens=True,
#                                                     max_length=self.max_length
#                                                     )["input_ids"]
        
        tokenized_text = self.tokenizer.encode_plus(text, 
                                    add_special_tokens=True,
                                    max_length=self.max_length,
                                    padding='longest', 
                                    truncation=True,
                                    return_token_type_ids=True,
                                    return_attention_mask=True,
                                    return_tensors='pt'
                                    )
        return tokenized_text


    def _tokenize_and_predict(self, text: str) -> torch.tensor:
        tokenized = self._tokenize(text)

        #embeddings = self.model(**tokenized)
        return self.embedding_func(self.model, tokenized)

    def transform(self, text: List[str]):
        if isinstance(text, pd.Series):
            text = text.tolist()
        
#         return torch.stack([self._tokenize_and_predict(string) for string in text]).cpu()

        return torch.stack([self._tokenize_and_predict(text)]).cpu()

    def fit(self, X, y=None):
        """No fitting necessary so we just return ourselves"""
        return self

# Data sets

### English dataset tranlated to spanish

In [26]:
eng_to_es_train = torch.load('amz_train_en_to_es_Translation_Helsinki.pkl')
eng_to_es_test = torch.load('amz_test_en_to_es_Translation_Helsinki.pkl')
eng_to_es_val = torch.load('amz_val_en_to_es_Translation_Helsinki.pkl')

### Spanish -original

In [27]:
es_train = pd.read_csv('../data/train_es').review_body
es_test = pd.read_csv('../data/test_es').review_body
es_val = pd.read_csv('../data/val_es').review_body

# LABSE BERT

In [29]:
tokenizer_mul = BertTokenizer.from_pretrained("pvl/labse_bert")
model_mul = BertModel.from_pretrained('pvl/labse_bert',
                                  output_hidden_states = True, # Whether the model returns all hidden-states.
                                  )

In [30]:
#bert_transformer = BertTransformer(tokenizer_mul, model_mul, embedding_func=embed)
bert_transformer = BertTransformer(tokenizer_mul, model_mul, embedding_func=embed_cls)
  

In [31]:
import pickle
import itertools

texts = [es_val, es_test, es_train, eng_to_es_val, eng_to_es_test, eng_to_es_train ]
files_name = ['es_val_labse_embdded.pkl', 'es_test_labse_embdded.pkl', 'es_train_labse_embdded.pkl', 'eng_to_es_val_labse_embdded.pkl', 'eng_to_es_test_labse_embdded.pkl', 'eng_to_es_train_labse_embdded.pkl' ]


for i,t in enumerate(texts):
    emebedded = []
    start = time.time()
    for k,com in enumerate(t):
        emebedded.append(bert_transformer.transform(com))
    end = time.time()
    print(end - start)
    
    with open(files_name[i], 'wb') as f:
        torch.save(list(itertools.chain(*emebedded)), f)    
    end = time.time()
    print(end - start)

41.027315855026245
41.09771680831909
37.729169607162476
37.7992742061615
1506.2099006175995
1509.6874799728394
37.675663232803345
37.746981382369995
37.727787017822266
37.7990984916687
1512.413660287857
1515.9639060497284


# Spanish BERT

In [32]:
tokenizer_mul = BertTokenizer.from_pretrained("Geotrend/bert-base-es-cased")
model_mul = BertModel.from_pretrained("Geotrend/bert-base-es-cased",
                                  output_hidden_states = True, # Whether the model returns all hidden-states.
                                  )

Downloading:   0%|          | 0.00/174k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/659 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/425M [00:00<?, ?B/s]

Some weights of BertModel were not initialized from the model checkpoint at Geotrend/bert-base-es-cased and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [33]:
bert_transformer = BertTransformer(tokenizer_mul, model_mul, embedding_func=embed_cls)


In [34]:
import pickle
import itertools

texts = [es_val, es_test, es_train, eng_to_es_val,eng_to_es_test, eng_to_es_train ]
files_name = ['es_val_bert-base-es-cased_embdded.pkl', 'es_test_bert-base-es-cased_embdded.pkl', 'es_train_bert-base-es-cased_embdded.pkl', 'eng_to_es_val_bert-base-es-cased_embdded.pkl', 'eng_to_es_test_bert-base-es-cased_embdded.pkl', 'eng_to_es_train_bert-base-es-cased_embdded.pkl' ]


for i,t in enumerate(texts):
    emebedded = []
    start = time.time()
    for k,com in enumerate(t):
        emebedded.append(bert_transformer.transform(com))
    end = time.time()
    print(end - start)
    
    with open(files_name[i], 'wb') as f:
        torch.save(list(itertools.chain(*emebedded)), f)    
    end = time.time()
    print(end - start)

37.54897403717041
37.621267557144165
37.43864393234253
37.51006722450256
1661.582781791687
1664.9422955513
37.30940222740173
37.38084363937378
37.23605251312256
37.309192180633545
1502.438512802124
1505.7900450229645


# BETO BERT

In [35]:
tokenizer_mul = BertTokenizer.from_pretrained("finiteautomata/beto-sentiment-analysis")
model_mul = BertModel.from_pretrained("finiteautomata/beto-sentiment-analysis",
                                  output_hidden_states = True, # Whether the model returns all hidden-states.
                                  )

Downloading:   0%|          | 0.00/242k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/253 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/658 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/439M [00:00<?, ?B/s]

In [36]:
bert_transformer = BertTransformer(tokenizer_mul, model_mul, embedding_func=embed_cls)


In [None]:
import pickle
import itertools

texts = [es_val, es_test, es_train, eng_to_es_val,eng_to_es_test, eng_to_es_train ]
files_name = ['es_val_beto_embdded.pkl', 'es_test_beto_embdded.pkl', 'es_train_beto_embdded.pkl', 'eng_to_es_val_bbeto_embdded.pkl', 'eng_to_es_test_beto_embdded.pkl', 'eng_to_es_train_beto_embdded.pkl' ]


for i,t in enumerate(texts):
    emebedded = []
    start = time.time()
    for k,com in enumerate(t):
        emebedded.append(bert_transformer.transform(com))
    end = time.time()
    print(end - start)
    
    with open(files_name[i], 'wb') as f:
        torch.save(list(itertools.chain(*emebedded)), f)    
    end = time.time()
    print(end - start)

37.18480849266052
37.25659394264221
37.0012092590332
37.07319140434265
