In [1]:
import torch
from transformers import BertTokenizer, BertModel
from sklearn.linear_model import LogisticRegression
import pandas as pd
import numpy as np
import time
#from keras.preprocessing.sequence import pad_sequences


In [2]:
def tok(tokenizer, text):
    return tokenizer.encode_plus(text, 
                                    add_special_tokens=True,
                                    max_length=510,
                                    padding='longest', 
                                    truncation=True,
                                      return_token_type_ids=True,
                                      return_attention_mask=True,
                                      return_tensors='pt'
                                    )

In [3]:
def set_embed(df, model, tokenizer):
    l = []
    for i, review in enumerate(df.review_body):
        tokened = tok(tokenizer, review)
        #print(model(**tokenizer))
        l.append(embed(model_mul, tokened).numpy())
    return l

In [4]:
from typing import Callable, List, Optional, Tuple
from torch import nn
import pandas as pd
from sklearn.base import TransformerMixin, BaseEstimator
import torch

def embed(model, tokens_tensor ):

    with torch.no_grad():
        tokens_tensor = tokens_tensor.to('cuda')
        model.to('cuda')
        outputs = model(**tokens_tensor)

        # Evaluating the model will return a different number of objects based on 
        # how it's  configured in the `from_pretrained` call earlier. In this case, 
        # becase we set `output_hidden_states = True`, the third item will be the 
        # hidden states from all layers. See the documentation for more details:
        # https://huggingface.co/transformers/model_doc/bert.html#bertmodel
        hidden_states = outputs[2]
        token_embeddings = torch.stack(hidden_states, dim=0)
        token_embeddings = torch.squeeze(token_embeddings, dim=1)
        # Stores the token vectors, with shape [6 x 768]
    
    token_vecs_sum = []

    # `token_embeddings` is a [6 x 12 x 768] tensor.

    # For each token in the sentence...
    for token in token_embeddings:

        # `token` is a [6 x 768] tensor

        # Sum the vectors from the last four layers.
        sum_vec = torch.sum(token[-4:], dim=0)

        # Use `sum_vec` to represent `token`.
        token_vecs_sum.append(sum_vec)
        
    token_vecs = hidden_states[-2][0]

    # Calculate the average of all 6 token vectors.
    sentence_embedding = torch.mean(token_vecs, dim=0)
    
    return sentence_embedding

def embed_cls(model, tokens_tensor):
    #device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    with torch.no_grad():
        tokens_tensor = tokens_tensor.to('cuda')       
        model.to('cuda')
        
        #if torch.cuda.device_count() > 1:
            #print("Let's use", torch.cuda.device_count(), "GPUs!")
            #model = nn.DataParallel(model)
        #else:
            #model.to(device)
        
        outputs = model(**tokens_tensor)
        
        return outputs.pooler_output
        
class BertTransformer(BaseEstimator, TransformerMixin):
    def __init__(
            self,
            bert_tokenizer,
            bert_model,
            max_length: int = 510,
            embedding_func = None,
    ):
        self.tokenizer = bert_tokenizer
        self.model = bert_model
        self.model.eval()
        self.max_length = max_length
        self.embedding_func = embedding_func

        if self.embedding_func is None:
            self.embedding_func = lambda x: x[0][:, 0, :].squeeze()

    def _tokenize(self, text: str) -> Tuple[torch.tensor, torch.tensor]:
        # Tokenize the text with the provided tokenizer
#         tokenized_text = self.tokenizer.encode_plus(text,
#                                                     add_special_tokens=True,
#                                                     max_length=self.max_length
#                                                     )["input_ids"]
        
        tokenized_text = self.tokenizer.encode_plus(text, 
                                    add_special_tokens=True,
                                    max_length=self.max_length,
                                    padding='longest', 
                                    truncation=True,
                                    return_token_type_ids=True,
                                    return_attention_mask=True,
                                    return_tensors='pt'
                                    )
        return tokenized_text


    def _tokenize_and_predict(self, text: str) -> torch.tensor:
        tokenized = self._tokenize(text)

        #embeddings = self.model(**tokenized)
        return self.embedding_func(self.model, tokenized)

    def transform(self, text: List[str]):
        if isinstance(text, pd.Series):
            text = text.tolist()
        
#         return torch.stack([self._tokenize_and_predict(string) for string in text]).cpu()

        return torch.stack([self._tokenize_and_predict(text)]).cpu()

    def fit(self, X, y=None):
        """No fitting necessary so we just return ourselves"""
        return self

# Data sets

### English dataset 

In [13]:
eng_train = pd.read_csv('../data/train_en').review_body
eng_test = pd.read_csv('../data/test_en').review_body
eng_val = pd.read_csv('../data/val_en').review_body

In [7]:
from transformers import RobertaTokenizer, RobertaModel

tokenizer_mul = RobertaTokenizer.from_pretrained('roberta-base')
model_mul = RobertaModel.from_pretrained('roberta-base',
                                  output_hidden_states = True, # Whether the model returns all hidden-states.
                                  )

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/501M [00:00<?, ?B/s]

In [16]:
#bert_transformer = BertTransformer(tokenizer_mul, model_mul, embedding_func=embed)
bert_transformer = BertTransformer(tokenizer_mul, model_mul, embedding_func=embed_cls)
  

In [17]:
import pickle
import itertools

texts = [eng_test, eng_val, eng_train ]
files_name = ['eng_test_roberta_embedded.pkl', 'eng_val_roberta_embedded.pkl', 'eng_train_roberta_embedded.pkl' ]


for i,t in enumerate(texts):
    emebedded = []
    start = time.time()
    for k,com in enumerate(t):
        emebedded.append(bert_transformer.transform(com))
    end = time.time()
    print(end - start)
    
    with open(files_name[i], 'wb') as f:
        torch.save(list(itertools.chain(*emebedded)), f)    
    end = time.time()
    print(end - start)

40.398080348968506
40.467787742614746
37.07446479797363
37.14466166496277
1515.7183821201324
1519.1222438812256


## Spanish translated to English - embedding using roberta

In [18]:
es_to_eng_train = torch.load('amz_train_es_to_en_Translation_Helsinki.pkl')
es_to_eng_test = torch.load('amz_val_es_to_en_Translation_Helsinki.pkl')
es_to_eng_val = torch.load('amz_val_en_to_es_Translation_Helsinki.pkl')

In [19]:
import pickle
import itertools

texts = [es_to_eng_val, es_to_eng_test, es_to_eng_train ]
files_name = ['es_to_eng_val_embedded.pkl', 'es_to_eng_test_embedded.pkl', 'es_to_eng_train_embedded.pkl' ]


for i,t in enumerate(texts):
    emebedded = []
    start = time.time()
    for k,com in enumerate(t):
        emebedded.append(bert_transformer.transform(com))
    end = time.time()
    print(end - start)
    
    with open(files_name[i], 'wb') as f:
        torch.save(list(itertools.chain(*emebedded)), f)    
    end = time.time()
    print(end - start)

37.88653063774109
37.96212816238403
37.30016016960144
37.37120580673218
1488.6091227531433
1492.014592409134
