In [1]:
import torch
from transformers import BertTokenizer, BertModel
from sklearn.linear_model import LogisticRegression
import pandas as pd
import numpy as np
import time

#from keras.preprocessing.sequence import pad_sequences


In [2]:
def tok(tokenizer, text):
    return tokenizer.encode_plus(text, 
                                    add_special_tokens=True,
                                    max_length=510,
                                    padding='longest', 
                                    truncation=True,
                                      return_token_type_ids=True,
                                      return_attention_mask=True,
                                      return_tensors='pt'
                                    )

In [3]:
def set_embed(df, model, tokenizer):
    l = []
    for i, review in enumerate(df.review_body):
        tokened = tok(tokenizer, review)
        #print(model(**tokenizer))
        l.append(embed(model_mul, tokened).numpy())
    return l

In [4]:
from typing import Callable, List, Optional, Tuple
from torch import nn
import pandas as pd
from sklearn.base import TransformerMixin, BaseEstimator
import torch

def embed(model, tokens_tensor ):

    with torch.no_grad():
        tokens_tensor = tokens_tensor.to('cuda')
        model.to('cuda')
        outputs = model(**tokens_tensor)

        # Evaluating the model will return a different number of objects based on 
        # how it's  configured in the `from_pretrained` call earlier. In this case, 
        # becase we set `output_hidden_states = True`, the third item will be the 
        # hidden states from all layers. See the documentation for more details:
        # https://huggingface.co/transformers/model_doc/bert.html#bertmodel
        hidden_states = outputs[2]
        token_embeddings = torch.stack(hidden_states, dim=0)
        token_embeddings = torch.squeeze(token_embeddings, dim=1)
        # Stores the token vectors, with shape [6 x 768]
    
    token_vecs_sum = []

    # `token_embeddings` is a [6 x 12 x 768] tensor.

    # For each token in the sentence...
    for token in token_embeddings:

        # `token` is a [6 x 768] tensor

        # Sum the vectors from the last four layers.
        sum_vec = torch.sum(token[-4:], dim=0)

        # Use `sum_vec` to represent `token`.
        token_vecs_sum.append(sum_vec)
        
    token_vecs = hidden_states[-2][0]

    # Calculate the average of all 6 token vectors.
    sentence_embedding = torch.mean(token_vecs, dim=0)
    
    return sentence_embedding

def embed_cls(model, tokens_tensor):
    #device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    with torch.no_grad():
        tokens_tensor = tokens_tensor.to('cuda')       
        model.to('cuda')
        
        #if torch.cuda.device_count() > 1:
            #print("Let's use", torch.cuda.device_count(), "GPUs!")
            #model = nn.DataParallel(model)
        #else:
            #model.to(device)
        
        outputs = model(**tokens_tensor)
        
        return outputs.pooler_output
        
class BertTransformer(BaseEstimator, TransformerMixin):
    def __init__(
            self,
            bert_tokenizer,
            bert_model,
            max_length: int = 510,
            embedding_func = None,
    ):
        self.tokenizer = bert_tokenizer
        self.model = bert_model
        self.model.eval()
        self.max_length = max_length
        self.embedding_func = embedding_func

        if self.embedding_func is None:
            self.embedding_func = lambda x: x[0][:, 0, :].squeeze()

    def _tokenize(self, text: str) -> Tuple[torch.tensor, torch.tensor]:
        # Tokenize the text with the provided tokenizer
#         tokenized_text = self.tokenizer.encode_plus(text,
#                                                     add_special_tokens=True,
#                                                     max_length=self.max_length
#                                                     )["input_ids"]
        
        tokenized_text = self.tokenizer.encode_plus(text, 
                                    add_special_tokens=True,
                                    max_length=self.max_length,
                                    padding='longest', 
                                    truncation=True,
                                    return_token_type_ids=True,
                                    return_attention_mask=True,
                                    return_tensors='pt'
                                    )
        return tokenized_text


    def _tokenize_and_predict(self, text: str) -> torch.tensor:
        tokenized = self._tokenize(text)

        #embeddings = self.model(**tokenized)
        return self.embedding_func(self.model, tokenized)

    def transform(self, text: List[str]):
        if isinstance(text, pd.Series):
            text = text.tolist()
        
#         return torch.stack([self._tokenize_and_predict(string) for string in text]).cpu()

        return torch.stack([self._tokenize_and_predict(text)]).cpu()

    def fit(self, X, y=None):
        """No fitting necessary so we just return ourselves"""
        return self

# Data sets

### English

In [6]:
FILE_PATH = '../data/jigsaw-toxic-comment-train.csv'

eng_ds = pd.read_csv(FILE_PATH)
eng_text = eng_ds.comment_text

### tranlated Spanish dataset to English -AWS

In [5]:
FILE_PATH = 'val_es_text_only__en_AWS_TRANSLATED-Copy1.csv'

es_ds = pd.read_csv(FILE_PATH)
es_text = es_ds.TranslatedText

# English BERT

In [7]:
tokenizer_mul = BertTokenizer.from_pretrained("distilbert-base-uncased")
model_mul = BertModel.from_pretrained('distilbert-base-uncased',
                                  output_hidden_states = True, # Whether the model returns all hidden-states.
                                  )

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/442 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing BertModel: ['distilbert.embeddings.word_embeddings.weight', 'distilbert.embeddings.position_embeddings.weight', 'distilbert.embeddings.LayerNorm.weight', 'distilbert.embeddings.LayerNorm.bias', 'distilbert.transformer.layer.0.attention.q_lin.weight', 'distilbert.transformer.layer.0.attention.q_lin.bias', 'distilbert.transformer.layer.0.attention.k_lin.weight', 'distilbert.transformer.layer.0.attention.k_lin.bias', 'distilbert.transformer.layer.0.attention.v_lin.weight', 'distilbert.transformer.layer.0.attention.v_lin.bias', 'distilbert.transformer.layer.0.attention.out_lin.weight', 'distilbert.transformer.layer.0.attention.out_lin.bias', 'distilbert.transformer.layer.0.sa_layer_norm.weight', 'distilbert.transformer.layer.0.sa_layer_norm.bias', 'distilbert.transformer.layer.0.ffn.lin1.weight', 'distilbert.transformer.layer.0.ffn.lin1.bias', 'distilbert.transformer.layer.0.ffn.lin2.weight', 'd

In [8]:
#bert_transformer = BertTransformer(tokenizer_mul, model_mul, embedding_func=embed)
bert_transformer = BertTransformer(tokenizer_mul, model_mul, embedding_func=embed_cls)
  

In [None]:
%%time
import pickle
import torch
import itertools

texts = [es_text]
files_name = ['val_es_text_only__en_AWS_TRANSLATED-Copy1_embedded_CLS_english_bert.pkl']

for i,t in enumerate(texts):
    emebedded = []
    for k,com in enumerate(t):
        emebedded.append(bert_transformer.transform(com))
    
    with open(files_name[i], 'wb') as f:
        torch.save(list(itertools.chain(*emebedded)), f) 

In [None]:
%%time
import pickle
import torch
import itertools

texts = [es_text, eng_text]
files_name = ['es_ds_translated_aws_english_bert.pkl', 'jigsaw_toxic-comment_train_embedded_CLS_english_bert.pkl']

for i,t in enumerate(texts):
    emebedded = []
    for k,com in enumerate(t):
        emebedded.append(bert_transformer.transform(com))
    
    with open(files_name[i], 'wb') as f:
        torch.save(list(itertools.chain(*emebedded)), f) 

    


In [10]:
%%time
import pickle
import torch
import itertools

texts = [eng_text]
files_name = ['jigsaw_toxic-comment_train_embedded_CLS_english_bert.pkl']

for i,t in enumerate(texts):
    emebedded = []
    for k,com in enumerate(t):
        print(k/len(t))
        emebedded.append(bert_transformer.transform(com))
    
    with open(files_name[i], 'wb') as f:
        torch.save(list(itertools.chain(*emebedded)), f) 

    


0.0
5e-05
0.0001
0.00015
0.0002
0.00025
0.0003
0.00035
0.0004
0.00045
0.0005
0.00055
0.0006
0.00065
0.0007
0.00075
0.0008
0.00085
0.0009
0.00095
0.001
0.00105
0.0011
0.00115
0.0012
0.00125
0.0013
0.00135
0.0014
0.00145
0.0015
0.00155
0.0016
0.00165
0.0017
0.00175
0.0018
0.00185
0.0019
0.00195
0.002
0.00205
0.0021
0.00215
0.0022
0.00225
0.0023
0.00235
0.0024
0.00245
0.0025
0.00255
0.0026
0.00265
0.0027
0.00275
0.0028
0.00285
0.0029
0.00295
0.003
0.00305
0.0031
0.00315
0.0032
0.00325
0.0033
0.00335
0.0034
0.00345
0.0035
0.00355
0.0036
0.00365
0.0037
0.00375
0.0038
0.00385
0.0039
0.00395
0.004
0.00405
0.0041
0.00415
0.0042
0.00425
0.0043
0.00435
0.0044
0.00445
0.0045
0.00455
0.0046
0.00465
0.0047
0.00475
0.0048
0.00485
0.0049
0.00495
0.005
0.00505
0.0051
0.00515
0.0052
0.00525
0.0053
0.00535
0.0054
0.00545
0.0055
0.00555
0.0056
0.00565
0.0057
0.00575
0.0058
0.00585
0.0059
0.00595
0.006
0.00605
0.0061
0.00615
0.0062
0.00625
0.0063
0.00635
0.0064
0.00645
0.0065
0.00655
0.0066
0.00665
0.0067

In [11]:
1+1

2