In [2]:
import torch
from transformers import BertTokenizer, BertModel
from sklearn.linear_model import LogisticRegression
import pandas as pd
import numpy as np
import time
#from keras.preprocessing.sequence import pad_sequences


In [3]:
def tok(tokenizer, text):
    return tokenizer.encode_plus(text, 
                                    add_special_tokens=True,
                                    max_length=510,
                                    padding='longest', 
                                    truncation=True,
                                      return_token_type_ids=True,
                                      return_attention_mask=True,
                                      return_tensors='pt'
                                    )

In [4]:
def set_embed(df, model, tokenizer):
    l = []
    for i, review in enumerate(df.review_body):
        tokened = tok(tokenizer, review)
        #print(model(**tokenizer))
        l.append(embed(model_mul, tokened).numpy())
    return l

In [5]:
from typing import Callable, List, Optional, Tuple
from torch import nn
import pandas as pd
from sklearn.base import TransformerMixin, BaseEstimator
import torch

def embed(model, tokens_tensor ):

    with torch.no_grad():
        tokens_tensor = tokens_tensor.to('cuda')
        model.to('cuda')
        outputs = model(**tokens_tensor)

        # Evaluating the model will return a different number of objects based on 
        # how it's  configured in the `from_pretrained` call earlier. In this case, 
        # becase we set `output_hidden_states = True`, the third item will be the 
        # hidden states from all layers. See the documentation for more details:
        # https://huggingface.co/transformers/model_doc/bert.html#bertmodel
        hidden_states = outputs[2]
        token_embeddings = torch.stack(hidden_states, dim=0)
        token_embeddings = torch.squeeze(token_embeddings, dim=1)
        # Stores the token vectors, with shape [6 x 768]
    
    token_vecs_sum = []

    # `token_embeddings` is a [6 x 12 x 768] tensor.

    # For each token in the sentence...
    for token in token_embeddings:

        # `token` is a [6 x 768] tensor

        # Sum the vectors from the last four layers.
        sum_vec = torch.sum(token[-4:], dim=0)

        # Use `sum_vec` to represent `token`.
        token_vecs_sum.append(sum_vec)
        
    token_vecs = hidden_states[-2][0]

    # Calculate the average of all 6 token vectors.
    sentence_embedding = torch.mean(token_vecs, dim=0)
    
    return sentence_embedding

def embed_cls(model, tokens_tensor):
    #device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    with torch.no_grad():
        tokens_tensor = tokens_tensor.to('cuda')       
        model.to('cuda')
        
        #if torch.cuda.device_count() > 1:
            #print("Let's use", torch.cuda.device_count(), "GPUs!")
            #model = nn.DataParallel(model)
        #else:
            #model.to(device)
        
        outputs = model(**tokens_tensor)
        
        return outputs.pooler_output
        
class BertTransformer(BaseEstimator, TransformerMixin):
    def __init__(
            self,
            bert_tokenizer,
            bert_model,
            max_length: int = 510,
            embedding_func = None,
    ):
        self.tokenizer = bert_tokenizer
        self.model = bert_model
        self.model.eval()
        self.max_length = max_length
        self.embedding_func = embedding_func

        if self.embedding_func is None:
            self.embedding_func = lambda x: x[0][:, 0, :].squeeze()

    def _tokenize(self, text: str) -> Tuple[torch.tensor, torch.tensor]:
        # Tokenize the text with the provided tokenizer
#         tokenized_text = self.tokenizer.encode_plus(text,
#                                                     add_special_tokens=True,
#                                                     max_length=self.max_length
#                                                     )["input_ids"]
        
        tokenized_text = self.tokenizer.encode_plus(text, 
                                    add_special_tokens=True,
                                    max_length=self.max_length,
                                    padding='longest', 
                                    truncation=True,
                                    return_token_type_ids=True,
                                    return_attention_mask=True,
                                    return_tensors='pt'
                                    )
        return tokenized_text


    def _tokenize_and_predict(self, text: str) -> torch.tensor:
        tokenized = self._tokenize(text)

        #embeddings = self.model(**tokenized)
        return self.embedding_func(self.model, tokenized)

    def transform(self, text: List[str]):
        if isinstance(text, pd.Series):
            text = text.tolist()
        
#         return torch.stack([self._tokenize_and_predict(string) for string in text]).cpu()

        return torch.stack([self._tokenize_and_predict(text)]).cpu()

    def fit(self, X, y=None):
        """No fitting necessary so we just return ourselves"""
        return self

# Data sets

### English

In [6]:
FILE_PATH = 'data/jigsaw-toxic-comment-train.csv'

eng_ds = pd.read_csv(FILE_PATH)
eng_text = eng_ds.comment_text

### Spanish

In [7]:
FILE_PATH = 'data/val_es_text_only.csv'

es_ds = pd.read_csv(FILE_PATH)
es_text = es_ds.comment_text

In [9]:
FILE_PATH = 'data/test-processed-seqlen128.csv'
pd.read_csv(FILE_PATH)


Unnamed: 0,id,comment_text,input_word_ids,input_mask,all_segment_id
0,0,Doctor Who adlı viki başlığına 12. doctor olar...,"(101, 17376, 14516, 19165, 56324, 10116, 24542...","(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,1,"Вполне возможно, но я пока не вижу необходимо...","(101, 511, 53204, 36689, 44504, 117, 11279, 57...","(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,2,"Quindi tu sei uno di quelli conservativi , ...","(101, 35921, 17938, 13055, 13868, 11381, 10120...","(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,3,Malesef gerçekleştirilmedi ancak şöyle bir şey...,"(101, 59170, 16822, 99087, 10284, 83972, 51782...","(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,4,:Resim:Seldabagcan.jpg resminde kaynak sorunu ...,"(101, 131, 32070, 11759, 131, 11045, 23388, 10...","(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...,...,...
63807,63807,"No, non risponderò, come preannunciato. Prefer...","(101, 10657, 117, 10446, 29956, 54609, 102754,...","(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
63808,63808,"Ciao, I tecnici della Wikimedia Foundation sta...","(101, 51457, 14875, 117, 146, 10361, 101788, 1...","(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
63809,63809,innnazitutto ti ringrazio per i ringraziamenti...,"(101, 15203, 10219, 46680, 109056, 14382, 2155...","(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
63810,63810,Kaç olumlu oy gerekiyor? Şu an 7 oldu. Hayır...,"(101, 25444, 13406, 30668, 107357, 183, 10157,...","(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


### Spanish ds translated to Englisgh - AWS

In [11]:
FILE_PATH = 'translation/val_es_text_only__en_AWS_TRANSLATED.csv'

es_t_eng_aws_ds = pd.read_csv(FILE_PATH)
es_t_eng_aws_text = es_t_eng_aws_ds.TranslatedText

# Labse BERT

In [13]:
tokenizer_mul = BertTokenizer.from_pretrained("pvl/labse_bert")
model_mul = BertModel.from_pretrained('pvl/labse_bert',
                                  output_hidden_states = True, # Whether the model returns all hidden-states.
                                  )

Downloading:   0%|          | 0.00/5.22M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/62.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/472 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.89G [00:00<?, ?B/s]

In [14]:
#bert_transformer = BertTransformer(tokenizer_mul, model_mul, embedding_func=embed)
bert_transformer = BertTransformer(tokenizer_mul, model_mul, embedding_func=embed_cls)
  

In [16]:
import pickle
import itertools

texts = [es_t_eng_aws_text]
files_name = ['es_t_eng_aws_text.pkl']


for i,t in enumerate(texts):
    emebedded = []
    start = time.time()
    for com in t:
        emebedded.append(bert_transformer.transform(com))
    
    with open(files_name[i], 'wb') as f:
        torch.save(list(itertools.chain(*emebedded)), f)    
    end = time.time()
    print(end - start)

101.24582242965698


# Multilingual BERT

In [44]:
from transformers import BertTokenizer, TFBertModel

tokenizer_mul = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
model_mul = TFBertModel.from_pretrained("bert-base-multilingual-cased")


ImportError: 
TFBertModel requires the TensorFlow library but it was not found in your environment. Checkout the instructions on the
installation page: https://www.tensorflow.org/install and follow the ones that match your environment.


In [45]:
bert_transformer = BertTransformer(tokenizer_mul, model_mul, embedding_func=embed_cls)


In [None]:
import pickle
import itertools

texts = [es_text, eng_text]
files_name = ['validation_es_only_text_CLS_multilingual_bert.pkl', 'jigsaw_toxic-comment_train_embedded_CLS_multilingual_bert.pkl']


for i,t in enumerate(texts):
    emebedded = []
    start = time.time()
    for com in t:
        emebedded.append(bert_transformer.transform(com))
    
    with open(files_name[i], 'wb') as f:
        torch.save(list(itertools.chain(*emebedded)), f)    
    end = time.time()
    print(end - start)

# xlm-roberta-large-xnli

In [11]:
!pip uninstall transformers -y


Found existing installation: transformers 4.5.1
Uninstalling transformers-4.5.1:
  Successfully uninstalled transformers-4.5.1


In [7]:
!pip install  sentencepiece

Collecting sentencepiece
  Downloading sentencepiece-0.1.95-cp36-cp36m-manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 18.0 MB/s eta 0:00:01
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.95


In [7]:

from transformers import XLMRobertaModel , XLMRobertaTokenizer

tokenizer_mul = XLMRobertaTokenizer.from_pretrained("xlm-roberta-base")
model_mul = XLMRobertaModel.from_pretrained("xlm-roberta-base", output_hidden_states = True)

In [8]:
from transformers import XLMRobertaTokenizer

In [9]:
bert_transformer = BertTransformer(tokenizer_mul, model_mul, embedding_func=embed_cls)


In [10]:
import pickle
import itertools

texts = [es_text, eng_text]
files_name = ['validation_es_only_text_CLS_xlm-roberta-large-xnli.pkl', 'jigsaw_toxic-comment_train_embedded_CLS_xlm-roberta-large-xnli.pkl']

texts = [eng_text]
files_name = ['jigsaw_toxic-comment_train_embedded_CLS_xlm-roberta-large-xnli_2.pkl']



for i,t in enumerate(texts):
    emebedded = []
    start = time.time()
    for com in t:
        emebedded.append(bert_transformer.transform(com))
    
    with open(files_name[i], 'wb') as f:
        torch.save(list(itertools.chain(*emebedded)), f)    
    end = time.time()
    print(end - start)

4355.596848249435


In [None]:
1+1