In [1]:
from transformers import BertTokenizerFast, BertTokenizer

In [2]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [3]:
seq_first = 'hello my name is john nice to meet you today is a good day is not it'
seq_second = 'hello i am marry first time to see you'

In [4]:
a = tokenizer.encode_plus(seq_first, seq_second, add_special_tokens=True, max_length=64, pad_to_max_length=True, return_attention_mask=True, return_tensors='pt', truncation=False)
# a = tokenizer.encode_plus(seq_first, seq_second)['input_ids']



In [5]:
a

{'input_ids': tensor([[ 101, 7592, 2026, 2171, 2003, 2198, 3835, 2000, 3113, 2017, 2651, 2003,
         1037, 2204, 2154, 2003, 2025, 2009,  102, 7592, 1045, 2572, 5914, 2034,
         2051, 2000, 2156, 2017,  102,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])}

In [6]:
tokenizer.decode(a['input_ids'][0])

'[CLS] hello my name is john nice to meet you today is a good day is not it [SEP] hello i am marry first time to see you [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]'

In [7]:
import random
import torch

class SimpleTokenizer:

    def __init__(self):
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    # end

    '''
        {
            "input_ids": [],
            "segment_masks": [],
            "position_masks": [],
            "attention_masks": []
        }
    '''
    #TODO: more than max_length?
    def generate_training_embedding(self, seq_a, seq_b, probs_mask=0.15, max_length=64):
        tokens_a = seq_a.split()
        tokens_b = seq_b.split()



        tokens_pair = ['[CLS]'] + tokens_a + ['[SEP]'] + tokens_b + ['[SEP]']
        indexs_mask_all = [i + 1 for i in range(len(tokens_a))] + [i + 2 + len(tokens_a) for i in range(len(tokens_b))]
        random.shuffle(indexs_mask_all)
        indexs_masked = indexs_mask_all[:int(len(indexs_mask_all) * probs_mask)]

        len_all = len(tokens_a) + len(tokens_b) + 3
        tokens_pad = ['[PAD]' for i in range(max_length - len_all)]
        tokens_all = tokens_pair + tokens_pad

        t_segments_all = torch.IntTensor([0 for _ in range(len(tokens_a) + 2)] + [1 for _ in range(len(tokens_b) + 1)] + [0 for _ in range(len(tokens_pad))])
        t_attentions_all = torch.IntTensor([1 for _ in range(len(tokens_pair))] + [0 for _ in range(len(tokens_pad))])
        t_attentions_all[indexs_masked] = 0
        t_masks = torch.zeros(len(tokens_all), dtype=torch.bool)
        t_masks[indexs_masked] = True
        t_position_all = torch.IntTensor([i for i in range(len(tokens_all))])
        t_tokens_id = self.tokenizer.convert_tokens_to_ids(tokens_all)

        return {
            'tokens_id': t_tokens_id,
            'masks': t_masks,
            'segments': t_segments_all,
            'attentions': t_attentions_all,
            'positions': t_position_all
        }
    # end
# end

In [8]:
my_t = SimpleTokenizer()

In [9]:
my_t.generate_training_embedding(seq_first, seq_second)

{'tokens_id': [101,
  7592,
  2026,
  2171,
  2003,
  2198,
  3835,
  2000,
  3113,
  2017,
  2651,
  2003,
  1037,
  2204,
  2154,
  2003,
  2025,
  2009,
  102,
  7592,
  1045,
  2572,
  5914,
  2034,
  2051,
  2000,
  2156,
  2017,
  102,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 'masks': tensor([False, False, False, False, False, False, False,  True, False, False,
         False, False, False, False, False, False,  True, False, False, False,
         False, False, False, False, False, False, False,  True, False, False,
         False, False, False, False, False, False, False, False, False, False,
         False, False, False, False, False, False, False, False, False, False,
         False, False, False, False, False, False, False, False, False, False,
         False, False, False, False]),
 'segments': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [10]:
embedder = torch.nn.Embedding(100,512)

In [17]:
decoder = torch.nn.Linear(512, 100)

In [39]:
# decoder.weight = torch.nn.Parameter(embedder.weight.t())
# decoder.weight[:] = embedder.weight.T[:]
# decoder.weight = embedder.weight

In [49]:
a = torch.BoolTensor([True, False, True])
b= torch.arange(24).view(2,3,4)

In [64]:
b

tensor([[[ 0,  1,  2,  3],
         [ 4,  5,  6,  7],
         [ 8,  9, 10, 11]],

        [[12, 13, 14, 15],
         [16, 17, 18, 19],
         [20, 21, 22, 23]]])

In [54]:
a_new = a[None,:,None].expand_as(b)

In [66]:
torch.masked_select(b, a[None,:,None]).view(b.shape[0], -1, b.shape[-1])

In [70]:
b[:, 0, :]

In [77]:
torch.gather(b, 1, torch.tensor([[0],[0]]))

In [1]:
from transformers import BertTokenizer

In [4]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [6]:
tokenizer.vocab_size

30522