In [1]:
import torch
from google.cloud import storage
import tokenizers
from transformers import BertTokenizer
from tokenizers import BertWordPieceTokenizer
from torch.utils.data import Dataset, TensorDataset, DataLoader
from torch.utils.data.sampler import RandomSampler
import numpy as np
import random
import os

In [10]:
seq_length = 128
accum_multipler = 1
batch_size = 128
epochs = 2
warmup_ratio = 0.06
lr = 5e-4

data_size = os.stat("/mnt/d/data_masked_%s"%seq_length).st_size // batch_size

num_batches = -int(-data_size // batch_size) 
tot_num_steps   = -int(-data_size / batch_size / accum_multipler)  * epochs
warmup_steps = int(tot_num_steps * warmup_ratio)
data_size

18746072

In [3]:
# JUST TRY
# Test Speed:  1.1862 steps /s  :  / bs128 / accum 1 /

data_size = data_size // 40

In [4]:
12 * 3600 * 1.1862 # steps

51243.84

In [9]:
51243 * 128 / 2

3279552.0

In [11]:
data_size = 3279552

In [12]:

print('data_size:      ', data_size)
print('seq_length:     ', seq_length)
print('lr:             ', lr)
print('epochs:         ', epochs)
print('tot_num_steps:  ', tot_num_steps)
print('warmup_steps:   ', warmup_steps)

data_size:       3279552
seq_length:      128
lr:              0.0005
epochs:          2
tot_num_steps:   292906
warmup_steps:    17574


In [13]:
if torch.cuda.is_available():      
    device = torch.device("cuda")
    print('GPU:', torch.cuda.get_device_name(0))
else:
    print('CPU')
    device = torch.device("cpu")

GPU: Tesla V100-SXM2-16GB


In [14]:
tokenizer = BertWordPieceTokenizer(vocab_file = 'tokenizer/vocab.txt')
tokenizer.add_special_tokens(["<nl>"])
tokenizer.enable_truncation(max_length=seq_length)
tokenizer.enable_padding(length=seq_length)

In [15]:
data_original_fn = "/mnt/d/data_original_%s"%seq_length
data_masked_fn   = "/mnt/d/data_masked_%s"%seq_length

In [16]:
import random
i = random.randint(0, 100000)
with open(data_original_fn, "rb") as f:
    data = torch.tensor(np.fromfile(f,dtype=np.int32, count=seq_length, offset=seq_length*i*4))
    
with open(data_masked_fn, "rb") as f:
    data_masked = torch.tensor(np.fromfile(f,dtype=np.int32, count=seq_length, offset=seq_length*i*4))
    


In [17]:
from termcolor import colored
tensor = torch.zeros(())
labels = tensor.new_full(data.shape, -100).int()
labels[data!=data_masked] = data[data!=data_masked]

attention_mask = torch.where(data!=0, torch.ones_like(data), torch.zeros_like(data))

for id, label in zip(data, labels):
    token = tokenizer.id_to_token(id)
    if label >= 0:
        token = colored(token,'red')
    print(token, end=" ")
print()
print()

for id, label in zip(data, labels):
    if not id:
        continue
    token = tokenizer.id_to_token(id)
    if label >= 0:
        token = colored(tokenizer.id_to_token(label), 'blue')
    print(token, end=" ")
print()
print()
for id, label in zip(data_masked, labels):
    if not id:
        continue
    token = tokenizer.id_to_token(id)
    if label >= 0:
        token = colored(token,'red')
    print(token, end=" ")


[CLS] the full . she was watching him and he grinned at her . ‘ stop [31mfishing[0m for compliment ##s . ’ <nl> melan ##ie grinned back , her head on one side . ‘ so [31m,[0m what about you and fi ##zz beau ##mont then ? ’ <nl> it was his turn to hesitate . ‘ what [31mdo[0m you [31mmean[0m ? ’ <nl> she threw him a look full of misc ##hi ##ef [31m.[0m ‘ i asked her if [31myou[0m [31m’[0m d had [31ma[0m row or something . ’ <nl> ‘ [31mthat[0m [31mwas[0m very rude of you . ’ <nl> ‘ probably [31m.[0m but i wanted to know . ’ <nl> ‘ and what did she say [31m?[0m ’ <nl> melan ##ie ’ s eyes gle [SEP] 

[CLS] the full . she was watching him and he grinned at her . ‘ stop [34mfishing[0m for compliment ##s . ’ <nl> melan ##ie grinned back , her head on one side . ‘ so [34m,[0m what about you and fi ##zz beau ##mont then ? ’ <nl> it was his turn to hesitate . ‘ what [34mdo[0m you [34mmean[0m ? ’ <nl> she threw him a look full of misc ##hi ##ef [34m.[0m ‘ i asked 

In [18]:
class textDataset(Dataset):
    def __init__(self, size):
        self.size = size
    def __len__(self):
        return self.size
    def __getitem__(self,i):
        with open(data_original_fn, "rb") as f:
            data = torch.tensor(np.fromfile(f,dtype=np.int32, count=seq_length, offset=seq_length*i*4))

        with open(data_masked_fn, "rb") as f:
            data_masked = torch.tensor(np.fromfile(f,dtype=np.int32, count=seq_length, offset=seq_length*i*4))

        tensor = torch.zeros(())
        labels = tensor.new_full(data.shape, -100).int()
        labels[data!=data_masked] = data[data!=data_masked]
        
        attention_mask = torch.where(data!=0, torch.ones_like(data), torch.zeros_like(data))
        
        return data_masked.long(), labels.long(), attention_mask.long(), data.long()

In [19]:
dataset = textDataset(data_size)
dataloader = DataLoader(dataset, batch_size = batch_size, shuffle=True)

print('Actual batch size:', batch_size * accum_multipler)

print('Batch size per GPU per pass:', batch_size // torch.cuda.device_count())

Actual batch size: 128
Batch size per GPU per pass: 16


In [20]:
'''
from transformers import ElectraForMaskedLM, ElectraForPreTraining
from transformers import ElectraConfig
import torch.nn as nn

generator_config = ElectraConfig(
    vocab_size=50000,
    hidden_size = 64,
    intermediate_size = 128,
    num_attention_heads = 1
)
discriminator_config = ElectraConfig(
    vocab_size=50000
)

generator = nn.DataParallel(ElectraForMaskedLM(config=generator_config))
generator.to(device)
discriminator = nn.DataParallel(ElectraForPreTraining(config=discriminator_config))
discriminator.to(device)
'''

'\nfrom transformers import ElectraForMaskedLM, ElectraForPreTraining\nfrom transformers import ElectraConfig\nimport torch.nn as nn\n\ngenerator_config = ElectraConfig(\n    vocab_size=50000,\n    hidden_size = 64,\n    intermediate_size = 128,\n    num_attention_heads = 1\n)\ndiscriminator_config = ElectraConfig(\n    vocab_size=50000\n)\n\ngenerator = nn.DataParallel(ElectraForMaskedLM(config=generator_config))\ngenerator.to(device)\ndiscriminator = nn.DataParallel(ElectraForPreTraining(config=discriminator_config))\ndiscriminator.to(device)\n'

In [21]:
from transformers import ElectraForMaskedLM, ElectraForPreTraining
from transformers import ElectraConfig
import torch.nn as nn
'''
        vocab_size=30522,
        embedding_size=128,
        hidden_size=256,
        num_hidden_layers=12,
        num_attention_heads=4,
        intermediate_size=1024,
        hidden_act="gelu",
        hidden_dropout_prob=0.1,
        attention_probs_dropout_prob=0.1,
        max_position_embeddings=512,
        type_vocab_size=2,
        initializer_range=0.02,
        layer_norm_eps=1e-12,
        pad_token_id=0,

'''

'''
sequence length: 512 -> 128

batch size: 256 -> 128

model hidden dimension: 768 -> 256

embeddings: 768 -> 128
'''
generator_config = ElectraConfig(
    max_position_embeddings=seq_length,
    num_hidden_layers=12,
    vocab_size=50000,
    embedding_size=128,
    hidden_size = 64,
    intermediate_size = 256,
    num_attention_heads=1,
)
discriminator_config = ElectraConfig(
    max_position_embeddings=seq_length,
    num_hidden_layers=12,
    vocab_size=50000,
    embedding_size=128,
    hidden_size=256,
    intermediate_size=1024,
    num_attention_heads=4,
)

generator = nn.DataParallel(ElectraForMaskedLM(config=generator_config))
generator.to(device)
discriminator = nn.DataParallel(ElectraForPreTraining(config=discriminator_config))
discriminator.to(device)

DataParallel(
  (module): ElectraForPreTraining(
    (electra): ElectraModel(
      (embeddings): ElectraEmbeddings(
        (word_embeddings): Embedding(50000, 128, padding_idx=0)
        (position_embeddings): Embedding(128, 128)
        (token_type_embeddings): Embedding(2, 128)
        (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (embeddings_project): Linear(in_features=128, out_features=256, bias=True)
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0): BertLayer(
            (attention): BertAttention(
              (self): BertSelfAttention(
                (query): Linear(in_features=256, out_features=256, bias=True)
                (key): Linear(in_features=256, out_features=256, bias=True)
                (value): Linear(in_features=256, out_features=256, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): BertSelfO

In [22]:
discriminator.module.electra.embeddings = generator.module.electra.embeddings

In [23]:
'''

total_steps seems wrong

from transformers import get_linear_schedule_with_warmup
from transformers import AdamW
generator_optimizer = AdamW(generator.parameters(), betas=(0.9, 0.999), lr = lr, weight_decay=0.01)
discriminator_optimizer = AdamW(discriminator.parameters(), betas=(0.9, 0.999), lr = lr, weight_decay=0.01)


total_steps = len(dataloader) * epochs
generator_scheduler = get_linear_schedule_with_warmup(generator_optimizer, 
                                            num_warmup_steps = warmup_steps,
                                            num_training_steps = total_steps)
discriminator_scheduler = get_linear_schedule_with_warmup(discriminator_optimizer, 
                                            num_warmup_steps = warmup_steps,
                                            num_training_steps = total_steps)
                                            
                                            
                                            '''

'\n\ntotal_steps seems wrong\n\nfrom transformers import get_linear_schedule_with_warmup\nfrom transformers import AdamW\ngenerator_optimizer = AdamW(generator.parameters(), betas=(0.9, 0.999), lr = lr, weight_decay=0.01)\ndiscriminator_optimizer = AdamW(discriminator.parameters(), betas=(0.9, 0.999), lr = lr, weight_decay=0.01)\n\n\ntotal_steps = len(dataloader) * epochs\ngenerator_scheduler = get_linear_schedule_with_warmup(generator_optimizer, \n                                            num_warmup_steps = warmup_steps,\n                                            num_training_steps = total_steps)\ndiscriminator_scheduler = get_linear_schedule_with_warmup(discriminator_optimizer, \n                                            num_warmup_steps = warmup_steps,\n                                            num_training_steps = total_steps)\n                                            \n                                            \n                                            '

In [24]:
from transformers import get_linear_schedule_with_warmup
from transformers import AdamW
generator_optimizer = AdamW(generator.parameters(), betas=(0.9, 0.999), lr = lr, weight_decay=0.01)
discriminator_optimizer = AdamW(discriminator.parameters(), betas=(0.9, 0.999), lr = lr, weight_decay=0.01)

total_steps = len(dataloader) * epochs
generator_scheduler = get_linear_schedule_with_warmup(generator_optimizer, 
                                            num_warmup_steps = warmup_steps,
                                            num_training_steps = tot_num_steps)
discriminator_scheduler = get_linear_schedule_with_warmup(discriminator_optimizer, 
                                            num_warmup_steps = warmup_steps,
                                            num_training_steps = tot_num_steps)

In [25]:
import time
import datetime

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [None]:


total_t0 = time.time()
for epoch_i in range(0, epochs):
    
    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    t0 = time.time()

    generator_train_loss = 0
    discriminator_train_loss = 0

    generator.train()
    discriminator.train()
    generator.zero_grad()
    discriminator.zero_grad()
    for step, batch in enumerate(dataloader):
        #generator
        generator_input = batch[0].to(device)
        generator_labels = batch[1].to(device)
        generator_mask = batch[2].to(device)
        generator_original = batch[3].to(device)
        
        generator_loss, generator_scores = generator(generator_input, attention_mask=generator_mask, labels=generator_labels)
        generator_loss = generator_loss.mean()
        generator_train_loss += generator_loss.item()
        generator_loss.backward()
        torch.nn.utils.clip_grad_norm_(generator.parameters(), 1.0)
        
        #discriminator
        discriminator_input = torch.where(generator_labels>=0, torch.argmax(generator_scores,dim=2), generator_original)
        discriminator_labels = torch.where(discriminator_input==generator_original, 
                                           torch.zeros_like(generator_original), torch.ones_like(generator_original))
        discriminator_mask = generator_mask
        
        
        discriminator_loss, discriminator_scores = discriminator(discriminator_input, 
                                                    attention_mask=discriminator_mask, labels=discriminator_labels)
        discriminator_loss = discriminator_loss.mean()
        discriminator_train_loss += discriminator_loss.item()
        discriminator_loss.backward()
        torch.nn.utils.clip_grad_norm_(discriminator.parameters(), 1.0)
        
        if step % accum_multipler == 0 and (accum_multipler == 1 or step != 0):
            generator_optimizer.step()
            generator_scheduler.step()
            discriminator_optimizer.step()
            discriminator_scheduler.step()
            generator.zero_grad()
            discriminator.zero_grad()
        
        if step % 200 == 0 and not step == 0:
            elapsed = format_time(time.time() - t0)
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.    Generator Loss: {:.3f}.    Discriminator Loss: {:.3f}.'
                  .format(step, len(dataloader), elapsed, generator_train_loss/40, discriminator_train_loss/40))
            generator_train_loss = 0
            discriminator_train_loss = 0
            



Training...




  Batch   200  of  25,622.    Elapsed: 0:03:17.    Generator Loss: 54.284.    Discriminator Loss: 2.914.
  Batch   400  of  25,622.    Elapsed: 0:06:02.    Generator Loss: 52.188.    Discriminator Loss: 1.462.
  Batch   600  of  25,622.    Elapsed: 0:08:48.    Generator Loss: 48.621.    Discriminator Loss: 0.640.
  Batch   800  of  25,622.    Elapsed: 0:11:35.    Generator Loss: 44.140.    Discriminator Loss: 0.409.
  Batch 1,000  of  25,622.    Elapsed: 0:14:24.    Generator Loss: 40.509.    Discriminator Loss: 0.550.
  Batch 1,200  of  25,622.    Elapsed: 0:17:08.    Generator Loss: 38.512.    Discriminator Loss: 0.437.
  Batch 1,400  of  25,622.    Elapsed: 0:19:54.    Generator Loss: 37.989.    Discriminator Loss: 0.432.
  Batch 1,600  of  25,622.    Elapsed: 0:22:44.    Generator Loss: 37.774.    Discriminator Loss: 0.426.
  Batch 1,800  of  25,622.    Elapsed: 0:25:30.    Generator Loss: 37.747.    Discriminator Loss: 0.428.
  Batch 2,000  of  25,622.    Elapsed: 0:28:16.    Gene

In [27]:
  Batch 16,800  of  25,622.    Elapsed: 3:54:08.    Generator Loss: 24.918.    Discriminator Loss: 0.752.\n
  Batch 17,000  of  25,622.    Elapsed: 3:56:57.    Generator Loss: 24.956.    Discriminator Loss: 0.754.\n
  Batch 17,200  of  25,622.    Elapsed: 3:59:46.    Generator Loss: 24.981.    Discriminator Loss: 0.752.\n
  Batch 17,400  of  25,622.    Elapsed: 4:02:35.    Generator Loss: 24.905.    Discriminator Loss: 0.752.\n
  Batch 17,600  of  25,622.    Elapsed: 4:05:21.    Generator Loss: 24.907.    Discriminator Loss: 0.751.\n
  Batch 17,800  of  25,622.    Elapsed: 4:08:04.    Generator Loss: 24.870.    Discriminator Loss: 0.747.\n
  Batch 18,000  of  25,622.    Elapsed: 4:10:51.    Generator Loss: 24.880.    Discriminator Loss: 0.751.\n
  Batch 18,200  of  25,622.    Elapsed: 4:13:40.    Generator Loss: 24.884.    Discriminator Loss: 0.752.\n
  Batch 18,400  of  25,622.    Elapsed: 4:16:27.    Generator Loss: 24.838.    Discriminator Loss: 0.750.\n
  Batch 18,600  of  25,622.    Elapsed: 4:19:13.    Generator Loss: 24.805.    Discriminator Loss: 0.747.\n
  Batch 18,800  of  25,622.    Elapsed: 4:22:02.    Generator Loss: 24.842.    Discriminator Loss: 0.754.\n
  Batch 19,000  of  25,622.    Elapsed: 4:24:48.    Generator Loss: 24.837.    Discriminator Loss: 0.748.\n







                
                
                
                
                
                
                
                
                

1

In [None]:
torch.save(generator,'electra_small_generator.pth')
torch.save(discriminator,'electra_small_discriminator.pth')

In [11]:

for step, batch in enumerate(dataloader):
    break
    
batch[2]

tensor([[1, 1, 1,  ..., 1, 1, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1]])

In [None]:
discriminator = torch.load('electra_small_discriminator.pth')
discriminator.cuda()
generator = torch.load('electra_small_generator.pth')
generator.cuda()

In [9]:
generator.eval()
discriminator.eval()
with torch.no_grad():
    for step, batch in enumerate(dataloader):
        generator_input = batch[0].to(device)
        generator_labels = batch[1].to(device)
        generator_mask = batch[2].to(device)
        generator_original = batch[3].to(device)

        generator_loss, generator_scores = generator(generator_input, attention_mask=generator_mask, labels=generator_labels)


        #discriminator
        discriminator_input = torch.where(generator_labels>=0, torch.argmax(generator_scores,dim=2), generator_original)
        discriminator_labels = torch.where(discriminator_input==generator_original, 
                                           torch.zeros_like(generator_original), torch.ones_like(generator_original))
        discriminator_mask = generator_mask
        discriminator_loss, discriminator_scores = discriminator(discrimisnator_input, 
                                                    attention_mask=discriminator_mask, labels=discriminator_labels)
        break





In [15]:
generator_loss

tensor([6.6637, 6.1166, 6.7644, 7.4927, 6.7277, 6.4525, 6.4347, 5.8803],
       device='cuda:0')

In [11]:
from termcolor import colored

In [12]:

for id, label in zip(generator_input[0], generator_labels[0]):
    if id == 0:
        continue
    token = tokenizer.id_to_token(id)
    if label > 0:
        token = colored(token,'red')
    print(token, end=" ")
    
print()
print()

for id, label in zip(generator_input[0], generator_labels[0]):
    if id == 0:
        continue
    token = tokenizer.id_to_token(id)
    if label > 0:
        token = colored(tokenizer.id_to_token(label),'red')
    print(token, end=" ")

[CLS] # 架 垃 圾 車 違 反 交 通 [31m##bb[0m [31m1923[0m 喎 , 執 行 [31m[MASK][0m 拆 [31m[MASK][0m 可 以 無 視 法 律 ? <nl> # 1 a : [31m[MASK][0m 咁 小 學 雞 啦 😗 <nl> # [31m[MASK][0m b : # [31m[MASK][0m 🤡 off ， 破 壞 法 治 丫 嗎 [31m[MASK][0m 架 車 唔 係 警 察 [31m[MASK][0m 喎 ， [31m[MASK][0m 咩 有 特 權 ？ 幫 警 察 就 有 特 權 ？ <nl> [31m##らに[0m [31m[MASK][0m c : # 2 咩 拎 野 小 學 雞 你 跟 足 人 地 個 套 玩 [31m[MASK][0m 係 [31m侚[0m [31m##1999[0m [31m桉[0m ? <nl> [SEP] 

[CLS] # 架 垃 圾 車 違 反 交 通 [31m規[0m [31m則[0m 喎 , 執 行 [31m清[0m 拆 [31m就[0m 可 以 無 視 法 律 ? <nl> # 1 a : [31m咪[0m 咁 小 學 雞 啦 😗 <nl> # [31m2[0m b : # [31m1[0m 🤡 off ， 破 壞 法 治 丫 嗎 [31m，[0m 架 車 唔 係 警 察 [31m架[0m 喎 ， [31m憑[0m 咩 有 特 權 ？ 幫 警 察 就 有 特 權 ？ <nl> [31m#[0m [31m3[0m c : # 2 咩 拎 野 小 學 雞 你 跟 足 人 地 個 套 玩 [31m就[0m 係 [31m小[0m [31m學[0m [31m雞[0m ? <nl> [SEP] 

In [14]:
from termcolor import colored

for id, label in zip(discriminator_input[0], discriminator_labels[0]):
    if id == 0:
        continue
    token = tokenizer.id_to_token(id)
    if label > 0:
        token = colored(token,'red')
    print(token, end=" ")
print()
print()

[CLS] # 架 垃 圾 車 違 反 交 通 [31m係[0m [31m#[0m 喎 , 執 行 [31m#[0m 拆 [31m#[0m 可 以 無 視 法 律 ? <nl> # 1 a : [31m#[0m 咁 小 學 雞 啦 😗 <nl> # [31m#[0m b : # [31m#[0m 🤡 off ， 破 壞 法 治 丫 嗎 [31m#[0m 架 車 唔 係 警 察 [31m#[0m 喎 ， [31m#[0m 咩 有 特 權 ？ 幫 警 察 就 有 特 權 ？ <nl> # [31m#[0m c : # 2 咩 拎 野 小 學 雞 你 跟 足 人 地 個 套 玩 [31m#[0m 係 [31m#[0m [31m#[0m [31m#[0m ? <nl> [SEP] 



In [52]:
generator_original[0][discriminator_labels[0] == 1]

tensor([   40, 11948,  4160,  4145,  4451, 13918,  3281,  4882,  6236,  7232,
        18180,  8172, 13287, 26257, 15006,  9942,  6938,  4160,  2473,  2353,
         9978,  4145,  4161,  2684, 11145,  2888,  8946,  3429,  3379, 11840,
         6948,  6349,  7796,  5730, 13287,  3290,  3429,  3379, 11840,  6948,
         6349,  5450,  4149, 13918,  2520,  2518,  6207,  2888,  8946,  8887,
         6906,  8887,  6906,  8391,  2874,  6222, 11851,  7051,  5457],
       device='cuda:0')

In [44]:
discriminator_labels[0]

tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0,
        0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [45]:
tokenizer = BertWordPieceTokenizer(vocab_file = 'tokenizer/vocab.txt')
tokenizer.add_special_tokens(["<nl>"])
tokenizer.enable_truncation(max_length=512)
tokenizer.enable_padding(length=512)

In [49]:
tokenizer.decode(generator_original[1].tolist())

'# 112 c : 請 廣 傳 唔 佔 等 auntie 餵 狗 ？ 等 等 等 政 府 明 拖 你 龍 和 道 告 急 ！ ！ 1 ， 沒 公 民 提 名 2 ， 不 下 台 3 ， 沒 普 選 4 ， 沒 公 開 約 實 完 全 沒 回 應 明 玩 9 你 班 愚 民 你 班 垃 圾 左 膠 咩 事 宜 家 唔 佔 士 氣 及 談 判 籌 碼 大 減 ！ 輸 龍 和 運 動 必 輸 ！ # 113 bi : # 112 無 錯 ， 學 聯 賣 港 ， 行 動 升 級 ！ 🔥 # 114 bj : 明 知 傾 唔 掂 都 傾 係 形 象 工 程 有 得 傾 都 衝 就 會 比 人 覺 得 你 無 誠 意 為 佔 而 佔 宜 家 同 捉 棋 一 樣 可 進 可 退 ！ # 115 bc : # 114 而 家 唔 係 幫 你 班 hihi 戴 光 環 呀 👋 傾 乜 呀 ？ 會 答 應 根 本 無 須 傾 # 116 bk : # 115 傾 係 無 用 傾 得 成 先 叫 停 我 地 唔 該 # 117 bl : 今 次 政 府 收 唔 到 科 因 為 活 動 而 家 變 成 市 民 自 發 就 算 佢 同 學 聯 傾 除 非 真 係 傾 到 d 乜 否 則 一 定 收 唔 到 科 # 118 bj :? _?? _??? i? _?? y? ζh? u? {? _? o? _???? n? |?? h?? o? a? l?? n? _?? _??? y? a? p? _? [ email protected ]??? i? i? i? h? i # 119 bm : 怯 就 輸 一 世! 添 華 道 + 龍 和 道 必 須 圍 城 斷 路! 請 問, 大 家 手 無 寸 鐵, 唔 衝 擊, 可 以 做 到 d 乜? 有! 就 係 圍 城, 包 圍 到 滴 水 不 溜, 斷 敵 軍 出 入, 逼 降! 呢 樣 已 經 係 最 和 平 可 以 做 到, 而 又 最 有 效 既 手 段! 你 見 警 方 大 陣 仗 佈 防 就 知, 包 圍 呢 招 係 work! 我 地 唔 出 手, 純 包 圍, 包 圍 到 滴 水 不 溜, 不 準 進 出, 佢 地 敢 再 出 防 暴 就 全 世 界 聲 討! 如 果 連 呢 樣 都 做 唔 到, 我 真 係'

In [50]:
tokenizer.decode(discriminator_input[1].tolist())

'flor 112 c : 請 廣 傳 唔 佔 等 auntie 餵 狗 ？ 等 等 等 政 府 明 拖 你 龍 和 道 告 急 ！ ！ 1 alle 沒 driveusc 提 名 dam ， 不 下 台 3 ， 沒 普 選 4 ， 沒 公 開 約 實 完 全 滌 脧 應 明 玩 9 你 班ⓖ 傜 你 班 垃 圾 左 膠 咩 事 宜 家 唔 flor nova 氣 及 談 判 dak 畄 大 減 ！ 輸 龍 和 durham 鎡 必 輸 ！ # 113 bi : # 112 無 錯 ， 學 聯 賣 港 ， 行 動 升 級 ！ 🔥 # 114 bj : 明 知 傾 唔 掂 都ⓖ 係 形 象 工 程 有 得 傾 都 衝 就 會 比 人 覺 得 你 無 誠 意 為 佔 而 점령 宜 家 同 捉 棋 一 樣ཙ bil 可 退 ！✰ 115 bc : # 114 neck 滌 唔 係 幫 你 班 hihi 戴 光 環 呀 👋 傾 乜 娇 ？ 齿 통 應 根 本 無 須 傾 # 116 bk : www 115 傾 係 無 用 cave 獫 成 先 叫 停 我 地 唔 該 # 117 bl : 今 次 政 府 收 唔 到 科 因 為 活 動 而 家 變 成 市 民 自 發 就 算 佢 同 damned 聯 傾 除 非 真 係 傾 到 d 乜 否 則 一 定 收 唔 到 科 # dozen bj :? _?? 翩?? recently i? _ 🥮? y? ζ ㄽ? u? 琲? _? o?베이? municip?? n? |?? brilliant?? o? a cruel l?? n? _?? _? 畄? y? a? p? _ rights [ email protected ]??? i? i? i? h? i # 119 bm : 怯 就 includes 一 世! 魁 2b 특징 + flor rights 交 必 須 圍 城 斷 路! 請 問 塬 大 家 手 無 寸 鐵, 唔💎 1894, 可 以 做 到 d 乜? 有! 就 係 圍 城, × 2020년 到 滴 水 不 溜, 斷 敵 軍 出 入, 逼 降! 呢 樣 已 經 係 最 和 平 可 以 做 到 しられた 👫 又 1894 有 效 既 躭 畄! 你 見 警 方 大 陣 仗 佈 防 就