# Tokenization for Huggingface Transformer model (GPT2/Reformer/TransformerXL/...) training on HUWIKI

## Get data
<br>
<font size="4"> 
We have a preprocessed huwiki dump (20200520) stored as a list of txt files in a google cloud bucket at <br> 
<code>gs://hungpt2-wikipedia/full_wiki_cleaned/*</code> <br><br>
Here we access it from local storage when neeed <br><br>
</font>

In [8]:
import os

sourceDir = '/home/adamb/Documents/huwiki_preprocessed/'
print(os.listdir(sourceDir))

['xml6_wiki_08', 'xml6_wiki_06', 'xml2_wiki_00', 'xml1_wiki_03', 'xml3_wiki_01', 'xml1_wiki_02', 'xml4_wiki_00', 'xml4_wiki_04', 'xml4_wiki_02', 'xml3_wiki_04', 'xml5_wiki_00', 'xml3_wiki_02', 'xml3_wiki_03', 'xml6_wiki_00', 'xml4_wiki_03', 'xml6_wiki_03', 'xml6_wiki_09', 'xml1_wiki_01', 'xml5_wiki_06', 'xml5_wiki_04', 'xml2_wiki_01', 'xml5_wiki_03', 'xml5_wiki_05', 'xml6_wiki_05', 'xml6_wiki_07', 'xml1_wiki_00', 'xml6_wiki_01', 'xml2_wiki_04', 'xml1_wiki_04', 'xml5_wiki_01', 'xml3_wiki_00', 'xml2_wiki_03', 'xml5_wiki_07', 'xml4_wiki_01', 'xml6_wiki_02', 'xml5_wiki_02', 'xml2_wiki_02', 'xml6_wiki_04']


## Set up tokenizer

ModuleNotFoundError: ignored

**Train BPE tokenizer**

In [None]:
%pip install -q git+https://github.com/huggingface/transformers.git
import transformers
from tokenizers import ByteLevelBPETokenizer

[K     |████████████████████████████████| 3.0MB 13.0MB/s 
[K     |████████████████████████████████| 1.1MB 51.6MB/s 
[K     |████████████████████████████████| 890kB 52.3MB/s 
[?25h  Building wheel for transformers (setup.py) ... [?25l[?25hdone
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone


In [None]:
paths = glob("/content/preprocessed_wiki/*/*")

# Initialize a tokenizer
tokenizer = ByteLevelBPETokenizer(lowercase=False)

# Customize training
tokenizer.train(files=paths, vocab_size=8000, min_frequency=3, special_tokens=["<|endoftext|>"]) #"[PAD]",

# Save tokenizer
OUT_DIR = "hunwiki_tokenizer"
os.makedirs(OUT_DIR, exist_ok=True)
tokenizer.save(OUT_DIR+"/hunwiki")

TypeError: ignored

In [None]:
tokenizer.save(OUT_DIR+"/hunwiki_tokenizer")

In [None]:
os.getcwd()

'/content'

In [None]:
from google.colab import auth

In [None]:
auth.authenticate_user()

In [None]:
!gsutil cp /content/hunwiki_tokenizer/hunwiki_tokenizer gs://hungpt2-wikipedia/hunwiki_tokenizer

Copying file:///content/hunwiki_tokenizer/hunwiki_tokenizer [Content-Type=application/octet-stream]...
-
Operation completed over 1 objects/209.1 KiB.                                    


In [None]:
!gsutil cp /content/preprocessed_wiki/test/*.txt gs://hungpt2-wikipedia/

Copying file:///content/preprocessed_wiki/test/0.txt [Content-Type=text/plain]...
Copying file:///content/preprocessed_wiki/test/1.txt [Content-Type=text/plain]...
Copying file:///content/preprocessed_wiki/test/2.txt [Content-Type=text/plain]...
Copying file:///content/preprocessed_wiki/test/3.txt [Content-Type=text/plain]...
/
==> NOTE: You are performing a sequence of gsutil operations that may
run significantly faster if you instead use gsutil -m cp ... Please
see the -m section under "gsutil help options" for further information
about when gsutil -m can be advantageous.

Copying file:///content/preprocessed_wiki/test/4.txt [Content-Type=text/plain]...
-
Operation completed over 5 objects/101.5 MiB.                                    


Create tokenized files with uniform length samples

In [None]:
MAX_LEN = 256

preopcessed_wikis = glob('/content/preprocessed_wiki/test/*.txt')
examples = []
for i,file in enumerate(preopcessed_wikis):
  with open(file) as f:
    prep_wiki = f.read()
  ids = tokenizer.encode(prep_wiki).ids
  for i in range(0, len(ids) - MAX_LEN + 1, MAX_LEN):  # Truncate in block of MAX_LEN
    examples.append(ids[i : i + MAX_LEN])

with open('train_examples.txt','w') as f:
  f.write('\n'.join(str(example) for example in examples))

In [None]:
!gsutil cp /content/train_examples.txt gs://hungpt2-wikipedia/

Copying file:///content/train_examples.txt [Content-Type=text/plain]...
|
Operation completed over 1 objects/134.9 MiB.                                    


In [None]:
import numpy as np
examples_array = np.array(examples)

In [None]:
print(os.listdir('./preprocessed_wiki/'))

['test']


# Trainer class approach

In [None]:
from transformers import GPT2Config
from transformers import TFGPT2LMHeadModel
from transformers import GPT2TokenizerFast

config = GPT2Config(n_positions=256,
                    n_ctx=256,
                    vocab_size=8000,
                    bos_token_id=0,
                    eos_token_id=0)

model = TFGPT2LMHeadModel(config=config)
gpt_tokenizer = GPT2TokenizerFast('/content/hunwiki_tokenizer/hunwiki-vocab.json','/content/hunwiki_tokenizer/hunwiki-merges.txt')

Exception: ignored

In [None]:
gpt_tokenizer.pad_token='<|endoftext|>'

**Build tf.Dataset**

To check what is the valid input format(features, labels) for the model, load a pretrained GPT2 and do succesful a forward pass, that returns loss, and logits.
We want to replicate the behavior in "_run_model" method of TFTrainer class.

What we need as input are fix length tokenized chunkes as inputs and labels=inputs.

In [None]:
import tensorflow as tf

In [None]:
train_dataset = tf.data.Dataset.from_tensor_slices(examples_array)

In [None]:
from transformers import TFTrainer, TFTrainingArguments

training_args = TFTrainingArguments(
    output_dir="/content/hunGPT2",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=8,
    gradient_accumulation_steps=8
)

trainer = TFTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset 
)

In [None]:
next(iter(train_dataset))

In [None]:
trainer.train()

OperatorNotAllowedInGraphError: ignored

In [None]:
train_dataset = tf.data.TextLineDataset('/content/train_examples.txt')

In [None]:
from transformers import TextDataset

dataset = TextDataset(
    tokenizer=gpt_tokenizer,
    file_path="/content/preprocessed_wiki/test/0.txt",
    block_size=config.n_positions
)

Load hunwiki pretrained tokenizer vocab to tf tokenizer

In [None]:
import json

with open('/content/hunwiki_tokenizer/hunwiki-vocab.json') as json_file:
  vocab = json.load(json_file)

type(vocab)

dict

In [None]:
vocab_words = vocab.keys()

with open('vocab.txt','w') as f:
  f.write("\n".join(list(vocab_words)))

In [None]:
def tokenize_dataset(example):
  return gpt_tokenizer.encode(example)

In [None]:
from glob import glob

text_paths = glob('/content/preprocessed_wiki/test/*.txt')

train_dataset = tf.data.TextLineDataset(text_paths)

In [None]:
gpt_tokenizer

In [None]:
MAX_LEN = 256

def encode_ds(example):
  tokens = gpt_tokenizer.encode(example.numpy().decode('utf-8'),pad_to_max_length=True,max_length=256)
  # tf.keras.preprocessing.sequence.pad_sequences(tokens,padding='post',maxlen=MAX_LEN)
  # tokens= tokens[:MAX_LEN]
  # for i in range(0, len(tokens) - max_len + 1, max_len):  # Truncate in block of block_size
  #   examples.append(tokens[i : i + max_len])
  return [tokens]

def encode_ds_map_fn(example):
  # py_func doesn't set the shape of the returned tensors.
  encoded_text = tf.py_function(encode_ds, 
                                inp=[example], 
                                Tout=(tf.int32))
  encoded_text.set_shape(MAX_LEN)

  return encoded_text

# def pad_ds(example):
#   padded_input = tf.keras.preprocessing.sequence.pad_sequences(.numpy(),padding='post',maxlen=MAX_LEN)
#   return padded_input

encoded_dataset = train_dataset.map(encode_ds_map_fn)
# train_dataset = encoded_dataset.map(pad_ds)

In [None]:
for item in encoded_dataset:
  print(len(item))

Try wikipedia dataset from huggingface/nlp or tfds library

In [None]:
%pip install -q git+https://github.com/huggingface/nlp.git
!pip install apache-beam
%pip install -q apache_beam mwparserfromhell
import nlp

In [None]:
huwiki_nlp = nlp.load_dataset('wikipedia','20200501.hu',beam_runner='DirectRunner')

In [None]:
!pip install tensorflow_datasets -U
import tensorflow_datasets as tfds

In [None]:
huwiki = tfds.load('wikipedia/20200301.hu',split='train')

In [None]:
huwiki_text = huwiki.map(lambda example: example['text']).batch(8)

In [None]:
def convert_to_tf_features(example_batch):
    # Tokenize contexts and questions (as pairs of inputs)
    batch = list(example_batch)
    encodings = gpt_tokenizer.batch_encode_plus(batch, max_length=512)

    # # Compute start and end tokens for labels using Transformers's fast tokenizers alignement methods.
    # start_positions, end_positions = [], []
    # for i, (context, answer) in enumerate(zip(example_batch['context'], example_batch['answers'])):
    #     start_idx, end_idx = get_correct_alignement(context, answer)
    #     start_positions.append([encodings.char_to_token(i, start_idx)])
    #     end_positions.append([encodings.char_to_token(i, end_idx-1)])
    
    # if start_positions and end_positions:
    #   encodings.update({'start_positions': start_positions,
    #                     'end_positions': end_positions})
    return encodings

huwiki_text_tokenized = huwiki_text.map(convert_to_tf_features)

In [None]:
tokens = gpt_tokenizer.encode(next(iter(huwiki_text.take(100))).numpy().decode('utf-8'))
examples = []
max_len=512
for i in range(0, len(tokens) - max_len + 1, max_len):  # Truncate in block of block_size
  examples.append(tokens[i : i + max_len])

In [None]:
def tokenize_example(example):
    tokens = gpt_tokenizer.encode(next(iter(example.numpy().decode('utf-8'))))
    max_len=512
    for i in range(0, len(tokens) - max_len + 1, max_len):  # Truncate in block of block_size
      examples.append(tokens[i : i + max_len])
    return example

In [None]:
huwiki_text.map(tokenize_example)

In [None]:
type(gpt_tokenizer.encode(next(iter(huwiki_text.take(1))).numpy().decode('utf-8'),,return_tensors='tf'))
overflowing_tokens

tensorflow.python.framework.ops.EagerTensor

In [None]:
print(next(iter(huwiki_text.take(1))).numpy().decode('utf-8'))