# Train the Tokenizer

In [1]:
from tokenizers import BertWordPieceTokenizer, ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing
from pathlib import Path
import os

# paths = [str(x) for x in Path(".").glob("text_split/*.txt")]
path = os.path.realpath("../data/dataset_fen_small.txt")

tokenizer_folder = os.path.realpath("./bert-harmon")

MAX_LEN = 128

vocab_size = 16384

In [4]:




# Initialize a tokenizer
tokenizer = BertWordPieceTokenizer(lowercase=False)

# Customize training
tokenizer.train(files=path, vocab_size=vocab_size, min_frequency=1,
                show_progress=True,
                special_tokens=[ '[PAD]', '[UNK]', '[CLS]', '[SEP]', '[MASK]', "[MOVESEP]"])
#Save the Tokenizer to disk
tokenizer.save_model(tokenizer_folder)






['/home/michael/Workspace/nlp-chess/src/bert/bert-harmon/vocab.txt']

In [5]:
tokenizer.save_model(tokenizer_folder)

['/home/michael/Workspace/nlp-chess/src/bert/bert-harmon/vocab.txt']

In [6]:
import os


tokenizer = BertWordPieceTokenizer(os.path.abspath(os.path.join(tokenizer_folder, 'vocab.txt')))
# Prepare the tokenizer
# tokenizer._tokenizer.post_processor = BertProcessing(
#     ("</s>", tokenizer.token_to_id("</s>")),
#     ("<s>", tokenizer.token_to_id("<s>")),
# )
tokenizer.enable_truncation(max_length=128)
# Test the tokenizer
tokenizer.encode("rnbqkbnr/pppppppp/8/8/8/8/PPPPPPPP/RNBQKBNR w KQkq - 0 1 <MOVE_SEP> e2e4")
# Show the tokens created
tokenizer.encode("rnbqkbnr/pppppppp/8/8/8/8/PPPPPPPP/RNBQKBNR w KQkq - 0 1 <MOVE_SEP> e2e4").tokens

['[CLS]',
 'rnbqkbnr',
 '/',
 'pppppppp',
 '/',
 '8',
 '/',
 '8',
 '/',
 '8',
 '/',
 '8',
 '/',
 'pppppppp',
 '/',
 'rnbqkbnr',
 'w',
 'kq',
 '##k',
 '##q',
 '-',
 '0',
 '1',
 '[UNK]',
 '[UNK]',
 '[UNK]',
 '[UNK]',
 '[UNK]',
 'e2e4',
 '[SEP]']

In [7]:
from transformers import BertConfig
from transformers import BertForMaskedLM

from transformers import BertTokenizer

# Set a configuration for our RoBERTa model
config = BertConfig(
    vocab_size=vocab_size,
    max_position_embeddings=514,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1,
)
# Initialize the model from a configuration without pretrained weights
model = BertForMaskedLM(config=config)
print('Num parameters: ',model.num_parameters())

# Create the tokenizer from a trained one
tokenizer = BertTokenizer.from_pretrained(tokenizer_folder + "/vocab.txt", max_len=MAX_LEN)

Num parameters:  56115712




# Build the Dataset

In [8]:
from datasets import load_dataset
from datasets import set_caching_enabled
raw_datasets = load_dataset('text', data_files=path,
                            split='train')

#cut size in half
raw_datasets = raw_datasets.shuffle(seed=42).select(range(int(len(raw_datasets) / 2.5)))

#raw_datasets = raw_datasets.select(range(10000))
raw_datasets = raw_datasets.train_test_split()


def tokenize_function(examples):
    return tokenizer(examples["text"], padding='max_length', truncation=True)

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True, keep_in_memory=True, num_proc=20, remove_columns=["text"])

Using custom data configuration default-20e42cf60b461c77


Downloading and preparing dataset text/default to /home/michael/.cache/huggingface/datasets/text/default-20e42cf60b461c77/0.0.0/d86c40dad297bdddf277b406c6a59f0250b5318c400bf23d420a31aff88c84c4...


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Dataset text downloaded and prepared to /home/michael/.cache/huggingface/datasets/text/default-20e42cf60b461c77/0.0.0/d86c40dad297bdddf277b406c6a59f0250b5318c400bf23d420a31aff88c84c4. Subsequent calls will reuse this data.


In [9]:
# tokenized_datasets = raw_datasets
#     with open("dataset-tokenized.obj", 'wb') as f:
#         pickle.dump(tokenized_datasets, f)

# block_size = tokenizer.model_max_length

block_size = 128

def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
        # customize this part to your needs.
    total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    #result["labels"] = result["input_ids"].copy()
    return result
print(tokenized_datasets["train"][1])
lm_datasets = tokenized_datasets.map(
    group_texts,
    batched=True,
    batch_size=1000,
    num_proc=20,
    keep_in_memory=True
)

print("grouping complete")

small_train_dataset = lm_datasets["train"].shuffle(seed=42).select(range(1000))
# small_train_dataset["labels"] = small_train_dataset["input_ids"]
small_eval_dataset = lm_datasets["test"].shuffle(seed=42).select(range(1000))
# small_eval_dataset["labels"] = small_eval_dataset["input_ids"]
full_train_dataset = lm_datasets["train"]
full_eval_dataset = lm_datasets["test"]

{'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'input_ids': [2, 7795, 63, 54, 7, 8811, 7, 192, 8523, 7, 444, 7, 705, 7, 3065, 7, 1787, 7, 1295, 32, 6, 6, 8, 339, 29, 1, 30, 2007, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [10]:
len(tokenized_datasets["train"])

3194

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    per_device_train_batch_size=392,
    output_dir='./output', 
    num_train_epochs=2000,)

from transformers import Trainer, DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm_probability=0.15,
)

trainer = Trainer(
    model=model, args=training_args, train_dataset=full_train_dataset, eval_dataset=full_eval_dataset, data_collator=data_collator
)

trainer.train()

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
***** Running training *****
  Num examples = 3194
  Num Epochs = 2000
  Instantaneous batch size per device = 392
  Total train batch size (w. parallel, distributed & accumulation) = 392
  Gradient Accumulation steps = 1
  Total optimization steps = 18000


Step,Training Loss


In [15]:
!pwd

/home/michael/Workspace/transformers/examples/pytorch/language-modeling


In [9]:
len(raw_datasets)

13857084