In [21]:
from tokenizers import (
    decoders,
    models,
    normalizers,
    pre_tokenizers,
    processors,
    trainers,
    Tokenizer,
)
from tqdm import tqdm
from transformers import PreTrainedTokenizerFast

# ------- HYPERPARAMETERS -------
IS_LOWERCASE = True  # True if uncased, false otherwise

In [22]:
# declare word piece tokenizer
tokenizer = Tokenizer(models.WordPiece(unk_token="[UNK]"))

In [23]:
# declare normalizer
tokenizer.normalizer = normalizers.BertNormalizer(lowercase=IS_LOWERCASE)

In [24]:
# declare pre-tokenizer
tokenizer.pre_tokenizer = pre_tokenizers.BertPreTokenizer()

In [25]:
# define trainer
trainer = trainers.WordPieceTrainer(vocab_size=30000,
                                    min_frequency=2,
                                    special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"]
                                    )

In [None]:
def get_lines(files):
    for file in files:
        with open(file, 'r', encoding='utf-8') as f:
            for line in f:
                yield line.strip()


files = [
    "/content/train.csv"
]
total_lines = sum(1 for file in files for _ in open(file, 'r', encoding='utf-8'))

In [None]:
tokenizer.train_from_iterator(
    tqdm(get_lines(files), total=total_lines, desc="Training tokenizer"),
    trainer=trainer
)

In [None]:
# define post-processor
tokenizer.post_processor = processors.TemplateProcessing(
    single=f"[CLS]:0 $A:0 [SEP]:0",
    pair=f"[CLS]:0 $A:0 [SEP]:0 $B:1 [SEP]:1",
    special_tokens=[
        ("[CLS]", tokenizer.token_to_id("[CLS]")),
        ("[SEP]", tokenizer.token_to_id("[SEP]")),
        ("[PAD]", tokenizer.token_to_id("[PAD]")),
        ("[UNK]", tokenizer.token_to_id("[UNK]")),
        ("[MASK]", tokenizer.token_to_id("[MASK]"))
    ],
)

In [None]:
tokenizer.decoder = decoders.WordPiece(prefix="##")

In [None]:
# wrap into a fast tokenizer and save
wrapped_tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=tokenizer,
    unk_token="[UNK]",
    pad_token="[PAD]",
    cls_token="[CLS]",
    sep_token="[SEP]",
    mask_token="[MASK]",
)
path = f"/content/drive/MyDrive/tokenizer_{'uncased' if IS_LOWERCASE else 'cased'}"
wrapped_tokenizer.save_pretrained(path)

In [None]:
wrapped_tokenizer.push_to_hub(repo_id=f"RooseBERT-tokenizer-{'uncased' if IS_LOWERCASE else 'cased'}")