In [1]:
"""
BERT Tokenizer for historical German data 
Input data: sentences extracted from Referenzkorpora zur deutschen Sprachgeschichte 
and German Data from Semeval2020 challenge on LSC

"""

'\nBERT Tokenizer for historical German data \nInput data: sentences extracted from Referenzkorpora zur deutschen Sprachgeschichte \nand German Data from Semeval2020 challenge on LSC\n\n'

In [2]:
__author__ = 'Christin Beck'
__created__ = '31.05.2023'

In [3]:
from icecream import ic

import re
import os

import numpy as np
import torch
from transformers import *
from tokenizers import BertWordPieceTokenizer

import pandas as pd

import logging

import sys

import json

2023-06-12 12:41:12.706269: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [4]:
#load data
files = ['sentences_all.txt']

In [5]:
#Data prep
special_tokens = ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]", "<S>", "<T>"]
    
#This is the standard vocab size in BERT, can be altered
vocab_size = 30_522
#maximum sequence length, lowering will result to faster training (when increasing batch size)
max_length = 512
#whether to truncate, set to False, we'll truncate sentences wrt max_length‚
truncate_longer_samples = False

In [6]:
#Initialize the WordPiece tokenizer
tokenizer = BertWordPieceTokenizer()
#train the tokenizer
tokenizer.train(files=files, vocab_size=vocab_size, special_tokens=special_tokens)
#enable truncation up to the maximum 512 tokens
tokenizer.enable_truncation(max_length=max_length)






In [7]:
#Save tokenizer (json)
model_path = 'pretrained-tokenizer'

#make the directory if not already there
if not os.path.isdir(model_path):
        os.mkdir(model_path)
#save the tokenizer  
tokenizer.save_model(model_path)
#dumping some of the tokenizer config to config file, 
#including special tokens, whether to lower case and the maximum sequence length
with open(os.path.join(model_path, "tokenizer_config.json"), "w") as f:
    tokenizer_cfg = {
        "do_lower_case": True,
        "unk_token": "[UNK]",
        "sep_token": "[SEP]",
        "pad_token": "[PAD]",
        "cls_token": "[CLS]",
        "mask_token": "[MASK]",
        "model_max_length": max_length,
        "max_len": max_length,
    }
    json.dump(tokenizer_cfg, f)