In [None]:
import tensorflow as tf
from transformers import BertTokenizer, TFBertForMaskedLM, DataCollatorForLanguageModeling
from datasets import load_dataset
from transformers import TFTrainer, TFTrainingArguments

In [None]:
tf_device = '/GPU:0' if tf.config.list_physical_devices('GPU') else '/CPU:0'
print(tf_device)

In [None]:
# Load the tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-german-cased")

In [None]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)

def load_and_prepare_dataset(dataset_path):
    dataset = load_dataset('text', data_files={'train': dataset_path})

    tokenized_datasets = dataset.map(tokenize_function, batched=True)
    data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
    )
    tf_dataset = tokenized_datasets["train"].to_tf_dataset(
    columns=["input_ids", "attention_mask", "token_type_ids"],
    shuffle=True,
    batch_size=16,
    collate_fn=data_collator
    )
    return tf_dataset

In [None]:
dataset_path = '../dataset/clean_text_data.txt'
tf_dataset = load_and_prepare_dataset(dataset_path)

In [None]:
with tf.device(tf_device):
    model = TFBertForMaskedLM.from_pretrained("bert-base-german-cased")

In [None]:
training_args = TFTrainingArguments(
    output_dir="./bert-german-checkpoints",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=16,
    save_steps=10_000,
    save_total_limit=2
)

In [None]:
with tf.device(tf_device):
    trainer = TFTrainer(
        model=model,
        args=training_args,
        train_dataset=tf_dataset
    )

In [None]:
trainer.train()

In [None]:
model.save_pretrained("./bert-german-final")