In [1]:
CUDA_LAUNCH_BLOCKING=1

In [2]:
"""
Finetuning of BERT model using historical German data
Input data: sentences extracted from Referenzkorpora zur deutschen Sprachgeschichte 
and German Data from Semeval2020 challenge on LSC

"""

__author__ = 'Christin Beck'
__created__ = '31.05.2023'

from icecream import ic

import re
import os

import numpy as np
import torch

from transformers import *
from tokenizers import BertWordPieceTokenizer

import pandas as pd

import logging

import sys

import json

2023-07-21 13:27:53.062417: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
tokenizer_path = 'pretrained-tokenizer'
tokenizer = BertTokenizerFast.from_pretrained(tokenizer_path)
      
#for fine-tuning
model_path = 'fine-tuned-bert/german'
#make the directory if not already there
if not os.path.isdir(model_path):
        os.mkdir(model_path)

loading file vocab.txt
loading file tokenizer.json
loading file added_tokens.json
loading file special_tokens_map.json
loading file tokenizer_config.json
loading configuration file pretrained-tokenizer/config.json
loading configuration file pretrained-tokenizer/config.json


In [4]:
with open('train.txt', 'r') as fin:
    train = [line.rstrip() for line in fin]

with open('test.txt', 'r') as fin:
    test = [line.rstrip() for line in fin]
    
train_dataset = tokenizer(train, max_length=512, padding='max_length', truncation=True, return_special_tokens_mask=True, return_tensors='pt')
train_dataset = train_dataset['input_ids']


test_dataset = tokenizer(test, max_length=512, padding='max_length', truncation=True, return_special_tokens_mask=True, return_tensors='pt')
test_dataset = test_dataset['input_ids']


tensor([[    2,   637,  7065,  ...,     0,     0,     0],
        [    2,   507,  9809,  ...,     0,     0,     0],
        [    2,   506,  2963,  ...,     0,     0,     0],
        ...,
        [    2, 19567,    17,  ...,     0,     0,     0],
        [    2,    53,    15,  ...,     0,     0,     0],
        [    2,   623,  1862,  ...,     0,     0,     0]])


In [5]:
# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    #device = torch.device("cuda")
    device = torch.cuda.set_device(0)

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")
    
torch.cuda.empty_cache()

There are 2 GPU(s) available.
We will use the GPU: NVIDIA A100-PCIE-40GB


In [6]:
##########Initialize the model
###Config = Europeana BERT
model_config = BertConfig(
    vocab_size=32000,
    hidden_size=768, 
    num_hidden_layers=12, 
    num_attention_heads=12,
    max_position_embeddings=512,
)

#for finetuning
#Load pretrained model
model = AutoModelForMaskedLM.from_pretrained("dbmdz/bert-base-german-cased")
print('No of parameters: ', model.num_parameters())

# Tell pytorch to run this model on the GPU.
model.cuda()


data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.2, return_tensors='pt') #randomly replaces each token with mask by 20%probability
print(data_collator)
    
##########Initialize Trainer and pass arguments on
training_args = TrainingArguments(
    overwrite_output_dir = False,  #set to False if training continued from checkpoint
    output_dir=model_path,          # output directory to where save model checkpoint
    evaluation_strategy="steps",    # evaluate each `logging_steps` steps
    num_train_epochs=4, #recommendation of BERT authors for fine-tuning: 2-4
    per_device_train_batch_size=8, # the training batch size, put it as high as your GPU memory fits
    gradient_accumulation_steps=8,  # accumulating the gradients before updating the weights
    per_device_eval_batch_size=8,  # evaluation batch size
    logging_steps=1000,             # evaluate, log and save model checkpoints every 1000 step
    save_steps=10000,
    # load_best_model_at_end=True,  # whether to load the best model (in terms of loss) at the end of training
    save_total_limit=2,           # whether you don't have much space so you let only 2 model weights saved in the disk
)    
    

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)
    
trainer.train(resume_from_checkpoint="fine-tuned-bert/german/checkpoint-10000")
#trainer.train()
trainer.save_model(model_path)

loading configuration file config.json from cache at /home/scc/christin.beck/.cache/huggingface/hub/models--dbmdz--bert-base-german-cased/snapshots/56c3dce79f5d93e466f3b800d8e57cddfe13a6d4/config.json
Model config BertConfig {
  "_name_or_path": "dbmdz/bert-base-german-cased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.26.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 31102
}

loading weights file pytorch_model.bin from cache at /home/scc/christin.beck/.cache/huggingface/hub/models--dbmdz--bert-base-german-cased/s

No of parameters:  109960318


using `logging_steps` to initialize `eval_steps` to 1000
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Loading model from fine-tuned-bert/german/checkpoint-10000.


DataCollatorForLanguageModeling(tokenizer=BertTokenizerFast(name_or_path='pretrained-tokenizer', vocab_size=30522, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}), mlm=True, mlm_probability=0.2, pad_to_multiple_of=None, tf_experimental_compile=False, return_tensors='pt')


***** Running training *****
  Num examples = 524876
  Num Epochs = 4
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 8
  Total optimization steps = 16400
  Number of trainable parameters = 109960318
  Continuing training from checkpoint, will skip to saved global_step
  Continuing training from epoch 2
  Continuing training from global step 10000
  Will skip the first 2 epochs then the first 14400 batches in the first epoch. If this takes a lot of time, you can add the `--ignore_data_skip` flag to your launch command, but you will resume the training on data already seen by your model.


  0%|          | 0/14400 [00:00<?, ?it/s]



Step,Training Loss,Validation Loss
11000,5.0255,4.937214
12000,4.9889,4.896297
13000,4.9431,4.858616
14000,4.8959,
15000,4.8825,4.809449
16000,4.8689,4.795249


***** Running Evaluation *****
  Num examples = 131219
  Batch size = 16
***** Running Evaluation *****
  Num examples = 131219
  Batch size = 16
***** Running Evaluation *****
  Num examples = 131219
  Batch size = 16
***** Running Evaluation *****
  Num examples = 131219
  Batch size = 16
***** Running Evaluation *****
  Num examples = 131219
  Batch size = 16
***** Running Evaluation *****
  Num examples = 131219
  Batch size = 16


Training completed. Do not forget to share your model on huggingface.co/models =)


Saving model checkpoint to fine-tuned-bert/german
Configuration saved in fine-tuned-bert/german/config.json
Configuration saved in fine-tuned-bert/german/generation_config.json
Model weights saved in fine-tuned-bert/german/pytorch_model.bin
