In [1]:
from transformers import DataCollatorForLanguageModeling
from transformers import TrainingArguments
from transformers import BertTokenizerFast
from transformers import BertForMaskedLM
from transformers import BertConfig
from transformers import pipeline 
from datasets import load_dataset
from transformers import Trainer
from datasets import DatasetDict
import pandas as pd
import transformers
import datasets
import logging
import torch
import math
import sys


# Setup logging
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.getLevelName('INFO'), 
                    handlers=[logging.StreamHandler(sys.stdout)], 
                    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')

# Log versions of dependencies
logger.info(f'[Using Transformers: {transformers.__version__}]')
logger.info(f'[Using Datasets: {datasets.__version__}]')
logger.info(f'[Using Torch: {torch.__version__}]')
logger.info(f'[Using Pandas: {pd.__version__}]')


  from .autonotebook import tqdm as notebook_tqdm


2023-08-31 21:07:54,788 - __main__ - INFO - [Using Transformers: 4.33.0.dev0]
2023-08-31 21:07:54,789 - __main__ - INFO - [Using Datasets: 2.14.4]
2023-08-31 21:07:54,790 - __main__ - INFO - [Using Torch: 2.0.1+cpu]
2023-08-31 21:07:54,790 - __main__ - INFO - [Using Pandas: 2.1.0]


In [5]:
MAX_LENGTH = 512
# Re-create BERT WordPiece tokenizer 
logger.info(f'Re-creating BERT tokenizer using custom vocabulary from [./data/vocab/]')
# config = BertConfig()
tokenizer = BertTokenizerFast.from_pretrained("./data/vocab",padding="max_length", truncation="max_length")
tokenizer.model_max_length = MAX_LENGTH
tokenizer.init_kwargs['model_max_length'] = MAX_LENGTH
tokenizer.save_pretrained("./BTF")
logger.info(f'Tokenizer: {tokenizer}')

2023-08-31 21:11:19,919 - __main__ - INFO - Re-creating BERT tokenizer using custom vocabulary from [./vocab/]
2023-08-31 21:11:19,934 - __main__ - INFO - Tokenizer: BertTokenizerFast(name_or_path='./vocab', vocab_size=3136, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True)


In [5]:

MAX_LENGTH = 512
CHUNK_SIZE = 128
TRAIN_EPOCHS = 5
BATCH_SIZE = 128
SAVE_STEPS = 10000
SAVE_TOTAL_LIMIT = 2

LOCAL_DATA_DIR = './'
LOCAL_MODEL_DIR = './finetuned'




# # Re-create original BERT WordPiece tokenizer 
# logger.info(f'Re-creating original BERT tokenizer')
# tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
# logger.info(f'Tokenizer: {tokenizer}')

# Re-create BERT WordPiece tokenizer 
logger.info(f'Re-creating BERT tokenizer using custom vocabulary from [./data/vocab/]')
config = BertConfig()
tokenizer = BertTokenizerFast.from_pretrained("./data/vocab", config=config, padding="max_length", truncation="max_length")
tokenizer.model_max_length = MAX_LENGTH
tokenizer.init_kwargs['model_max_length'] = MAX_LENGTH
tokenizer.save_model("./BTF")
logger.info(f'Tokenizer: {tokenizer}')

# Read dataset 
chunked_datasets = datasets.load_from_disk(LOCAL_DATA_DIR)
logger.info(f'Chunked datasets: {chunked_datasets}')

# Create data collator
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, 
                                                mlm=True, 
                                                mlm_probability=0.15)
    
# Load MLM
logger.info('Loading BertForMaskedLM model')
mlm = BertForMaskedLM(config=config)

# Train MLM
logger.info('Training MLM')
training_args = TrainingArguments(output_dir='/tmp/checkpoints', 
                                    overwrite_output_dir=True, 
                                    optim='adamw_torch',
                                    num_train_epochs=TRAIN_EPOCHS,
                                    per_device_train_batch_size=BATCH_SIZE,
                                    evaluation_strategy='epoch',
                                    save_steps=SAVE_STEPS, 
                                    save_total_limit=SAVE_TOTAL_LIMIT)
trainer = Trainer(model=mlm, 
                    args=training_args, 
                    data_collator=data_collator,
                    train_dataset=chunked_datasets['train'],
                    eval_dataset=chunked_datasets['validation'])

# Evaluate trained model for perplexity
eval_results = trainer.evaluate()
logger.info(f"Perplexity before training: {math.exp(eval_results['eval_loss']):.2f}")

trainer.train()

eval_results = trainer.evaluate()
logger.info(f"Perplexity after training: {math.exp(eval_results['eval_loss']):.2f}")

# Save trained model to local model directory
logger.info(f'Saving trained MLM to [{LOCAL_MODEL_DIR}/]')
trainer.save_model(LOCAL_MODEL_DIR)

2023-08-31 17:44:36,311 - __main__ - INFO - Re-creating BERT tokenizer using custom vocabulary from [./vocab/]
2023-08-31 17:44:36,319 - __main__ - INFO - Tokenizer: BertTokenizerFast(name_or_path='./vocab', vocab_size=3136, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True)
2023-08-31 17:44:36,327 - __main__ - INFO - Chunked datasets: DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 177
    })
    validation: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 18
    })
})
2023-08-31 17:44:36,328 - __main__ - INFO - Loading BertForMaskedLM model
2023-08-31 17:44:37,851 - __main__ - INFO - Training MLM


You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
100%|██████████| 3/3 [00:01<00:00,  1.71it/s]

2023-08-31 17:44:40,941 - __main__ - INFO - Perplexity before training: 35577.18



                                              
 20%|██        | 2/10 [01:54<06:49, 51.22s/it]

{'eval_loss': 9.679203033447266, 'eval_runtime': 4.4669, 'eval_samples_per_second': 4.03, 'eval_steps_per_second': 0.672, 'epoch': 1.0}


                                              
 40%|████      | 4/10 [04:08<05:48, 58.12s/it]

{'eval_loss': 9.505746841430664, 'eval_runtime': 4.3143, 'eval_samples_per_second': 4.172, 'eval_steps_per_second': 0.695, 'epoch': 2.0}


 60%|██████    | 6/10 [05:58<03:39, 54.93s/it]
 60%|██████    | 6/10 [06:02<03:39, 54.93s/it]

{'eval_loss': 9.336270332336426, 'eval_runtime': 4.7532, 'eval_samples_per_second': 3.787, 'eval_steps_per_second': 0.631, 'epoch': 3.0}


                                              
 80%|████████  | 8/10 [08:21<01:59, 59.55s/it]

{'eval_loss': 9.20265007019043, 'eval_runtime': 4.8844, 'eval_samples_per_second': 3.685, 'eval_steps_per_second': 0.614, 'epoch': 4.0}


                                               
100%|██████████| 10/10 [10:13<00:00, 61.38s/it]


{'eval_loss': 9.226667404174805, 'eval_runtime': 4.324, 'eval_samples_per_second': 4.163, 'eval_steps_per_second': 0.694, 'epoch': 5.0}
{'train_runtime': 613.7378, 'train_samples_per_second': 1.442, 'train_steps_per_second': 0.016, 'train_loss': 9.502236938476562, 'epoch': 5.0}


100%|██████████| 3/3 [00:02<00:00,  1.33it/s]

2023-08-31 17:54:59,078 - __main__ - INFO - Perplexity after training: 10688.52
2023-08-31 17:54:59,084 - __main__ - INFO - Saving trained MLM to [./finetuned/]





In [6]:
eval_results

{'eval_loss': 9.276925086975098,
 'eval_runtime': 4.18,
 'eval_samples_per_second': 4.306,
 'eval_steps_per_second': 0.718,
 'epoch': 5.0}