<a href="https://colab.research.google.com/github/drjaehongmin/hello-world/blob/master/BERT_Pre_Trainer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [95]:
# Jae-Hong Min
# Last Edit 04 Aug 2021
# Initially Based Off of: https://github.com/gmihaila/ml_things.git

# Parameters Panel

# Import necessary libraries
from google.colab import drive
from google.colab import files
# Mounts the Google Drive

drive.mount('/content/drive')

#=================================#
# Experimental Parameters         #
#=================================#

tokenizer_file_path = '/content/drive/MyDrive/Academic Research/W266 Final Project/test/'
#bert_model_selected = '/content/drive/MyDrive/Academic Research/W266 Final Project/models/drBERT_small_v2'
#tokenizer_file_path = "bert-base-cased"
#bert_model_selected = "bert-base-cased"


train_data_file_source='/content/drive/MyDrive/Academic Research/W266 Final Project/BERT_Training_Corpus/train.txt', 
eval_data_file_source ='/content/drive/MyDrive/Academic Research/W266 Final Project/BERT_Training_Corpus/test.txt', 

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# BERT Pretraining

## Initialization

In [96]:
# Install transformers library.
!pip install -q git+https://github.com/huggingface/transformers.git
# Install helper functions.
!pip install -q git+https://github.com/gmihaila/ml_things.git

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone


In [97]:
# Import necessary libraries
import io
import os
import math
import torch
import warnings
from tqdm.notebook import tqdm
from ml_things import plot_dict, fix_text
from transformers import (
                          CONFIG_MAPPING,
                          MODEL_FOR_MASKED_LM_MAPPING,
                          MODEL_FOR_CAUSAL_LM_MAPPING,
                          PreTrainedTokenizer,
                          TrainingArguments,
                          AutoConfig,
                          AutoTokenizer,
                          AutoModelWithLMHead,
                          AutoModelForCausalLM,
                          AutoModelForMaskedLM,
                          LineByLineTextDataset,
                          TextDataset,
                          DataCollatorForLanguageModeling,
                          DataCollatorForWholeWordMask,
                          DataCollatorForPermutationLanguageModeling,
                          PretrainedConfig,
                          Trainer,
                          set_seed,
                          )

# Set seed for reproducibility,
set_seed(42)

# Look for gpu to use. Will use `cpu` by default if no gpu found.
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


In [99]:

class ModelDataArguments(object):
   def __init__(self, train_data_file=None, eval_data_file=None, 
               line_by_line=False, mlm=False, mlm_probability=0.15, 
               whole_word_mask=False, plm_probability=float(1/6), 
               max_span_length=5, block_size=-1, overwrite_cache=False, 
               model_type=None, model_config_name=None, tokenizer_name=None, 
               model_name_or_path= None, model_cache_dir=None):
    
    # Make sure CONFIG_MAPPING is imported from transformers module.
    if 'CONFIG_MAPPING' not in globals():
      raise ValueError('Could not find `CONFIG_MAPPING` imported! Make sure' \
                       ' to import it from `transformers` module!')

    # Make sure model_type is valid.
    if (model_type is not None) and (model_type not in CONFIG_MAPPING.keys()):
      raise ValueError('Invalid `model_type`! Use one of the following: %s' % 
                       (str(list(CONFIG_MAPPING.keys()))))
      
    # Make sure that model_type, model_config_name and model_name_or_path 
    # variables are not all `None`.
    if not any([model_type, model_config_name, model_name_or_path]):
      raise ValueError('You can`t have all `model_type`, `model_config_name`,' \
                       ' `model_name_or_path` be `None`! You need to have' \
                       'at least one of them set!')
    
    # Check if a new model will be loaded from scratch.
    if not any([model_config_name, model_name_or_path]):
      # Setup warning to show pretty. This is an overkill
      warnings.formatwarning = lambda message,category,*args,**kwargs: \
                               '%s: %s\n' % (category.__name__, message)
      # Display warning.
      warnings.warn('Check Inputs")

    # Check if a new tokenizer wants to be loaded.
    # This feature is not supported!
    if not any([tokenizer_name, model_name_or_path]):
      # Can't train tokenizer from scratch here! Raise error.
      raise ValueError('Check Inputs")

      
    # Set all data related arguments.
    self.train_data_file = train_data_file
    self.eval_data_file = eval_data_file
    self.line_by_line = line_by_line
    self.mlm = mlm
    self.whole_word_mask = whole_word_mask
    self.mlm_probability = mlm_probability
    self.plm_probability = plm_probability
    self.max_span_length = max_span_length
    self.block_size = block_size
    self.overwrite_cache = overwrite_cache

    # Set all model and tokenizer arguments.
    self.model_type = model_type
    self.model_config_name = model_config_name
    self.tokenizer_name = tokenizer_name
    self.model_name_or_path = model_name_or_path
    self.model_cache_dir = model_cache_dir
    return


def get_model_config(args: ModelDataArguments):
  # Check model configuration.
  if args.model_config_name is not None:
    # Use model configure name if defined.
    model_config = AutoConfig.from_pretrained(args.model_config_name, 
                                      cache_dir=args.model_cache_dir)

  elif args.model_name_or_path is not None:
    # Use model name or path if defined.
    model_config = AutoConfig.from_pretrained(args.model_name_or_path, 
                                      cache_dir=args.model_cache_dir)

  else:
    # Use config mapping if building model from scratch.
    model_config = CONFIG_MAPPING[args.model_type]()

  # Make sure `mlm` flag is set for Masked Language Models (MLM).
  if (model_config.model_type in ["bert", "roberta", "distilbert", 
                                  "camembert"]) and (args.mlm is False):
    raise ValueError('BERT and RoBERTa-like models do not have LM heads ' \
                    'butmasked LM heads. They must be run setting `mlm=True`')
  
  # Adjust block size for xlnet.
  if model_config.model_type == "xlnet":
    # xlnet used 512 tokens when training.
    args.block_size = 512
    # setup memory length
    model_config.mem_len = 1024
  
  return model_config


def get_tokenizer(args: ModelDataArguments):
  # Check model configuration.
  if args.model_config_name is not None:
    # Use model configure name if defined.
    model_config = AutoConfig.from_pretrained(args.model_config_name, 
                                      cache_dir=args.model_cache_dir)

  elif args.model_name_or_path is not None:
    # Use model name or path if defined.
    model_config = AutoConfig.from_pretrained(args.model_name_or_path, 
                                      cache_dir=args.model_cache_dir)
    
  tokenizer = BertTokenizer.from_pretrained(tokenizer_file_path)
    
  # Setp data block size.
  if args.block_size <= 0:
    # Set block size to maximum length of tokenizer.
    # Input block size will be the max possible for the model.
    # Some max lengths are very large and will cause a
    args.block_size = tokenizer.model_max_length
  else:
    # Never go beyond tokenizer maximum length.
    args.block_size = min(args.block_size, tokenizer.model_max_length)

  return tokenizer
  

def get_model(args: ModelDataArguments, model_config):
  # Make sure MODEL_FOR_MASKED_LM_MAPPING and MODEL_FOR_CAUSAL_LM_MAPPING are 
  # imported from transformers module.
  if ('MODEL_FOR_MASKED_LM_MAPPING' not in globals()) and \
                ('MODEL_FOR_CAUSAL_LM_MAPPING' not in globals()):
    raise ValueError('Could not find `MODEL_FOR_MASKED_LM_MAPPING` and' \
                     ' `MODEL_FOR_MASKED_LM_MAPPING` imported! Make sure to' \
                     ' import them from `transformers` module!')
    
  # Check if using pre-trained model or train from scratch.
  if args.model_name_or_path:
    # Use pre-trained model.
    if type(model_config) in MODEL_FOR_MASKED_LM_MAPPING.keys():
      # Masked language modeling head.
      return AutoModelForMaskedLM.from_pretrained(
                        args.model_name_or_path,
                        from_tf=bool(".ckpt" in args.model_name_or_path),
                        config=model_config,
                        cache_dir=args.model_cache_dir,
                        )
    elif type(model_config) in MODEL_FOR_CAUSAL_LM_MAPPING.keys():
      # Causal language modeling head.
      return AutoModelForCausalLM.from_pretrained(
                                          args.model_name_or_path, 
                                          from_tf=bool(".ckpt" in 
                                                        args.model_name_or_path),
                                          config=model_config, 
                                          cache_dir=args.model_cache_dir)
    else:
      raise ValueError(
          'Invalid `model_name_or_path`! It should be in %s or %s!' % 
          (str(MODEL_FOR_MASKED_LM_MAPPING.keys()), 
           str(MODEL_FOR_CAUSAL_LM_MAPPING.keys())))
    
  else:
    # Use model from configuration - train from scratch.
      print("Training new model from scratch!")
      return AutoModelWithLMHead.from_config(config)


def get_dataset(args: ModelDataArguments, tokenizer: PreTrainedTokenizer, 
                evaluate: bool=False):
  # Get file path for either train or evaluate.
  file_path = args.eval_data_file if evaluate else args.train_data_file

  # Check if `line_by_line` flag is set to `True`.
  if args.line_by_line:
    # Each example in data file is on each line.
    return LineByLineTextDataset(tokenizer=tokenizer, file_path=file_path, 
                                 block_size=args.block_size)
    
  else:
    # All data in file is put together without any separation.
    return TextDataset(tokenizer=tokenizer, file_path=file_path, 
                       block_size=args.block_size, 
                       overwrite_cache=args.overwrite_cache)


def get_collator(args: ModelDataArguments, model_config: PretrainedConfig, 
                 tokenizer: PreTrainedTokenizer):
  # Special dataset handle depending on model type.
  if model_config.model_type == "xlnet":
    # Configure collator for XLNET.
    return DataCollatorForPermutationLanguageModeling(
                                          tokenizer=tokenizer,
                                          plm_probability=args.plm_probability,
                                          max_span_length=args.max_span_length,
                                          )
  else:
    # Configure data for rest of model types.
    if args.mlm and args.whole_word_mask:
      # Use whole word masking.
      return DataCollatorForWholeWordMask(
                                          tokenizer=tokenizer, 
                                          mlm_probability=args.mlm_probability,
                                          )
    else:
      # Regular language modeling.
      return DataCollatorForLanguageModeling(
                                          tokenizer=tokenizer, 
                                          mlm=args.mlm, 
                                          mlm_probability=args.mlm_probability,
                                          )


In [100]:
# Define arguments for data, tokenizer and model arguments.
# See comments in `ModelDataArguments` class.
model_data_args = ModelDataArguments(
                                    train_data_file='/content/drive/MyDrive/Academic Research/W266 Final Project/BERT_Training_Corpus/train.txt', 
                                    eval_data_file='/content/drive/MyDrive/Academic Research/W266 Final Project/BERT_Training_Corpus/test.txt', 
                                    line_by_line=True, 
                                    mlm=True,
                                    whole_word_mask=True,
                                    mlm_probability=0.15,
                                    plm_probability=float(1/6), 
                                    max_span_length=5,
                                    block_size=50, 
                                    overwrite_cache=False, 
                                    model_type='bert', 
                                    model_config_name='bert-base-cased', 
                                    #tokenizer_name='bert-base-cased', 
                                    tokenizer_name = "root",
                                    #model_name_or_path='bert-base-cased',
                                    model_name_or_path = None,
                                    model_cache_dir=None,
                                    )

training_args = TrainingArguments(
                          # The output directory where the model predictions 
                          # and checkpoints will be written.
                          output_dir='pretrain_bert',

                          # Overwrite the content of the output directory.
                          overwrite_output_dir=True,

                          # Whether to run training or not.
                          do_train=True, 
                          
                          # Whether to run evaluation on the dev or not.
                          do_eval=True,
                          
                          # Batch size GPU/TPU core/CPU training.
                          per_device_train_batch_size=10,
                          
                          # Batch size  GPU/TPU core/CPU for evaluation.
                          per_device_eval_batch_size=100,

                          # evaluation strategy to adopt during training
                          # `no`: No evaluation during training.
                          # `steps`: Evaluate every `eval_steps`.
                          # `epoch`: Evaluate every end of epoch.
                          evaluation_strategy='steps',

                          # How often to show logs. I will se this to 
                          # plot history loss and calculate perplexity.
                          logging_steps=700,

                          # Number of update steps between two 
                          # evaluations if evaluation_strategy="steps".
                          # Will default to the same value as l
                          # logging_steps if not set.
                          eval_steps = None,
                          
                          # Set prediction loss to `True` in order to 
                          # return loss for perplexity calculation.
                          prediction_loss_only=True,

                          # The initial learning rate for Adam. 
                          # Defaults to 5e-5.
                          learning_rate = 5e-5,

                          # The weight decay to apply (if not zero).
                          weight_decay=0,

                          # Epsilon for the Adam optimizer. 
                          # Defaults to 1e-8
                          adam_epsilon = 1e-8,

                          # Maximum gradient norm (for gradient 
                          # clipping). Defaults to 0.
                          max_grad_norm = 1.0,
                          # Total number of training epochs to perform 
                          # (if not an integer, will perform the 
                          # decimal part percents of
                          # the last epoch before stopping training).
                          num_train_epochs = 2,

                          # Number of updates steps before two checkpoint saves. 
                          # Defaults to 500
                          save_steps = -1,
                          )

using `logging_steps` to initialize `eval_steps` to 700
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [101]:
# Load model configuration.
print('Loading model configuration...')
config = get_model_config(model_data_args)

# Load model tokenizer.
print('Loading model`s tokenizer...')
tokenizer = get_tokenizer(model_data_args)

# Loading model.
print('Loading actual model...')
model = get_model(model_data_args, config)

# Resize model to fit all tokens in tokenizer.
model.resize_token_embeddings(len(tokenizer))

Loading model configuration...


loading configuration file https://huggingface.co/bert-base-cased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/a803e0468a8fe090683bdc453f4fac622804f49de86d7cecaee92365d4a0f829.a64a22196690e0e82ead56f388a3ef3a50de93335926ccfa20610217db589307
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.10.0.dev0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 28996
}



Loading model`s tokenizer...


loading configuration file https://huggingface.co/bert-base-cased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/a803e0468a8fe090683bdc453f4fac622804f49de86d7cecaee92365d4a0f829.a64a22196690e0e82ead56f388a3ef3a50de93335926ccfa20610217db589307
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.10.0.dev0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 28996
}

Didn't find file /content/drive/MyDrive/Academic Research/W266 Final Project/test/added_tokens.json. 

Loading actual model...
Training new model from scratch!


Embedding(30522, 768)

In [102]:
# Setup train dataset if `do_train` is set.
print('Creating train dataset...')
train_dataset = get_dataset(model_data_args, tokenizer=tokenizer, evaluate=False) if training_args.do_train else None

# Setup evaluation dataset if `do_eval` is set.
print('Creating evaluate dataset...')
eval_dataset = get_dataset(model_data_args, tokenizer=tokenizer, evaluate=True) if training_args.do_eval else None

# Get data collator to modify data format depending on type of model used.
data_collator = get_collator(model_data_args, config, tokenizer)

# Check how many logging prints you'll have. This is to avoid overflowing the 
# notebook with a lot of prints. Display warning to user if the logging steps 
# that will be displayed is larger than 100.
if (len(train_dataset) // training_args.per_device_train_batch_size \
    // training_args.logging_steps * training_args.num_train_epochs) > 100:
  # Display warning.
  warnings.warn('Your `logging_steps` value will will do a lot of printing!' \
                ' Consider increasing `logging_steps` to avoid overflowing' \
                ' the notebook with a lot of prints!')

Creating features from dataset file at /content/drive/MyDrive/Academic Research/W266 Final Project/BERT_Training_Corpus/train.txt


Creating train dataset...


Creating features from dataset file at /content/drive/MyDrive/Academic Research/W266 Final Project/BERT_Training_Corpus/test.txt


Creating evaluate dataset...




## Train

In [None]:
# Initialize Trainer.
print('Loading `trainer`...')
trainer = Trainer(model=model,
                  args=training_args,
                  data_collator=data_collator,
                  train_dataset=train_dataset,
                  eval_dataset=eval_dataset,
                  )


# Check model path to save.
if training_args.do_train:
  print('Start training...')

  # Setup model path if the model to train loaded from a local path.
  model_path = (model_data_args.model_name_or_path 
                if model_data_args.model_name_or_path is not None and 
                os.path.isdir(model_data_args.model_name_or_path) 
                else None
                )
  # Run training.
  trainer.train(model_path=model_path)
  # Save model.
  trainer.save_model()

  # For convenience, we also re-save the tokenizer to the same directory,
  # so that you can share your model easily on huggingface.co/models =).
  if trainer.is_world_process_zero():
    tokenizer.save_pretrained(training_args.output_dir)

Loading `trainer`...


***** Running training *****
  Num examples = 1858362
  Num Epochs = 2
  Instantaneous batch size per device = 10
  Total train batch size (w. parallel, distributed & accumulation) = 10
  Gradient Accumulation steps = 1
  Total optimization steps = 371674


Start training...


Step,Training Loss,Validation Loss
700,7.1551,6.792943
1400,6.7209,6.614862
2100,6.5829,6.507611
2800,6.5015,6.441
3500,6.4546,6.395853
4200,6.3415,6.288786
4900,6.2273,6.130356
5600,6.1231,5.998371
6300,5.9897,5.867303
7000,5.8652,5.701867


***** Running Evaluation *****
  Num examples = 206486
  Batch size = 100
***** Running Evaluation *****
  Num examples = 206486
  Batch size = 100
***** Running Evaluation *****
  Num examples = 206486
  Batch size = 100
***** Running Evaluation *****
  Num examples = 206486
  Batch size = 100
***** Running Evaluation *****
  Num examples = 206486
  Batch size = 100
***** Running Evaluation *****
  Num examples = 206486
  Batch size = 100
***** Running Evaluation *****
  Num examples = 206486
  Batch size = 100
***** Running Evaluation *****
  Num examples = 206486
  Batch size = 100
***** Running Evaluation *****
  Num examples = 206486
  Batch size = 100
***** Running Evaluation *****
  Num examples = 206486
  Batch size = 100
***** Running Evaluation *****
  Num examples = 206486
  Batch size = 100
***** Running Evaluation *****
  Num examples = 206486
  Batch size = 100
***** Running Evaluation *****
  Num examples = 206486
  Batch size = 100
***** Running Evaluation *****
  Num e

In [79]:
import matplotlib.pyplot as plt
%matplotlib inline

# Keep track of train and evaluate loss.
loss_history = {'train_loss':[], 'eval_loss':[]}

# Keep track of train and evaluate perplexity.
# This is a metric useful to track for language models.
perplexity_history = {'train_perplexity':[], 'eval_perplexity':[]}

# Loop through each log history.
for log_history in trainer.state.log_history:

  if 'loss' in log_history.keys():
    # Deal with trianing loss.
    loss_history['train_loss'].append(log_history['loss'])
    perplexity_history['train_perplexity'].append(math.exp(log_history['loss']))
    
  elif 'eval_loss' in log_history.keys():
    # Deal with eval loss.
    loss_history['eval_loss'].append(log_history['eval_loss'])
    perplexity_history['eval_perplexity'].append(math.exp(log_history['eval_loss']))

# Plot Losses.
plot_dict(loss_history, start_step=training_args.logging_steps, 
          step_size=training_args.logging_steps, use_title='Loss', 
          use_xlabel='Train Steps', use_ylabel='Values', magnify=2)

print()

# Plot Perplexities.
plot_dict(perplexity_history, start_step=training_args.logging_steps, 
          step_size=training_args.logging_steps, use_title='Perplexity', 
          use_xlabel='Train Steps', use_ylabel='Values', magnify=2)



ImportError: ignored

<Figure size 1944x432 with 1 Axes>






ImportError: ignored

<Figure size 1944x432 with 1 Axes>

## **Evaluate**

For the final evaluation we can have a separate test set that we use to do our final perplexity evaluation. For simplicity I used the same validation text file for the final evaluation. That is the reason I get the same results as the last validation perplexity plot value.

In [80]:
# check if `do_eval` flag is set.
if training_args.do_eval:
  
  # capture output if trainer evaluate.
  eval_output = trainer.evaluate()
  # compute perplexity from model loss.
  perplexity = math.exp(eval_output["eval_loss"])
  print('\nEvaluate Perplexity: {:10,.2f}'.format(perplexity))
else:
  print('No evaluation needed. No evaluation data provided, `do_eval=False`!')

***** Running Evaluation *****
  Num examples = 5001
  Batch size = 100



Evaluate Perplexity:     538.85


In [81]:
model.save_pretrained("/content/drive/MyDrive/Academic Research/W266 Final Project/models/drBERT_v2")

Configuration saved in /content/drive/MyDrive/Academic Research/W266 Final Project/models/drBERT_small_v2/config.json
Model weights saved in /content/drive/MyDrive/Academic Research/W266 Final Project/models/drBERT_small_v2/pytorch_model.bin
