<a href="https://colab.research.google.com/github/gpandu/BERT-Pretraining/blob/main/BERT_pre_training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install tokenizers
!pip install datasets
!pip install transformers
!pip install apache_beam
! pip install -U accelerate

Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Using cached dill-0.3.7-py3-none-any.whl (115 kB)
Installing collected packages: dill
  Attempting uninstall: dill
    Found existing installation: dill 0.3.1.1
    Uninstalling dill-0.3.1.1:
      Successfully uninstalled dill-0.3.1.1
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
apache-beam 2.52.0 requires dill<0.3.2,>=0.3.1.1, but you have dill 0.3.7 which is incompatible.[0m[31m
[0mSuccessfully installed dill-0.3.7
Collecting dill<0.3.2,>=0.3.1.1 (from apache_beam)
  Using cached dill-0.3.1.1-py3-none-any.whl
Installing collected packages: dill
  Attempting uninstall: dill
    Found existing installation: dill 0.3.7
    Uninstalling dill-0.3.7:
      Successfully uninstalled dill-0.3.7
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This

In [None]:
from datasets import load_dataset
dataset = load_dataset("bookcorpus", split="train")

Downloading builder script:   0%|          | 0.00/3.25k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.67k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/6.48k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.18G [00:00<?, ?B/s]

Generating train split:   0%|          | 0/74004228 [00:00<?, ? examples/s]

In [2]:
# download and prepare cc_news dataset
from datasets import load_dataset
dataset = load_dataset("cc_news", split="train")

In [3]:
dataset

Dataset({
    features: ['title', 'text', 'domain', 'date', 'description', 'url', 'image_url'],
    num_rows: 708241
})

In [4]:
from tokenizers import (
    decoders,
    models,
    normalizers,
    pre_tokenizers,
    processors,
    trainers,
    Tokenizer,
    BertWordPieceTokenizer
)

from transformers import BertTokenizerFast

In [5]:
tokenizer_batch_len = 1000
def get_training_corpus():
    for start_idx in range(0, len(dataset), tokenizer_batch_len):
        samples = dataset[start_idx : start_idx + tokenizer_batch_len]
        yield samples["text"]

In [6]:
special_tokens = [
  "[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]", "<S>", "<T>"
]
vocab_size = 30_522
max_length = 512

In [7]:
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

In [8]:
# train new tokenizer
bert_tokenizer = tokenizer.train_new_from_iterator(text_iterator=get_training_corpus(), vocab_size=32_000)
bert_tokenizer.save_pretrained("tokenizer")

('tokenizer/tokenizer_config.json',
 'tokenizer/special_tokens_map.json',
 'tokenizer/vocab.txt',
 'tokenizer/added_tokens.json',
 'tokenizer/tokenizer.json')

In [8]:
tokenizer = BertTokenizerFast.from_pretrained("/content/tokenizer/")

In [9]:
ids = tokenizer.encode("but just one look at a minion sent him practically catatonic .", "but just one look at a minion sent him practically catatonic .")
tokens = tokenizer.decode(ids)
tokens


'[CLS] but just one look at a minion sent him practically catatonic. [SEP] but just one look at a minion sent him practically catatonic. [SEP]'

In [10]:
def encode(examples):
  """Mapping function to tokenize the sentences passed without truncation"""
  return tokenizer(examples["text"], return_special_tokens_mask=True)

train_dataset = dataset.map(encode, batched=True)

In [11]:
train_dataset.set_format(columns=["input_ids", "attention_mask", "special_tokens_mask"])

In [12]:
from itertools import chain

max_seq_length = 512
# https://github.com/huggingface/transformers/blob/main/examples/pytorch/language-modeling/run_mlm.py
def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    # customize this part to your needs.
    if total_length >= max_seq_length:
        total_length = (total_length // max_seq_length) * max_seq_length
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + max_seq_length] for i in range(0, total_length, max_seq_length)]
        for k, t in concatenated_examples.items()
    }
    return result




train_dataset = train_dataset.map(group_texts, batched=True,
                                    desc=f"Grouping texts in chunks of {max_seq_length}")
# convert them from lists to torch tensors
#train_dataset.set_format("tf")

In [13]:
train_dataset.set_format("torch")

In [14]:
train_dataset

Dataset({
    features: ['title', 'text', 'domain', 'date', 'description', 'url', 'image_url', 'input_ids', 'token_type_ids', 'attention_mask', 'special_tokens_mask'],
    num_rows: 82134
})

In [15]:
# initialize the model with the config
from transformers import BertConfig, BertForMaskedLM
model_config = BertConfig(vocab_size=vocab_size, max_position_embeddings=max_length)
model = BertForMaskedLM(config=model_config)

In [16]:
# initialize the data collator, randomly masking 20% (default is 15%) of the tokens for the Masked Language
# Modeling (MLM) task
from transformers import DataCollatorForLanguageModeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.2
)

In [23]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="/content/model/",          # output directory to where save model checkpoint
    evaluation_strategy="no",    # evaluate each `logging_steps` steps
    overwrite_output_dir=True,
    num_train_epochs=10,            # number of training epochs, feel free to tweak
    per_device_train_batch_size=10, # the training batch size, put it as high as your GPU memory fits
    gradient_accumulation_steps=8,  # accumulating the gradients before updating the weights
    logging_steps=1000,             # evaluate, log and save model checkpoints every 1000 step
    save_steps=1000,
    #load_best_model_at_end=True,  # whether to load the best model (in terms of loss) at the end of training
    save_total_limit=1,           # whether you don't have much space so you let only 3 model weights saved in the disk
)

In [24]:
# initialize the trainer and pass everything to it
from transformers import Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
)

RuntimeError: ignored

In [22]:
# train the model
trainer.train()

RuntimeError: ignored