# Train your own BERT

Sometimes you need to build your own BERT if one is unavailable (eg. for a low resource language). Here are the steps you need to do so.

1. Choose your corpus
2. Building your tokenizer
3. Creating an input pipeline
4. Training the model with the MLM objective


In [None]:
!pip install datasets
!pip install 'transformers[torch]'
!pip install zstandard
!pip install accelerate -U


In [None]:
import datasets
from datasets import DatasetInfo
#from transformers import AutoTokenizer
from tqdm import tqdm
from transformers import BertTokenizerFast
from transformers import (
    AutoTokenizer,
    #BertTokenizerFast,
    AutoConfig,
    AutoModelForMaskedLM,
    DataCollatorForLanguageModeling,
    AutoModelForSequenceClassification,
    HfArgumentParser,
    Trainer,
    TrainingArguments,
    is_torch_tpu_available,
    set_seed,
    DataCollatorWithPadding,

)
from itertools import chain
import os
import torch

In [None]:
# connect to GPU runtime to check this
print(f'(Free memory, Available Memory){torch.cuda.mem_get_info()}')

(Free memory, Available Memory)(15727394816, 15835398144)


## 1. Choose your corpus

If you don't have your own corpus, you can use the [OSCAR](https://huggingface.co/datasets/oscar-corpus/OSCAR-2301/viewer/af) dataset which is a multilingual dataset (provided that the dataset for that language exists of course). Let's choose  language="af". The OSCAR dataset needs the huggingface token from your account.

For the purpose of demonstration, we restrict the dataset size to training (1000 rows) and test (100 rows) only.

In [None]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
ds = (datasets.load_dataset("oscar-corpus/OSCAR-2301", token=True, language="af", split="train").
      train_test_split(train_size=1000, test_size=100))

Downloading builder script:   0%|          | 0.00/22.3k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/37.4k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/83.0 [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/17.0M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [None]:
ds

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['text'],
        num_rows: 100
    })
})

In [None]:
ds.column_names

{'train': ['id', 'text', 'meta'], 'test': ['id', 'text', 'meta']}

In [None]:
ds = ds.remove_columns([col for col in ds["train"].column_names if col!="text"])

In [None]:
def get_training_corpus(ds):
    return (
        ds["text"]
        for i in range(0, len(ds), 1000)
    )


## 2. Building your tokenizer
At first you will have to initialise a tokenizer for this new language using based on the chosen model.

Then we will train the tokenizer for this language. We can limit the vocabulary size if desired.

Once trained, we can view the tokenizer under the files.

### Task 1
What is stored in the following files?
1. 'tokenizer_af/tokenizer_config.json',
2. 'tokenizer_af/special_tokens_map.json',
3. 'tokenizer_af/vocab.txt',
4. 'tokenizer_af/added_tokens.json',
5. 'tokenizer_af/tokenizer.json'

In [None]:
model_checkpoint ="bert-base-uncased"

In [None]:
tokenizer_bert_uncased = AutoTokenizer.from_pretrained(model_checkpoint)
training_corpus = get_training_corpus(ds["train"])
new_tokenizer = tokenizer_bert_uncased.train_new_from_iterator(text_iterator = training_corpus, vocab_size=25000)


Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
new_tokenizer.save_pretrained("tokenizer_af")

('tokenizer_af/tokenizer_config.json',
 'tokenizer_af/special_tokens_map.json',
 'tokenizer_af/vocab.txt',
 'tokenizer_af/added_tokens.json',
 'tokenizer_af/tokenizer.json')

In [None]:
new_tokenizer.vocab_size

25000

Once the tokenizer is saved, you can load it anytime to train your new BERT model.

In [None]:

tokenizer = AutoTokenizer.from_pretrained("tokenizer_af")
print(f"The max length for the tokenizer is: {tokenizer.model_max_length}")

The max length for the tokenizer is: 512


In [None]:
def return_tokenized_inputs(dataset, tokenizer):
  tokenized_inputs = tokenizer( dataset["text"], return_special_tokens_mask=True, truncation=True,padding=True, max_length=tokenizer.model_max_length, return_tensors="pt")
  return tokenized_inputs

# preprocess dataset
tokenized_datasets = ds.map(return_tokenized_inputs, batched=True, remove_columns=["text"], fn_kwargs={"tokenizer": tokenizer})

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [None]:
tokenized_datasets["train"]

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'special_tokens_mask'],
    num_rows: 1000
})

In [None]:


# Main data processing function that will concatenate all texts from our dataset and generate chunks of
# max_seq_length.
def group_texts(examples):
    """
    This function takes a batch in and chains the entire batch key wise (such as input_ids, token_type_ids).
    But the training can only occur in accordance with the model_max_length. Thus we have to create
    """
    # Concatenate all texts.
    concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    # customize this part to your needs.
    if total_length >= tokenizer.model_max_length:
        total_length = (total_length // tokenizer.model_max_length) * tokenizer.model_max_length
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + tokenizer.model_max_length] for i in range(0, total_length, tokenizer.model_max_length)]
        for k, t in concatenated_examples.items()
    }
    return result

In [None]:
tokenized_datasets = tokenized_datasets.map(group_texts, batched=True)
# shuffle dataset
tokenized_datasets = tokenized_datasets.shuffle(seed=34)
print(f"the dataset contains in total {len(tokenized_datasets)*tokenizer.model_max_length} tokens")


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

the dataset contains in total 1024 tokens


## 3. Creating an input pipeline


In [None]:
config = AutoConfig.from_pretrained(model_checkpoint)
model = AutoModelForMaskedLM.from_config(config)

In [None]:
training_args = TrainingArguments(
    output_dir=f"{model_checkpoint}-oscar-af",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    overwrite_output_dir=True,
    num_train_epochs=1,
    save_steps=10_000,
    learning_rate=2e-5,
    weight_decay=0.01,
    push_to_hub=False
)



In [None]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

In [None]:
tokenized_datasets.shape

{'train': (1000, 4), 'test': (100, 4)}

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator = data_collator
)

In [None]:
trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss


TrainOutput(global_step=63, training_loss=9.029562329489087, metrics={'train_runtime': 116.7764, 'train_samples_per_second': 8.563, 'train_steps_per_second': 0.539, 'total_flos': 263204812800000.0, 'train_loss': 9.029562329489087, 'epoch': 1.0})

In [None]:
model_saving_path = "shiny_new_bert_af"
trainer.save_model(model_saving_path)

## Other Resources
[Pretraining BERT](https://huggingface.co/blog/pretraining-bert)

https://github.com/huggingface/notebooks/blob/main/examples/language_modeling_from_scratch.ipynb

# Task 2  Perform sentiment analysis
Fine tune your shiny new BERT to perform a downstram task [Sentiment Analysis Dataset](https://huggingface.co/datasets/senti_lex) for the Afrikaans language.
Use the colabs from the previous session to guide you.

Estimated time taken 20-30 minutes.

In [None]:
raw_dataset = datasets.load_dataset('senti_lex', "af", split="train").train_test_split(test_size=0.2) # 80% train, 20% test

In [None]:
raw_dataset

DatasetDict({
    train: Dataset({
        features: ['word', 'sentiment'],
        num_rows: 1839
    })
    test: Dataset({
        features: ['word', 'sentiment'],
        num_rows: 460
    })
})

In [None]:
raw_dataset["train"][0]

{'word': 'soliede', 'sentiment': 1}

In [None]:
labels = [0, 1]
id2label = {idx:label for idx, label in enumerate(labels)}
label2id = {label:idx for idx, label in enumerate(labels)}

In [None]:
def tokenize_function(batch, tokenizer,label2id ):#= tokenizer
    tokenized_batch = tokenizer(batch["word"],padding=True, max_length=tokenizer.model_max_length, truncation=True, return_tensors="pt")
    tokenized_batch["labels"] = [label2id.get(label) for label in batch["sentiment"]] #label
    return tokenized_batch

In [None]:
senti_tokenized_dataset = raw_dataset.map(tokenize_function, batched=True, fn_kwargs={"tokenizer": tokenizer, "label2id":label2id},
                                    remove_columns=raw_dataset['train'].column_names) #use fn_kwargs to pass any arguments to the tokenizing function

Map:   0%|          | 0/1839 [00:00<?, ? examples/s]

Map:   0%|          | 0/460 [00:00<?, ? examples/s]

In [None]:
senti_tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 1839
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 460
    })
})

In [None]:
senti_tokenized_dataset.set_format(type="torch")

In [None]:
senti_data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding = True) #pads to the max sequence length in a batch

In [None]:
!pip install evaluate
import evaluate
accuracy = evaluate.load("accuracy")



In [None]:
import numpy as np
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [None]:
senti_model = AutoModelForSequenceClassification.from_pretrained(model_saving_path, num_labels=2, id2label=id2label, label2id=label2id)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at shiny_new_bert_af and are newly initialized: ['bert.pooler.dense.weight', 'classifier.bias', 'bert.pooler.dense.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:

senti_training_args = TrainingArguments(
    output_dir= model_saving_path + "_sentiment_af_trained",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
)

In [None]:

senti_trainer = Trainer(
    model=senti_model,
    args=senti_training_args,
    train_dataset=senti_tokenized_dataset["train"],
    eval_dataset=senti_tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=senti_data_collator,
    compute_metrics=compute_metrics,
)

In [None]:
senti_trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.703536,0.513043
2,0.678800,0.730295,0.578261


Epoch,Training Loss,Validation Loss


TrainOutput(global_step=920, training_loss=0.6654566557511039, metrics={'train_runtime': 113.8014, 'train_samples_per_second': 32.319, 'train_steps_per_second': 8.084, 'total_flos': 19845870794820.0, 'train_loss': 0.6654566557511039, 'epoch': 2.0})

In [None]:
senti_model_saving_path = "super_shiny_new_bert_af_sentiment"
senti_trainer.save_model(senti_model_saving_path)