In [1]:
# Install the transformers library if you haven't already
# You can install it using: pip install transformers

from transformers import BertTokenizer, BertForMaskedLM
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments




In [2]:
# Load the pre-trained BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForMaskedLM.from_pretrained('bert-base-uncased')

# Load your text file corpus
corpus_file_path = 'datasets/v1-v2-combine_400mb.txt'

# Tokenize the corpus
with open(corpus_file_path, 'r', encoding='utf-8') as file:
    text = file.read()



Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [3]:
# tokenized_text = tokenizer.tokenize(tokenizer.decode(tokenizer.encode(text)))

In [4]:

# Create a masked language modeling dataset
dataset = TextDataset(
    tokenizer=tokenizer,
    file_path=corpus_file_path,
    block_size=512  # Adjust block_size according to your corpus size and memory constraints
)




In [5]:

# Create a data collator for language modeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=True,
    mlm_probability=0.15
)

# Define training arguments
training_args = TrainingArguments(
    output_dir='models/bert_mlm_output',
    overwrite_output_dir=True,
    num_train_epochs=1,  # Adjust the number of training epochs
    per_device_train_batch_size=4,  # Adjust batch size according to your GPU memory
    save_steps=10_000,
    save_total_limit=2,
)

# Create the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset
)


In [6]:
%%time
# Train the model
trainer.train()

  Decoupled weight decay to apply.


Step,Training Loss
500,2.4176
1000,1.8299
1500,1.6246
2000,1.4849
2500,1.4022
3000,1.3169
3500,1.2706
4000,1.2271
4500,1.1872
5000,1.1417




CPU times: user 3h 25min 32s, sys: 7min 4s, total: 3h 32min 37s
Wall time: 2h 21min 32s


TrainOutput(global_step=17903, training_loss=1.1051811180334215, metrics={'train_runtime': 8491.6228, 'train_samples_per_second': 33.733, 'train_steps_per_second': 2.108, 'total_flos': 7.53944922169344e+16, 'train_loss': 1.1051811180334215, 'epoch': 1.0})