In [2]:
from transformers import AlbertConfig, AlbertForMaskedLM, DataCollatorForLanguageModeling, Trainer, TrainingArguments
from tokenizer import SamplingAlbertTokenizer
from dataset import BatchedLineByLineTextDataset

In [4]:
albert_tokenizer = SamplingAlbertTokenizer('tokenizer_65536.model', do_lower_case=False)
vocab_size = len(albert_tokenizer.get_vocab())

In [6]:
dataset = BatchedLineByLineTextDataset(
    tokenizer=albert_tokenizer,
    file_path="./corpus_train.txt",
    block_size=128,
)

In [8]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=albert_tokenizer, mlm=True, mlm_probability=0.15
)

In [9]:
albert_tiny_config = {
    "attention_probs_dropout_prob": 0.0,
    "directionality": "bidi",
    "hidden_act": "gelu",
    "hidden_dropout_prob": 0.0,
    "hidden_size": 312,
    "embedding_size": 128,
    "initializer_range": 0.02,
    "intermediate_size": 1248 ,
    "max_position_embeddings": 512,
    "num_attention_heads": 12,
    "num_hidden_layers": 4,
    "pooler_fc_size": 768,
    "pooler_num_attention_heads": 12,
    "pooler_num_fc_layers": 3,
    "pooler_size_per_head": 128,
    "pooler_type": "first_token_transform",
    "type_vocab_size": 2,
    "vocab_size": vocab_size,
    "ln_type":"postln"
}

config = AlbertConfig(**albert_tiny_config)

In [10]:
model = AlbertForMaskedLM(config=config)
model.num_parameters()

9870600

In [11]:
import datetime

training_args = TrainingArguments(
    output_dir='albert_chkpt',
    logging_dir=f'runs/lm_{datetime.datetime.now().strftime("%H%M_%Y%m%d")}',
    logging_first_step=True,
    logging_steps=100,
    overwrite_output_dir=True,
    learning_rate=0.000176,
    num_train_epochs=5,
    per_gpu_train_batch_size=64,
    save_steps=2000,
    save_total_limit=2,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
    prediction_loss_only=True,
)

In [None]:
%%time
trainer.train()

Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.
Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.


HBox(children=(FloatProgress(value=0.0, description='Epoch', max=5.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=112618.0, style=ProgressStyle(description…



In [16]:
trainer.save_model("./hk_albert")