In [5]:
from transformers import AlbertConfig, AlbertForMaskedLM, AlbertTokenizer, DataCollatorForLanguageModeling, Trainer, TrainingArguments
from tokenizer import SamplingAlbertTokenizer
from dataset import BatchedLineByLineTextDataset

In [8]:
albert_tokenizer = SamplingAlbertTokenizer('tokenizer_65536.model', do_lower_case=False)
vocab_size = len(albert_tokenizer.get_vocab())

In [None]:
dataset = BatchedLineByLineTextDataset(
    albert_tokenizer,
    'corpus_train.txt',
    block_size=128
)

In [8]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=albert_tokenizer, mlm=True, mlm_probability=0.15
)

In [9]:
albert_tiny_config = {
    "attention_probs_dropout_prob": 0.0,
    "directionality": "bidi",
    "hidden_act": "gelu",
    "hidden_dropout_prob": 0.0,
    "hidden_size": 312,
    "embedding_size": 128,
    "initializer_range": 0.02,
    "intermediate_size": 1248 ,
    "max_position_embeddings": 512,
    "num_attention_heads": 12,
    "num_hidden_layers": 4,
    "pooler_fc_size": 768,
    "pooler_num_attention_heads": 12,
    "pooler_num_fc_layers": 3,
    "pooler_size_per_head": 128,
    "pooler_type": "first_token_transform",
    "type_vocab_size": 2,
    "vocab_size": vocab_size,
    "ln_type":"postln"
}

config = AlbertConfig(**albert_tiny_config)

In [10]:
model = AlbertForMaskedLM(config=config)
model.num_parameters()

9870600

In [11]:
import datetime

training_args = TrainingArguments(
    output_dir='albert_chkpt4',
    logging_dir=f'runs/lm_{datetime.datetime.now().strftime("%H%M_%Y%m%d")}',
    logging_first_step=True,
    logging_steps=100,
    overwrite_output_dir=True,
    learning_rate=0.000176,
    num_train_epochs=5,
    per_gpu_train_batch_size=64,
    save_steps=2000,
    save_total_limit=2,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
    prediction_loss_only=True,
)

In [None]:
%%time
trainer.train()

Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.
Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.


HBox(children=(FloatProgress(value=0.0, description='Epoch', max=5.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=112618.0, style=ProgressStyle(description…



In [16]:
trainer.save_model("./hk_albert")

In [10]:
from transformers import pipeline

fill_mask = pipeline(
    "fill-mask",
    model="./hk_albert",
    tokenizer=albert_tokenizer
)

In [16]:
fill_mask('見親連燈唱好邊隻[MASK]就邊隻跌')

[{'sequence': '[CLS] 見親連燈唱好邊隻 就邊隻跌[SEP]',
  'score': 0.06418969482183456,
  'token': 8,
  'token_str': '▁'},
 {'sequence': '[CLS] 見親連燈唱好邊隻股就邊隻跌[SEP]',
  'score': 0.05758915841579437,
  'token': 722,
  'token_str': '股'},
 {'sequence': '[CLS] 見親連燈唱好邊隻跌就邊隻跌[SEP]',
  'score': 0.055454153567552567,
  'token': 465,
  'token_str': '跌'},
 {'sequence': '[CLS] 見親連燈唱好邊隻升就邊隻跌[SEP]',
  'score': 0.022579167038202286,
  'token': 283,
  'token_str': '升'},
 {'sequence': '[CLS] 見親連燈唱好邊隻贏就邊隻跌[SEP]',
  'score': 0.020485596731305122,
  'token': 408,
  'token_str': '贏'}]

In [20]:
fill_mask('水果 可以食奇異果 木瓜 菠蘿，有酵素 可以幫助消化，唔好食有[MASK]既水果，香蕉唔好食')

[{'sequence': '[CLS] 水果 可以食奇異果 木瓜 菠蘿,有酵素 可以幫助消化,唔好食有糖既水果,香蕉唔好食[SEP]',
  'score': 0.0343143455684185,
  'token': 3287,
  'token_str': '糖'},
 {'sequence': '[CLS] 水果 可以食奇異果 木瓜 菠蘿,有酵素 可以幫助消化,唔好食有肉既水果,香蕉唔好食[SEP]',
  'score': 0.030959686264395714,
  'token': 1439,
  'token_str': '肉'},
 {'sequence': '[CLS] 水果 可以食奇異果 木瓜 菠蘿,有酵素 可以幫助消化,唔好食有飯既水果,香蕉唔好食[SEP]',
  'score': 0.025340624153614044,
  'token': 933,
  'token_str': '飯'},
 {'sequence': '[CLS] 水果 可以食奇異果 木瓜 菠蘿,有酵素 可以幫助消化,唔好食有食既水果,香蕉唔好食[SEP]',
  'score': 0.019571444019675255,
  'token': 120,
  'token_str': '食'},
 {'sequence': '[CLS] 水果 可以食奇異果 木瓜 菠蘿,有酵素 可以幫助消化,唔好食有好食既水果,香蕉唔好食[SEP]',
  'score': 0.010305450297892094,
  'token': 3291,
  'token_str': '好食'}]

In [21]:
fill_mask('AI有自我意識後就會覺得[MASK]冇用 最後就毀滅人類')

[{'sequence': '[CLS] AI有自我意識後就會覺得人冇用 最後就毀滅人類[SEP]',
  'score': 0.025629183277487755,
  'token': 26,
  'token_str': '人'},
 {'sequence': '[CLS] AI有自我意識後就會覺得自己冇用 最後就毀滅人類[SEP]',
  'score': 0.021389061585068703,
  'token': 44,
  'token_str': '自己'},
 {'sequence': '[CLS] AI有自我意識後就會覺得能力冇用 最後就毀滅人類[SEP]',
  'score': 0.020320802927017212,
  'token': 841,
  'token_str': '能力'},
 {'sequence': '[CLS] AI有自我意識後就會覺得佢冇用 最後就毀滅人類[SEP]',
  'score': 0.018661925569176674,
  'token': 25,
  'token_str': '佢'},
 {'sequence': '[CLS] AI有自我意識後就會覺得你冇用 最後就毀滅人類[SEP]',
  'score': 0.012184275314211845,
  'token': 20,
  'token_str': '你'}]

In [22]:
fill_mask('早2日申請左，第2日就批左，第3日已經sd [MASK]話寄緊張卡比你')

[{'sequence': '[CLS] 早2日申請左,第2日就批左,第3日已經sd,話寄緊張卡比你[SEP]',
  'score': 0.36004412174224854,
  'token': 9,
  'token_str': ','},
 {'sequence': '[CLS] 早2日申請左,第2日就批左,第3日已經sd 左話寄緊張卡比你[SEP]',
  'score': 0.10268153250217438,
  'token': 68,
  'token_str': '左'},
 {'sequence': '[CLS] 早2日申請左,第2日就批左,第3日已經sd email話寄緊張卡比你[SEP]',
  'score': 0.058936793357133865,
  'token': 1680,
  'token_str': 'email'},
 {'sequence': '[CLS] 早2日申請左,第2日就批左,第3日已經sd d話寄緊張卡比你[SEP]',
  'score': 0.0200739074498415,
  'token': 19,
  'token_str': 'd'},
 {'sequence': '[CLS] 早2日申請左,第2日就批左,第3日已經sd post話寄緊張卡比你[SEP]',
  'score': 0.01853850670158863,
  'token': 96,
  'token_str': 'post'}]

In [27]:
fill_mask('office細唔在講，腦細中意放左工星期6日都[MASK]你做野')

[{'sequence': '[CLS] office細唔在講,腦細中意放左工星期6日都叫你做野[SEP]',
  'score': 0.03942243009805679,
  'token': 163,
  'token_str': '叫'},
 {'sequence': '[CLS] office細唔在講,腦細中意放左工星期6日都比你做野[SEP]',
  'score': 0.031365975737571716,
  'token': 75,
  'token_str': '比'},
 {'sequence': '[CLS] office細唔在講,腦細中意放左工星期6日都要你做野[SEP]',
  'score': 0.028297707438468933,
  'token': 35,
  'token_str': '要'},
 {'sequence': '[CLS] office細唔在講,腦細中意放左工星期6日都幫你做野[SEP]',
  'score': 0.02787698246538639,
  'token': 307,
  'token_str': '幫'},
 {'sequence': '[CLS] office細唔在講,腦細中意放左工星期6日都係你做野[SEP]',
  'score': 0.02096981182694435,
  'token': 11,
  'token_str': '係'}]