In [1]:
# prepare data
from utils.util import conll_iob
from utils.util import get_reader, train_model, create_model, save_model, parse_args, get_tagset
train_file = "./training_data/EN-English/en_train.conll"
dev_file = "./training_data/EN-English/en_dev.conll"
encoder_model = "roberta-base"
train_reader = get_reader(file_path=train_file, target_vocab=get_tagset(conll_iob), encoder_model=encoder_model)
dev_reader = get_reader(file_path=dev_file, target_vocab=get_tagset(conll_iob), encoder_model=encoder_model)


2021-12-01 09:53:07 - INFO - reader - Reading file ./training_data/EN-English/en_train.conll
2021-12-01 09:53:25 - INFO - reader - Finished reading 15300 instances from file ./training_data/EN-English/en_train.conll
2021-12-01 09:53:36 - INFO - reader - Reading file ./training_data/EN-English/en_dev.conll
2021-12-01 09:53:37 - INFO - reader - Finished reading 800 instances from file ./training_data/EN-English/en_dev.conll


In [5]:
print(train_reader.sentences.__len__())
print(dev_reader.__len__())
from typing import List
def write_roberta_pretrain_sentences(data: List[List], filename: str):
    with open(filename, "w") as f:
        for sentences in data:
            for sentence in sentences:
                f.write(sentence)
                f.write("\n")

write_roberta_pretrain_sentences([train_reader.sentences, dev_reader.sentences], "roberta.txt")
    

15300
800


In [4]:
from transformers import RobertaTokenizer, RobertaForMaskedLM
from transformers import LineByLineTextDataset
from transformers import DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
import os

encoder_model = "roberta-base"
tokenizer = RobertaTokenizer.from_pretrained(encoder_model)
checkpoint = "./roberta-retrained/checkpoint-1000"
if os.path.exists(checkpoint):
    model = RobertaForMaskedLM.from_pretrained(checkpoint)
else:
    model = RobertaForMaskedLM.from_pretrained(encoder_model)
dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="roberta.txt",
    block_size=512,
)
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)
training_args = TrainingArguments(
    output_dir="./roberta-retrained",
    overwrite_output_dir=True,
    num_train_epochs=5,
    per_device_train_batch_size=16,
    save_steps=500,
    save_total_limit=2,
    seed=1
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset
)
trainer.train()

loading file https://huggingface.co/roberta-base/resolve/main/vocab.json from cache at /Users/malong/.cache/huggingface/transformers/d3ccdbfeb9aaa747ef20432d4976c32ee3fa69663b379deb253ccfce2bb1fdc5.d67d6b367eb24ab43b08ad55e014cf254076934f71d832bbab9ad35644a375ab
loading file https://huggingface.co/roberta-base/resolve/main/merges.txt from cache at /Users/malong/.cache/huggingface/transformers/cafdecc90fcab17011e12ac813dd574b4b3fea39da6dd817813efa010262ff3f.5d12962c5ee615a4c803841266e9c3be9a691a924f72d395d3a6c6c81157788b
loading file https://huggingface.co/roberta-base/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/roberta-base/resolve/main/special_tokens_map.json from cache at None
loading file https://huggingface.co/roberta-base/resolve/main/tokenizer_config.json from cache at None
loading file https://huggingface.co/roberta-base/resolve/main/tokenizer.json from cache at /Users/malong/.cache/huggingface/transformers/d53fc0fa09b8342651efd4073d75e1

{'loss': 0.3033, 'learning_rate': 4.503475670307845e-05, 'epoch': 0.5}


Model weights saved in ./roberta-retrained/checkpoint-500/pytorch_model.bin
 20%|â–ˆâ–‰        | 1000/5035 [1:38:09<6:40:03,  5.95s/it]Saving model checkpoint to ./roberta-retrained/checkpoint-1000
Configuration saved in ./roberta-retrained/checkpoint-1000/config.json


{'loss': 0.6139, 'learning_rate': 4.006951340615691e-05, 'epoch': 0.99}


Model weights saved in ./roberta-retrained/checkpoint-1000/pytorch_model.bin
 30%|â–ˆâ–ˆâ–‰       | 1500/5035 [2:26:36<5:40:00,  5.77s/it]Saving model checkpoint to ./roberta-retrained/checkpoint-1500
Configuration saved in ./roberta-retrained/checkpoint-1500/config.json


{'loss': 0.962, 'learning_rate': 3.5104270109235354e-05, 'epoch': 1.49}


Model weights saved in ./roberta-retrained/checkpoint-1500/pytorch_model.bin
Deleting older checkpoint [roberta-retrained/checkpoint-500] due to args.save_total_limit
 40%|â–ˆâ–ˆâ–ˆâ–‰      | 2000/5035 [3:15:51<5:26:36,  6.46s/it]Saving model checkpoint to ./roberta-retrained/checkpoint-2000
Configuration saved in ./roberta-retrained/checkpoint-2000/config.json


{'loss': 0.9578, 'learning_rate': 3.0139026812313804e-05, 'epoch': 1.99}


Model weights saved in ./roberta-retrained/checkpoint-2000/pytorch_model.bin
Deleting older checkpoint [roberta-retrained/checkpoint-1000] due to args.save_total_limit
 50%|â–ˆâ–ˆâ–ˆâ–ˆâ–‰     | 2500/5035 [4:04:46<4:43:15,  6.70s/it]Saving model checkpoint to ./roberta-retrained/checkpoint-2500
Configuration saved in ./roberta-retrained/checkpoint-2500/config.json


{'loss': 0.8525, 'learning_rate': 2.5173783515392257e-05, 'epoch': 2.48}


Model weights saved in ./roberta-retrained/checkpoint-2500/pytorch_model.bin
Deleting older checkpoint [roberta-retrained/checkpoint-1500] due to args.save_total_limit
 60%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–‰    | 3000/5035 [4:53:27<3:26:47,  6.10s/it]Saving model checkpoint to ./roberta-retrained/checkpoint-3000
Configuration saved in ./roberta-retrained/checkpoint-3000/config.json


{'loss': 0.9111, 'learning_rate': 2.0208540218470706e-05, 'epoch': 2.98}


Model weights saved in ./roberta-retrained/checkpoint-3000/pytorch_model.bin
Deleting older checkpoint [roberta-retrained/checkpoint-2000] due to args.save_total_limit
 70%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–‰   | 3500/5035 [5:42:11<2:43:42,  6.40s/it]Saving model checkpoint to ./roberta-retrained/checkpoint-3500
Configuration saved in ./roberta-retrained/checkpoint-3500/config.json


{'loss': 0.8219, 'learning_rate': 1.5243296921549157e-05, 'epoch': 3.48}


Model weights saved in ./roberta-retrained/checkpoint-3500/pytorch_model.bin
Deleting older checkpoint [roberta-retrained/checkpoint-2500] due to args.save_total_limit
 79%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–‰  | 4000/5035 [6:30:39<1:46:26,  6.17s/it]Saving model checkpoint to ./roberta-retrained/checkpoint-4000
Configuration saved in ./roberta-retrained/checkpoint-4000/config.json


{'loss': 0.8071, 'learning_rate': 1.0278053624627607e-05, 'epoch': 3.97}


Model weights saved in ./roberta-retrained/checkpoint-4000/pytorch_model.bin
Deleting older checkpoint [roberta-retrained/checkpoint-3000] due to args.save_total_limit
 89%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–‰ | 4500/5035 [7:19:19<56:16,  6.31s/it]Saving model checkpoint to ./roberta-retrained/checkpoint-4500
Configuration saved in ./roberta-retrained/checkpoint-4500/config.json


{'loss': 0.7983, 'learning_rate': 5.312810327706058e-06, 'epoch': 4.47}


Model weights saved in ./roberta-retrained/checkpoint-4500/pytorch_model.bin
Deleting older checkpoint [roberta-retrained/checkpoint-3500] due to args.save_total_limit
 99%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–‰| 5000/5035 [8:12:24<03:40,  6.30s/it]Saving model checkpoint to ./roberta-retrained/checkpoint-5000
Configuration saved in ./roberta-retrained/checkpoint-5000/config.json


{'loss': 0.8031, 'learning_rate': 3.475670307845084e-07, 'epoch': 4.97}


Model weights saved in ./roberta-retrained/checkpoint-5000/pytorch_model.bin
Deleting older checkpoint [roberta-retrained/checkpoint-4000] due to args.save_total_limit
100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 5035/5035 [8:15:41<00:00,  4.67s/it]

Training completed. Do not forget to share your model on huggingface.co/models =)


100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 5035/5035 [8:15:41<00:00,  5.91s/it]

{'train_runtime': 29741.9841, 'train_samples_per_second': 2.707, 'train_steps_per_second': 0.169, 'train_loss': 0.7823451268519048, 'epoch': 5.0}





TrainOutput(global_step=5035, training_loss=0.7823451268519048, metrics={'train_runtime': 29741.9841, 'train_samples_per_second': 2.707, 'train_steps_per_second': 0.169, 'train_loss': 0.7823451268519048, 'epoch': 5.0})

wandb: Network error (ProxyError), entering retry loop.
