In [4]:
# prepare data
from utils.utils import conll_iob
from utils.utils import get_reader, train_model, create_model, save_model, parse_args, get_tagset

train_file = "./training_data/EN-English/en_train.conll"
dev_file = "./training_data/EN-English/en_dev.conll"
encoder_model = "roberta-base"
train_reader = get_reader(file_path=train_file, target_vocab=get_tagset(conll_iob), encoder_model=encoder_model)
dev_reader = get_reader(file_path=dev_file, target_vocab=get_tagset(conll_iob), encoder_model=encoder_model)


2021-11-16 18:16:21 - INFO - reader - Reading file ./training_data/EN-English/en_train.conll
2021-11-16 18:16:49 - INFO - reader - Finished reading 15300 instances from file ./training_data/EN-English/en_train.conll
2021-11-16 18:17:00 - INFO - reader - Reading file ./training_data/EN-English/en_dev.conll
2021-11-16 18:17:02 - INFO - reader - Finished reading 800 instances from file ./training_data/EN-English/en_dev.conll


In [9]:
print(train_reader.sentences.__len__())
print(dev_reader.__len__())
from typing import List
def write_roberta_pretrain_sentences(data: List[List], filename: str):
    with open(filename, "w") as f:
        for sentences in data:
            for sentence in sentences:
                f.write("<s> " + sentence + " </s>")
                f.write("\n")

write_roberta_pretrain_sentences([train_reader.sentences, dev_reader.sentences], "roberta.txt")
    

15300
800


In [12]:
from transformers import RobertaTokenizer, RobertaForMaskedLM
from transformers import LineByLineTextDataset
from transformers import DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments

encoder_model = "roberta-base"
tokenizer = RobertaTokenizer.from_pretrained(encoder_model)
model = RobertaForMaskedLM.from_pretrained(encoder_model)
dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="roberta.txt",
    block_size=512,
)
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)
training_args = TrainingArguments(
    output_dir="./roberta-retrained",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=8,
    save_steps=500,
    save_total_limit=2,
    seed=1
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset
)
trainer.train()


***** Running training *****
  Num examples = 16100
  Num Epochs = 1
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 2013
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize



Downloading:  18%|█▊        | 86.0M/478M [4:08:52<5:20:46, 21.4kB/s]Saving model checkpoint to ./roberta-retrained/checkpoint-500
Configuration saved in ./roberta-retrained/checkpoint-500/config.json


{'loss': 2.6728, 'learning_rate': 3.758072528564332e-05, 'epoch': 0.25}


Model weights saved in ./roberta-retrained/checkpoint-500/pytorch_model.bin

Downloading:  18%|█▊        | 86.0M/478M [4:36:17<5:20:46, 21.4kB/s]Saving model checkpoint to ./roberta-retrained/checkpoint-1000
Configuration saved in ./roberta-retrained/checkpoint-1000/config.json


{'loss': 2.4639, 'learning_rate': 2.516145057128664e-05, 'epoch': 0.5}


Model weights saved in ./roberta-retrained/checkpoint-1000/pytorch_model.bin

Downloading:  18%|█▊        | 86.0M/478M [5:03:46<5:20:46, 21.4kB/s]Saving model checkpoint to ./roberta-retrained/checkpoint-1500
Configuration saved in ./roberta-retrained/checkpoint-1500/config.json


{'loss': 2.3705, 'learning_rate': 1.2742175856929956e-05, 'epoch': 0.75}


Model weights saved in ./roberta-retrained/checkpoint-1500/pytorch_model.bin
Deleting older checkpoint [roberta-retrained/checkpoint-500] due to args.save_total_limit

Downloading:  18%|█▊        | 86.0M/478M [5:30:56<5:20:46, 21.4kB/s]Saving model checkpoint to ./roberta-retrained/checkpoint-2000
Configuration saved in ./roberta-retrained/checkpoint-2000/config.json


{'loss': 2.195, 'learning_rate': 3.229011425732737e-07, 'epoch': 0.99}


Model weights saved in ./roberta-retrained/checkpoint-2000/pytorch_model.bin
Deleting older checkpoint [roberta-retrained/checkpoint-1000] due to args.save_total_limit


Training completed. Do not forget to share your model on huggingface.co/models =)



100%|██████████| 2013/2013 [1:49:14<00:00,  3.26s/it]

{'train_runtime': 7462.9776, 'train_samples_per_second': 2.157, 'train_steps_per_second': 0.27, 'train_loss': 2.4235724009984296, 'epoch': 1.0}





TrainOutput(global_step=2013, training_loss=2.4235724009984296, metrics={'train_runtime': 7462.9776, 'train_samples_per_second': 2.157, 'train_steps_per_second': 0.27, 'train_loss': 2.4235724009984296, 'epoch': 1.0})