In [1]:
from pathlib import Path
from tokenizers import ByteLevelBPETokenizer

In [2]:
path = [str("data/train.promoter.txt")]
tokenizer = ByteLevelBPETokenizer()
tokenizer.train(files=path, vocab_size=5000, min_frequency=2, special_tokens=[
    "<s>",
    "<pad>",
    "</s>",
    "<unk>",
    "<mask>",
])
tokenizer.save_model("cornModel")

['cornModel/vocab.json', 'cornModel/merges.txt']

In [1]:
import torch
from pathlib import Path
from tokenizers.implementations import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing

from transformers import RobertaConfig
from transformers import RobertaForMaskedLM
from transformers import RobertaTokenizerFast
from transformers import DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
import os
os.environ["CUDA_VISIBLE_DEVICES"]="1"

In [2]:
from torch.utils.data import Dataset

class cornDataset(Dataset):
    def __init__(self, evaluate: bool = False):
        tokenizer = ByteLevelBPETokenizer(
            "./cornModel/vocab.json",
            "./cornModel/merges.txt",
        )
        tokenizer._tokenizer.post_processor = BertProcessing(
            ("</s>", tokenizer.token_to_id("</s>")),
            ("<s>", tokenizer.token_to_id("<s>")),
        )
        tokenizer.enable_truncation(max_length=512)
        self.examples = []

        src_files = Path("./data/").glob("*promoter.txt")
        for src_file in src_files:
            lines = src_file.read_text(encoding="utf-8").splitlines()
            self.examples += [x.ids for x in tokenizer.encode_batch(lines)]

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, i):
        return torch.tensor(self.examples[i])

In [3]:
config = RobertaConfig(
        vocab_size=5000,
        max_position_embeddings=512,
        num_attention_heads=6,
        num_hidden_layers=6,
        type_vocab_size=1,
    )
training_args = TrainingArguments(
        output_dir="./cornModel",
        overwrite_output_dir=True,
        num_train_epochs=1,
        save_steps=500,
        save_total_limit=2,
        prediction_loss_only=True,
    )

In [4]:
model = RobertaForMaskedLM(config=config)
tokenizer = RobertaTokenizerFast.from_pretrained("./cornModel", max_len=512)

dataset = cornDataset()
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15)
    
trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=dataset,
    )

trainer.train()
trainer.save_model("./cornModel")

  "torch.norm is deprecated and may be removed in a future PyTorch release. "


Step,Training Loss
500,7.307
1000,6.898
1500,6.7829
2000,6.7468
2500,6.7153
3000,6.6951
3500,6.6926


  "torch.norm is deprecated and may be removed in a future PyTorch release. "
  "torch.norm is deprecated and may be removed in a future PyTorch release. "
  "torch.norm is deprecated and may be removed in a future PyTorch release. "
  "torch.norm is deprecated and may be removed in a future PyTorch release. "
  "torch.norm is deprecated and may be removed in a future PyTorch release. "
  "torch.norm is deprecated and may be removed in a future PyTorch release. "
  "torch.norm is deprecated and may be removed in a future PyTorch release. "
