In [None]:
try:
    from google.colab import drive
    drive.mount('/content/drive')
except ModuleNotFoundError as e:
    print("not in colab")
    pass
import os
base_dir = "/content/drive/MyDrive/semeval2022"
if not os.path.exists(base_dir):
  !pip install -r requirements.txt
  base_dir = ""
else:
  !pip install -r /content/drive/MyDrive/semeval2022/requirements.txt
  !cp -rf /content/drive/MyDrive/semeval2022/*.py . 
  !cp -rf /content/drive/MyDrive/semeval2022/utils .
  !cp -rf /content/drive/MyDrive/semeval2022/model .

In [None]:
# prepare data
from utils.util import wnut_iob
import os
from utils.util import get_reader, train_model, create_model, save_model, parse_args, get_tagset
train_file = os.path.join(base_dir, "training_data/EN-English/en_train.conll")
dev_file = os.path.join(base_dir, "training_data/EN-English/en_dev.conll")
encoder_model = "bert-base-uncased"
train_reader = get_reader(file_path=train_file, target_vocab=get_tagset(wnut_iob), encoder_model=encoder_model)
dev_reader = get_reader(file_path=dev_file, target_vocab=get_tagset(wnut_iob), encoder_model=encoder_model)


In [None]:
print(train_reader.sentences.__len__())
print(dev_reader.__len__())
from typing import List
pretrain_txt = os.path.join(base_dir, "pretrain.txt")
def write_pretrain_sentences(data: List[List], filename: str):
    with open(filename, "w") as f:
        for sentences in data:
            for sentence in sentences:
                f.write(sentence)
                f.write("\n")

write_pretrain_sentences([train_reader.sentences, dev_reader.sentences], pretrain_txt)
    

In [None]:
from transformers import RobertaTokenizer, RobertaForMaskedLM
from transformers import AutoModelForMaskedLM, AutoTokenizer, AutoModelForPreTraining
from transformers import LineByLineTextDataset
from transformers import DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments

output_dir = os.path.join(base_dir, "self_pretrain", encoder_model)
tokenizer = AutoTokenizer.from_pretrained(encoder_model)
model = AutoModelForPreTraining.from_pretrained(encoder_model)
dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path=pretrain_txt,
    block_size=512,
)
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)
training_args = TrainingArguments(
    output_dir=output_dir,
    overwrite_output_dir=True,
    num_train_epochs=20,
    per_device_train_batch_size=16,
    save_steps=500,
    save_total_limit=2,
    seed=1
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset
)
trainer.train()