# BERT fine-tuning for semantic shifts

We use this notebook in Google Colab to fine-tune BERT on the old corpus.

In [None]:
# !pip install datasets transformers

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
import os
from datasets import load_dataset
from transformers import (
    AutoModelForMaskedLM, AutoTokenizer,
    DataCollatorForLanguageModeling, 
    Trainer, TrainingArguments, TrainerCallback
)

In [None]:
class SaverCallback(TrainerCallback):
    """Class to store checkpoints in Google Drive every
    5000 steps, while progressively deleting old
    checkpoints to prevent memory saturation.
    """

    def on_save(self, args, state, control, logs=None, **kwargs):
        checkpoint = state.global_step
        if (checkpoint % 5000) == 0:
            print("copying checkpoint", checkpoint)
            os.system(
                f"cp -r /content/bert-semeval2020-clm/"
                + f"checkpoint-{checkpoint} /content/drive/My\ Drive"
            )
        try:
            print(f"deleting checkpoint {checkpoint - 500}")
            os.system(
                f"rm -r /content/bert-semeval2020-clm/"
                + f"checkpoint-{checkpoint - 500}"
            )
        except:
            pass

In [None]:
tokenizer_checkpoint = "bert-base-cased"

tokenizer = AutoTokenizer.from_pretrained(tokenizer_checkpoint, use_fast=True)

def tokenize_function(examples):
    return tokenizer(examples["text"])

In [None]:
dataset = load_dataset(
    "text",
    name="semeval2020-ccoha1",
    data_files="drive/MyDrive/datasets/ccoha1.txt"
)

In [None]:
tokenized_dataset = dataset.map(
    tokenize_function, 
    batched=True,
    remove_columns=["text"]
)

In [None]:
tokenizer.decode(tokenized_dataset["train"]["input_ids"][0])

In [None]:
device = "cuda"
model_checkpoint = "bert-base-cased"
model = AutoModelForMaskedLM.from_pretrained(
    model_checkpoint
).to(device).train()

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, 
    mlm_probability=0.15
)

training_args = TrainingArguments(
    "bert-semeval2020-clm",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    num_train_epochs=5
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["train"],
    data_collator=data_collator
)

saver_callback = SaverCallback()
trainer.add_callback(saver_callback)

In [None]:
trainer.train("/content/drive/MyDrive/checkpoint-40000")