In [None]:
import numpy as np
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForMaskedLM, DataCollatorForLanguageModeling, Trainer, TrainingArguments

%load_ext autoreload
%autoreload 2

In [None]:
def load_diachronic_for_finetuning(datapath="data/nytimes_dataset.txt", start_date="2019-01-01", end_date="2020-12-31"):
    """
    Read in a diachronic dataset with "%Y-%m-%d\tsentence" per line; 
    similar to evolvemb.diachronic_utils.load_diachronic_dataset but returns sentences as str, not list of words

    Inputs:
        - datapath [str]: path to a dataset with tab-separated dates (in the same format as start/end_date)
                and sentences. Since these sentences will later be passed as is to the transformer,
                they shouldn't be too long, i.e., not whole documents. (default: "data/nytimes_dataset.txt")
        - start_date [str]: earliest date at and after which the sentences should be taken (default: "2019-01-01")
        - end_date [str]: latest date until which the sentences should be included (default: "2020-12-31")
    Returns:
        - sentences [list: str]: list of sentences (as complete strings) in chronological order
    """
    sentences = []
    with open(datapath) as f:
        for line in f:
            d, s = line.strip().split("\t")
            if d < start_date:
                continue
            elif d > end_date:
                break
            # some longer words mistakenly can end with "." due to the tokenizer; remove this!
            # keep single strings since the tokenization is done by the transformer model
            sentences.append(" ".join([w if len(w) <= 3 or not w.endswith(".") else w[:-1] for w in s.split()]))
    print("Dataset contains %i sentences between %s and %s" % (len(sentences), start_date, end_date))
    return sentences

In [None]:
# load sentences and put into a huggingface dataset
sentences = load_diachronic_for_finetuning("data/nytimes_dataset.txt", "2019-04-01", "2020-12-31")
dataset = Dataset.from_dict({"text": sentences})

In [None]:
# get model-specific tokenizer
model_checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)
# tokenize dataset (i.e. the "text" column)
tokenized_dataset = dataset.map(lambda x: tokenizer(x["text"]), batched=True, num_proc=4, remove_columns=["text"])
# load model
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)
# define some stuff for the training
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)
training_args = TrainingArguments(
    "test-clm",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset,
    data_collator=data_collator,
)
# train
trainer.train()

In [None]:
# check results
eval_results = trainer.evaluate()
print(f"Perplexity: {np.exp(eval_results['eval_loss']): .2f}")

In [None]:
# save model + tokenizer
trainer.save_model(f"data/{model_checkpoint}_ft")
tokenizer.save_pretrained(f"data/{model_checkpoint}_ft")