In [9]:
import pandas as pd
import os
from huggingface_hub import notebook_login
import transformers
from datasets import load_dataset
from transformers import BertTokenizer
from transformers import Trainer, TrainingArguments
import math
from transformers import AutoModelForMaskedLM
from transformers import DataCollatorForLanguageModeling

In [35]:
block_size = 128

def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
        # customize this part to your needs.
    total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

In [36]:
print("starting notebook login")
notebook_login()

In [37]:
print("done with notebook login")
wiki_train = "wikitext"
wiki_val = "wikitext-2-raw-v1"
sports_train = "data/sports_article_data.csv"
sports_val = "data/sports_article_data.csv"
scouting_train = "data/unlabeled_scouting.csv"
scouting_val = "data/unlabeled_scouting.csv"
base_model = "bert-base-uncased" # replace with best base model as determined by Amol's baseline experiments

In [38]:
def run_pretraining_stage(datasets, model_loc, stage_name):
    if model_loc == "amanm27/" + base_model:
        # first pre-training stage
        model_checkpoint = base_model
    else:
        # already pre-trained on something, so resume where we left off
        model_checkpoint = model_loc
    print("Reading model from " + model_checkpoint)
    tokenizer = BertTokenizer.from_pretrained(base_model, use_fast=True)
    def tokenize_function(examples):
        return tokenizer(examples["text"])
    tokenized_datasets = datasets.map(tokenize_function, batched=True, num_proc=4, remove_columns=["text"])
    lm_datasets = tokenized_datasets.map(group_texts, batched=True, batch_size=1000, num_proc=4)
    model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)
    model_name = model_checkpoint.split("/")[-1]
    training_args = TrainingArguments(model_name + stage_name, evaluation_strategy = "epoch", learning_rate=2e-5, weight_decay=0.01, push_to_hub=True)
    model_loc += stage_name 
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)
    trainer = Trainer(model=model, args=training_args, train_dataset=lm_datasets["train"], eval_dataset=lm_datasets["validation"], data_collator=data_collator)
    trainer.train()
    eval_results = trainer.evaluate()
    print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")
    trainer.push_to_hub()
    print("Done with " + stage_name + " stage")
    return model_loc

In [39]:
def run_pretraining_experiment(include_wiki, include_sports, include_scouting):
    model_loc = "amanm27/"
    model_loc += base_model
    if include_wiki:
        datasets = load_dataset(wiki_train, wiki_val)
        model_loc = run_pretraining_stage(datasets, model_loc, "-wiki")
        print("Saved model checkpoint to " + model_loc)
    if include_sports:
        datasets = load_dataset("text", data_files={"train": sports_train, "validation": sports_val})
        model_loc = run_pretraining_stage(datasets, model_loc, "-sports")
        print("Saved model checkpoint to " + model_loc)
    if include_scouting:
        datasets = load_dataset("text", data_files={"train": scouting_train, "validation": scouting_val})
        model_loc = run_pretraining_stage(datasets, model_loc, "-scouting")
        print("Saved model checkpoint to " + model_loc)
    print("Saved final model checkpoint to " + model_loc)

In [40]:
print("Starting experiment: wiki")
run_pretraining_experiment(True, False, False)
print("Done with experiment: wiki")

Starting experiment: wiki --> sports --> scouting


Reusing dataset wikitext (/Users/amanmalhotra/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126)


  0%|          | 0/3 [00:00<?, ?it/s]

Reading model from bert-base-uncased


loading file https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt from cache at /Users/amanmalhotra/.cache/huggingface/transformers/45c3f7a79a80e1cf0a489e5c62b43f173c15db47864303a55d623bb3c96f72a5.d789d64ebfe299b0e416afc4a169632f903f693095b4629a7ea271d5a0cf2c99
loading file https://huggingface.co/bert-base-uncased/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/bert-base-uncased/resolve/main/special_tokens_map.json from cache at None
loading file https://huggingface.co/bert-base-uncased/resolve/main/tokenizer_config.json from cache at /Users/amanmalhotra/.cache/huggingface/transformers/c1d7f0a763fb63861cc08553866f1fc3e5a6f4f07621be277452d26d71303b7e.20430bd8e10ef77a7d2977accefe796051e01bc2fc4aa146bc862997a1a15e79
loading file https://huggingface.co/bert-base-uncased/resolve/main/tokenizer.json from cache at /Users/amanmalhotra/.cache/huggingface/transformers/534479488c54aeaf9c3406f647aa2ec13648c06771ffe269edabebd4c412da1d.7f2721073f19841

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of BertForMaskedLM were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertForMaskedLM for predictions without further training.
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from

Saved model checkpoint to amanm27/bert-base-uncased-wiki


Using custom data configuration default-d4d7d22c135e9a72
Reusing dataset text (/Users/amanmalhotra/.cache/huggingface/datasets/text/default-d4d7d22c135e9a72/0.0.0/08f6fb1dd2dab0a18ea441c359e1d63794ea8cb53e7863e6edf8fc5655e47ec4)


  0%|          | 0/2 [00:00<?, ?it/s]

Reading model from amanm27/bert-base-uncased-wiki


loading file https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt from cache at /Users/amanmalhotra/.cache/huggingface/transformers/45c3f7a79a80e1cf0a489e5c62b43f173c15db47864303a55d623bb3c96f72a5.d789d64ebfe299b0e416afc4a169632f903f693095b4629a7ea271d5a0cf2c99
loading file https://huggingface.co/bert-base-uncased/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/bert-base-uncased/resolve/main/special_tokens_map.json from cache at None
loading file https://huggingface.co/bert-base-uncased/resolve/main/tokenizer_config.json from cache at /Users/amanmalhotra/.cache/huggingface/transformers/c1d7f0a763fb63861cc08553866f1fc3e5a6f4f07621be277452d26d71303b7e.20430bd8e10ef77a7d2977accefe796051e01bc2fc4aa146bc862997a1a15e79
loading file https://huggingface.co/bert-base-uncased/resolve/main/tokenizer.json from cache at /Users/amanmalhotra/.cache/huggingface/transformers/534479488c54aeaf9c3406f647aa2ec13648c06771ffe269edabebd4c412da1d.7f2721073f19841

OSError: amanm27/bert-base-uncased-wiki does not appear to have a file named config.json.

In [None]:
print("Starting experiment: sports")
run_pretraining_experiment(False, True, False)
print("Done with experiment: sports")

In [None]:
print("Starting experiment: scouting")
run_pretraining_experiment(False, False, True)
print("Done with experiment: scouting")

In [None]:
print("Starting experiment: wiki --> sports")
run_pretraining_experiment(True, True, False)
print("Done with experiment: wiki --> sports")

In [None]:
print("Starting experiment: wiki --> scouting")
run_pretraining_experiment(True, False, True)
print("Done with experiment: wiki --> scouting")

In [None]:
print("Starting experiment: sports --> scouting")
run_pretraining_experiment(False, True, True)
print("Done with experiment: sports --> scouting")

In [None]:
print("Starting experiment: wiki --> sports --> scouting")
run_pretraining_experiment(True, True, True)
print("Done with experiment: wiki --> sports --> scouting")