In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
import logging
import torch
from torch.utils.data import DataLoader
from transformers import DistilBertModel, AdamW

from dataset import SquadDataset
from model import QAModel
from preprocess import SquadPreprocessor, SquadPlausibleAnswersPreprocessor


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [7]:
logging.basicConfig(level=logging.INFO)


In [23]:
def train_model(preprocessor, base_model, frac_train_data, frac_val_data, batch_size=8, n_epoch=10, log_every=1,
                eval_every=10,
                save_every=200, checkpoint_fn=None, force_cpu=False, save_model_prefix="/content/drive/MyDrive/model_checkpoint "
                ) -> None:
    """
    Fine-tunes transformer model with custom head on custom data.
    Parameters
    ----------
    preprocessor (SquadPreprocessor,  SquadPlausibleAnswersPreprocessor) - pre-processor class.
    base_model (nn.Module)- model class, sub-class of nn.Module.
    frac_train_data (float) - fraction of training data to sample randomly. Useful with limited memory.
    frac_val_data (float) - fraction of validation data to sample randomly.
    batch_size (int) - batch size for training.
    n_epoch (int) - number of epochs for training.
    log_every (int) - steps frequency to print training loss.
    eval_every (int) - steps frequency to print eval loss.
    save_every (int) - steps frequency to save checkpoint.
    checkpoint_fn (None or str) - if str, uses as filename to load a checkpoint model, to continue training.
    force_cpu - forces CPU, even on systems with detectable CUDA. Useful for old CUDA architectures,
                which aren't supported anymore
    save_model_prefix (str) - prefix to save the model checkpoint
    """

    sp = preprocessor()
    train_enc, val_enc = sp.get_encodings(random_sample_train=frac_train_data, random_sample_val=frac_val_data,
                                          return_tensors="pt")

    train_ds = SquadDataset(train_enc)
    val_ds = SquadDataset(val_enc)

    train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
    eval_dl = DataLoader(val_ds, batch_size=64, shuffle=True)

    dbm = DistilBertModel.from_pretrained('distilbert-base-uncased', return_dict=True)

    # Freeze all parameters of the DistilBert
    # for name, param in dbm.named_parameters():
    #     if name.startswith('embeddings'):
    #         param.requires_grad = False
    if force_cpu:
        device = torch.device("cpu")
    else:
        device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')  # torch.device("cpu")

    epoch = 0
    train_iter = 0
    loss_eval = 1000

    if checkpoint_fn is not None:
        checkpoint = torch.load(checkpoint_fn, map_location=device)
        epoch = checkpoint['epoch'] - 1.0
        train_iter = checkpoint['train_iter']
    else:
        checkpoint = None

    model = base_model(transformer_model=dbm, device=device)

    if checkpoint:
        model.load_state_dict(checkpoint['model_state_dict'])

    # optimizer = torch.optim.Adam(model.parameters(), lr = 0.0002)
    logging.info(f"Using device: {device}")

    model.to(device)
    model.train()

    optimizer = AdamW(model.parameters(), lr=5e-5)  # torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
    if checkpoint:
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

    while epoch < n_epoch:
        epoch += 1

        for train_data in train_dl:
            train_iter += 1
            optimizer.zero_grad()
            model_out = model(train_data)
            loss = model.compute_loss(*model_out)
            loss.backward()
            optimizer.step()

            if train_iter % log_every == 0:
                print('Train: Epoch: %d, iter: %d, avg. loss: %.2f' % (epoch, train_iter, loss))

            if train_iter % eval_every == 0:
                with torch.no_grad():  # Disable gradient tracking for evaluation
                    model.eval()
                    eval_data = next(iter(eval_dl))
                    model_out = model(eval_data)
                    loss_eval = model.compute_loss(*model_out)
                    print('\nEval: Epoch: %d, iter: %d, avg. loss: %.2f\n' % (epoch, train_iter, loss_eval))
                    model.train()

            if train_iter % save_every == 0:
              model.save(f"{save_model_prefix}_model_{train_iter}.pt", train_iter=train_iter,
                           epoch=epoch,
                           optimizer=optimizer,
                           train_loss=loss, eval_loss=loss_eval)



In [24]:
plausible_model_url = "https://www.dropbox.com/s/9mkiaxfipof8orm/model_plausible.pt?dl=1"
possible_model_url = "https://www.dropbox.com/s/fgyyokik58wsvha/model_possible.pt?dl=1"
model_folder = "models"

mappings = {
    "model_plausible":plausible_model_url,
    "model_possible":possible_model_url
}

In [None]:
if __name__ == '__main__':
    # Training main QA model
    train_model(preprocessor=SquadPlausibleAnswersPreprocessor, base_model=QAModel, frac_val_data=0.010, frac_train_data=0.010,
                save_model_prefix="plausible",
                force_cpu=True)


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Train: Epoch: 1, iter: 1, avg. loss: 6.24
Train: Epoch: 1, iter: 2, avg. loss: 6.03
Train: Epoch: 1, iter: 3, avg. loss: 6.14
Train: Epoch: 1, iter: 4, avg. loss: 5.94
Train: Epoch: 1, iter: 5, avg. loss: 5.92
Train: Epoch: 1, iter: 6, avg. loss: 5.86
Train: Epoch: 1, iter: 7, avg. loss: 5.84
Train: Epoch: 1, iter: 8, avg. loss: 5.67
Train: Epoch: 1, iter: 9, avg. loss: 5.72
Train: Epoch: 1, iter: 10, avg. loss: 5.58

Eval: Epoch: 1, iter: 10, avg. loss: 5.11

Train: Epoch: 1, iter: 11, avg. loss: 5.23
Train: Epoch: 1, iter: 12, avg. loss: 5.07
Train: Epoch: 1, iter: 13, avg. loss: 5.14
Train: Epoch: 1, iter: 14, avg. loss: 4.86
Train: Epoch: 1, iter: 15, avg. loss: 4.60
Train: Epoch: 1, iter: 16, avg. loss: 4.98
Train: Epoch: 1, iter: 17, avg. loss: 4.88
Train: Epoch: 1, iter: 18, avg. loss: 4.37
Train: Epoch: 1, iter: 19, avg. loss: 4.42
Train: Epoch: 1, iter: 20, avg. loss: 4.75

Eval: Epoch: 1, iter: 20, avg. loss: 4.60

Train: Epoch: 1, iter: 21, avg. loss: 4.34
Train: Epoch: 1, i