In [2]:
import sys
sys.path.append("..")

In [28]:
!nvidia-smi

Fri Aug 27 08:50:54 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 455.23.04    Driver Version: 455.23.04    CUDA Version: 11.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  GeForce RTX 3080    Off  | 00000000:01:00.0 Off |                  N/A |
| 30%   31C    P8    11W / 320W |   9689MiB / 10015MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+---------------------------------------------------------------------------

In [3]:
import argparse
import os
import torch
from torch.utils.data import DataLoader
import torch.nn.functional as F
from transformers import BertLMHeadModel, AdamW
from stats import AverageMeterSet, StatTracker
from bert_custom_dataset import CHILDESDataset
from utils import save_pkl

In [4]:
train_path='../../../Data/model_datasets/eng/train.txt'
val_path='../../../Data/model_datasets/eng/validation.txt'
result_dir='./'
experiment_name='test_finetune'
lr='5e-5'
batch_size=1
n_epochs=5
seed=0
patience=5

In [5]:
def test_finetuned_model(model, dataloader, device, stat_tracker, epoch=1, prefix='test'):
    model.eval()
    test_stats = AverageMeterSet()
    batch_size = dataloader.batch_size
    for batch in dataloader:
        for key in batch:
            batch[key] = batch[key].to(device)
        labels = batch['labels']
        outputs = model(**batch)
        predictions = F.log_softmax(outputs.logits, -1)
        n_matches = torch.eq(torch.argmax(predictions, dim = -1), labels).int()
        max_sequence_len = list(n_matches.size())[1]
        avg_sequence_accs = torch.sum(n_matches, 1)/max_sequence_len
        batch_accuracy = float((torch.sum(avg_sequence_accs, 0)/avg_sequence_accs.size()[0]).item())
        test_stats.update('accuracy', batch_accuracy, n=1)
    stat_tracker.record_stats(test_stats.averages(epoch, prefix=prefix))

    return test_stats.avgs['accuracy']

In [6]:
def finetune_model(model, train_dataloader, val_dataloader, device, stat_tracker, n_epochs=5, lr=5e-5):
    optimizer = AdamW(model.parameters(), lr)
    for epoch in range(n_epochs):
        model.train()
        epoch_stats = AverageMeterSet()
        for step,batch in enumerate(train_dataloader):
            print(step)
            optimizer.zero_grad()
            for key in batch:
                batch[key] = batch[key].to(device)
            outputs = model(**batch)
            ##get train accuracy
            predictions = F.log_softmax(outputs.logits, -1)
            n_matches = torch.eq(torch.argmax(predictions, dim = -1), batch['labels']).int()
            max_sequence_len = list(n_matches.size())[1]
            avg_sequence_accs = torch.sum(n_matches, 1)/max_sequence_len
            batch_accuracy = float((torch.sum(avg_sequence_accs, 0)/avg_sequence_accs.size()[0]).item())
            epoch_stats.update('accuracy', batch_accuracy, n=1)
            ## get loss
            loss = outputs[0]
            loss.backward()
            optimizer.step()
            epoch_stats.update('loss', float(loss), n=1)
        val_accuracy = test_finetuned_model(model, val_dataloader, device, stat_tracker, epoch, prefix="val")
        stat_tracker.record_stats(epoch_stats.averages(epoch, prefix="train"))
        print(str(epoch))
        print("train acc: "+ str(epoch_stats.avgs['accuracy']))
        print("val acc: "+ str(val_accuracy))
        print("loss :"+ str(epoch_stats.avgs['loss']))
        
    return model

In [7]:
experiment_dir = os.path.join(result_dir, experiment_name)
os.makedirs(experiment_dir, exist_ok=True)
#save_pkl(experiment_dir, vars(args), "hyperparameters.pkl")
torch.manual_seed(seed)

<torch._C.Generator at 0x7fb00c0522d0>

In [8]:
device = torch.device('cuda')
torch.cuda.empty_cache()
model = BertLMHeadModel.from_pretrained("bert-base-multilingual-uncased", return_dict=True, is_decoder = True)
model = model.to(device)

Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertLMHeadModel: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertLMHeadModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertLMHeadModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [26]:
!nvidia-smi

Fri Aug 27 08:49:45 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 455.23.04    Driver Version: 455.23.04    CUDA Version: 11.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  GeForce RTX 3080    Off  | 00000000:01:00.0 Off |                  N/A |
| 30%   34C    P8    10W / 320W |   7237MiB / 10015MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+---------------------------------------------------------------------------

In [11]:
#100 max length (can be reduced if needed)
train_dataset = CHILDESDataset(train_path)
val_dataset = CHILDESDataset(val_path)

In [25]:
torch.cuda.empty_cache()

In [21]:
train_dl = DataLoader(train_dataset, batch_size=6,shuffle=True)
val_dl = DataLoader(val_dataset, batch_size=6,shuffle=True)

In [13]:
stat_tracker = StatTracker(log_dir=os.path.join(experiment_dir,"tensorboard-log"))

log_dir: ./test_finetune/tensorboard-log


In [22]:
len(train_dl)

293268

In [23]:
len(val_dl)

73263

In [27]:
model = finetune_model(model, train_dl, val_dl, device, stat_tracker, n_epochs=1, lr=5e-5)

0


RuntimeError: CUDA out of memory. Tried to allocate 486.00 MiB (GPU 0; 9.78 GiB total capacity; 7.82 GiB already allocated; 325.88 MiB free; 7.96 GiB reserved in total by PyTorch)

In [None]:
    args = get_args()
    experiment_dir = os.path.join(args.result_dir, args.experiment_name)
    os.makedirs(experiment_dir, exist_ok=True)
    save_pkl(experiment_dir, vars(args), "hyperparameters.pkl")
    torch.manual_seed(args.seed)

    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
    torch.cuda.empty_cache()
    model = BertLMHeadModel.from_pretrained("bert-base-multilingual-uncased", return_dict=True, is_decoder = True)
    model = model.to(device)

    train_dataset = CHILDESDataset(args.train_path)
    train_dl = DataLoader(train_dataset, batch_size=args.batch_size,shuffle=True)
    val_dataset = CHILDESDataset(args.val_path)
    val_dl = DataLoader(val_dataset, batch_size=args.batch_size,shuffle=True)

    stat_tracker = StatTracker(log_dir=os.path.join(experiment_dir,"tensorboard-log"))

    model = finetune_model(model, train_dl, val_dl, device, stat_tracker, args.n_epochs, args.lr)

    torch.save(model, os.path.join(experiment_dir,"model.pt"))