In [13]:
import time
import torch.backends.cudnn as cudnn
import torch.optim
import torch.utils.data
from model import SSD300, MultiBoxLoss
from datasets import PascalVOCDataset
from utils import *

In [14]:
# Data parameters
data_folder = './'  # folder with data files
keep_difficult = True  # use objects considered difficult to detect?

# Model parameters
# Not too many here since the SSD300 has a very specific structure
n_classes = len(label_map)  # number of different types of objects
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [15]:
# Learning parameters
checkpoint = [file for file in os.listdir() if file.endswith('.tar')]  # path to model checkpoint, None if none
batch_size = 8  # batch size
workers = 2  # number of workers for loading data in the DataLoader
print_freq = 200  # print training status every __ batches
lr = 1e-4  # learning rate
interval=10 #How many epochs to run at a time
weight_decay = 1e-4 #weight decay (L2 penalty)

cudnn.benchmark = True

In [17]:
def train(train_loader, model, criterion, optimizer, epoch):
    """
    One epoch's training.

    :param train_loader: DataLoader for training data
    :param model: model
    :param criterion: MultiBox loss
    :param optimizer: optimizer
    :param epoch: epoch number
    """
    model.train()  # training mode enables dropout

    batch_time = AverageMeter()  # forward prop. + back prop. time
    data_time = AverageMeter()  # data loading time
    losses = AverageMeter()  # loss

    start = time.time()

    # Batches
    for i, (images, boxes, labels, _) in enumerate(train_loader):
        data_time.update(time.time() - start)

        # Move to default device
        images = images.to(device)  # (batch_size (N), 3, 300, 300)
        boxes = [b.to(device) for b in boxes]
        labels = [l.to(device) for l in labels]

        # Forward prop.
        predicted_locs, predicted_scores = model(images)  # (N, 8732, 4), (N, 8732, n_classes)

        # Loss
        loss = criterion(predicted_locs, predicted_scores, boxes, labels)  # scalar

        # Backward prop.
        optimizer.zero_grad()
        loss.backward()

        # Update model
        optimizer.step()

        losses.update(loss.item(), images.size(0))
        batch_time.update(time.time() - start)

        start = time.time()

        # Print status
        if i % print_freq == 0:
            print('Epoch: [{0}][{1}/{2}]\t'
                  'Batch Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                  'Data Time {data_time.val:.3f} ({data_time.avg:.3f})\t'
                  'Loss {loss.val:.4f} ({loss.avg:.4f})\t'.format(epoch, i, len(train_loader),
                                                                  batch_time=batch_time,
                                                                  data_time=data_time, loss=losses))
        if i == len(train_loader)-1:
            print("*************************************************")
            print('Epoch: {num}\t'
                  'Loss: {loss.avg:.4f}\t'.format(num=epoch+1,loss=losses))
            print("************************************************")
    del predicted_locs, predicted_scores, images, boxes, labels  # free some memory since their histories may be stored


In [18]:

"""
Training.
"""
# Initialize model or load checkpoint
if not checkpoint:    
    create_data_lists(voc07_path='VOC2007', voc12_path='VOC2012',output_folder='./')

    start_epoch = 0
    model = SSD300(n_classes=n_classes)

    param_list = list()
    for param_name, param in model.named_parameters():
            if param.requires_grad:
                #print(param_name)
                param_list.append(param)
                
    optimizer=torch.optim.Adam(params=param_list,lr=lr,weight_decay=weight_decay)
    epochs=interval
    
else:
    checkpoint = torch.load(checkpoint[0])
    start_epoch = checkpoint['epoch'] + 1
    print('\nLoaded checkpoint from epoch %d.\n' % start_epoch)
    model = checkpoint['model']
    optimizer = checkpoint['optimizer']

    epochs = checkpoint['epoch'] + interval +1 #Training 10 epochs at a time

# Move to default device
model = model.to(device)
criterion = MultiBoxLoss(priors_cxcy=model.priors_cxcy).to(device)

# Custom dataloaders
train_dataset = PascalVOCDataset(data_folder,
                                 split='train',
                                 keep_difficult=keep_difficult)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True,
                                           collate_fn=train_dataset.collate_fn, num_workers=workers,
                                           pin_memory=True)  # note that we're passing the collate function here

#epochs = checkpoint['epoch'] + 11 #Training 10 epochs at a time

# Epochs
for epoch in range(start_epoch, epochs):

    # One epoch's training
    train(train_loader=train_loader,
          model=model,
          criterion=criterion,
          optimizer=optimizer,
          epoch=epoch)

    # Save checkpoint
    save_checkpoint(epoch, model, optimizer)


Loaded checkpoint from epoch 90.

Epoch: [90][0/2069]	Batch Time 1.824 (1.824)	Data Time 1.474 (1.474)	Loss 1.4282 (1.4282)	
Epoch: [90][200/2069]	Batch Time 0.365 (0.368)	Data Time 0.001 (0.008)	Loss 1.9816 (2.3876)	
Epoch: [90][400/2069]	Batch Time 0.368 (0.368)	Data Time 0.001 (0.004)	Loss 2.1730 (2.4216)	
Epoch: [90][600/2069]	Batch Time 0.377 (0.369)	Data Time 0.001 (0.003)	Loss 2.2886 (2.4271)	
Epoch: [90][800/2069]	Batch Time 0.375 (0.371)	Data Time 0.000 (0.002)	Loss 1.6009 (2.4391)	
Epoch: [90][1000/2069]	Batch Time 0.379 (0.372)	Data Time 0.001 (0.002)	Loss 2.1774 (2.4392)	
Epoch: [90][1200/2069]	Batch Time 0.377 (0.373)	Data Time 0.001 (0.002)	Loss 2.6392 (2.4256)	
Epoch: [90][1400/2069]	Batch Time 0.381 (0.375)	Data Time 0.001 (0.002)	Loss 2.4821 (2.4262)	
Epoch: [90][1600/2069]	Batch Time 0.389 (0.376)	Data Time 0.001 (0.001)	Loss 2.4958 (2.4336)	
Epoch: [90][1800/2069]	Batch Time 0.396 (0.377)	Data Time 0.001 (0.001)	Loss 2.6324 (2.4311)	
Epoch: [90][2000/2069]	Batch Tim