In [1]:
from data import *
from utils.augmentations import SSDAugmentation
from layers.modules import MultiBoxLoss
from ssd import build_ssd
import os
import os.path as osp
import sys
import time

import torch
from torch.autograd import Variable
import torch.nn as nn
from torch.nn import functional as F
import torchvision as tv
import torch.optim as optim
import torch.backends.cudnn as cudnn
import torch.nn.init as init
import torch.utils.data as data
import torchvision.datasets

import numpy as np
from PIL import Image
import argparse
from matplotlib import pyplot as plt
import pandas as pd
import cv2 
import pickle as pkl
import random
import tarfile
import collections
import math
import datetime

In [2]:
if sys.version_info[0] == 2:
    import xml.etree.cElementTree as ET
else:
    import xml.etree.ElementTree as ET

In [3]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cuda


In [4]:
def adjust_learning_rate(optimizer, gamma, step):
    """Sets the learning rate to the initial LR decayed by 10 at every
        specified step
    # Adapted from PyTorch Imagenet example:
    # https://github.com/pytorch/examples/blob/master/imagenet/main.py
    """
    lr = lr * (gamma ** (step))
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr


def xavier(param):
    init.xavier_uniform_(param)


def weights_init(m):
    if isinstance(m, nn.Conv2d):
        xavier(m.weight.data)
        m.bias.data.zero_()


def create_vis_plot(_xlabel, _ylabel, _title, _legend):
    return viz.line(
        X=torch.zeros((1,)).cpu(),
        Y=torch.zeros((1, 3)).cpu(),
        opts=dict(
            xlabel=_xlabel,
            ylabel=_ylabel,
            title=_title,
            legend=_legend
        )
    )


def update_vis_plot(iteration, loc, conf, window1, window2, update_type,
                    epoch_size=1):
    viz.line(
        X=torch.ones((1, 3)).cpu() * iteration,
        Y=torch.Tensor([loc, conf, loc + conf]).unsqueeze(0).cpu() / epoch_size,
        win=window1,
        update=update_type
    )
    # initialize epoch plot on first iteration
    if iteration == 0:
        viz.line(
            X=torch.zeros((1, 3)).cpu(),
            Y=torch.Tensor([loc, conf, loc + conf]).unsqueeze(0).cpu(),
            win=window2,
            update=True
        )

In [5]:
def train(pick="", resume=None, start_iter=0):
    
    print("Pickle File: "+ str(pick))
    
    cfg = voc
    dataset = VOCDetection(root=dataset_root, image_sets=[('2012', 'train')],
                            transform=SSDAugmentation(cfg['min_dim'],
                             MEANS))

    ssd_net = build_ssd('train', cfg['min_dim'], cfg['num_classes'])
    net = ssd_net
    
    if cuda:
        net = torch.nn.DataParallel(ssd_net)
        cudnn.benchmark = True
        
    if resume:
        print('Resuming training, loading {}...'.format(resume))
        ssd_net.load_weights(resume)
    else:
        vgg_weights = torch.load(basenet)
        print('Loading base network...')
        ssd_net.vgg.load_state_dict(vgg_weights)

    if cuda:
         net = net.to(device)

    if not resume:
        print('Initializing weights...')
        # initialize newly added layers' weights with xavier method
        ssd_net.extras.apply(weights_init)
        ssd_net.loc.apply(weights_init)
        ssd_net.conf.apply(weights_init)

    optimizer = optim.SGD(net.parameters(), lr, momentum,weight_decay)
    criterion = MultiBoxLoss(cfg['num_classes'], 0.5, True, 0, True, 3, 0.5,
                             False, cuda)

    net.train()
    name = 'train'
    
    # loss counters
    loc_loss = 0
    conf_loss = 0
    epoch = 0
    print('Loading the dataset...')

    epoch_size = len(dataset) // batch_size
    print('Training SSD on: ',name)
#    print('Using the specified args:')
#     print(args)

    step_index = 0

    data_loader = data.DataLoader(dataset, batch_size=32, num_workers=4, shuffle=True,\
                               collate_fn=detection_collate,pin_memory=True)
    
    print("Number of images in the training set = " + str(len(dataset)))
    print("Number of images in a mini-batch = "+str(batch_size))
    print("Number of mini-batches = " + str(len(data_loader)))
    
    
     # create batch iterator
    batch_iterator = iter(data_loader)
    print("STARTING - ITERATIONS")
    
    # Stats for pickle
    l_loss = []
    c_loss = []
    itr = []
    
    for iteration in range(start_iter, 500):
        
        if visdom and iteration != 0 and (iteration % epoch_size == 0):
            update_vis_plot(epoch, loc_loss, conf_loss, epoch_plot, None,
                             'append', epoch_size)
            # reset epoch loss counters
            loc_loss = 0
            conf_loss = 0
            epoch += 1

        if iteration in cfg['lr_steps']:
            step_index += 1
            adjust_learning_rate(optimizer, gamma, step_index)

            
        ## load train data
        #images, targets = next(batch_iterator)
        try:
            images, targets = next(batch_iterator)
        except StopIteration:
            batch_iterator = iter(data_loader)
            images, targets = next(batch_iterator)


        
        if cuda:
            images = Variable(images.cuda())
            targets = [Variable(ann.cuda(), volatile=True) for ann in targets]
        else:
            images = Variable(images)
            targets = [Variable(ann, volatile=True) for ann in targets]
        
        # forward
        t0 = time.time()
        out = net(images)

        # backprop
        optimizer.zero_grad()

        loss_l, loss_c = criterion(out, targets)
        loss = loss_l + loss_c

        loss.backward()

        optimizer.step()

        t1 = time.time()
        loc_loss += loss_l.data.item()
        conf_loss += loss_c.data.item()

        l_loss.append(loss_l.data.item())
        c_loss.append(loss_c.data.item())
        itr.append(iteration)
        
        if iteration % 10 == 0:
            print('timer: %.4f sec.' % (t1 - t0))
            print('iter ' + repr(iteration) + ' || Loss: %.4f ||' % (loss.data.item()), end=' ')
            currentDT = datetime.datetime.now()
            print (currentDT.strftime("%H:%M:%S %p"))
            print("\n")
            
        if iteration != 0 and iteration % 10 == 0:
            print('Saving state, iter:', iteration)
            iter_name = math.ceil(iteration/100)*100
            torch.save(ssd_net.state_dict(), 'weights/ssd300_' + str(pick) + '_' +repr(iter_name) + '.pth')
            with open('stats_'+str(pick)+'.pkl','wb') as f:
                pkl.dump([l_loss, c_loss, itr], f)
                

    torch.save(ssd_net.state_dict(),
               save_folder + data_set + '.pth')

In [6]:
data_set = 'VOC'
dataset_root = '//datasets/ee285f-public/PascalVOC2012/'
basenet = 'weights/vgg16_reducedfc.pth'
batch_size = 32
resume = None
start_iter = 0
num_workers = 4
cuda = True

learning_rate = lr = 1e-5
momentum = 0.9
weight_decay = 5e-4
gamma = 0.1

visdom = False
save_folder = 'trained_weights/'
if not os.path.exists(save_folder):
    os.mkdir(save_folder)

# resume = 'ssd300_mAP_77.43_v2.pth'

In [7]:
for lr in [1e-2]:
    train(pick=str(lr))

Pickle File: 0.01
Loading base network...
Initializing weights...
Loading the dataset...
Training SSD on:  train
Number of images in the training set = 5717
Number of images in a mini-batch = 32
Number of mini-batches = 179
STARTING - ITERATIONS




timer: 59.0077 sec.
iter 0 || Loss: 25.4825 || 03:35:31 AM


timer: 1.9066 sec.
iter 10 || Loss: nan || 03:36:41 AM


Saving state, iter: 10
timer: 2.3963 sec.
iter 20 || Loss: nan || 03:39:27 AM


Saving state, iter: 20
timer: 2.2903 sec.
iter 30 || Loss: nan || 03:41:45 AM


Saving state, iter: 30
timer: 3.0067 sec.
iter 40 || Loss: nan || 03:44:31 AM


Saving state, iter: 40
timer: 4.6964 sec.
iter 50 || Loss: nan || 03:46:48 AM


Saving state, iter: 50
timer: 1.7022 sec.
iter 60 || Loss: nan || 03:49:28 AM


Saving state, iter: 60
timer: 2.5008 sec.
iter 70 || Loss: nan || 03:51:52 AM


Saving state, iter: 70
timer: 2.3073 sec.
iter 80 || Loss: nan || 03:54:31 AM


Saving state, iter: 80
timer: 3.0099 sec.
iter 90 || Loss: nan || 03:56:53 AM


Saving state, iter: 90
timer: 2.1964 sec.
iter 100 || Loss: nan || 03:59:33 AM


Saving state, iter: 100
timer: 2.0050 sec.
iter 110 || Loss: nan || 04:01:47 AM


Saving state, iter: 110
timer: 4.2930 sec.
iter 120 || Loss: nan || 04:04:40 AM

KeyboardInterrupt: 

In [8]:
for lr in [1e-1, 1e-4]:
    train(pick=str(lr))

Pickle File: 0.1
Loading base network...
Initializing weights...
Loading the dataset...
Training SSD on:  train
Number of images in the training set = 5717
Number of images in a mini-batch = 32
Number of mini-batches = 179
STARTING - ITERATIONS




timer: 5.3077 sec.
iter 0 || Loss: 26.4228 || 04:31:42 AM


timer: 2.2957 sec.
iter 10 || Loss: nan || 04:33:49 AM


Saving state, iter: 10
timer: 2.2003 sec.
iter 20 || Loss: nan || 04:36:49 AM


Saving state, iter: 20
timer: 2.8018 sec.
iter 30 || Loss: nan || 04:38:59 AM


Saving state, iter: 30
timer: 2.7966 sec.
iter 40 || Loss: nan || 04:42:02 AM


Saving state, iter: 40
timer: 2.9045 sec.
iter 50 || Loss: nan || 04:44:11 AM


Saving state, iter: 50
timer: 3.6124 sec.
iter 60 || Loss: nan || 04:47:19 AM


Saving state, iter: 60
timer: 2.1992 sec.
iter 70 || Loss: nan || 04:49:23 AM


Saving state, iter: 70
timer: 3.4068 sec.
iter 80 || Loss: nan || 04:52:20 AM


Saving state, iter: 80
timer: 2.4023 sec.
iter 90 || Loss: nan || 04:54:32 AM


Saving state, iter: 90
timer: 2.6002 sec.
iter 100 || Loss: nan || 04:57:28 AM


Saving state, iter: 100
timer: 3.7998 sec.
iter 110 || Loss: nan || 04:59:31 AM


Saving state, iter: 110
timer: 2.3064 sec.
iter 120 || Loss: nan || 05:02:24 AM


RuntimeError: CUDA out of memory. Tried to allocate 703.12 MiB (GPU 0; 10.92 GiB total capacity; 7.01 GiB already allocated; 619.50 MiB free; 2.73 GiB cached)

In [None]:
for lr in [1e-4]:
    train(pick=str(lr))

Pickle File: 0.0001
Loading base network...
Initializing weights...
Loading the dataset...
Training SSD on:  train
Number of images in the training set = 5717
Number of images in a mini-batch = 32
Number of mini-batches = 179
STARTING - ITERATIONS




timer: 56.4220 sec.
iter 0 || Loss: 26.0719 || 08:14:38 AM


timer: 3.1056 sec.
iter 10 || Loss: 21.0108 || 08:15:50 AM


Saving state, iter: 10
timer: 3.6074 sec.
iter 20 || Loss: 15.6640 || 08:18:40 AM


Saving state, iter: 20
timer: 3.6040 sec.
iter 30 || Loss: 15.6045 || 08:21:08 AM


Saving state, iter: 30
timer: 1.6076 sec.
iter 40 || Loss: 15.2274 || 08:23:55 AM


Saving state, iter: 40
timer: 2.1958 sec.
iter 50 || Loss: 15.0279 || 08:26:23 AM


Saving state, iter: 50
timer: 1.8016 sec.
iter 60 || Loss: 15.0481 || 08:29:06 AM


Saving state, iter: 60
timer: 1.8047 sec.
iter 70 || Loss: 14.5396 || 08:31:54 AM


Saving state, iter: 70
timer: 2.0944 sec.
iter 80 || Loss: 14.8459 || 08:34:35 AM


Saving state, iter: 80
timer: 1.9997 sec.
iter 90 || Loss: 13.8513 || 08:37:35 AM


Saving state, iter: 90
timer: 2.2004 sec.
iter 100 || Loss: 13.9908 || 08:40:07 AM


Saving state, iter: 100
timer: 2.1047 sec.
iter 110 || Loss: 12.9512 || 08:43:01 AM


Saving state, iter: 110
timer: 1.70