In [1]:
from data import *
from utils.augmentations import SSDAugmentation
from layers.modules import MultiBoxLoss
from ssd import build_ssd

import os
import os.path as osp
import sys
import time

import torch
import torch.nn as nn
from torch.nn import functional as F
import torchvision as tv
import torch.optim as optim

import torch.utils.data as data
import torchvision.datasets

import numpy as np
from PIL import Image
import argparse

from matplotlib import pyplot as plt
import pandas as pd
import cv2 
import pickle as pkl
import random
import tarfile
import collections
import math
import datetime
import pickle

from opts import *
import xml.etree.ElementTree as ET

In [2]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cuda


In [3]:
def weights_init(m):
    if isinstance(m, nn.Conv2d):
        nn.init.xavier_uniform_(m.weight.data)
        m.bias.data.zero_()

In [13]:
def train(device, resume=None, momentum=0.9):
    
    #initialize config
    cfg = voc
    dataset = VOCDetection(root=dataset_root, image_sets=[('2012', 'train')],
                            transform=SSDAugmentation(cfg['min_dim'],
                             MEANS))
    ssd_net = build_ssd('train', cfg['min_dim'], cfg['num_classes'])
    net = ssd_net
    
    if device:
        net = torch.nn.DataParallel(ssd_net)
        torch.backends.cudnn.benchmark = True
        net = net.to(device)
        
    if resume:
        print('Resuming training, loading previous training at ',resume)
        ssd_net.load_weights(resume)
    else:
        vgg_weights = torch.load(basenet)
        print('Loading base network...')
        ssd_net.vgg.load_state_dict(vgg_weights)
        print('Initializing weights...')
        ssd_net.extras.apply(weights_init)
        ssd_net.loc.apply(weights_init)
        ssd_net.conf.apply(weights_init)

    optimizer = optim.SGD(net.parameters(), lr, momentum,weight_decay)
    
    criterion = MultiBoxLoss(cfg['num_classes'], 0.5, True, 0, True, 3, 0.5,
                             False, torch.cuda.is_available())

    
    net.train()
    name = 'train'
    
    # loss counters
    loc_loss = 0
    conf_loss = 0
    epoch = 0
    print('Loading the dataset...')

    epoch_size = len(dataset) // batch_size
    print('Training SSD on: ',name)

    step_index = 0


    train_data_loader = data.DataLoader(dataset, batch_size=32, num_workers=4, shuffle=True,\
                               collate_fn=detection_collate,pin_memory=True)
    
    print("Number of images in the training set = " + str(len(dataset)))
    print("Number of images in a mini-batch = "+str(batch_size))
    print("Number of mini-batches = " + str(len(train_data_loader)))
    
    
     # create batch iterator
    batch_iterator = iter(train_data_loader)
    print("STARTING - ITERATIONS")
    
    # Stats for pickle and plotting
    l_loss = []
    c_loss = []
    itr = []
    
    for iteration in range(0, 150):
        
        if iteration != 0 and (iteration % epoch_size == 0):
            # reset epoch loss counters
            loc_loss = 0
            conf_loss = 0
            epoch += 1

        if iteration in cfg['lr_steps']:
            step_index += 1
            lr_dec = lr * (gamma ** (step_index))
            for param_group in optimizer.param_groups:
                param_group['lr'] = lr_dec

            
        ## load train data
        try:
            images, targets = next(batch_iterator)
        except StopIteration:
            batch_iterator = iter(train_data_loader)
            images, targets = next(batch_iterator)


        
        if device:
            images = images.cuda()
            targets = [ann.cuda() for ann in targets]
        else:
            images = images
            targets = [ann for ann in targets]
        
        # forward
        t0 = time.time()
        out = net(images)

        # backprop
        optimizer.zero_grad()

        loss_l, loss_c = criterion(out, targets)
        loss = loss_l + loss_c

        loss.backward()

        optimizer.step()

        t1 = time.time()
        loc_loss += loss_l.data.item()
        conf_loss += loss_c.data.item()

        l_loss.append(loss_l.data.item())
        c_loss.append(loss_c.data.item())
        itr.append(iteration)
        
        if iteration % 10 == 0:
            print('timer: %.4f sec.' % (t1 - t0))
            print('iter ' + repr(iteration) + ' || Loss: %.4f ||' % (loss.data.item()), end=' ')
            currentDT = datetime.datetime.now()
            print (currentDT.strftime("%H:%M:%S %p"))
            print("\n")

        
        if iteration != 0 and iteration % 10 == 0:
            print('Saving state, iter:', iteration)
            iter_name = math.ceil(iteration/100)*100
            torch.save(ssd_net.state_dict(), 'weights/ssd_VOC_' +str(momentum)+ '_' + repr(iter_name) + '.pth')
            with open('stats_momentum_' +str(momentum)+'.pkl','wb') as f:
                pkl.dump([l_loss, c_loss, itr], f)
                

    torch.save(ssd_net.state_dict(),
               'weights/' + data_set+ '_momentum_' + str(momentum) + '.pth') 

In [10]:
!cat opts.py

#Initialize pointers
basenet = 'weights/vgg16_reducedfc.pth'
data_set = 'VOC'
dataset_root = voc_root = '//datasets/ee285f-public/PascalVOC2012/'
save_folder = 'trained_weights/'
trained_model = 'ssd_pretrained.pth'
eval_save_folder = 'eval/'
devkit_path = 'devkit_path/'
output_dir = "out/"

#Run related metaparameters

batch_size = 32
resume = None

#Optimization metaparameters
lr = 1e-3
momentum = 0.9
weight_decay = 5e-4
gamma = 0.1
    
confidence_threshold = 0.01
top_k = 5
cleanup = True

YEAR = '2012'
dataset_mean = (104, 117, 123)
set_type = 'train'


## Please run the below cells twice

In [None]:
momentum = 0.1
train(device, "weights/ssd_VOC_0.1_200.pth", momentum)

Loading base network...
Initializing weights...
Loading the dataset...
Training SSD on:  train
Number of images in the training set = 5717
Number of images in a mini-batch = 32
Number of mini-batches = 179
STARTING - ITERATIONS
timer: 56.4053 sec.
iter 0 || Loss: 25.0151 || 06:51:25 AM


timer: 3.3949 sec.
iter 10 || Loss: 15.6071 || 06:52:36 AM


Saving state, iter: 10
timer: 2.7956 sec.
iter 20 || Loss: 15.3809 || 06:54:59 AM


Saving state, iter: 20
timer: 3.8933 sec.
iter 30 || Loss: 15.2311 || 06:57:36 AM


Saving state, iter: 30
timer: 2.4992 sec.
iter 40 || Loss: 14.7566 || 07:00:01 AM


Saving state, iter: 40
timer: 1.9987 sec.
iter 50 || Loss: 14.2782 || 07:02:39 AM


Saving state, iter: 50
timer: 2.8956 sec.
iter 60 || Loss: 13.8230 || 07:04:54 AM


Saving state, iter: 60


In [8]:
momentum = 10
train(device, resume, momentum)

Loading base network...
Initializing weights...
Loading the dataset...
Training SSD on:  train
Number of images in the training set = 5717
Number of images in a mini-batch = 32
Number of mini-batches = 179
STARTING - ITERATIONS
timer: 59.9189 sec.
iter 0 || Loss: 25.7631 || 07:43:35 AM


timer: 3.8020 sec.
iter 10 || Loss: nan || 07:44:47 AM


Saving state, iter: 10
timer: 2.0916 sec.
iter 20 || Loss: nan || 07:47:01 AM


Saving state, iter: 20
timer: 3.8022 sec.
iter 30 || Loss: nan || 07:50:00 AM


Saving state, iter: 30
timer: 1.7055 sec.
iter 40 || Loss: nan || 07:52:05 AM


Saving state, iter: 40
timer: 1.4999 sec.
iter 50 || Loss: nan || 07:55:12 AM


Saving state, iter: 50
timer: 2.0949 sec.
iter 60 || Loss: nan || 07:57:25 AM


Saving state, iter: 60
timer: 1.9010 sec.
iter 70 || Loss: nan || 08:00:25 AM


Saving state, iter: 70
timer: 1.4930 sec.
iter 80 || Loss: nan || 08:02:38 AM


Saving state, iter: 80
timer: 2.2078 sec.
iter 90 || Loss: nan || 08:05:43 AM


Saving state, 

FileNotFoundError: [Errno 2] No such file or directory: 'trained_weights/VOC.pth'

In [14]:
momentum = 0.01
train(device, resume, momentum)

Loading base network...
Initializing weights...
Loading the dataset...
Training SSD on:  train
Number of images in the training set = 5717
Number of images in a mini-batch = 32
Number of mini-batches = 179
STARTING - ITERATIONS
timer: 3.0926 sec.
iter 0 || Loss: 26.1997 || 09:56:15 AM


timer: 2.5999 sec.
iter 10 || Loss: 16.2860 || 09:58:19 AM


Saving state, iter: 10
timer: 2.5046 sec.
iter 20 || Loss: 15.6932 || 10:01:21 AM


Saving state, iter: 20
timer: 2.4899 sec.
iter 30 || Loss: 14.9477 || 10:03:35 AM


Saving state, iter: 30
timer: 3.1277 sec.
iter 40 || Loss: 14.7577 || 10:06:33 AM


Saving state, iter: 40
timer: 1.6084 sec.
iter 50 || Loss: 14.4329 || 10:08:39 AM


Saving state, iter: 50
timer: 3.9029 sec.
iter 60 || Loss: 14.3015 || 10:11:43 AM


Saving state, iter: 60
timer: 2.6942 sec.
iter 70 || Loss: 13.8160 || 10:13:48 AM


Saving state, iter: 70
timer: 3.4991 sec.
iter 80 || Loss: 13.8718 || 10:16:44 AM


Saving state, iter: 80
timer: 2.1974 sec.
iter 90 || Loss: 12.7

In [15]:
momentum = 1.5
train(device, resume, momentum)

Loading base network...
Initializing weights...
Loading the dataset...
Training SSD on:  train
Number of images in the training set = 5717
Number of images in a mini-batch = 32
Number of mini-batches = 179
STARTING - ITERATIONS
timer: 2.9991 sec.
iter 0 || Loss: 26.9479 || 10:36:11 AM


timer: 2.1992 sec.
iter 10 || Loss: 410151948047.0588 || 10:38:21 AM


Saving state, iter: 10
timer: 2.0081 sec.
iter 20 || Loss: nan || 10:41:16 AM


Saving state, iter: 20
timer: 1.9995 sec.
iter 30 || Loss: nan || 10:43:37 AM


Saving state, iter: 30
timer: 4.0017 sec.
iter 40 || Loss: nan || 10:46:22 AM


Saving state, iter: 40
timer: 1.8991 sec.
iter 50 || Loss: nan || 10:48:47 AM


Saving state, iter: 50
timer: 2.0058 sec.
iter 60 || Loss: nan || 10:51:34 AM


Saving state, iter: 60
timer: 1.9007 sec.
iter 70 || Loss: nan || 10:53:52 AM


Saving state, iter: 70
timer: 2.0991 sec.
iter 80 || Loss: nan || 10:56:59 AM


Saving state, iter: 80
timer: 1.6039 sec.
iter 90 || Loss: nan || 10:59:12 AM


S

KeyboardInterrupt: 