In [37]:
import time
import torch.backends.cudnn as cudnn
import torch.optim
import torch.utils.data
from model import SSD300, MultiBoxLoss, PredictionConvolutions
from datasets import PascalVOCDataset
from utils import *

In [38]:
data_folder = './'  # folder with data files
keep_difficult = True  # use objects considered difficult to detect?

# Model parameters
n_classes = len(label_map)  # number of different types of objects
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [39]:
label_map, n_classes, device

({'piso_parquet': 1, 'piso_ceramica': 2, 'background': 0},
 3,
 device(type='cuda'))

In [40]:
# Learning parameters
checkpoint = None  # path to model checkpoint, None if none
checkpoint = './checkpoints/checkpoint_ssd300.pth.tar'
batch_size = 2  # batch size
iterations = 120000  # number of iterations to train
workers = 4  # number of workers for loading data in the DataLoader
print_freq = 200  # print training status every __ batches
lr = 1e-3  # learning rate
decay_lr_at = [80000, 100000]  # decay learning rate after these many iterations
decay_lr_to = 0.1  # decay learning rate to this fraction of the existing learning rate
momentum = 0.9  # momentum
weight_decay = 5e-4  # weight decay
grad_clip = None  # clip if gradients are exploding, which may happen at larger batch sizes (sometimes at 32) - you will recognize it by a sorting error in the MuliBox loss calculation

cudnn.benchmark = True


In [41]:
def main():
    """
    Training.
    """
    global start_epoch, label_map, epoch, checkpoint, decay_lr_at

    # Initialize model or load checkpoint
    if checkpoint is None:
        start_epoch = 0
        model = SSD300(n_classes=n_classes)
        print('\nLoaded new SSD300 model')
        checkpoint = []

    else: # cargo checkpoint (modelo pre entrenado)
        checkpoint = torch.load(checkpoint)
        start_epoch = checkpoint['epoch'] + 1
        print('\nLoaded checkpoint from epoch %d.\n' % start_epoch)
        model = checkpoint['model']
        model.pred_convs = PredictionConvolutions(n_classes) # Cambio las capas predictivas
        
        # Frizo todas las capas
        for child in model.children():
            for layer in child.children():
                for i in layer.parameters():
                    i.requires_grad = False
    
        # Desfrizo las de la capa predictiva
        print("\n")
        for l in model.pred_convs.children():
            for p in l.parameters():
                p.requires_grad = True
                print(str(l) + "<- No Frizada")
                
        # Desfrizo las capas en layers
        #  conv 4_3, conv 7, conv 8_2, conv 9_2, conv 10_2, conv 11_2 + pred convs
        layers = [model.base.conv4_3, model.base.conv7, model.aux_convs.conv8_2, model.aux_convs.conv9_2, model.aux_convs.conv10_2, model.aux_convs.conv11_2]
        for l in layers:
            for i in l.parameters():
                i.requires_grad = True
                print(str(l) + "<- No Frizada")

    if 'optimizer' not in checkpoint:
        print("Optimizer no encontrado. Generando uno...")
        # Initialize the optimizer, with twice the default learning rate for biases, as in the original Caffe repo
        biases = list()
        not_biases = list()
        for param_name, param in model.named_parameters():
            if param.requires_grad:
                if param_name.endswith('.bias'):
                    biases.append(param) # Separo los parametros que son de bias de los que no
                else:                    # porque vamos a aplicarles learning rates distintos
                    not_biases.append(param)
        optimizer = torch.optim.SGD(params=[{'params': biases, 'lr': 2 * lr}, {'params': not_biases}],
                                    lr=lr, momentum=momentum, weight_decay=weight_decay) # Aca creamos el optimizador y
                                                                                         # y aplicamos los distintos LR
    # Move to default device
    print("Pasando el modelo a ", device)
    model = model.to(device)
    criterion = MultiBoxLoss(priors_cxcy=model.priors_cxcy).to(device)

    # Custom dataloaders
    print("Seteando dataset y dataloader")
    train_dataset = PascalVOCDataset(data_folder,
                                     split='train',
                                     keep_difficult=keep_difficult)
    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True,
                                               collate_fn=train_dataset.collate_fn, num_workers=workers,
                                               pin_memory=True)  # note that we're passing the collate function here

    # Calculate total number of epochs to train and the epochs to decay learning rate at (i.e. convert iterations to epochs)
    # To convert iterations to epochs, divide iterations by the number of iterations per epoch
    # The paper trains for 120,000 iterations with a batch size of 32, decays after 80,000 and 100,000 iterations
    epochs = iterations // (len(train_dataset) // 8)
    epochs = start_epoch + 150 # Probar
    decay_lr_at = [it // (len(train_dataset) // 8) for it in decay_lr_at]

    # Epochs
    print("Arrancando loop de entrenamiento.\nEpochs: ", epochs)
    print("start epoch:", start_epoch)
    for epoch in range(start_epoch, epochs):
        print("epoch: "+ str(epoch) + "/" + str(epochs))
        # Decay learning rate at particular epochs
        if epoch in decay_lr_at:
            adjust_learning_rate(optimizer, decay_lr_to)

        # One epoch's training
        train(train_loader=train_loader,
              model=model,
              criterion=criterion,
              optimizer=optimizer,
              epoch=epoch)

        # Save checkpoint
        #save_checkpoint(epoch, model, optimizer)
    return epoch, model, optimizer

In [42]:
def train(train_loader, model, criterion, optimizer, epoch):
    """
    One epoch's training.

    :param train_loader: DataLoader for training data
    :param model: model
    :param criterion: MultiBox loss
    :param optimizer: optimizer
    :param epoch: epoch number
    """
    model.train()  # training mode enables dropout

    batch_time = AverageMeter()  # forward prop. + back prop. time
    data_time = AverageMeter()  # data loading time
    losses = AverageMeter()  # loss

    start = time.time()

    # Batches
    for i, (images, boxes, labels, _) in enumerate(train_loader):
        data_time.update(time.time() - start)

        # Move to default device
        images = images.to(device)  # (batch_size (N), 3, 300, 300)
        boxes = [b.to(device) for b in boxes]
        labels = [l.to(device) for l in labels]

        # Forward prop.
        predicted_locs, predicted_scores = model(images)  # (N, 8732, 4), (N, 8732, n_classes)

        # Loss
        loss = criterion(predicted_locs, predicted_scores, boxes, labels)  # scalar

        # Backward prop.
        optimizer.zero_grad()
        loss.backward()

        # Clip gradients, if necessary
        if grad_clip is not None:
            clip_gradient(optimizer, grad_clip)

        # Update model
        optimizer.step()

        losses.update(loss.item(), images.size(0))
        batch_time.update(time.time() - start)

        start = time.time()

        # Print status
        if i % print_freq == 0:
            print(predicted_locs.shape)
            print(predicted_scores.shape)
            
            print('Epoch: [{0}][{1}/{2}]\t'
                  'Batch Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                  'Data Time {data_time.val:.3f} ({data_time.avg:.3f})\t'
                  'Loss {loss.val:.4f} ({loss.avg:.4f})\t'.format(epoch, i, len(train_loader),
                                                                  batch_time=batch_time,
                                                                  data_time=data_time, loss=losses))
    del predicted_locs, predicted_scores, images, boxes, labels  # free some memory since their histories may be stored



In [43]:
state = main()


Loaded checkpoint from epoch 232.



Conv2d(512, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))<- No Frizada
Conv2d(512, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))<- No Frizada
Conv2d(1024, 24, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))<- No Frizada
Conv2d(1024, 24, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))<- No Frizada
Conv2d(512, 24, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))<- No Frizada
Conv2d(512, 24, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))<- No Frizada
Conv2d(256, 24, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))<- No Frizada
Conv2d(256, 24, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))<- No Frizada
Conv2d(256, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))<- No Frizada
Conv2d(256, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))<- No Frizada
Conv2d(256, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))<- No Frizada
Conv2d(256, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))<- No Frizada


epoch: 266/382
torch.Size([2, 8732, 4])
torch.Size([2, 8732, 3])
Epoch: [266][0/55]	Batch Time 2.859 (2.859)	Data Time 2.806 (2.806)	Loss 1.8309 (1.8309)	
epoch: 267/382
torch.Size([2, 8732, 4])
torch.Size([2, 8732, 3])
Epoch: [267][0/55]	Batch Time 2.928 (2.928)	Data Time 2.874 (2.874)	Loss 2.9065 (2.9065)	
epoch: 268/382
torch.Size([2, 8732, 4])
torch.Size([2, 8732, 3])
Epoch: [268][0/55]	Batch Time 2.837 (2.837)	Data Time 2.781 (2.781)	Loss 2.8324 (2.8324)	
epoch: 269/382
torch.Size([2, 8732, 4])
torch.Size([2, 8732, 3])
Epoch: [269][0/55]	Batch Time 2.849 (2.849)	Data Time 2.794 (2.794)	Loss 2.7451 (2.7451)	
epoch: 270/382
torch.Size([2, 8732, 4])
torch.Size([2, 8732, 3])
Epoch: [270][0/55]	Batch Time 3.069 (3.069)	Data Time 3.017 (3.017)	Loss 3.0272 (3.0272)	
epoch: 271/382
torch.Size([2, 8732, 4])
torch.Size([2, 8732, 3])
Epoch: [271][0/55]	Batch Time 2.928 (2.928)	Data Time 2.873 (2.873)	Loss 3.0378 (3.0378)	
epoch: 272/382
torch.Size([2, 8732, 4])
torch.Size([2, 8732, 3])
Epoch

epoch: 319/382
torch.Size([2, 8732, 4])
torch.Size([2, 8732, 3])
Epoch: [319][0/55]	Batch Time 2.852 (2.852)	Data Time 2.798 (2.798)	Loss 2.5039 (2.5039)	
epoch: 320/382
torch.Size([2, 8732, 4])
torch.Size([2, 8732, 3])
Epoch: [320][0/55]	Batch Time 2.957 (2.957)	Data Time 2.905 (2.905)	Loss 3.5099 (3.5099)	
epoch: 321/382
torch.Size([2, 8732, 4])
torch.Size([2, 8732, 3])
Epoch: [321][0/55]	Batch Time 2.923 (2.923)	Data Time 2.868 (2.868)	Loss 2.0647 (2.0647)	
epoch: 322/382
torch.Size([2, 8732, 4])
torch.Size([2, 8732, 3])
Epoch: [322][0/55]	Batch Time 2.891 (2.891)	Data Time 2.836 (2.836)	Loss 1.8839 (1.8839)	
epoch: 323/382
torch.Size([2, 8732, 4])
torch.Size([2, 8732, 3])
Epoch: [323][0/55]	Batch Time 2.831 (2.831)	Data Time 2.778 (2.778)	Loss 1.2887 (1.2887)	
epoch: 324/382
torch.Size([2, 8732, 4])
torch.Size([2, 8732, 3])
Epoch: [324][0/55]	Batch Time 2.857 (2.857)	Data Time 2.805 (2.805)	Loss 1.6999 (1.6999)	
epoch: 325/382
torch.Size([2, 8732, 4])
torch.Size([2, 8732, 3])
Epoch

epoch: 372/382
torch.Size([2, 8732, 4])
torch.Size([2, 8732, 3])
Epoch: [372][0/55]	Batch Time 2.932 (2.932)	Data Time 2.880 (2.880)	Loss 1.6243 (1.6243)	
epoch: 373/382
torch.Size([2, 8732, 4])
torch.Size([2, 8732, 3])
Epoch: [373][0/55]	Batch Time 3.133 (3.133)	Data Time 3.080 (3.080)	Loss 3.2312 (3.2312)	
epoch: 374/382
torch.Size([2, 8732, 4])
torch.Size([2, 8732, 3])
Epoch: [374][0/55]	Batch Time 2.951 (2.951)	Data Time 2.899 (2.899)	Loss 2.2884 (2.2884)	
epoch: 375/382
torch.Size([2, 8732, 4])
torch.Size([2, 8732, 3])
Epoch: [375][0/55]	Batch Time 2.865 (2.865)	Data Time 2.812 (2.812)	Loss 1.3294 (1.3294)	
epoch: 376/382
torch.Size([2, 8732, 4])
torch.Size([2, 8732, 3])
Epoch: [376][0/55]	Batch Time 2.929 (2.929)	Data Time 2.874 (2.874)	Loss 1.4102 (1.4102)	
epoch: 377/382
torch.Size([2, 8732, 4])
torch.Size([2, 8732, 3])
Epoch: [377][0/55]	Batch Time 2.839 (2.839)	Data Time 2.785 (2.785)	Loss 1.3710 (1.3710)	
epoch: 378/382
torch.Size([2, 8732, 4])
torch.Size([2, 8732, 3])
Epoch

In [44]:
state = {'epoch': state[0],
             'model': state[1],
             'optimizer': state[2]}

In [45]:
state

{'epoch': 381,
 'model': SSD300(
   (base): VGGBase(
     (conv1_1): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
     (conv1_2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
     (pool1): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
     (conv2_1): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
     (conv2_2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
     (pool2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
     (conv3_1): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
     (conv3_2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
     (conv3_3): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
     (pool3): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=True)
     (conv4_1): Conv2d(256, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
     (conv4_2):

In [46]:
#torch.save(state, 'checkpoint_fede.pth.tar')