# Treinamento da rede para o CUHK03

In [1]:
%load_ext autoreload
%autoreload 2

In [16]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
import torch
import torch as tc
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from torch.autograd import Variable
from  torch.utils.data import DataLoader
import torch.optim.lr_scheduler as lr_scheduler
import matplotlib.pyplot as plt
%matplotlib inline
import torchvision.datasets as dset
import numpy as np
import cv2

from center_loss import CenterLoss
from frw import L2_penalty_with_constant
from reid_model import ReID_Net
from datasets import CUHK03
from custom_transforms import RandomTranslation
from evaluation import pairwise_dist, generate_split_cuhk03, get_topk_results
from utils import MetricAverager, save_checkpoint, load_checkpoint
import time
import shutil

## Dataset, transformações de data augmentation e DataLoaders

In [3]:
data_transforms = {
    'train': transforms.Compose([
        RandomTranslation(0.1),   # translação em numpy array
        transforms.ToPILImage(),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),    # range [0.0, 1.0]
        transforms.Lambda(lambda t: t/0.5 - 1)    # range [-1.0, 1.0]
    ]),
    'valid': transforms.Compose([
        transforms.ToTensor(),
        transforms.Lambda(lambda t: t/0.5 - 1)
    ])
}

In [4]:
datasets = {
    'train': CUHK03('../reid/datasets/cuhk03_train_correct.npz', transform=data_transforms['train']),
    'valid': CUHK03('../reid/datasets/cuhk03_train_correct.npz', train=False, transform=data_transforms['valid']),
}

batch_size = 128
dset_loaders = {
    'train': DataLoader(datasets['train'], batch_size=batch_size, shuffle=True, num_workers=4),
    'valid': DataLoader(datasets['valid'], batch_size=batch_size, shuffle=False, num_workers=4)
}

In [5]:
print(datasets['train'].X.shape, datasets['train'].X.dtype, datasets['train'].X.min(), datasets['train'].X.max())
print(datasets['train'].y.shape, datasets['train'].y.dtype, datasets['train'].y.min(), datasets['train'].y.max())
print(datasets['valid'].X.shape, datasets['valid'].X.dtype, datasets['valid'].X.min(), datasets['valid'].X.max())
print(datasets['valid'].y.shape, datasets['valid'].y.dtype, datasets['valid'].y.min(), datasets['valid'].y.max())

(12165, 128, 48, 3) uint8 0 255
(12165,) int64 0 1266
(966, 128, 48, 3) uint8 0 255
(966,) int64 0 99


In [6]:
len(dset_loaders['train']), len(dset_loaders['valid'])

(96, 8)

In [7]:
dset_loaders['valid'].dataset.val_cam_ids[:10]

array([0, 0, 0, 0, 0, 1, 1, 1, 1, 1])

In [8]:
lambda_cl = 0.01
alpha = 0.5
num_classes = len(np.unique(datasets['train'].y))
print('num_classes: ', num_classes)

epochs = 500
beta_frw = 0.0001

lr = 0.001
weight_decay = 0.001

num_classes:  1267


## Funções de treinamento

### Para uma época

In [46]:
def train_epoch(model, optimizer, epoch, criterions, train_loader, val_loader, use_cuda, lambda_cl, beta_frw,
                query_splits, cuda_device=None):
    print("Training... Epoch = %d" % epoch)

    train_accum = 0
    n_train_samples = 0
    val_accum = 0
    n_val_samples = 0
    
    tlosses = {
        'center_loss': MetricAverager(),
        'softmax_loss': MetricAverager(),
        'total': MetricAverager(),
    }
    
    #train
    model.train(True)
    t0 = time.time()
    for i,(data, target) in enumerate(train_loader):
        
        if use_cuda:
            data = data.cuda()
            target = target.cuda()
        data, target = Variable(data), Variable(target)
        
        
        features, pred = model(data)
        
        # Softmax loss + center loss
        softmax_loss = criterions[0](pred, target)
        center_loss = lambda_cl * criterions[1](features, target)
        loss = softmax_loss + center_loss
        
        # Acrescenta a penalização L2 na camada FRW
        for (name, p) in model.named_parameters():
            if name == 'frw.weight' or name == 'fc_embeddings.3.weight':
                loss += beta_frw * criterions[2](p)
        
        _, preds = torch.max(pred.data, 1)
        correct = (preds == target.data).sum()
        train_accum += correct
        n_train_samples += target.size(0)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        tlosses['center_loss'].update(center_loss.data.cpu().numpy()[0])
        tlosses['softmax_loss'].update(softmax_loss.data.cpu().numpy()[0])
        tlosses['total'].update(loss.data.cpu().numpy()[0])
        
        t1 = time.time() - t0
        train_string = 'Epoch {}, Batch {}/{} ({:.1f}s): Center loss {:.4f}, Softmax loss {:.4f}, ' \
            'total {:.4f}'.format(epoch, i+1, len(train_loader), t1, tlosses['center_loss'], 
                                  tlosses['softmax_loss'], tlosses['total'])
        print(train_string, end='\r' if i < len(train_loader)-1 else '\n')
    
    train_acc = train_accum / n_train_samples
    print('Train accuracy: {:.2%}'.format(train_acc))
        
    # Validation: in the validation phase, we extract the features of all the validation images.
    # Then, we calculate the pairwise euclidian distance for all samples and calculate
    # the top-1, top-5 and top-10 CMC curves using single-gallery-shot setting: the query and
    # gallery sets have images from different camera views and, for each person identity, one image 
    # is randomly selected for each set. This is done for each query split configuration.
    
    t0 = time.time()
    model.train(False)
    
    val_feats = []
    val_pids = []
    
    valid_cmc = {
        'topk': {k: MetricAverager() for k in [1,5,10]}
    }
    
    for i,(data, target) in enumerate(val_loader):
        
        if use_cuda:
            data = data.cuda()
        data = Variable(data, volatile=True)

        features, pred = model(data)
        val_feats.append(features.data)
        
    val_feats = torch.cat(val_feats)
    
    # Calculates the pairwise euclidian distance
    dists = pairwise_dist(val_feats)
    
    val_dataset = val_loader.dataset
    val_cam_ids = val_dataset.val_cam_ids
    y_val = val_dataset.y
    
    N = len(query_splits)
    for i, indices in enumerate(query_splits):
        
        gallery_matches = get_topk_results(dists, k=10, indices=indices)
        # best_matches: Tensor (len(indices['query']), 10)
        if gallery_matches.is_cuda:
            gallery_matches = gallery_matches.cpu()
        gallery_matches = gallery_matches.numpy()
        
        y_query = y_val[indices['query']]
        y_gal = y_val[indices['gallery']]
        
        topk = {k: 0 for k in [1,5,10]}
        
        for j, query_idx in enumerate(indices['query']):
            matches = gallery_matches[j, :]
            query_pid = y_val[query_idx]
            matched_pids = y_gal[matches]
            
            for k in [1,5,10]:
                if query_pid in matched_pids[:k]:
                    topk[k] += 1
        
        for k in topk.keys():
            topk[k] /= len(indices['query'])
            valid_cmc['topk'][k].update(topk[k])
        
        t1 = time.time() - t0
        print('Validation CMC (split {:>2}/{}): top-1: {:.2%} | top-5: {:.2%} | top-10: {:.2%}'.format(
             i+1, N, valid_cmc['topk'][1], valid_cmc['topk'][5], valid_cmc['topk'][10]),
             end='\r' if i < N-1 else '\n')
        
    print('-'*80)
        
    
    metrics = {
        'train': {
            'losses': {
                'center_loss': tlosses['center_loss'].mean,
                'softmax_loss': tlosses['softmax_loss'].mean,
                'total': tlosses['total'].mean,
            },
            'accuracy': train_acc,
        },
        'valid': {
            'cmc': {
                'topk':{
                    k: valid_cmc['topk'][k].mean for k in [1,5,10]
                }
            }
        },
    }

    return metrics

### Para o treinamento completo

In [44]:
def train(epochs, model=None, resume=None, checkpoint_fn=None, **train_args):
    best_val_loss = None
    last_epoch = 0
    
    train_history = []
    
    use_cuda = train_args.get('use_cuda')
    
    if model is None:
        num_classes = train_args.pop('num_classes')
        model = ReID_Net(num_classes)
    
    if resume is not None:
        scheduler, last_epoch, best_val_loss, train_history = load_checkpoint(resume, model, optimizer4nn)
        num_classes = model.centers.size(0)
    
    # NLLLoss
    nllloss = nn.NLLLoss() #CrossEntropyLoss = log_softmax + NLLLoss
    # CenterLoss
    loss_weight = 1.0

    # FRW L2 penalty
    frw_l2_penalty = L2_penalty_with_constant(200)
    
    if use_cuda:
        nllloss = nllloss.cuda()
        model = model.cuda()
        frw_l2_penalty = frw_l2_penalty.cuda()
        
    
    alpha = train_args.pop('alpha')

    centerloss = CenterLoss(model, model.centers, num_classes, alpha)
    criterions = [nllloss, centerloss, frw_l2_penalty]

    # optimzer4nn
    lr = train_args.pop('lr')
    weight_decay = train_args.pop('weight_decay')
    optimizer4nn = optim.Adam(model.parameters(),lr=lr, weight_decay=weight_decay)
    
    # LR Scheduler
    scheduler_patience = train_args.pop('scheduler_patience', 40)
    scheduler_factor = train_args.pop('scheduler_factor', 0.1)
    scheduler = lr_scheduler.ReduceLROnPlateau(optimizer4nn, mode='max', patience=scheduler_patience, 
                                               factor=scheduler_factor, verbose=True)
        
    max_val_top1 = best_val_loss or -np.Inf
    
    patience = train_args.pop('patience', 50)
    no_improve = 0
    
    print('train_args:', train_args)
    
    # Gera os splits de query e gallery que serão usados no CMC de validação
    val_dataset = train_args['val_loader'].dataset
    query_splits = []
    for i in range(20):
        query_splits.append(generate_split_cuhk03(val_dataset.y, val_dataset.val_cam_ids))
    
    for epoch in range(last_epoch+1, epochs):

        metrics = train_epoch(model, optimizer4nn, epoch, criterions, query_splits=query_splits, **train_args)
        
        scheduler.step(metrics['valid']['cmc']['topk'][1])
        train_history.append(metrics)
        
        if metrics['valid']['cmc']['topk'][1] > max_val_top1:
            print("Min. Valid. top-1 / Epoch's Valid. top-1: {:.2%} | {:.2%}".format(
                max_val_top1, metrics['valid']['cmc']['topk'][1]))
            print('Salvando melhor modelo...')
            max_val_top1 = metrics['valid']['cmc']['topk'][1]
            state = {
                'epoch': epoch,
                'state_dict': model.state_dict(),
                'best_val_loss': max_val_top1,
                'train_softmax_loss': metrics['train']['losses']['softmax_loss'],
                'history': train_history,
            }
            opts = {'is_best': False}
            if checkpoint_fn is not None:
                opts.update({'filename': checkpoint_fn.format(epoch+1)})
            save_checkpoint(state, **opts)
            no_improve = 0
            print('-'*80)
        else:
            no_improve += 1
            if no_improve > patience:
                print('Early stopping at epoch {}.'.format(epoch))
                break
                
                
    return model, train_history

## Treinamento: parâmetros e loop de treinamento

In [14]:
train_params = {
    'train_loader': dset_loaders['train'],
    'val_loader': dset_loaders['valid'],
    'num_classes': len(np.unique(datasets['train'].y)),
    'lr': 0.001,
    'weight_decay': 0.001,
    'use_cuda': True,
    'lambda_cl': 0.01,
    'alpha': 0.5,
    'beta_frw': 0.0001,
    'cuda_device': None,
    'patience': 50,
    'scheduler_patience': 40,
    'scheduler_factor': 0.1,
}

In [47]:
try:
    model, hist = train(epochs, model=None, checkpoint_fn='cuhk03_correct_epoch_{}.pth', **train_params)
except KeyboardInterrupt:
    print('Interrompendo...')

train_args: {'cuda_device': None, 'val_loader': <torch.utils.data.dataloader.DataLoader object at 0x7f2eeb19c2e8>, 'train_loader': <torch.utils.data.dataloader.DataLoader object at 0x7f2eeb19c358>, 'beta_frw': 0.0001, 'lambda_cl': 0.01, 'use_cuda': True}
Training... Epoch = 1
Epoch 1, Batch 96/96 (21.3s): Center loss 0.7963, Softmax loss 7.0263, total 7.9578
Train accuracy: 0.22%
Validation CMC (split 20/20): top-1: 5.45% | top-5: 16.10% | top-10: 26.45%
--------------------------------------------------------------------------------
Min. Valid. top-1 / Epoch's Valid. top-1: -inf% | 5.45%
Salvando melhor modelo...
--------------------------------------------------------------------------------
Training... Epoch = 2
Epoch 2, Batch 96/96 (21.3s): Center loss 0.1330, Softmax loss 6.2328, total 6.3763
Train accuracy: 1.49%
Validation CMC (split 20/20): top-1: 15.00% | top-5: 36.60% | top-10: 53.35%
--------------------------------------------------------------------------------
Min. Valid.

Epoch 46, Batch 96/96 (21.2s): Center loss 0.4597, Softmax loss 0.2417, total 0.7030
Train accuracy: 99.07%
Validation CMC (split 20/20): top-1: 67.30% | top-5: 91.55% | top-10: 95.80%
--------------------------------------------------------------------------------
Training... Epoch = 47
Epoch 47, Batch 96/96 (21.2s): Center loss 0.5193, Softmax loss 0.3197, total 0.8406
Train accuracy: 97.97%
Validation CMC (split 20/20): top-1: 73.65% | top-5: 94.90% | top-10: 98.05%
--------------------------------------------------------------------------------
Training... Epoch = 48
Epoch 48, Batch 96/96 (21.2s): Center loss 0.4540, Softmax loss 0.2337, total 0.6895
Train accuracy: 99.22%
Validation CMC (split 20/20): top-1: 74.80% | top-5: 95.70% | top-10: 98.10%
--------------------------------------------------------------------------------
Training... Epoch = 49
Epoch 49, Batch 96/96 (21.3s): Center loss 0.4460, Softmax loss 0.2313, total 0.6790
Train accuracy: 99.28%
Validation CMC (split 20/

Validation CMC (split 20/20): top-1: 70.80% | top-5: 93.25% | top-10: 97.60%
--------------------------------------------------------------------------------
Training... Epoch = 101
Epoch 101, Batch 96/96 (21.2s): Center loss 0.4365, Softmax loss 0.2071, total 0.6460
Train accuracy: 99.28%
Validation CMC (split 20/20): top-1: 78.50% | top-5: 96.10% | top-10: 98.75%
--------------------------------------------------------------------------------
Training... Epoch = 102
Epoch 102, Batch 96/96 (21.1s): Center loss 0.4502, Softmax loss 0.2040, total 0.6566
Train accuracy: 99.31%
Validation CMC (split 20/20): top-1: 80.75% | top-5: 96.60% | top-10: 99.00%
--------------------------------------------------------------------------------
Min. Valid. top-1 / Epoch's Valid. top-1: 80.15% | 80.75%
Salvando melhor modelo...
--------------------------------------------------------------------------------
Training... Epoch = 103
Epoch 103, Batch 96/96 (21.2s): Center loss 0.4129, Softmax loss 0.1762

Epoch 128, Batch 96/96 (21.2s): Center loss 0.4173, Softmax loss 0.1854, total 0.6052
Train accuracy: 99.50%
Validation CMC (split 20/20): top-1: 74.30% | top-5: 95.70% | top-10: 98.00%
--------------------------------------------------------------------------------
Training... Epoch = 129
Epoch 129, Batch 96/96 (21.2s): Center loss 0.4169, Softmax loss 0.1831, total 0.6024
Train accuracy: 99.55%
Validation CMC (split 20/20): top-1: 80.65% | top-5: 96.25% | top-10: 98.70%
--------------------------------------------------------------------------------
Training... Epoch = 130
Epoch 130, Batch 96/96 (21.2s): Center loss 0.4147, Softmax loss 0.1763, total 0.5935
Train accuracy: 99.65%
Validation CMC (split 20/20): top-1: 79.60% | top-5: 96.35% | top-10: 98.55%
--------------------------------------------------------------------------------
Training... Epoch = 131
Epoch 131, Batch 96/96 (21.2s): Center loss 0.4018, Softmax loss 0.1677, total 0.5719
Train accuracy: 99.66%
Validation CMC (sp

Epoch 155, Batch 96/96 (21.2s): Center loss 0.3946, Softmax loss 0.1736, total 0.5707
Train accuracy: 99.65%
Validation CMC (split 20/20): top-1: 76.40% | top-5: 96.05% | top-10: 98.55%
--------------------------------------------------------------------------------
Training... Epoch = 156
Epoch 156, Batch 96/96 (21.2s): Center loss 0.4729, Softmax loss 0.2410, total 0.7164
Train accuracy: 98.91%
Validation CMC (split 20/20): top-1: 78.10% | top-5: 96.45% | top-10: 99.00%
--------------------------------------------------------------------------------
Training... Epoch = 157
Epoch 157, Batch 96/96 (21.1s): Center loss 0.4359, Softmax loss 0.1980, total 0.6364
Train accuracy: 99.31%
Validation CMC (split 20/20): top-1: 80.75% | top-5: 97.00% | top-10: 99.15%
--------------------------------------------------------------------------------
Training... Epoch = 158
Epoch 158, Batch 96/96 (21.2s): Center loss 0.3971, Softmax loss 0.1629, total 0.5625
Train accuracy: 99.61%
Validation CMC (sp

In [48]:
chkp = torch.load('cuhk03_epoch_156.pth')

In [21]:
list(chkp.keys())

['best_val_loss', 'epoch', 'optimizer', 'train_softmax_loss', 'state_dict']

In [52]:
chkp['best_val_loss'], chkp['epoch'], chkp['train_softmax_loss']

(1.0733552536656779, 155, 0.12763088082687724)