# Treinamento do modelo para Transfer Learning usando Market-1501 e CUHK03

In [1]:
%load_ext autoreload
%autoreload 2

In [1]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
import torch
import torch as tc
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from torch.autograd import Variable
from  torch.utils.data import DataLoader
import torch.optim.lr_scheduler as lr_scheduler
import matplotlib.pyplot as plt
%matplotlib inline
import torchvision.datasets as dset
import numpy as np
import cv2

from center_loss import CenterLoss
from frw import L2_penalty_with_constant
from reid_model import ReID_Net
from datasets import Market1501, CUHK03, ConcatenateDataset
from custom_transforms import RandomTranslation
from evaluation import pairwise_squared_euclidian_distance, generate_query, \
                        get_topk_results, apk, mapk, calculate_mAP
from utils import MetricAverager, save_checkpoint, load_checkpoint
import time
import shutil

## Dataset, transformações de _data augmentation_ e DataLoaders

In [3]:
data_transforms = {
    'train': transforms.Compose([
        RandomTranslation(0.1),   # translação em numpy array
        transforms.ToPILImage(),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),    # range [0.0, 1.0]
        transforms.Lambda(lambda t: t/0.5 - 1)    # range [-1.0, 1.0]
    ]),
    'valid': transforms.Compose([
        transforms.ToTensor(),
        transforms.Lambda(lambda t: t/0.5 - 1)
    ])
}

In [51]:
individual_datasets = {
    'market': {
        'train': Market1501('../reid/datasets/market_train_correct.npz', 
                            transform=data_transforms['train']),
        'valid': Market1501('../reid/datasets/market_train_correct.npz', train=False, 
                            transform=data_transforms['valid'])
    },
    'cuhk03': {
        'train': CUHK03('../reid/datasets/cuhk03_train_correct.npz', 
                        transform=data_transforms['train']),
        'valid': CUHK03('../reid/datasets/cuhk03_train_correct.npz', train=False, 
                        transform=data_transforms['valid'])
    }   
}

# market = individual_datasets['market']
# market['train'].X = market['train'].X[:3000]
# market['train'].y = market['train'].y[:3000]

# cuhk03 = individual_datasets['cuhk03']
# cuhk03['train'].X = cuhk03['train'].X[:3000]
# cuhk03['train'].y = cuhk03['train'].y[:3000]


datasets = {
    'train': ConcatenateDataset(individual_datasets['market']['train'], individual_datasets['cuhk03']['train']),
    'valid': ConcatenateDataset(individual_datasets['market']['valid'], individual_datasets['cuhk03']['valid']),
}

batch_size = 128
dset_loaders = {
    'train': DataLoader(datasets['train'], batch_size=batch_size, shuffle=True, num_workers=4),
    'valid': DataLoader(datasets['valid'], batch_size=batch_size, shuffle=False, num_workers=4)
}

In [52]:
market = individual_datasets['market']
print('Market1501:')
print('Treinamento:')
print('X:', market['train'].X.shape, market['train'].X.dtype, market['train'].X.min(), market['train'].X.max())
print('y:', market['train'].y.shape, market['train'].y.dtype, market['train'].y.min(), market['train'].y.max())
print('Validação:')
print('X:', market['valid'].X.shape, market['valid'].X.dtype, market['valid'].X.min(), market['valid'].X.max())
print('y:', market['valid'].y.shape, market['valid'].y.dtype, market['valid'].y.min(), market['valid'].y.max())

cuhk03 = individual_datasets['cuhk03']
print('\nCUHK03:')
print('Treinamento:')
print('X:', cuhk03['train'].X.shape, cuhk03['train'].X.dtype, cuhk03['train'].X.min(), cuhk03['train'].X.max())
print('y:', cuhk03['train'].y.shape, cuhk03['train'].y.dtype, cuhk03['train'].y.min(), cuhk03['train'].y.max())
print('Validação:')
print('X:', cuhk03['valid'].X.shape, cuhk03['valid'].X.dtype, cuhk03['valid'].X.min(), cuhk03['valid'].X.max())
print('y:', cuhk03['valid'].y.shape, cuhk03['valid'].y.dtype, cuhk03['valid'].y.min(), cuhk03['valid'].y.max())

print()
print('Tamanho dataset concatenado: ')
print('Treinamento:')
print('X:', datasets['train'].X.shape, datasets['train'].X.dtype, datasets['train'].X.min(), datasets['train'].X.max())
print('y:', datasets['train'].y.shape, datasets['train'].y.dtype, datasets['train'].y.min(), datasets['train'].y.max())
print('Validação:')
print('X:', datasets['valid'].X.shape, datasets['valid'].X.dtype, datasets['valid'].X.min(), datasets['valid'].X.max())
print('y:', datasets['valid'].y.shape, datasets['valid'].y.dtype, datasets['valid'].y.min(), datasets['valid'].y.max())

Market1501:
Treinamento:
X: (11253, 128, 48, 3) uint8 0 255
y: (11253,) int64 0 650
Validação:
X: (1683, 128, 48, 3) uint8 0 255
y: (1683,) int64 0 99

CUHK03:
Treinamento:
X: (12165, 128, 48, 3) uint8 0 255
y: (12165,) int64 0 1266
Validação:
X: (966, 128, 48, 3) uint8 0 255
y: (966,) int64 0 99

Tamanho dataset concatenado: 
Treinamento:
X: (23418, 128, 48, 3) uint8 0 255
y: (23418,) int64 0 1917
Validação:
X: (2649, 128, 48, 3) uint8 0 255
y: (2649,) int64 0 199


In [53]:
datasets['train'].num_classes, datasets['valid'].num_classes

(1918, 200)

In [54]:
len(dset_loaders['train']), len(dset_loaders['valid'])

(183, 21)

## Funções de treinamento

### Para uma época

In [42]:
def train_epoch(model, optimizer, epoch, criterions, train_loader, val_loader, use_cuda,
                lambda_cl, beta_frw, query_splits, cuda_device=None):
    print("Training... Epoch = %d" % epoch)

    train_accum = 0
    n_train_samples = 0
    val_accum = 0
    n_val_samples = 0
    
    tlosses = {
        'center_loss': MetricAverager(),
        'softmax_loss': MetricAverager(),
        'total': MetricAverager(),
    }
    
    #train
    model.train(True)
    t0 = time.time()
    for i,(data, target) in enumerate(train_loader):
        
        if use_cuda:
            data = data.cuda()
            target = target.cuda()
        data, target = Variable(data), Variable(target)
        
        
        features, pred = model(data)
        
        # Softmax loss + center loss
        softmax_loss = criterions[0](pred, target)
        center_loss = lambda_cl * criterions[1](features, target)
        loss = softmax_loss + center_loss
        
        # Acrescenta a penalização L2 na camada FRW
        for (name, p) in model.named_parameters():
            if name == 'frw.weight' or name == 'fc_embeddings.3.weight':
                loss += beta_frw * criterions[2](p)
        
        _, preds = torch.max(pred.data, 1)
        correct = (preds == target.data).sum()
        train_accum += correct
        n_train_samples += target.size(0)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        tlosses['center_loss'].update(center_loss.data.cpu().numpy()[0])
        tlosses['softmax_loss'].update(softmax_loss.data.cpu().numpy()[0])
        tlosses['total'].update(loss.data.cpu().numpy()[0])
        
        t1 = time.time() - t0
        train_string = 'Epoch {}, Batch {}/{} ({:.1f}s): Center loss {:.4f}, Softmax loss {:.4f}, ' \
            'total {:.4f}'.format(epoch, i+1, len(train_loader), t1, tlosses['center_loss'], 
                                  tlosses['softmax_loss'], tlosses['total'])
        print(train_string, end='\r' if i < len(train_loader)-1 else '\n')
    
    train_acc = train_accum / n_train_samples
    print('Train accuracy: {:.2%}'.format(train_acc))
        
    # Validation: in the validation phase, we extract the features of all the validation images.
    # Then, we calculate the pairwise euclidian distance for all samples and calculate
    # the Mean Average Precision for the rankings in a Cross-camera search configuration: the query and
    # gallery sets have images from different camera views and, for each person identity, one image 
    # is randomly selected for each set. This is done for N query/gallery split configurations.
    
    t0 = time.time()
    model.train(False)
    
    val_feats = []
    val_pids = []
    
    valid_metrics = {
        'map': MetricAverager(),
    }
    
    for i, (data, target) in enumerate(val_loader):
        
        if use_cuda:
            data = data.cuda()
        data = Variable(data, volatile=True)

        features, pred = model(data)
        val_feats.append(features.data.cpu())
        
    val_feats = torch.cat(val_feats)
    
    # Calculates mAP for the validation set, as if it was a Query-Gallery set
    val_dataset = val_loader.dataset
    val_cam_ids = val_dataset.cam_ids
    y_val = val_dataset.y
    
    # Gallery: may or may not include distractors
    y_gallery = y_val
    gal_cam_ids = val_cam_ids
    
    for i, query_indices_by_cam in enumerate(query_splits):
        #query_indices_by_cam = generate_query(y_val, val_cam_ids)
        val_map = calculate_mAP(query_indices_by_cam, val_feats, y_gallery, gal_cam_ids)
        valid_metrics['map'].update(val_map)

        t1 = time.time() - t0
        print('Validation mAP (split {:>2}/{}) ({:.1f}s): {:.2%}'.format(i+1, len(query_splits),
                t1, valid_metrics['map'].mean), end='\r' if i+1 < len(query_splits) else '\n')
        
    print('-'*80)
        
    
    metrics = {
        'train': {
            'losses': {
                'center_loss': tlosses['center_loss'].mean,
                'softmax_loss': tlosses['softmax_loss'].mean,
                'total': tlosses['total'].mean,
            },
            'accuracy': train_acc,
        },
        'valid': {
            'map': valid_metrics['map'].mean
        }
    }
    
    return metrics

### Para o treinamento completo

In [43]:
def train(epochs, model=None, resume=None, checkpoint_fn=None, **train_args):
    best_val_loss = None
    last_epoch = 0
    
    train_history = []
    
    use_cuda = train_args.get('use_cuda')
    
    if model is None:
        num_classes = train_args.pop('num_classes')
        model = ReID_Net(num_classes)
    
    if resume is not None:
        scheduler, last_epoch, best_val_loss, train_history = load_checkpoint(resume, model, optimizer4nn)
        num_classes = model.centers.size(0)
    
    # NLLLoss
    nllloss = nn.NLLLoss() #CrossEntropyLoss = log_softmax + NLLLoss
    # CenterLoss
    loss_weight = 1.0

    # FRW L2 penalty
    frw_l2_penalty = L2_penalty_with_constant(200)
    
    if use_cuda:
        nllloss = nllloss.cuda()
        model = model.cuda()
        frw_l2_penalty = frw_l2_penalty.cuda()
        
    
    alpha = train_args.pop('alpha')

    centerloss = CenterLoss(model, model.centers, num_classes, alpha)
    criterions = [nllloss, centerloss, frw_l2_penalty]

    # optimzer4nn
    lr = train_args.pop('lr')
    weight_decay = train_args.pop('weight_decay')
    optimizer4nn = optim.Adam(model.parameters(),lr=lr, weight_decay=weight_decay)
    
    # LR Scheduler
    scheduler_patience = train_args.pop('scheduler_patience', 35)
    scheduler_factor = train_args.pop('scheduler_factor', 0.1)
    scheduler = lr_scheduler.ReduceLROnPlateau(optimizer4nn, mode='max',
                                               patience=scheduler_patience, 
                                               factor=scheduler_factor, verbose=True)
        
    max_val_map = best_val_loss or -np.Inf
    
    patience = train_args.pop('patience', 50)
    no_improve = 0
    
    print('train_args:', train_args, '\n')
    
    # Gera os splits de query e gallery que serão usados nas métricas de validação
    val_dataset = train_args['val_loader'].dataset
    query_splits = []
    for i in range(5):
        query_splits.append(generate_query(val_dataset.y, val_dataset.cam_ids))
    
    for epoch in range(last_epoch+1, epochs):

        metrics = train_epoch(model, optimizer4nn, epoch, criterions,
                              query_splits=query_splits, **train_args)
        
        scheduler.step(metrics['valid']['map'])
        train_history.append(metrics)
        
        # Save checkpoint and EarlyStop
        if metrics['valid']['map'] > max_val_map:
            print("Max. Valid. mAP / Epoch's Valid. mAP: {:.2%} | {:.2%}".format(
                max_val_map, metrics['valid']['map']))
            print('Salvando melhor modelo...')
            max_val_map = metrics['valid']['map']
            state = {
                'epoch': epoch,
                'state_dict': model.state_dict(),
                'best_val_loss': max_val_map,
                'train_softmax_loss': metrics['train']['losses']['softmax_loss'],
                'history': train_history,
            }
            opts = {'is_best': False}
            if checkpoint_fn is not None:
                opts.update({'filename': checkpoint_fn.format(epoch+1)})
            save_checkpoint(state, **opts)
            no_improve = 0
            print('-'*80)
        else:
            no_improve += 1
            if no_improve > patience:
                print('Early stopping at epoch {}.'.format(epoch))
                break
                
    return model, train_history

## Parâmetros de treinamento e treino

In [56]:
epochs = 500

train_params = {
    'lr': 0.001,
    'weight_decay': 0.001,
    'lambda_cl': 0.01,
    'alpha': 0.5,
    'num_classes': len(np.unique(datasets['train'].y)),
    'train_loader': dset_loaders['train'],
    'val_loader': dset_loaders['valid'],
    'use_cuda': True,
    'beta_frw': 0.0001,
    'cuda_device': None,
    'patience': 55,
    'scheduler_patience': 40,
    'scheduler_factor': 0.1
}

In [57]:
try:
    model, train_history = train(epochs, model=None, 
                                 checkpoint_fn='market1501_and_cuhk03_map_epoch_{}.pth', 
                                 **train_params)
    
except KeyboardInterrupt:
    print('Interrompendo...')

train_args: {'val_loader': <torch.utils.data.dataloader.DataLoader object at 0x7fcf98d86668>, 'lambda_cl': 0.01, 'cuda_device': None, 'train_loader': <torch.utils.data.dataloader.DataLoader object at 0x7fcf98d861d0>, 'beta_frw': 0.0001, 'use_cuda': True} 

Training... Epoch = 1
Epoch 1, Batch 183/183 (40.6s): Center loss 0.6226, Softmax loss 6.6646, total 7.3641
Train accuracy: 2.66%
Validation mAP (split  5/5) (3.6s): 30.69%
--------------------------------------------------------------------------------
Max. Valid. mAP / Epoch's Valid. mAP: -inf% | 30.69%
Salvando melhor modelo...
--------------------------------------------------------------------------------
Training... Epoch = 2
Epoch 2, Batch 183/183 (40.7s): Center loss 0.1829, Softmax loss 5.1115, total 5.2953
Train accuracy: 11.72%
Validation mAP (split  5/5) (3.4s): 41.85%
--------------------------------------------------------------------------------
Max. Valid. mAP / Epoch's Valid. mAP: 30.69% | 41.85%
Salvando melhor mode

Epoch 53, Batch 183/183 (43.6s): Center loss 0.5403, Softmax loss 0.3030, total 0.8457
Train accuracy: 97.90%
Validation mAP (split  5/5) (2.9s): 76.27%
--------------------------------------------------------------------------------
Training... Epoch = 54
Epoch 54, Batch 183/183 (43.8s): Center loss 0.5430, Softmax loss 0.3048, total 0.8501
Train accuracy: 97.88%
Validation mAP (split  5/5) (3.0s): 77.13%
--------------------------------------------------------------------------------
Training... Epoch = 55
Epoch 55, Batch 183/183 (44.2s): Center loss 0.5349, Softmax loss 0.2980, total 0.8353
Train accuracy: 98.04%
Validation mAP (split  5/5) (3.0s): 75.13%
--------------------------------------------------------------------------------
Training... Epoch = 56
Epoch 56, Batch 183/183 (43.8s): Center loss 0.5394, Softmax loss 0.3050, total 0.8468
Train accuracy: 97.72%
Validation mAP (split  5/5) (2.9s): 76.04%
----------------------------------------------------------------------------

Epoch 83, Batch 183/183 (41.6s): Center loss 0.5172, Softmax loss 0.2717, total 0.7914
Train accuracy: 98.41%
Validation mAP (split  5/5) (6.3s): 76.63%
--------------------------------------------------------------------------------
Training... Epoch = 84
Epoch 84, Batch 183/183 (41.8s): Center loss 0.5171, Softmax loss 0.2709, total 0.7905
Train accuracy: 98.34%
Validation mAP (split  5/5) (7.5s): 79.33%
--------------------------------------------------------------------------------
Max. Valid. mAP / Epoch's Valid. mAP: 78.60% | 79.33%
Salvando melhor modelo...
--------------------------------------------------------------------------------
Training... Epoch = 85
Epoch 85, Batch 183/183 (41.4s): Center loss 0.5123, Softmax loss 0.2679, total 0.7827
Train accuracy: 98.42%
Validation mAP (split  5/5) (6.8s): 78.14%
--------------------------------------------------------------------------------
Training... Epoch = 86
Epoch 86, Batch 183/183 (41.1s): Center loss 0.5161, Softmax loss 0.

Validation mAP (split  5/5) (4.4s): 81.79%
--------------------------------------------------------------------------------
Training... Epoch = 141
Epoch 141, Batch 183/183 (41.0s): Center loss 0.3510, Softmax loss 0.1235, total 0.4769
Train accuracy: 99.94%
Validation mAP (split  5/5) (3.7s): 81.63%
--------------------------------------------------------------------------------
Training... Epoch = 142
Epoch 142, Batch 183/183 (40.8s): Center loss 0.3531, Softmax loss 0.1228, total 0.4783
Train accuracy: 99.94%
Validation mAP (split  5/5) (4.2s): 81.82%
--------------------------------------------------------------------------------
Training... Epoch = 143
Epoch 143, Batch 183/183 (40.8s): Center loss 0.3518, Softmax loss 0.1231, total 0.4774
Train accuracy: 99.94%
Validation mAP (split  5/5) (4.2s): 81.58%
--------------------------------------------------------------------------------
Training... Epoch = 144
Epoch 144, Batch 183/183 (40.9s): Center loss 0.3523, Softmax loss 0.1221, 