# Treinamento - Market-1501

In [1]:
%load_ext autoreload
%autoreload 2

In [1]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
import torch
import torch as tc
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from torch.autograd import Variable
from  torch.utils.data import DataLoader
import torch.optim.lr_scheduler as lr_scheduler
import matplotlib.pyplot as plt
%matplotlib inline
import torchvision.datasets as dset
import numpy as np
import cv2

from center_loss import CenterLoss
from frw import L2_penalty_with_constant
from reid_model import ReID_Net
from datasets import Market1501
from custom_transforms import RandomTranslation
from evaluation import calculate_mAP, generate_query
from utils import MetricAverager, save_checkpoint, load_checkpoint
import time
import shutil


## Dataset, transformações de _data augmentation_ e DataLoaders

In [3]:
data_transforms = {
    'train': transforms.Compose([
        RandomTranslation(0.1),   # translação em numpy array
        transforms.ToPILImage(),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),    # range [0.0, 1.0]
        transforms.Lambda(lambda t: t/0.5 - 1)    # range [-1.0, 1.0]
    ]),
    'valid': transforms.Compose([
        transforms.ToTensor(),
        transforms.Lambda(lambda t: t/0.5 - 1)
    ])
}

In [4]:
datasets = {
    'train': Market1501('../reid/datasets/market_train_correct.npz', transform=data_transforms['train']),
    'valid': Market1501('../reid/datasets/market_train_correct.npz', train=False, 
                    transform=data_transforms['valid']),
    'test': Market1501('../reid/datasets/market_test_correct.npz', train=False, test=True,
                    transform=data_transforms['valid']),
}

batch_size = 128
dset_loaders = {
    'train': DataLoader(datasets['train'], batch_size=batch_size, shuffle=True, num_workers=4),
    'valid': DataLoader(datasets['valid'], batch_size=batch_size, shuffle=False, num_workers=4),
    'test': DataLoader(datasets['test'], batch_size=batch_size, shuffle=False, num_workers=4)
}

In [5]:
print('Train data:')
print(datasets['train'].X.shape, datasets['train'].X.dtype, datasets['train'].X.min(), datasets['train'].X.max())
print(datasets['train'].y.shape, datasets['train'].y.dtype, datasets['train'].y.min(), datasets['train'].y.max())
print('\nValidation data:')
print(datasets['valid'].X.shape, datasets['valid'].X.dtype, datasets['valid'].X.min(), datasets['valid'].X.max())
print(datasets['valid'].y.shape, datasets['valid'].y.dtype, datasets['valid'].y.min(), datasets['valid'].y.max())
print('\nTest data:')
print(datasets['test'].X.shape, datasets['test'].X.dtype, datasets['test'].X.min(), datasets['test'].X.max())
print(datasets['test'].y.shape, datasets['test'].y.dtype, datasets['test'].y.min(), datasets['test'].y.max())

Train data:
(11253, 128, 48, 3) uint8 0 255
(11253,) int64 0 650

Validation data:
(1683, 128, 48, 3) uint8 0 255
(1683,) int64 0 99

Test data:
(16483, 128, 48, 3) uint8 0 255
(16483,) int64 1 750


In [6]:
len(dset_loaders['train']), len(dset_loaders['valid']), len(dset_loaders['test'])

(88, 14, 129)

In [7]:
dset_loaders['valid'].dataset.cam_ids[:10]

array([0, 0, 0, 3, 3, 3, 4, 4, 4, 1])

In [8]:
lambda_cl = 0.01
alpha = 0.5
num_classes = len(np.unique(datasets['train'].y))
print('num_classes: ', num_classes)

epochs = 500
beta_frw = 0.0001

lr = 0.001
weight_decay = 0.001

num_classes:  651


## Funções de treinamento

### Uma época

In [60]:
def train_epoch(model, optimizer, epoch, criterions, train_loader, val_loader, use_cuda, lambda_cl, beta_frw,
                query_splits, cuda_device=None):
    print("Training... Epoch = %d" % epoch)

    train_accum = 0
    n_train_samples = 0
    val_accum = 0
    n_val_samples = 0
    
    tlosses = {
        'center_loss': MetricAverager(),
        'softmax_loss': MetricAverager(),
        'total': MetricAverager(),
    }
    
    #train
    model.train(True)
    t0 = time.time()
    for i,(data, target) in enumerate(train_loader):
        
        if use_cuda:
            data = data.cuda()
            target = target.cuda()
        data, target = Variable(data), Variable(target)
        
        
        features, pred = model(data)
        
        # Softmax loss + center loss
        softmax_loss = criterions[0](pred, target)
        center_loss = lambda_cl * criterions[1](features, target)
        loss = softmax_loss + center_loss
        
        # Acrescenta a penalização L2 na camada FRW
        for (name, p) in model.named_parameters():
            if name == 'frw.weight' or name == 'fc_embeddings.3.weight':
                loss += beta_frw * criterions[2](p)
        
        _, preds = torch.max(pred.data, 1)
        correct = (preds == target.data).sum()
        train_accum += correct
        n_train_samples += target.size(0)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        tlosses['center_loss'].update(center_loss.data.cpu().numpy()[0])
        tlosses['softmax_loss'].update(softmax_loss.data.cpu().numpy()[0])
        tlosses['total'].update(loss.data.cpu().numpy()[0])
        
        t1 = time.time() - t0
        train_string = 'Epoch {}, Batch {}/{} ({:.1f}s): Center loss {:.4f}, Softmax loss {:.4f}, ' \
            'total {:.4f}'.format(epoch, i+1, len(train_loader), t1, tlosses['center_loss'], 
                                  tlosses['softmax_loss'], tlosses['total'])
        print(train_string, end='\r' if i < len(train_loader)-1 else '\n')
    
    train_acc = train_accum / n_train_samples
    print('Train accuracy: {:.2%}'.format(train_acc))
        
    # Validation: in the validation phase, we extract the features of all the validation images.
    # Then, we calculate the pairwise euclidian distance for all samples and calculate
    # the top-1, top-5 and top-10 CMC curves using single-gallery-shot setting: the query and
    # gallery sets have images from different camera views and, for each person identity, one image 
    # is randomly selected for each set. This is done for each query split configuration.
    
    t0 = time.time()
    model.train(False)
    
    val_feats = []
    val_pids = []
    
    valid_metrics = {
        'map': MetricAverager(),
    }
    
    for i, (data, target) in enumerate(val_loader):
        
        if use_cuda:
            data = data.cuda()
        data = Variable(data, volatile=True)

        features, pred = model(data)
        val_feats.append(features.data.cpu())
        
    val_feats = torch.cat(val_feats)
    
    # Calculates mAP for the validation set, as if it was a Query-Gallery set
    val_dataset = val_loader.dataset
    val_cam_ids = val_dataset.cam_ids
    y_val = val_dataset.y
    
    # Gallery: may or may not include distractors
    y_gallery = y_val
    gal_cam_ids = val_cam_ids
    
    for i, query_indices_by_cam in enumerate(query_splits):
        #query_indices_by_cam = generate_query(y_val, val_cam_ids)
        val_map = calculate_mAP(query_indices_by_cam, val_feats, y_gallery, gal_cam_ids)
        valid_metrics['map'].update(val_map)

        t1 = time.time() - t0
        print('Validation mAP (split {:>2}/{}) ({:.1f}s): {:.2%}'.format(i+1, len(query_splits),
                t1, valid_metrics['map'].mean), end='\r' if i+1 < len(query_splits) else '\n')
        
    print('-'*80)
        
    
    metrics = {
        'train': {
            'losses': {
                'center_loss': tlosses['center_loss'].mean,
                'softmax_loss': tlosses['softmax_loss'].mean,
                'total': tlosses['total'].mean,
            },
            'accuracy': train_acc,
        },
        'valid': {
            'map': valid_metrics['map'].mean
        }
    }
    
    return metrics

### Treinamento total

In [57]:
def train(epochs, model=None, resume=None, checkpoint_fn=None, **train_args):
    best_val_loss = None
    last_epoch = 0
    
    train_history = []
    
    use_cuda = train_args.get('use_cuda')
    
    if model is None:
        num_classes = train_args.pop('num_classes')
        model = ReID_Net(num_classes)
    
    if resume is not None:
        scheduler, last_epoch, best_val_loss, train_history = load_checkpoint(resume, model, optimizer4nn)
        num_classes = model.centers.size(0)
    
    # NLLLoss
    nllloss = nn.NLLLoss() #CrossEntropyLoss = log_softmax + NLLLoss
    # CenterLoss
    loss_weight = 1.0

    # FRW L2 penalty
    frw_l2_penalty = L2_penalty_with_constant(200)
    
    if use_cuda:
        nllloss = nllloss.cuda()
        model = model.cuda()
        frw_l2_penalty = frw_l2_penalty.cuda()
        
    
    alpha = train_args.pop('alpha')

    centerloss = CenterLoss(model, model.centers, num_classes, alpha)
    criterions = [nllloss, centerloss, frw_l2_penalty]

    # optimzer4nn
    lr = train_args.pop('lr')
    weight_decay = train_args.pop('weight_decay')
    optimizer4nn = optim.Adam(model.parameters(),lr=lr, weight_decay=weight_decay)
    
    # LR Scheduler
    scheduler_patience = train_args.pop('scheduler_patience', 35)
    scheduler_factor = train_args.pop('scheduler_factor', 0.1)
    scheduler = lr_scheduler.ReduceLROnPlateau(optimizer4nn, mode='max', patience=scheduler_patience, 
                                               factor=scheduler_factor, verbose=True)
        
    max_val_map = best_val_loss or -np.Inf
    
    patience = train_args.pop('patience', 50)
    no_improve = 0
    
    print('train_args:', train_args)
    
    # Gera os splits de query e gallery que serão usados no CMC de validação
    val_dataset = train_args['val_loader'].dataset
    query_splits = []
    for i in range(10):
        query_splits.append(generate_query(val_dataset.y, val_dataset.cam_ids))
    
    for epoch in range(last_epoch+1, epochs):

        metrics = train_epoch(model, optimizer4nn, epoch, criterions, query_splits=query_splits, **train_args)
        
        scheduler.step(metrics['valid']['map'])
        train_history.append(metrics)
        
        if metrics['valid']['map'] > max_val_map:
            print("Max. Valid. mAP / Epoch's Valid. mAP: {:.2%} | {:.2%}".format(
                max_val_map, metrics['valid']['map']))
            print('Salvando melhor modelo...')
            max_val_map = metrics['valid']['map']
            state = {
                'epoch': epoch,
                'state_dict': model.state_dict(),
                'best_val_loss': max_val_map,
                'train_softmax_loss': metrics['train']['losses']['softmax_loss'],
                'history': train_history,
            }
            opts = {'is_best': False}
            if checkpoint_fn is not None:
                opts.update({'filename': checkpoint_fn.format(epoch+1)})
            save_checkpoint(state, **opts)
            no_improve = 0
            print('-'*80)
        else:
            no_improve += 1
            if no_improve > patience:
                print('Early stopping at epoch {}.'.format(epoch))
                break
                
                
    return model, train_history

## Loop de treinamento


In [14]:
train_params = {
    'train_loader': dset_loaders['train'],
    'val_loader': dset_loaders['valid'],
    'num_classes': len(np.unique(datasets['train'].y)),
    'lr': 0.001,
    'weight_decay': 0.001,
    'use_cuda': True,
    'lambda_cl': 0.01,
    'alpha': 0.5,
    'beta_frw': 0.0001,
    'cuda_device': None,
    'patience': 50,
    'scheduler_patience': 35,
    'scheduler_factor': 0.1,
}

In [61]:
try:
    model, hist = train(epochs, model=None, checkpoint_fn='market1501_map_epoch_{}.pth', **train_params)
except KeyboardInterrupt:
    print('Interrompendo...')

train_args: {'train_loader': <torch.utils.data.dataloader.DataLoader object at 0x7ff922478d68>, 'beta_frw': 0.0001, 'val_loader': <torch.utils.data.dataloader.DataLoader object at 0x7ff922478e48>, 'lambda_cl': 0.01, 'use_cuda': True, 'cuda_device': None}
Training... Epoch = 1
Epoch 1, Batch 88/88 (19.9s): Center loss 0.5211, Softmax loss 5.4276, total 6.0953
Train accuracy: 6.98%
Validation mAP (split 10/10) (4.8s): 35.88%
--------------------------------------------------------------------------------
Max. Valid. mAP / Epoch's Valid. mAP: -inf% | 35.88%
Salvando melhor modelo...
--------------------------------------------------------------------------------
Training... Epoch = 2
Epoch 2, Batch 88/88 (20.1s): Center loss 0.1684, Softmax loss 3.7638, total 3.9547
Train accuracy: 24.96%
Validation mAP (split 10/10) (3.3s): 47.56%
--------------------------------------------------------------------------------
Max. Valid. mAP / Epoch's Valid. mAP: 35.88% | 47.56%
Salvando melhor modelo..

Epoch 24, Batch 88/88 (19.6s): Center loss 0.2882, Softmax loss 0.2127, total 0.5018
Train accuracy: 98.52%
Validation mAP (split 10/10) (2.4s): 75.22%
--------------------------------------------------------------------------------
Training... Epoch = 25
Epoch 25, Batch 88/88 (19.6s): Center loss 0.2863, Softmax loss 0.2058, total 0.4930
Train accuracy: 98.80%
Validation mAP (split 10/10) (2.4s): 75.31%
--------------------------------------------------------------------------------
Max. Valid. mAP / Epoch's Valid. mAP: 75.24% | 75.31%
Salvando melhor modelo...
--------------------------------------------------------------------------------
Training... Epoch = 26
Epoch 26, Batch 88/88 (19.6s): Center loss 0.2814, Softmax loss 0.1952, total 0.4776
Train accuracy: 98.97%
Validation mAP (split 10/10) (2.6s): 72.65%
--------------------------------------------------------------------------------
Training... Epoch = 27
Epoch 27, Batch 88/88 (19.6s): Center loss 0.2864, Softmax loss 0.2033,

Epoch 84, Batch 88/88 (19.6s): Center loss 0.2675, Softmax loss 0.1208, total 0.3906
Train accuracy: 99.53%
Validation mAP (split 10/10) (2.4s): 75.52%
--------------------------------------------------------------------------------
Training... Epoch = 85
Epoch 85, Batch 88/88 (19.6s): Center loss 0.2606, Softmax loss 0.1139, total 0.3769
Train accuracy: 99.62%
Validation mAP (split 10/10) (2.5s): 74.52%
--------------------------------------------------------------------------------
Training... Epoch = 86
Epoch 86, Batch 88/88 (19.6s): Center loss 0.2542, Softmax loss 0.1081, total 0.3647
Train accuracy: 99.65%
Validation mAP (split 10/10) (2.4s): 76.20%
--------------------------------------------------------------------------------
Training... Epoch = 87
Epoch 87, Batch 88/88 (19.6s): Center loss 0.2623, Softmax loss 0.1162, total 0.3808
Train accuracy: 99.57%
Validation mAP (split 10/10) (2.5s): 74.47%
--------------------------------------------------------------------------------

Validation mAP (split 10/10) (2.7s): 76.91%
--------------------------------------------------------------------------------
Early stopping at epoch 145.
