# Active FULL Training Code (Basic - Global)

In [1]:
from datasets.wider_global_dataset import build_wider_dataloader
from datasets.text_test_datasets import build_text_test_loader
from datasets.image_test_datasets import build_image_test_loader
from models.encoder import Model, MLP
from evaluators.global_evaluator import GlobalEvaluator
from loss.loss import crossmodal_triplet_loss, cos_distance
from loggers.logger import Logger
from tqdm import tqdm_notebook as tqdm
from sklearn.neighbors import DistanceMetric
import os

import torch.nn as nn
import torch.optim as optim

from configs.args import load_arg_parser

## config

In [2]:
parser = load_arg_parser()
cfg = parser.parse_args("")
cfg.data_root = "/data/aiyucui2/wider"
root = cfg.data_root

# data path
cfg.anno_path = os.path.join(root, cfg.anno_path)
cfg.img_dir = os.path.join(root, cfg.img_dir)
cfg.val_anno_path = os.path.join(root, cfg.val_anno_path)
cfg.val_img_dir = os.path.join(root, cfg.val_img_dir)
cfg.gt_file_fn = os.path.join(root, cfg.gt_file_fn)

# meta data path
cfg.cheap_candidate_fn = os.path.join(root, cfg.cheap_candidate_fn)
cfg.vocab_path = os.path.join(root, cfg.vocab_path)

# sys path
cfg.model_path = os.path.join(root, cfg.model_path)
cfg.output_path = os.path.join(root, cfg.output_path)
ckpt_root = "/shared/rsaas/aiyucui2/wider_person/checkpoints/reID"
load_exp_name = "dist_fn_cosine_imgbb_resnet18_capbb_bigru_embed_size_512_batch_96_lr_0.0001_captype_sent_img_meltlayer_8"
cfg.load_ckpt_fn = '0' #os.path.join(ckpt_root, load_exp_name, "stage1.pt")
cfg.debug = False
cfg.embed_size = 512
cfg.batch_size = 96
cfg.img_backbone_opt = "resnet18"
cfg.cap_backbone_opt = "bigru"
cfg.dim = (384,128)
cfg.dist_fn_opt = "cosine"
cfg.np = False
cfg.img_num_cut = 6
cfg.img_num_cut = 1 if not cfg.np else cfg.img_num_cut
cfg.cap_embed_type='sent'
# exp_name
exp_name = "dist_fn_{}_imgbb_{}_capbb_{}_embed_size_{}_batch_{}_lr_{}_captype_{}".format(cfg.dist_fn_opt,
                                                                       cfg.img_backbone_opt,
                                                                       cfg.cap_backbone_opt,
                                                                       cfg.embed_size,
                                                                       cfg.batch_size,
                                                                       cfg.lr,
                                                                                         cfg.cap_embed_type)
# logger
logger = Logger("test.txt") #os.path.join(cfg.output_path, cfg.exp_name+".txt"))
print(exp_name)

dist_fn_cosine_imgbb_resnet18_capbb_bigru_embed_size_512_batch_96_lr_0.0001_captype_sent


In [3]:
print(cfg)

Namespace(anno_path='/data/aiyucui2/wider/wider/train/train_anns_train.json', batch_size=96, cap_backbone_opt='bigru', cap_embed_type='sent', cheap_candidate_fn='/data/aiyucui2/wider/wider_graph/summer_best_val1_top100.pkl', cheap_eval=True, ckpt_freq=10, cos_margin=0.5, data_root='/data/aiyucui2/wider', debug=False, dim=(384, 128), dist_fn_opt='cosine', embed_size=512, experiment_name='default', gt_file_fn='/data/aiyucui2/wider/wider/val1/val_label.json', image_melt_layer=1, img_backbone_opt='resnet18', img_dir='/data/aiyucui2/wider/wider/train/img', img_num_cut=1, load_ckpt_fn='0', load_model_path='starter_bert_resnet50_2048.pt', lr=0.0001, mode='train', model_path='/data/aiyucui2/wider/checkpoints/reID/', momentum=0.9, np=False, num_epochs=25, num_epochs_stage1=20, num_epochs_stage2=60, num_gpu=1, num_gpus=1, num_workers=8, optimizer='Adam', output_path='/data/aiyucui2/wider/outputs/reID/', print_freq=50, regional_embed_size=256, step_size=15, text_melt_layer=0, token_length=40, val

## Loading data

In [4]:
# train loader
train_loader = build_wider_dataloader(cfg)

# test loader (loading image and text separately)
test_text_loader = build_text_test_loader(cfg) 
test_image_loader = build_image_test_loader(cfg) 


[ds] load annotations from /data/aiyucui2/wider/wider/train/train_anns_train.json
size of dataset: 74264


In [5]:
print(len(train_loader.dataset.person2label.values()))



12003


In [6]:
import matplotlib.pyplot as plt
import numpy as np
import torchvision
# functions to show an image
print(test_text_loader.dataset[1])
print(train_loader.dataset[1][5])
def imshow(img):
    img = img / 2 + 0.5     # unnormalize
    npimg = img.numpy()
    plt.imshow(np.transpose(npimg, (1, 2, 0)))
    plt.show()
dataiter = iter(test_image_loader)
images, labels = dataiter.next()

# show images
imshow(images[0])

(tensor([  9,  14,   6,   8,   2,  24,   4,  10,  97, 144,  58,   4,  59,  17,
          3,  19,  16,  50, 494, 123,  30, 141,   3,   1,   1,   1,   1,   1,
          1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1]), 'train_query/p8848_s17661.jpg')
tensor([ 51,  21,  16,  20,  39,  80,  23,   7,   2,  10, 289,  38,   7,  31,
          2,  35,  69,  42,  27,  54,  39,   7,   4,  24,  36,   4, 176,  10,
         13,   3,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1])


Clipping input data to the valid range for imshow with RGB data ([0..1] for floats or [0..255] for integers).


<Figure size 640x480 with 1 Axes>

## Define Model

In [7]:
# import torch.nn as nn
import torchvision.models as models
import torch
import torch.nn.functional as F 

from models.encoder import Model, MLP               
      

In [8]:
model = Model(embed_size=cfg.embed_size, 
              image_opt=cfg.img_backbone_opt, 
              caption_opt=cfg.cap_backbone_opt,
              cap_embed_type=cfg.cap_embed_type,
              img_num_cut=cfg.img_num_cut,
              regional_embed_size=cfg.regional_embed_size).cuda()

if cfg.load_ckpt_fn != "0":
    logger.log("[Model] load pre-trained model from %s." % cfg.load_ckpt_fn)
    ckpt = torch.load(cfg.load_ckpt_fn)
    model.load_state_dict(ckpt["model"], False)
    
id_cls = nn.Sequential(
    nn.Linear(cfg.embed_size, 164),
    nn.Softmax()
).cuda()
img_mlp = MLP(cfg.regional_embed_size, cfg.embed_size).cuda()
cap_mlp = MLP(cfg.embed_size, cfg.embed_size).cuda()

### Distance Metrics

In [9]:
import torch
def triplet_cos_loss(x, pos, neg, margin=0.2):
    def cos_dist(x,y):
        # import pdb; pdb.set_trace()
        return 1 - torch.sum(x*y, 1) / (torch.norm(x, dim=1)*torch.norm(y, dim=1))
    pos_dist = cos_dist(x, pos)
    neg_dist = cos_dist(x, neg)
    scores = torch.clamp(pos_dist - neg_dist + margin, min=0)
    return scores.mean()
    

if cfg.dist_fn_opt == "euclidean":
    dist_fn = DistanceMetric.get_metric('euclidean').pairwise
    triplet_loss = nn.TripletMarginLoss()
elif cfg.dist_fn_opt == "cosine":
    dist_fn = cos_distance
    triplet_loss = triplet_cos_loss

### Train Misc Setup

In [10]:
from evaluators.evaluator import Evaluator
from evaluators.global_evaluator import GlobalEvaluator
from evaluators.np_evaluator import NPEvaluator
from attentions.rga_attention import RGA_attend_one_to_many_batch, RGA_attend_one_to_many
# from tqdm import tqdm
import torch


In [11]:
if cfg.np:
    Evaluator = NPEvaluator
else:
    Evaluator = GlobalEvaluator

evaluator = Evaluator(img_loader=test_image_loader, 
                          cap_loader=test_text_loader, 
                          gt_file_path=cfg.gt_file_fn,
                          embed_size=cfg.embed_size,
                          logger=logger,
                          dist_fn_opt="euclidean")
cos_evaluator = Evaluator(img_loader=test_image_loader, 
                          cap_loader=test_text_loader, 
                          gt_file_path=cfg.gt_file_fn,
                          embed_size=cfg.embed_size,
                          logger=logger,
                          dist_fn_opt="cosine")


def build_graph_optimizer(models):
    if not isinstance(models, list):
        models = [models]
    params_to_optimize = []
    for model in models:
        if model and hasattr(model, '_parameters'):
            for param in model.parameters():
                if param.requires_grad == True:
                    params_to_optimize.append(param)
    return params_to_optimize

In [12]:
print(type(cos_evaluator))

<class 'evaluators.global_evaluator.GlobalEvaluator'>


In [None]:
from tqdm import tqdm
import torch
import torch.nn as nn
import torch.optim as optim 
import torchnet as tnt
import os

import torch.nn.functional as F
from models.model import build_dual_encoder
from loss.loss import triplet_cos_loss, crossmodal_triplet_loss

def regional_alignment_text(fulls, parts, p2fs, dist_fn_opt):
    start_index = 0
    aligned = []
    for i, jump in enumerate(p2fs):
        curr_parts = parts[start_index:start_index + jump]
        start_index += jump
        curr_full = fulls[i:i+1]
        aligned.append(RGA_attend_one_to_many(curr_full, curr_parts, dist_fn_opt))
    return torch.cat(aligned)

def regional_alignment_image(fulls, parts, dist_fn_opt):
    return RGA_attend_one_to_many_batch(fulls, parts, dist_fn_opt)
  
    
class Manager:
    def __init__(self, args, logger):
        self.cfg = args
        self._init_models()
        self._init_criterion()
        self.log = logger.log
    
    def _init_criterion(self):
        if self.cfg.dist_fn_opt == "cosine":
            self.triplet_loss = triplet_cos_loss
        elif self.cfg.dist_fn_opt == "euclidean":
            self.triplet_loss = nn.TripletMarginLoss()
        self.cls_loss = nn.CrossEntropyLoss()
        self.log("[Trainer][init] criterion initialized.")

    def _init_models(self):
        self.model = Model(embed_size=self.cfg.embed_size, 
                          image_opt=self.cfg.img_backbone_opt, 
                          caption_opt=self.cfg.cap_backbone_opt,
                          cap_embed_type=self.cfg.cap_embed_type,
                          img_num_cut=self.cfg.img_num_cut,
                          regional_embed_size=self.cfg.regional_embed_size).cuda()
        self.id_cls = nn.Linear(cfg.embed_size, cfg.num_ids)
        self.rga_img_mlp = MLP(self.cfg.regional_embed_size, self.cfg.embed_size).cuda()
        self.rga_cap_mlp = MLP(self.cfg.embed_size, self.cfg.embed_size).cuda()
        
        # load ckpt
        self.reset_ckpt()
        
        # gpu
        self.all_models = {
            "model": self.model,
            "id_cls": self.id_cls, 
            "rga_img_mlp": self.rga_img_mlp,
            "rga_cap_mlp": self.rga_cap_mlp,
        }
        self.log("[Trainer][init] model initialized.")

    def reset_ckpt(self):
        self.start_epoch = 0
        self.acc_history = []
        self.best_acc = (0, self.start_epoch)
        if cfg.load_ckpt_fn == "0":
            self.log("[Trainer][init] initialize fresh model.")
            return
        ckpt = torch.load(cfg.load_ckpt_fn)
        self.start_epoch = ckpt["epoch"] + 1
        self.acc_history = ckpt["acc_history"]
        for name, network in self.all_models.items():
            if name in ckpt:
                network.load_state_dict(ckpt[name], False)
                self.log("[Trainer][init] load pre-trained %s from %s." % (network, cfg.load_ckpt_fn))

              
    def save_ckpt(self, epoch, acc, fn):
        # update acc history 
        self.acc_history.append((acc, epoch))
        if acc > self.best_acc[0]:
            self.best_acc = (acc, epoch)
        # ckpt 
        ckpt = {
            "epoch": epoch,
            "acc_history": self.acc_history,
            "best_acc": self.best_acc,
            }
        for name, network in self.all_models.items():
            ckpt[name] = network.module.state_dict() if isinstance(network, nn.DataParallel) else network.state_dict(),

        path = os.path.join(self.cfg.model_path, fn)
        torch.save(ckpt, path)
            
    def todevice(self, batch):
        ret = []
        for arg in batch:
            if isinstance(arg, torch.Tensor):
                arg = arg.cuda()
            ret.append(arg)
        return tuple(ret)
    
    def melt_img_layer(self, num_layer_to_melt=1):
        if isinstance(model, nn.DataParallel):
            self.model.module.img_backbone.melt_layer(8 - num_layer_to_melt)
        else:
            self.model.img_backbone.melt_layer(8 - num_layer_to_melt)
     
    def train_epoch_global(train_data, optimizer, epoch, note="train"):
        self.model.train()
        cum_tri_loss, cum_id_loss = 0.0, 0.0
        for i, data in tqdm(enumerate(train_data), "%s, epoch%d" % (note, epoch)):
            # load data
            data = self.todevice(data)
            (img,pos_img,neg_img, cap, pos_cap, neg_cap, pid, pos_pid, neg_pid) = data
            
            # encode
            img, pos_img, neg_img = self.model(img), self.model(pos_img), self.model(neg_img)
            cap, pos_cap, neg_cap = self.model(cap), self.model(pos_cap), self.model(neg_cap)

            # loss
            tri_loss =  crossmodal_triplet_loss(img,pos_img,neg_img, 
                                                  cap, pos_cap, neg_cap, 
                                                  self.triplet_loss, self.cfg.dist_fn_opt)  
            id_loss = self.cls_loss(self.id_cls(img), pid) +  self.cls_loss(self.id_cls(cap), pid)
            loss = tri_loss + id_loss

            # backpropagation
            optimizer.zero_grad(); loss.backward(); optimizer.step()
            # log
            cum_tri_loss += tri_loss.item()
            cum_id_loss += id_loss.item()
            if (i+1) % self.cfg.print_freq == 0:
                out_string = "[ep-%d, bs-%d] " % (epoch, i)
                out_string += "[tri-loss] %.6f, " % cum_tri_loss / self.cfg.print_freq
                out_string += "[id-loss] %.6f, " % (cum_id_loss / self.cfg.print_freq)
                self.log(out_string)
                cum_tri_loss, cum_id_loss = 0.0, 0.0
                
    def train_epoch_regional(train_data, optimizer, epoch, note="train"):
        self.model.train(); self.rga_img_mlp.train(); self.rga_cap_mlp.train()

        cum_tri_loss, cum_tri_image_regional_loss, cum_tri_text_regional_loss, cum_id_loss = 0.0, 0.0, 0.0, 0.0
        for i, data in tqdm(enumerate(train_data), "%s, epoch%d" % (note,epoch)):
            # load data
            data = self.todevice(data)
            (img, pos_img, neg_img, 
             cap, pos_cap, neg_cap,
             nps, pos_nps, neg_nps,
             n2c, pos_n2c, neg_n2c,
             pid, pos_pid, neg_pid) = data


            img, img_part = self.model(img)
            pos_img, pos_img_part = self.model(pos_img)
            neg_img, neg_img_part = self.model(neg_img)
            cap, pos_cap, neg_cap = self.model(cap), self.model(pos_cap), self.model(neg_cap)
            
            N, M, T = nps.size()
            nps = self.rga_cap_mlp(self.model(nps.reshape(-1, T))).reshape(N, M, -1)
            pos_nps = self.rga_cap_mlp(self.model(pos_nps.reshape(-1, T))).reshape(N, M, -1)
            neg_nps = self.rga_cap_mlp(self.model(neg_nps.reshape(-1, T))).reshape(N, M, -1)
            
            # part
            img_part = self.rga_img_mlp(img_part)
            pos_img_part = self.rga_img_mlp(pos_img_part)
            neg_img_part = self.rga_img_mlp(neg_img_part)

            img_part = RGA_attend_one_to_many_batch(cap, img_part, self.cfg.dist_fn_opt)
            pos_img_part = RGA_attend_one_to_many_batch(pos_cap, pos_img_part, self.cfg.dist_fn_opt)
            neg_img_part = RGA_attend_one_to_many_batch(neg_cap, neg_img_part, self.cfg.dist_fn_opt)
            #cap_part = regional_alignment_text(img, nps, n2c, cfg.dist_fn_opt)
            #pos_cap_part = regional_alignment_text(pos_img, pos_nps, pos_n2c, cfg.dist_fn_opt)
            #neg_cap_part = regional_alignment_text(neg_img, neg_nps, neg_n2c, cfg.dist_fn_opt)
            cap_part = RGA_attend_one_to_many_batch(img, nps, self.cfg.dist_fn_opt)
            pos_cap_part = RGA_attend_one_to_many_batch(pos_img, pos_nps, self.cfg.dist_fn_opt)
            neg_cap_part = RGA_attend_one_to_many_batch(neg_img, neg_nps, self.cfg.dist_fn_opt)

            # loss
            tri_loss =  crossmodal_triplet_loss(img,pos_img,neg_img, 
                                                  cap, pos_cap, neg_cap, 
                                                  triplet_loss, self.cfg.dist_fn_opt) 
            tri_image_regional_loss =  crossmodal_triplet_loss(img_part,pos_img_part,neg_img_part, 
                                                  cap, pos_cap, neg_cap, 
                                                  triplet_loss, self.cfg.dist_fn_opt) 
            tri_text_regional_loss =  crossmodal_triplet_loss(img,pos_img,neg_img, 
                                                  cap_part, pos_cap_part, neg_cap_part, 
                                                  triplet_loss, self.cfg.dist_fn_opt) 
            id_loss = self.cls_loss(self.id_cls(img), pid) +  self.cls_loss(self.id_cls(cap), pid)


            loss = tri_loss + tri_image_regional_loss  + tri_text_regional_loss + id_loss

            # backpropagation
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()


            # log
            cum_tri_loss += tri_loss.item()
            cum_tri_image_regional_loss += tri_image_regional_loss.item()
            cum_tri_text_regional_loss += tri_text_regional_loss.item()
            cum_id_loss += id_loss.item()
            
            if (i+1) % self.cfg.print_freq == 0:
                out_string = "[ep-%d, bs-%d] " % (epoch, i)
                out_string += "[id-loss] %.6f, " % (cum_id_loss / self.cfg.print_freq)
                out_string += "[tri-loss] %.6f, " % cum_tri_loss / self.cfg.print_freq
                out_string += "[img_rga] %.6f, " %  cum_tri_image_regional_loss / self.cfg.print_freq
                out_string += "[cap_rga] %.6f " % cum_tri_text_regional_loss / self.cfg.print_freq
                self.log(out_string)
                cum_tri_loss, cum_tri_image_regional_loss, cum_tri_text_regional_loss, cum_id_loss = 0.0, 0.0, 0.0, 0.0
               
            
            
    def train_epoch_id(train_data, optimizer, epoch, note="train"):
        model.train()
        cum_loss = 0.0
        for i, data in tqdm(enumerate(train_data), "%s, epoch%d" % (note,epoch)):
            # load data
            data = self.todevice(data)
            (img,pos_img,neg_img, cap, pos_cap, neg_cap, pid, pos_pid, neg_pid) = data
            img = model(img)
            cap = model(cap)

            # loss
            loss = 0.0
            loss = loss + self.cls_loss(self.id_cls(img), pid) +  self.cls_loss(self.id_cls(cap), pid)

            # backpropagation
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            cum_loss += loss.item()

            # log
            if (i+1) % self.cfg.print_freq == 0:
                print("ep-%d, bs-%d, [id-loss] %.6f" % (epoch, i, cum_loss / self.cfg.print_freq))
                cum_loss = 0.0

### Id Loss initialization

In [16]:
def train_epoch_stage1(train_data, model, classifier, optimizer, cls_loss, note="train"):
    model.train()
    classifier.train()
    cum_loss = 0.0
    for i, data in tqdm(enumerate(train_data), "%s, epoch%d" % (note,epoch)):
        # load data
        (img, pos_img, neg_img, 
         cap, pos_cap, neg_cap, 
         pid, pos_pid, neg_pid) = data
        img, pos_img, neg_img = model(img.cuda()), model(pos_img.cuda()), model(neg_img.cuda())
        cap, pos_cap, neg_cap = model(cap.cuda()), model(pos_cap.cuda()), model(neg_cap.cuda())
        
        # loss
        loss = 0.0
        loss = loss + cls_loss(classifier(img), pid.cuda()) +  cls_loss(classifier(cap), pid.cuda())
        # import pdb; pdb.set_trace()
        # backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        cum_loss += loss.item()
        
        # log
        if (i+1) % 64 == 0:
            print("batch %d, loss %.6f" % (i, cum_loss/64))
            cum_loss = 0.0
    return model


if True:
    num_ids = len(train_loader.dataset.person2label.values())
    classifier = nn.Sequential(
        nn.Linear(cfg.embed_size, num_ids),
        # nn.Softmax()
    )
    classifier.cuda()
    # stage 1 - image channel forzen
    cls_loss = nn.CrossEntropyLoss()
    model.img_backbone.melt_layer(0)
    param_to_optimize = build_graph_optimizer([model, classifier])
    optimizer = optim.Adam(param_to_optimize, lr=2e-4)
    scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10)
    for epoch in range(15):
        model = train_epoch_stage1(train_loader, model, classifier, optimizer, cls_loss, "train-stage-1")
        acc = evaluator.evaluate(model)
        logger.log('[euclidean][global] R@1: %.4f | R@5: %.4f | R@10: %.4f' % (acc['top-1'], acc['top-5'], acc['top-10']))
        acc = cos_evaluator.evaluate(model)
        logger.log('[cosine   ][global] R@1: %.4f | R@5: %.4f | R@10: %.4f' % (acc['top-1'], acc['top-5'], acc['top-10']))
        scheduler.step()
    

HBox(children=(IntProgress(value=1, bar_style='info', description='train-stage-1, epoch0', max=1, style=Progre…




RuntimeError: cuDNN error: CUDNN_STATUS_EXECUTION_FAILED

In [15]:
#torch.save({
#    "model": model, 
#    "id_cls": classifier,
#}, "resnet18_bigru_512_id_initalized.pt")

### Matching Loss

In [14]:
def train_epoch_global(train_data, model, img_mlp_rga, cap_mlp_rga, optimizer, triplet_loss, logger, note="train"):
    model.train()
    cum_tri_loss, cum_id_loss = 0.0, 0.0
    for i, data in tqdm(enumerate(train_data), "%s, epoch%d" % (note,epoch)):
        # load data
        (img,pos_img,neg_img, cap, pos_cap, neg_cap, pid, pos_pid, neg_pid) = data
        img, pos_img, neg_img = model(img.cuda()), model(pos_img.cuda()), model(neg_img.cuda())
        cap, pos_cap, neg_cap = model(cap.cuda()), model(pos_cap.cuda()), model(neg_cap.cuda())
        
        # loss
        tri_loss =  crossmodal_triplet_loss(img,pos_img,neg_img, 
                                              cap, pos_cap, neg_cap, 
                                              triplet_loss, cfg.dist_fn_opt) 
        # id_loss = cls_loss(classifier(img), pid.cuda()) +  cls_loss(classifier(cap), pid.cuda())
        
        loss = tri_loss + id_loss
        
        # backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
       
        
        # log
        cum_tri_loss += tri_loss.item()
        if (i+1) % 64 == 0:
            logger.log("batch %d, [tri-loss] %.6f" % (i, cum_tri_loss/64))
            cum_tri_loss = 0.0
    return model



def regional_alignment_text(fulls, parts, p2fs, dist_fn_opt):
    start_index = 0
    aligned = []
    for i, jump in enumerate(p2fs):
        curr_parts = parts[start_index:start_index + jump]
        start_index += jump
        curr_full = fulls[i:i+1]
        aligned.append(RGA_attend_one_to_many(curr_full, curr_parts, dist_fn_opt))
    return torch.cat(aligned)

def regional_alignment_image(fulls, parts, dist_fn_opt):
    return RGA_attend_one_to_many_batch(fulls, parts, dist_fn_opt)
  
    
def train_epoch_regional(train_data, model, img_mlp_rga, cap_mlp_rga, optimizer, triplet_loss, logger, note="train"):
    model.train(); img_mlp_rga.train(); cap_mlp_rga.train()
    
    cum_tri_loss, cum_tri_image_regional_loss, cum_tri_text_regional_loss = 0.0, 0.0, 0.0
    for i, data in tqdm(enumerate(train_data), "%s, epoch%d" % (note,epoch)):
        # load data
        (img, pos_img, neg_img, 
         cap, pos_cap, neg_cap,
         nps, pos_nps, neg_nps,
         n2c, pos_n2c, neg_n2c,
         pid, pos_pid, neg_pid) = data
        
        
        img, img_part = model(img.cuda())
        pos_img, pos_img_part = model(pos_img.cuda())
        neg_img, neg_img_part = model(neg_img.cuda())
        cap, pos_cap, neg_cap = model(cap.cuda()), model(pos_cap.cuda()), model(neg_cap.cuda())
        N, M, T = nps.size()
        nps, pos_nps, neg_nps = model(nps.cuda().reshape(-1, T)), model(pos_nps.cuda().reshape(-1, T)), model(neg_nps.cuda().reshape(-1, T))
        
        # part
        img_part, pos_img_part, neg_img_part = img_mlp_rga(img_part), img_mlp_rga(pos_img_part), img_mlp_rga(neg_img_part)
        
        nps, pos_nps, neg_nps = cap_mlp_rga(nps.reshape(N, M, -1)), cap_mlp_rga(pos_nps.reshape(N, M, -1)), cap_mlp_rga(neg_nps.reshape(N, M, -1))
        
        img_part = RGA_attend_one_to_many_batch(cap, img_part, cfg.dist_fn_opt)
        pos_img_part = RGA_attend_one_to_many_batch(pos_cap, pos_img_part, cfg.dist_fn_opt)
        neg_img_part = RGA_attend_one_to_many_batch(neg_cap, neg_img_part, cfg.dist_fn_opt)
        #cap_part = regional_alignment_text(img, nps, n2c, cfg.dist_fn_opt)
        #pos_cap_part = regional_alignment_text(pos_img, pos_nps, pos_n2c, cfg.dist_fn_opt)
        #neg_cap_part = regional_alignment_text(neg_img, neg_nps, neg_n2c, cfg.dist_fn_opt)
        cap_part = RGA_attend_one_to_many_batch(img, nps, cfg.dist_fn_opt)
        pos_cap_part = RGA_attend_one_to_many_batch(pos_img, pos_nps, cfg.dist_fn_opt)
        neg_cap_part = RGA_attend_one_to_many_batch(neg_img, neg_nps, cfg.dist_fn_opt)
        
        # loss
        tri_loss =  crossmodal_triplet_loss(img,pos_img,neg_img, 
                                              cap, pos_cap, neg_cap, 
                                              triplet_loss, cfg.dist_fn_opt) 
        tri_image_regional_loss =  crossmodal_triplet_loss(img_part,pos_img_part,neg_img_part, 
                                              cap, pos_cap, neg_cap, 
                                              triplet_loss, cfg.dist_fn_opt) 
        tri_text_regional_loss =  crossmodal_triplet_loss(img,pos_img,neg_img, 
                                              cap_part, pos_cap_part, neg_cap_part, 
                                              triplet_loss, cfg.dist_fn_opt) 
        
        
        loss = tri_loss + tri_image_regional_loss  + tri_text_regional_loss
        
        # backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
       
        
        # log
        cum_tri_loss += tri_loss.item()
        cum_tri_image_regional_loss += tri_image_regional_loss.item()
        cum_tri_text_regional_loss += tri_text_regional_loss.item()
        
        if (i+1) % 64 == 0:
            logger.log("batch %d, [tri-loss] %.6f, [img_rga] %.6f, [cap_rga] %.6f" % (i, 
                                                                                      cum_tri_loss/64, 
                                                                                     cum_tri_image_regional_loss / 64, 
                                                                                     cum_tri_text_regional_loss / 64))
            cum_tri_loss, cum_tri_image_regional_loss, cum_tri_text_regional_loss = 0.0, 0.0, 0.0
    return model

train_epoch = train_epoch_regional if cfg.np else train_epoch_global

# stage 1 - image channel forzen
model.img_backbone.melt_layer(8)
param_to_optimize = build_graph_optimizer([model, img_mlp, cap_mlp])
optimizer = optim.Adam(param_to_optimize, lr=2e-4, weight_decay=1e-5)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10)
for epoch in range(0):
    model = train_epoch(train_loader, model, img_mlp, cap_mlp, optimizer, triplet_loss, logger, "train-stage-1")
    if cfg.np:
        acc = evaluator.evaluate(model,  img_mlp, cap_mlp)
    else:
        acc = evaluator.evaluate(model)
    logger.log('[euclidean][global] R@1: %.4f | R@5: %.4f | R@10: %.4f' % (acc['top-1'], acc['top-5'], acc['top-10']))
    if cfg.np:
        acc = cos_evaluator.evaluate(model,  img_mlp, cap_mlp)
    else:
        acc = cos_evaluator.evaluate(model)
    logger.log('[cosine   ][global] R@1: %.4f | R@5: %.4f | R@10: %.4f' % (acc['top-1'], acc['top-5'], acc['top-10']))
    scheduler.step()
    

    
# stage 2 - train all
model.img_backbone.melt_layer(7)
param_to_optimize = build_graph_optimizer([model, img_mlp, cap_mlp])
optimizer = optim.Adam(param_to_optimize, lr=1e-4, weight_decay=1e-5)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10)
for epoch in range(40):
    model = train_epoch(train_loader, model, img_mlp, cap_mlp, optimizer, triplet_loss, logger, "train-stage-2")
    if cfg.np:
        acc = evaluator.evaluate(model,  img_mlp, cap_mlp)
    else:
        acc = evaluator.evaluate(model)
    logger.log('[euclidean][global] R@1: %.4f | R@5: %.4f | R@10: %.4f' % (acc['top-1'], acc['top-5'], acc['top-10']))
    if cfg.np:
        acc = cos_evaluator.evaluate(model,  img_mlp, cap_mlp)
    else:
        acc = cos_evaluator.evaluate(model)
    logger.log('[cosine   ][global] R@1: %.4f | R@5: %.4f | R@10: %.4f' % (acc['top-1'], acc['top-5'], acc['top-10']))
    scheduler.step()

HBox(children=(IntProgress(value=1, bar_style='info', description='train-stage-2, epoch0', max=1, style=Progre…




NameError: name 'id_loss' is not defined

In [None]:
# stage 2 - train all
model.img_backbone.melt_layer(0)
param_to_optimize = build_graph_optimizer([model])
optimizer = optim.Adam(param_to_optimize, lr=2e-4)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=15)
for epoch in range(60):
    model = train_epoch(train_loader, model, optimizer, triplet_loss, logger, "train-stage-2")
    acc = evaluator.evaluate(model)
    logger.log('[euclidean][global] R@1: %.4f | R@5: %.4f | R@10: %.4f' % (acc['top-1'], acc['top-5'], acc['top-10']))
    acc = cos_evaluator.evaluate(model)
    logger.log('[cosine   ][global] R@1: %.4f | R@5: %.4f | R@10: %.4f' % (acc['top-1'], acc['top-5'], acc['top-10']))
    
    scheduler.step()