In [1]:
from datasets.wider_part_dataset import build_wider_dataloader
from datasets.text_test_datasets import build_text_test_loader
from datasets.image_test_datasets import build_image_test_loader
from models.encoder import Model, MLP
from evaluators.global_evaluator import GlobalEvaluator
from evaluators.np_evaluator import NPEvaluator
from loss.loss import crossmodal_triplet_loss, cos_distance, triplet_cos_loss
from loggers.logger import Logger
from manager import build_graph_optimizer
from tqdm import tqdm_notebook as tqdm
from sklearn.neighbors import DistanceMetric

from attentions.rga_attention import RGA_attend_one_to_many_batch, RGA_attend_one_to_many
import os

import torch.nn as nn
import torch.optim as optim

from configs.args import load_arg_parser

In [2]:
parser = load_arg_parser()
cfg = parser.parse_args("")
cfg.data_root = "/data/aiyucui2/wider"
root = cfg.data_root

# data path
cfg.anno_path = os.path.join(root, cfg.anno_path)
cfg.img_dir = os.path.join(root, cfg.img_dir)
cfg.val_anno_path = os.path.join(root, cfg.val_anno_path)
cfg.val_img_dir = os.path.join(root, cfg.val_img_dir)
cfg.gt_file_fn = os.path.join(root, cfg.gt_file_fn)

# meta data path
cfg.cheap_candidate_fn = os.path.join(root, cfg.cheap_candidate_fn)
cfg.vocab_path = os.path.join(root, cfg.vocab_path)
os.environ["CUDA_VISIBLE_DEVICES"] = "3"
# sys path
cfg.model_path = os.path.join(root, cfg.model_path)
cfg.output_path = os.path.join(root, cfg.output_path)
ckpt_root = "/shared/rsaas/aiyucui2/wider_person/checkpoints/reID/baseline"
load_exp_name = "dist_fn_cosine_imgbb_resnet50_capbb_bigru_embed_size_1024_batch_96_lr_0.0001_captype_sent_img_meltlayer_2_cos_margin_0.2_np_False"
cfg.load_ckpt_fn = os.path.join(ckpt_root, load_exp_name, "stage_1_id_last.pt")
cfg.debug = False
cfg.embed_size = 1024
cfg.batch_size = 96
cfg.img_backbone_opt = "resnet50"
cfg.num_gpus = 1
cfg.cap_backbone_opt = "bigru"
cfg.dim = (384,128)
cfg.dist_fn_opt = "cosine"
cfg.np = True
cfg.img_num_cut = 6
cfg.img_num_cut = 1 if not cfg.np else cfg.img_num_cut
cfg.sent_token_length = 60
cfg.np_token_length = 6
cfg.num_np_per_sent = 10



cfg.cap_embed_type='sent'
# exp_name
cfg.exp_name = 'debug'
cfg.model_path = os.path.join("/shared/rsaas/aiyucui2/wider_person", cfg.model_path, cfg.exp_name)
cfg.output_path = os.path.join("/shared/rsaas/aiyucui2/wider_person", cfg.output_path, cfg.exp_name)

if not os.path.exists(cfg.model_path):
    os.mkdir(cfg.model_path)
if not os.path.exists(cfg.output_path):
    os.mkdir(cfg.output_path)
# logger
logger = Logger("test_np.txt") #os.path.join(cfg.output_path, cfg.exp_name+".txt"))
print(cfg.exp_name)

debug


In [3]:
from evaluators.evaluator import *
import torch
import numpy as np
# from attentions.rga_attention import RGA_attend_one_to_many_batch
from sklearn.neighbors import DistanceMetric
from loss.loss import cos_distance
import torch.nn.functional as F


class NPEvaluator(Evaluator):
    def __init__(self, img_loader, cap_loader, gt_file_path,  embed_size, logger, dist_fn_opt):
        super(NPEvaluator, self).__init__(img_loader, cap_loader, gt_file_path,  embed_size, logger)
        # dist fn
        self.dist_fn_opt = dist_fn_opt
        if dist_fn_opt == 'euclidean':
            self.dist = DistanceMetric.get_metric('euclidean').pairwise
        else:
            self.dist = cos_distance
        
    def populate_img_db(self, encoder, img_mlp):
        K = self.embed_size
        self.global_imgs = []
        self.img_parts = []
        encoder.eval(); img_mlp.eval()
        with torch.no_grad():
            for i, data in tqdm(enumerate(self.img_loader),desc='build db global imgs'):
                img, file_names = data
                img_em, img_part = encoder(img.cuda())
                self.img_parts.append(img_mlp(img_part))
                self.global_imgs.append(img_em)
        self.global_imgs = torch.cat(self.global_imgs)
        self.img_parts = torch.cat(self.img_parts)
        return self.global_imgs
    
    def populate_cap_db(self, encoder, text_mlp):
        K = self.embed_size
        encoder.eval(); text_mlp.eval()
        self.global_caps = []
        self.cap_parts = []
        self.n2cs = []
        with torch.no_grad():
            for i, batch in tqdm(enumerate(self.cap_loader),desc='build db global caps'):
                caps, nps, n2c, file_names = batch
                N, M, T = nps.size()
                global_cap = encoder(caps.cuda())
                nps = encoder(nps.reshape(N*M, T).cuda())
                self.global_caps.append(global_cap)
                self.n2cs.append(n2c)
                self.cap_parts.append(text_mlp(nps).reshape(N, M, -1))
        self.global_caps = torch.cat(self.global_caps)
        self.cap_parts = torch.cat(self.cap_parts)
        self.n2cs = torch.cat(self.n2cs)
        return self.global_caps
    
    def regional_alignment_image(self, caps, img_parts, dist_fn_opt):
        scoremats = []
        N, M, K = img_parts.size()
        for cap in tqdm(caps, "scoremat_rga_img"):
            with torch.no_grad():
                parts = RGA_attend_one_to_many_batch(cap[None], img_parts, dist_fn_opt)
                if dist_fn_opt == "cosine":
                    scores = 1 - F.cosine_similarity(cap[None], parts)
                else:
                    scores = F.pairwise_distance(cap[None], parts)
                scoremats.append(scores.detach().cpu().numpy())
        return np.array(scoremats)
    
    def regional_alignment_text(self, imgs, cap_parts, n2cs, dist_fn_opt):
        scoremats = []
        N, M, K = cap_parts.size()
        for cap_part, n2c in tqdm(zip(cap_parts, n2cs), "scoremat_rga_cap(nps)"):
            with torch.no_grad():
                parts = RGA_attend_one_to_many_batch(imgs, cap_part[None,:n2c,:].expand(imgs.size(0), n2c, imgs.size(1)), dist_fn_opt)
                if dist_fn_opt == "cosine":
                    scores = 1 - F.cosine_similarity(imgs, parts)
                else:
                    scores = F.pairwise_distance(imgs, parts)
                scoremats.append(scores.detach().cpu().numpy())
        return np.array(scoremats)
    
    def evaluate(self, encoder, mlp_img, mlp_text, output_path="tmp.txt"):
        # compute global features
        self.populate_img_db(encoder, mlp_img)
        self.populate_cap_db(encoder, mlp_text)
    
        # global eval
        # scoremat = self.retrieval()
        scoremat_global, scoremat_img_rga, scoremat_cap_rga = self.retrieval()
        acc = self.compute_acc(scoremat_global, output_path)
        self.logger.log("[global] R@1: %.4f | R@5: %.4f | R@10: %.4f" % (acc['top-1'], acc['top-5'], acc['top-10']))
        acc = self.compute_acc(scoremat_img_rga, output_path)
        self.logger.log("[img_rga] R@1: %.4f | R@5: %.4f | R@10: %.4f" % (acc['top-1'], acc['top-5'], acc['top-10']))
        acc = self.compute_acc(scoremat_cap_rga, output_path)
        self.logger.log("[cap_rga] R@1: %.4f | R@5: %.4f | R@10: %.4f" % (acc['top-1'], acc['top-5'], acc['top-10']))
        
        acc = self.compute_acc(scoremat_global + 0.5*scoremat_img_rga + 0.5*scoremat_cap_rga, output_path)
        self.logger.log("[fusion] R@1: %.4f | R@5: %.4f | R@10: %.4f" % (acc['top-1'], acc['top-5'], acc['top-10']))
        return acc                         
            
    
    def retrieval(self):
        querys = self.global_caps.cpu().detach().numpy()
        candidates = self.global_imgs.cpu().detach().numpy()
        scoremat = self.dist(querys, candidates)
        scoremat2 = self.regional_alignment_image(self.global_caps, self.img_parts, self.dist_fn_opt)
        scoremat3 = self.regional_alignment_text(self.global_imgs, self.cap_parts, self.n2cs, self.dist_fn_opt)
        return scoremat, scoremat2, scoremat3

In [4]:
# train loader
train_loader = build_wider_dataloader(cfg)
# test loader (loading image and text separately)
test_text_loader = build_text_test_loader(cfg) 
test_image_loader = build_image_test_loader(cfg) 

# Evaluator
Evaluator = NPEvaluator if cfg.np else GlobalEvaluator
evaluator = Evaluator(img_loader=test_image_loader, 
                          cap_loader=test_text_loader, 
                          gt_file_path=cfg.gt_file_fn,
                          embed_size=cfg.embed_size,
                          logger=logger,
                          dist_fn_opt="cosine")

[ds] load annotations from /data/aiyucui2/wider/wider/train/train_anns_train.json
size of dataset: 74264


In [5]:
if False:
    ds = test_text_loader.dataset
    extractor = ds.np_extractor
    import nltk
    import collections
    num_nps_per_sent = collections.defaultdict(int)
    num_nps = collections.defaultdict(int)
    for cap in tqdm(ds.captions):
        nps = extractor.sent_parse(cap)
        num_nps_per_sent[len(nps)] += 1
        for np in nps:
            num_nps[len(np.split())] += 1

    all_cnts = []
    for num in num_nps_per_sent:
        all_cnts += [num] * num_nps_per_sent[num]
    all_cnts = sorted(all_cnts)
    print(all_cnts[int(len(all_cnts)*0.98)])

    all_cnts_np = []
    for num in num_nps:
        all_cnts_np += [num] * num_nps[num]
    all_cnts_np = sorted(all_cnts_np)
    print(all_cnts_np[int(len(all_cnts_np)*0.98)])


In [6]:
from tqdm import tqdm
import torch
import torch.nn as nn
import torch.optim as optim 
import os

import torch.nn.functional as F
from models.encoder import Model, MLP
from loss.loss import triplet_cos_loss, crossmodal_triplet_loss, triplet_cos_loss_inner

from attentions.rga_attention import RGA_attend_one_to_many_batch, RGA_attend_one_to_many
     
from manager import Manager, regional_alignment_text


def triplet_cos_loss_attention(fulls, parts, n2cs, pids):
    """
    fulls: N x E
    parts: N x Z x E
    pids: N
    """
    loss = 0.0
    N, E = fulls.size()
    Z = parts.size(1)
    for i, (pid, n2c) in enumerate(zip(pids, n2cs)):
        curr_part = parts[i:i+1].expand(N, Z, E)
        curr_full = fulls[i:i+1]
        cum_parts = RGA_attend_one_to_many_batch(fulls, curr_part, 'cosine')
         # 1 x Z x E
        neg_part = cum_parts[pids != pid]
        pos_part = cum_parts[i:i+1].expand_as(neg_part)
        
        neg_full = fulls[pids != pid]
        pos_full = fulls[i:i+1].expand_as(neg_full)
        
        loss = loss + triplet_cos_loss_inner(pos_part, pos_full, neg_full)
        loss = loss + triplet_cos_loss_inner(pos_full, pos_part, neg_part)
        
    return loss / N
        


def regional_alignment_image(fulls, parts, dist_fn_opt):
    return RGA_attend_one_to_many_batch(fulls, parts, dist_fn_opt)

class AiyuManager(Manager):
    def __init__(self, args, logger):
        super(AiyuManager, self).__init__(args, logger)
        
    def train_epoch_regional(self, train_data, optimizer, epoch, note="train"):
        self.model.train(); self.rga_img_mlp.train(); self.rga_cap_mlp.train(); 

        cum_tri_loss, cum_tri_image_regional_loss, cum_tri_text_regional_loss, cum_id_loss = 0.0, 0.0, 0.0, 0.0
        for i, data in tqdm(enumerate(train_data), "%s, epoch%d" % (note,epoch)):
            # load data
            data = self.todevice(data)
            (img, cap, nps, n2c, pid) = data
            import pdb;pdb.set_trace()


            img, img_part = self.model(img)
            cap = self.model(cap)
            
            N, M, T = nps.size()
            #nps = self.rga_cap_mlp(self.np_encoder(nps))
            nps = self.rga_cap_mlp(self.model(nps.view(-1, T))).view(N, M, -1)
            
            # part
            img_part = self.rga_img_mlp(img_part)

            img_part1 = RGA_attend_one_to_many_batch(cap, img_part, self.cfg.dist_fn_opt)
            cap_part1 = regional_alignment_text(img, nps, n2c, self.cfg.dist_fn_opt)
            # cap_part = RGA_attend_one_to_many_batch(img, nps, self.cfg.dist_fn_opt)

            # loss
            tri_loss =  self.triplet_loss(img, cap, pid) 
            tri_text_regional_loss = self.triplet_loss(cap_part1, img, pid) #triplet_cos_loss_attention(img, nps, n2c, pid) ##s
            tri_image_regional_loss = self.triplet_loss(img_part1, cap, pid) 
            id_loss = self.cls_loss(self.id_cls(img), pid) +  self.cls_loss(self.id_cls(cap), pid)
            # id_loss = id_loss + self.cls_loss(self.id_cls(img_part), pid) +  self.cls_loss(self.id_cls(cap_part), pid)


            loss = tri_loss + tri_image_regional_loss  + tri_text_regional_loss + id_loss

            # backpropagation
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()


            # log
            cum_tri_loss += tri_loss.item()
            cum_tri_image_regional_loss += tri_image_regional_loss.item()
            cum_tri_text_regional_loss += tri_text_regional_loss.item()
            cum_id_loss += id_loss.item()
            
            if (i+1) % self.cfg.print_freq == 0:
                out_string = "[ep-%d, bs-%d] " % (epoch, i)
                out_string += "[id-loss] %.6f, " % (cum_id_loss / self.cfg.print_freq)
                out_string += "[tri-loss] %.6f, " % (cum_tri_loss / self.cfg.print_freq)
                out_string += "[img_rga] %.6f, " %  (cum_tri_image_regional_loss / self.cfg.print_freq)
                out_string += "[cap_rga] %.6f " % (cum_tri_text_regional_loss / self.cfg.print_freq)
                self.log(out_string)
                cum_tri_loss, cum_tri_image_regional_loss, cum_tri_text_regional_loss, cum_id_loss = 0.0, 0.0, 0.0, 0.0       
    


In [7]:
cfg.num_ids = len(train_loader.dataset.person2label.values())
manager = AiyuManager(cfg, logger)

[Trainer][init] load pre-trained model from /shared/rsaas/aiyucui2/wider_person/checkpoints/reID/baseline/dist_fn_cosine_imgbb_resnet50_capbb_bigru_embed_size_1024_batch_96_lr_0.0001_captype_sent_img_meltlayer_2_cos_margin_0.2_np_False/stage_1_id_last.pt.
[Trainer][init] load pre-trained id_cls from /shared/rsaas/aiyucui2/wider_person/checkpoints/reID/baseline/dist_fn_cosine_imgbb_resnet50_capbb_bigru_embed_size_1024_batch_96_lr_0.0001_captype_sent_img_meltlayer_2_cos_margin_0.2_np_False/stage_1_id_last.pt.
[Trainer][init] load pre-trained rga_img_mlp from /shared/rsaas/aiyucui2/wider_person/checkpoints/reID/baseline/dist_fn_cosine_imgbb_resnet50_capbb_bigru_embed_size_1024_batch_96_lr_0.0001_captype_sent_img_meltlayer_2_cos_margin_0.2_np_False/stage_1_id_last.pt.
[Trainer][init] load pre-trained rga_cap_mlp from /shared/rsaas/aiyucui2/wider_person/checkpoints/reID/baseline/dist_fn_cosine_imgbb_resnet50_capbb_bigru_embed_size_1024_batch_96_lr_0.0001_captype_sent_img_meltlayer_2_cos_mar

In [8]:
if False:
    if cfg.np:
        acc = evaluator.evaluate(manager.model, manager.rga_img_mlp, manager.rga_cap_mlp)
    else:
        acc = evaluator.evaluate(manager.model)
    logger.log('[cosine   ][global] R@1: %.4f | R@5: %.4f | R@10: %.4f' % (acc['top-1'], acc['top-5'], acc['top-10']))


## Stage 1: ID Loss only

In [9]:
if False:
    logger.log("======== [Stage 1] ============")
    manager.melt_img_layer(num_layer_to_melt=1)
    param_to_optimize = build_graph_optimizer([manager.model, manager.id_cls])
    optimizer = optim.Adam(param_to_optimize, lr=1e-3)
    scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10)
    
    for epoch in range(0):
        manager.train_epoch_id(train_loader, optimizer, epoch, "train-stage-1")
        acc = evaluator.evaluate(manager.model)
        logger.log('[euclidean][global] R@1: %.4f | R@5: %.4f | R@10: %.4f' % (acc['top-1'], acc['top-5'], acc['top-10']))
        acc = cos_evaluator.evaluate(manager.model)
        logger.log('[cosine   ][global] R@1: %.4f | R@5: %.4f | R@10: %.4f' % (acc['top-1'], acc['top-5'], acc['top-10']))
        scheduler.step()
        manager.save_ckpt(epoch, acc, 'stage_1_id_last.pt')
    manager.save_ckpt(epoch, acc, 'id_initialized.pt')

## Stage 2: Matching + ID Loss

In [None]:
from tqdm import tqdm_notebook as tqdm
manager.melt_img_layer(num_layer_to_melt=2)
if cfg.np:
    param_to_optimize = build_graph_optimizer([manager.model, manager.id_cls, manager.rga_img_mlp, manager.rga_cap_mlp]) 
else:
    param_to_optimize = build_graph_optimizer([manager.model, manager.id_cls])  
    
optimizer = optim.Adam(param_to_optimize, lr=2e-4, weight_decay=1e-5)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10)
train_epoch = manager.train_epoch_regional if cfg.np else manager.train_epoch_global
for epoch in range(15):
    train_epoch(train_loader, optimizer, epoch, "train-stage-2")
    if cfg.np:
        cos_acc = evaluator.evaluate(manager.model, manager.rga_img_mlp, manager.rga_cap_mlp)
    else:
        cos_acc = evaluator.evaluate(manager.model)
    logger.log('[cosine   ][global] R@1: %.4f | R@5: %.4f | R@10: %.4f' % (cos_acc['top-1'], cos_acc['top-5'], cos_acc['top-10']))
    scheduler.step()

HBox(children=(IntProgress(value=1, bar_style='info', description='train-stage-2, epoch0', max=1, style=Progre…

> <ipython-input-6-35cb9463ce4a>(61)train_epoch_regional()
-> img, img_part = self.model(img)
(Pdb) n
> <ipython-input-6-35cb9463ce4a>(62)train_epoch_regional()
-> cap = self.model(cap)
(Pdb) img_part.size()
torch.Size([96, 6, 256])
(Pdb) n
> <ipython-input-6-35cb9463ce4a>(64)train_epoch_regional()
-> N, M, T = nps.size()
(Pdb) n
> <ipython-input-6-35cb9463ce4a>(66)train_epoch_regional()
-> nps = self.rga_cap_mlp(self.model(nps.view(-1, T))).view(N, M, -1)
(Pdb) nps.size()
torch.Size([96, 10, 6])
(Pdb) n
> <ipython-input-6-35cb9463ce4a>(69)train_epoch_regional()
-> img_part = self.rga_img_mlp(img_part)
(Pdb) n
> <ipython-input-6-35cb9463ce4a>(71)train_epoch_regional()
-> img_part1 = RGA_attend_one_to_many_batch(cap, img_part, self.cfg.dist_fn_opt)
(Pdb) img_part.size()
torch.Size([96, 6, 1024])
(Pdb) q


In [None]:
!nvidia-smi

In [None]:
if cfg.np:
    cos_acc = cos_evaluator.evaluate(manager.model, manager.rga_img_mlp, manager.rga_cap_mlp)
else:
    cos_acc = cos_evaluator.evaluate(manager.model)
logger.log('[cosine   ][global] R@1: %.4f | R@5: %.4f | R@10: %.4f' % (cos_acc['top-1'], cos_acc['top-5'], cos_acc['top-10']))
