In [1]:
from datasets.wider_part_dataset import build_wider_dataloader
from datasets.text_test_datasets import build_text_test_loader
from datasets.image_test_datasets import build_image_test_loader
from models.encoder import Model, MLP
from evaluators.global_evaluator import GlobalEvaluator
from evaluators.np_evaluator import NPEvaluator
from loss.loss import crossmodal_triplet_loss, cos_distance, triplet_cos_loss
from loggers.logger import Logger
from manager import build_graph_optimizer
from tqdm import tqdm_notebook as tqdm
from sklearn.neighbors import DistanceMetric

from attentions.rga_attention import RGA_attend_one_to_many_batch, RGA_attend_one_to_many
import os

import torch.nn as nn
import torch.optim as optim

from configs.args import load_arg_parser

In [2]:
parser = load_arg_parser()
cfg = parser.parse_args("")
cfg.data_root = "/shared/rsaas/aiyucui2/wider_person/"
root = cfg.data_root

# data path
cfg.anno_path = os.path.join(root, cfg.anno_path)
cfg.img_dir = os.path.join(root, cfg.img_dir)
cfg.val_anno_path = os.path.join(root, cfg.val_anno_path)
cfg.val_img_dir = os.path.join(root, cfg.val_img_dir)
cfg.gt_file_fn = os.path.join(root, cfg.gt_file_fn)

# meta data path
cfg.cheap_candidate_fn = os.path.join(root, cfg.cheap_candidate_fn)
cfg.vocab_path = os.path.join(root, cfg.vocab_path)
# os.environ["CUDA_VISIBLE_DEVICES"] = "3"
# sys path
cfg.model_path = os.path.join(root, cfg.model_path)
cfg.output_path = os.path.join(root, cfg.output_path)
ckpt_root = "/shared/rsaas/aiyucui2/wider_person/checkpoints/reID/baseline"
load_exp_name = "dist_fn_cosine_imgbb_resnet50_capbb_bigru_embed_size_1024_batch_96_lr_0.0002_step_size_10_captype_sent_img_meltlayer_6_np_True_sent_60_cap_10_6_both_fc_mute"
cfg.load_ckpt_fn = os.path.join(ckpt_root, load_exp_name, "stage_2_id_match_epoch14.pt")
cfg.debug = False
cfg.embed_size = 1024
cfg.batch_size = 96
cfg.img_backbone_opt = "resnet50"
cfg.num_gpus = 1
cfg.cap_backbone_opt = "bigru"
cfg.dim = (384,128)
cfg.dist_fn_opt = "cosine"
cfg.np = True
cfg.img_num_cut = 6
cfg.img_num_cut = 1 if not cfg.np else cfg.img_num_cut
cfg.sent_token_length = 60
cfg.np_token_length = 6
cfg.num_np_per_sent = 10



cfg.cap_embed_type='sent'
# exp_name
cfg.exp_name = 'debug'
cfg.model_path = os.path.join("/shared/rsaas/aiyucui2/wider_person", cfg.model_path, cfg.exp_name)
cfg.output_path = os.path.join("/shared/rsaas/aiyucui2/wider_person", cfg.output_path, cfg.exp_name)

if not os.path.exists(cfg.model_path):
    os.mkdir(cfg.model_path)
if not os.path.exists(cfg.output_path):
    os.mkdir(cfg.output_path)
# logger
logger = Logger("test_np.txt") #os.path.join(cfg.output_path, cfg.exp_name+".txt"))
print(cfg.exp_name)

debug


In [27]:
from evaluators.evaluator import *
import torch
import numpy as np
# from attentions.rga_attention import RGA_attend_one_to_many_batch
from sklearn.neighbors import DistanceMetric
from loss.loss import cos_distance
import torch.nn.functional as F


class NPEvaluator(Evaluator):
    def __init__(self, img_loader, cap_loader, gt_file_path,  embed_size, logger, dist_fn_opt, device):
        super(NPEvaluator, self).__init__(img_loader, cap_loader, gt_file_path,  embed_size, logger)
        self.device = device
        # dist fn
        self.dist_fn_opt = dist_fn_opt
        if dist_fn_opt == 'euclidean':
            self.dist = DistanceMetric.get_metric('euclidean').pairwise
        else:
            self.dist = cos_distance
        
    def populate_img_db(self):
        K = self.embed_size
        self.global_imgs = []
        self.img_parts = []
        self.encoder.eval(); self.mlp_img.eval()
        with torch.no_grad():
            for i, data in tqdm(enumerate(self.img_loader),desc='build db global imgs'):
                img, file_names = data
                img_em, img_part = self.encoder(img.to(self.device))
                self.img_parts.append(self.mlp_img(img_part))
                self.global_imgs.append(img_em)
        self.global_imgs = torch.cat(self.global_imgs)
        self.img_parts = torch.cat(self.img_parts)
        return self.global_imgs
    
    def populate_cap_db(self):
        self.captions = {}
        sent_parse = self.cap_loader.dataset.np_extractor.sent_parse
        for ann in tqdm(self.cap_loader.dataset.anns, 'popluate cap db'):
            caps = ann['captions']
            self.captions[ann['file_path']] = sent_parse(caps[0])# + sent_parse(caps[1])
    
    def regional_alignment_image(self, caps, img_parts, dist_fn_opt):
        scoremats = []
        N, M, K = img_parts.size()
        for cap in caps:
            with torch.no_grad():
                parts = RGA_attend_one_to_many_batch(cap[None], img_parts, dist_fn_opt)
                if dist_fn_opt == "cosine":
                    scores = 1 - F.cosine_similarity(cap[None], parts)
                else:
                    scores = F.pairwise_distance(cap[None], parts)
                scoremats.append(scores.detach().cpu().numpy())
        return np.array(scoremats)
    
    def regional_alignment_text(self, imgs, cap_parts, n2cs, dist_fn_opt):
        scoremats = []
        N, M, K = cap_parts.size()
        for cap_part, n2c in zip(cap_parts, n2cs):
            with torch.no_grad():
                parts = RGA_attend_one_to_many_batch(imgs, cap_part[None,:n2c,:].expand(imgs.size(0), n2c, imgs.size(1)), dist_fn_opt)
                if dist_fn_opt == "cosine":
                    scores = 1 - F.cosine_similarity(imgs, parts)
                else:
                    scores = F.pairwise_distance(imgs, parts)
                scoremats.append(scores.detach().cpu().numpy())
        return np.array(scoremats)
    
   
    def set_model(self, encoder, mlp_img, mlp_text):
        self.encoder = encoder
        self.mlp_img = mlp_img
        self.mlp_text = mlp_text
        self.populate_img_db()
        self.populate_cap_db()
        
    def single_cap_encode(self, query):
        self.encoder.eval()
        self.mlp_text.eval()
        cap_token, nps, num_nps = self.cap_loader.dataset._load_cap(query)
        caps = torch.LongTensor(cap_token)[None].to(self.device)
        nps = torch.LongTensor(nps)[None].to(self.device)
        n2c = [num_nps]
        N, M, T = nps.size()
        self.global_caps = self.encoder(caps)
        self.nps = self.encoder(nps.reshape(N*M, T).cuda())
        self.n2cs = n2c
        self.cap_parts = self.mlp_text(self.nps).reshape(N, M, -1)
        
    def inference(self, query, a=1, b=0, c=0, K=10):
        '''
        assume img_db has been populated already
        '''
        # encode query
        self.single_cap_encode(query)
        
        # get scoremat
        scoremat, scoremat2, scoremat3 = self.retrieval()
        final_scoremat = a*scoremat + b*scoremat2 + c*scoremat3
        
        # return images
        topk_images = np.argsort(final_scoremat[0, :])[:K]
        imgs = [self.idx2img[i] for i in topk_images]
        
        return imgs
        
        
        
    
    def retrieval(self):
        querys = self.global_caps.cpu().detach().numpy()
        candidates = self.global_imgs.cpu().detach().numpy()
        scoremat = self.dist(querys, candidates)
        scoremat2 = self.regional_alignment_image(self.global_caps, self.img_parts, self.dist_fn_opt)
        scoremat3 = self.regional_alignment_text(self.global_imgs, self.cap_parts, self.n2cs, self.dist_fn_opt)
        return scoremat, scoremat2, scoremat3

In [28]:
# train loader
# test loader (loading image and text separately)
test_text_loader = build_text_test_loader(cfg) 
test_image_loader = build_image_test_loader(cfg) 

# Evaluator
Evaluator = NPEvaluator if cfg.np else GlobalEvaluator
evaluator = Evaluator(img_loader=test_image_loader, 
                          cap_loader=test_text_loader, 
                          gt_file_path=cfg.gt_file_fn,
                          embed_size=cfg.embed_size,
                          logger=logger,
                          dist_fn_opt="cosine",device='cuda')

In [5]:
if False:
    ds = test_text_loader.dataset
    extractor = ds.np_extractor
    import nltk
    import collections
    num_nps_per_sent = collections.defaultdict(int)
    num_nps = collections.defaultdict(int)
    for cap in tqdm(ds.captions):
        nps = extractor.sent_parse(cap)
        num_nps_per_sent[len(nps)] += 1
        for np in nps:
            num_nps[len(np.split())] += 1

    all_cnts = []
    for num in num_nps_per_sent:
        all_cnts += [num] * num_nps_per_sent[num]
    all_cnts = sorted(all_cnts)
    print(all_cnts[int(len(all_cnts)*0.98)])

    all_cnts_np = []
    for num in num_nps:
        all_cnts_np += [num] * num_nps[num]
    all_cnts_np = sorted(all_cnts_np)
    print(all_cnts_np[int(len(all_cnts_np)*0.98)])


In [29]:
from manager import Manager, regional_alignment_text
cfg.num_ids = 12003
manager = Manager(cfg, logger)

[Trainer][init] load pre-trained model from /shared/rsaas/aiyucui2/wider_person/checkpoints/reID/baseline/dist_fn_cosine_imgbb_resnet50_capbb_bigru_embed_size_1024_batch_96_lr_0.0002_step_size_10_captype_sent_img_meltlayer_6_np_True_sent_60_cap_10_6_both_fc_mute/stage_2_id_match_epoch14.pt.
[Trainer][init] load pre-trained id_cls from /shared/rsaas/aiyucui2/wider_person/checkpoints/reID/baseline/dist_fn_cosine_imgbb_resnet50_capbb_bigru_embed_size_1024_batch_96_lr_0.0002_step_size_10_captype_sent_img_meltlayer_6_np_True_sent_60_cap_10_6_both_fc_mute/stage_2_id_match_epoch14.pt.
[Trainer][init] load pre-trained rga_img_mlp from /shared/rsaas/aiyucui2/wider_person/checkpoints/reID/baseline/dist_fn_cosine_imgbb_resnet50_capbb_bigru_embed_size_1024_batch_96_lr_0.0002_step_size_10_captype_sent_img_meltlayer_6_np_True_sent_60_cap_10_6_both_fc_mute/stage_2_id_match_epoch14.pt.
[Trainer][init] load pre-trained rga_cap_mlp from /shared/rsaas/aiyucui2/wider_person/checkpoints/reID/baseline/dist_

In [31]:
evaluator.set_model(manager.model, manager.rga_img_mlp, manager.rga_cap_mlp)

build db global imgs: 33it [00:05,  7.18it/s]
popluate cap db: 100%|██████████| 3074/3074 [00:05<00:00, 582.99it/s]


## Visualize

In [109]:
from IPython.display import Image
from IPython.core.display import HTML 

COLOR = ['white','yellow']
query = 'A man holds a tote.'
recalls = evaluator.inference(query,K=10)
recalls_part = evaluator.inference(query, 0, 0.5, 0.5)
def display_infer(query, recalls, caps, note='', img_root='../http/img'):
    out_string = ' <h2>query: %s</h2>' % query
    out_string += '<table style="align:center">'
    
    out_string += '<tr>' + '\n' 
    out_string += '<th> rank <th>'
    for i, recall in enumerate(recalls):
        out_string += '<th align="center">No. %d</th>' % i
    out_string += '</tr>' + '\n'
        
    
    out_string += '<tr>' + '\n' 
    out_string += '<td align="center"> %s <td>' % "returned images"
    # recalls
    for recall in recalls:
        img_path = os.path.join(img_root, recall)
        out_string += '<td align="right" ><img src="%s" height=128 width=48 /></td>' % img_path + '\n'
    out_string += '</tr>' + '\n'
    #out_string += '<td align="right"> %s <td>' % "associated noun phrases"
    for recall in []:#recalls:
        out_string += '<td>'
        for i,np in enumerate(caps[recall]):
            color = COLOR[i%len(COLOR)]
            out_string += "<div style='background:%s;width=128'>%s</div>" % (color,np)
        out_string += '</td>'
    out_string += '</tr>' + '\n'
    out_string += '</table>'
    return HTML(out_string)
    
a = display_infer(query, recalls, evaluator.captions, 'global')
b = display_infer(query, recalls_part, evaluator.captions, 'part')
display(a)
display(b)

rank,Unnamed: 1,No. 0,No. 1,No. 2,No. 3,No. 4,No. 5,No. 6,No. 7,No. 8,No. 9
returned images,,,,,,,,,,,


rank,Unnamed: 1,No. 0,No. 1,No. 2,No. 3,No. 4,No. 5,No. 6,No. 7,No. 8,No. 9
returned images,,,,,,,,,,,


## Stage 2: Matching + ID Loss