# Active FULL Training Code (Basic - Global)

In [1]:
from datasets.wider_global_dataset import build_wider_dataloader
from datasets.wider_global_test_dataset import build_wider_test_dataloader
from models.texts.gru_backbone import BiGRUBackbone as CaptionBackbone
from models.images.image_backbones import ImageBackbone
from evaluators.global_evaluator import GlobalEvaluator
from loss.loss import crossmodal_triplet_loss

from tqdm import tqdm_notebook as tqdm
from sklearn.neighbors import DistanceMetric
import os

import torch.nn as nn
import torch.optim as optim

from configs.args import load_arg_parser

## config

In [2]:
parser = load_arg_parser()
cfg = parser.parse_args("")

root = cfg.data_root

# data path
cfg.anno_path = os.path.join(root, cfg.anno_path)
cfg.img_dir = os.path.join(root, cfg.img_dir)
cfg.val_anno_path = os.path.join(root, cfg.val_anno_path)
cfg.val_img_dir = os.path.join(root, cfg.val_img_dir)
cfg.gt_file_fn = os.path.join(root, cfg.gt_file_fn)

# meta data path
cfg.cheap_candidate_fn = os.path.join(root, cfg.cheap_candidate_fn)
cfg.vocab_path = os.path.join(root, cfg.vocab_path)

# sys path
cfg.model_path = os.path.join(root, cfg.model_path)
cfg.output_path = os.path.join(root, cfg.output_path)


cfg.debug = False
cfg.embed_size = 512
cfg.batch_size = 128
cfg.img_backbone_opt = "resnet18"
cfg.cap_backbone_opt = "bigru"
cfg.dim = (384,128)
cfg.dist_fn_opt = "cosine"

## Loading data

In [3]:
# train loader
train_loader = build_wider_dataloader(anno_path=cfg.anno_path,
                                    img_dir=cfg.img_dir,
                                    vocab_fn=cfg.vocab_path, 
                                    dim=cfg.dim,
                                    token_length=40,
                                    train=True,
                                    batch_size=cfg.batch_size,
                                    num_workers=8,
                                    debug=cfg.debug)

# test loader (loading image and text separately)
test_text_loader, test_image_loader = build_wider_test_dataloader(anno_path=cfg.val_anno_path,
                                                              img_dir=cfg.val_img_dir,
                                                              vocab_fn=cfg.vocab_path, 
                                                              dim=cfg.dim,
                                                              batch_size=cfg.batch_size,
                                                              num_workers=8,
                                                              debug=cfg.debug)

[ds] load annotations from /shared/rsaas/aiyucui2/wider_person/wider/train/train_anns_train.json
size of dataset: 37132


In [4]:
print(len(train_loader.dataset.person2label.values()))

12003


In [5]:
import matplotlib.pyplot as plt
import numpy as np
import torchvision
# functions to show an image
print(test_text_loader.dataset[1])
print(train_loader.dataset[1][5])
def imshow(img):
    img = img / 2 + 0.5     # unnormalize
    npimg = img.numpy()
    plt.imshow(np.transpose(npimg, (1, 2, 0)))
    plt.show()
dataiter = iter(test_image_loader)
images, labels = dataiter.next()

# show images
imshow(images[0])

(tensor([  9,  14,   6,   8,   2,  24,   4,  10,  97, 144,  58,   4,  59,  17,
          3,  19,  16,  50, 494, 123,  30, 141,   3,   1,   1,   1,   1,   1,
          1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1]), 'train_query/p8848_s17661.jpg')
tensor([   9,   21,    6,    8,    2,   33,    7,    0,    7,  128,    7,  637,
          38,    4,  130,    3,   22,    6,    8,   35, 1111,   99,    4,    2,
          43,    3,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1])


Clipping input data to the valid range for imshow with RGB data ([0..1] for floats or [0..255] for integers).


<Figure size 640x480 with 1 Axes>

## Define Model

In [6]:
class Model(nn.Module):
    def __init__(self, embed_size, image_opt="resnet50", caption_opt="bigru"):
        super(Model, self).__init__()
        self.img_backbone = ImageBackbone(embed_size, image_opt)
        self.cap_backbone = CaptionBackbone(embed_size=embed_size, caption_opt=caption_opt)
        
    def forward(self,x):
        if len(x.size()) == 4:
            x = self.img_backbone(x)
        elif len(x.size()) == 2 or len(x.size()) == 3:
            x = self.cap_backbone(x)
        else:
            assert False
        return x        



In [7]:
model = Model(embed_size=cfg.embed_size, 
              image_opt=cfg.img_backbone_opt, 
              caption_opt=cfg.cap_backbone_opt).cuda()

#classifier = nn.Sequential(
#    nn.Linear(cfg.embed_size, 12003),
#    nn.Softmax()
#).cuda()

### Distance Metrics

In [8]:
def cos_distance(x,y):
    eps = 1e-12
    dot_prod = np.dot(x,y.transpose(1,0))
    x_norm = np.linalg.norm(x, axis=1, keepdims=True)
    y_norm = np.linalg.norm(y, axis=1, keepdims=True)
    tmp = dot_prod / (np.dot(x_norm, y_norm.transpose(1,0)) + eps)
    return 1 - tmp

def triplet_loss_cosine(x, pos, neg, margin=1):
    #import pdb; pdb.set_trace()
    pos_dist = F.cosine_similarity(x, pos).diag()
    neg_dist = F.cosine_similarity(x, neg).diag()
    return torch.clamp(margin-pos_dist+neg_dist, min=0).sum()

    
if cfg.dist_fn_opt == "euclidean":
    dist_fn = DistanceMetric.get_metric('euclidean').pairwise
    triplet_loss = nn.TripletMarginLoss()
elif cfg.dist_fn_opt == "cosine":
    dist_fn = cos_distance
    triplet_loss = nn.CosineEmbeddingLoss()

### Train Misc Setup

In [9]:
print(dist_fn)
evaluator = GlobalEvaluator(img_loader=test_image_loader, 
                          cap_loader=test_text_loader, 
                          gt_file_path=cfg.gt_file_fn,
                          embed_size=cfg.embed_size,
                          dist_fn=dist_fn)
#acc = evaluator.evaluate(model)

def build_graph_optimizer(models):
    if not isinstance(models, list):
        models = [models]
    params_to_optimize = []
    for model in models:
        if model and hasattr(model, '_parameters'):
            for param in model.parameters():
                if param.requires_grad == True:
                    params_to_optimize.append(param)
    return params_to_optimize

<function cos_distance at 0x2b5a2a007400>


In [10]:
def train_epoch_stage1(train_data, model, classifier, optimizer, cls_loss, note="train"):
    model.train()
    cum_loss = 0.0
    for i, data in tqdm(enumerate(train_data), "%s, epoch%d" % (note,epoch)):
        # load data
        (img,pos_img,neg_img, cap, pos_cap, neg_cap, pid, pos_pid, neg_pid) = data
        img, pos_img, neg_img = model(img.cuda()), model(pos_img.cuda()), model(neg_img.cuda())
        cap, pos_cap, neg_cap = model(cap.cuda()), model(pos_cap.cuda()), model(neg_cap.cuda())
        
        # loss
        loss = 0.0
        loss = loss + cls_loss(classifier(img), pid.cuda()) +  cls_loss(classifier(cap), pid.cuda())
        
        # backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        cum_loss += loss.item()
        
        # log
        if (i+1) % 64 == 0:
            print("batch %d, loss %.6f" % (i, cum_loss/64))
            cum_loss = 0.0
    return model


# stage 1 - image channel forzen
cls_loss = nn.CrossEntropyLoss()
model.img_backbone.melt_layer(8)
param_to_optimize = build_graph_optimizer([model, classifier])
optimizer = optim.Adam(param_to_optimize, lr=1e-3, weight_decay=1e-5)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=20)
for epoch in range(10):
    model = train_epoch_stage1(train_loader, model, classifier, optimizer, cls_loss, "train-stage-1")
    acc = evaluator.evaluate(model)
    scheduler.step()
    

NameError: name 'classifier' is not defined

In [None]:
import torch

def crossmodal_triplet_loss(img, pos_img, neg_img, cap, pos_cap, neg_cap, triplet_loss, dist_fn_opt):
    loss = 0.0
    if dist_fn_opt == "euclidean":
        loss = loss + triplet_loss(img, pos_img, neg_img)
        loss = loss + triplet_loss(cap, pos_cap, neg_cap)

        loss = loss + triplet_loss(img, pos_cap, neg_cap)
        loss = loss + triplet_loss(img, pos_cap, neg_img)
        loss = loss + triplet_loss(img, pos_img, neg_cap)
        loss = loss + triplet_loss(cap, pos_img, neg_img)
        loss = loss + triplet_loss(cap, pos_img, neg_cap)
        loss = loss + triplet_loss(cap, pos_cap, neg_img)

        loss = loss + triplet_loss(pos_cap, pos_img, neg_cap)
        loss = loss + triplet_loss(pos_cap, pos_img, neg_img)
        loss = loss + triplet_loss(cap, img, neg_cap)
        loss = loss + triplet_loss(cap, img, neg_img)
    elif dist_fn_opt == "cosine":
        same = torch.Tensor(img.size(0)).fill_(1).cuda()
        diff = torch.Tensor(img.size(0)).fill_(-1).cuda()
        
        loss = loss + triplet_loss(img, cap, same)
        loss = loss + triplet_loss(img, img, same)
        loss = loss + triplet_loss(pos_img, pos_cap, same)

        loss = loss + triplet_loss(img, neg_img, diff)
        loss = loss + triplet_loss(cap, neg_img, diff)
        loss = loss + triplet_loss(img, neg_cap, diff)
        loss = loss + triplet_loss(cap, neg_cap, diff)

        loss = loss + triplet_loss(pos_img, neg_img, diff)
        loss = loss + triplet_loss(pos_cap, neg_img, diff)
        loss = loss + triplet_loss(pos_img, neg_cap, diff)
        loss = loss + triplet_loss(pos_cap, neg_cap, diff)
    return loss

def train_epoch(train_data, model, optimizer, triplet_loss, note="train"):
    model.train()
    cum_tri_loss, cum_id_loss = 0.0, 0.0
    for i, data in tqdm(enumerate(train_data), "%s, epoch%d" % (note,epoch)):
        # load data
        (img,pos_img,neg_img, cap, pos_cap, neg_cap, pid, pos_pid, neg_pid) = data
        img, pos_img, neg_img = model(img.cuda()), model(pos_img.cuda()), model(neg_img.cuda())
        cap, pos_cap, neg_cap = model(cap.cuda()), model(pos_cap.cuda()), model(neg_cap.cuda())
        
        # loss
        tri_loss =  crossmodal_triplet_loss(img,pos_img,neg_img, 
                                              cap, pos_cap, neg_cap, 
                                              triplet_loss, cfg.dist_fn_opt)  
        loss = tri_loss
        # backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        cum_tri_loss += tri_loss.item()
        
        # log
        if (i+1) % 64 == 0:
            print("batch %d, [tri-loss] %.6f" % (i, cum_tri_loss/64))
            cum_tri_loss = 0.0
    return model

# stage 1 - image channel forzen
model.img_backbone.melt_layer(8)
param_to_optimize = build_graph_optimizer([model])
optimizer = optim.Adam(param_to_optimize, lr=1e-3, weight_decay=1e-5)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=20)
for epoch in range(10):
    model = train_epoch(train_loader, model, optimizer, triplet_loss, "train-stage-1")
    acc = evaluator.evaluate(model)
    scheduler.step()
    
# stage 2 - train all
model.img_backbone.melt_layer(7)
param_to_optimize = build_graph_optimizer([model])
optimizer = optim.Adam(param_to_optimize, lr=2e-4, weight_decay=1e-5)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=20)
for epoch in range(60):
    model = train_epoch(train_loader, model, optimizer, triplet_loss, "train-stage-2")
    acc = evaluator.evaluate(model)
    scheduler.step()

HBox(children=(IntProgress(value=1, bar_style='info', description='train-stage-1, epoch0', max=1, style=Progre…

batch 63, [tri-loss] 2.987790
batch 127, [tri-loss] 2.450860
