# M2177.003100 Deep Learning <br> Final Proejct: Text-guided to Image Manipulation

### Submitting your work:
<font color=red>**DO NOT clear the MP score **</font> so that TAs can grade the result.

In [1]:
import torch
import sys, os
import numpy as np
from torchvision import transforms

sys.path.append('..')
from evaluation.model import CNN_ENCODER, RNN_ENCODER
from utils.data_utils import CUBDataset


os.environ['CUDA_VISIBLE_DEVICES'] = '0'
device = torch.device('cpu' if not torch.cuda.is_available() else 'cuda')

from miscc.config import cfg, cfg_from_file
cfg_from_file('../cfg/eval_birds.yml')

  yaml_cfg = edict(yaml.load(f))


In [2]:
def cosine_similarity(x1, x2, dim=1, eps=1e-8):
    """
    Returns cosine similarity between x1 and x2, computed along dim
    """
    w12 = torch.sum(x1 * x2, dim)
    w1 = torch.norm(x1, 2, dim)
    w2 = torch.norm(x2, 2, dim)
    return (w12 / (w1 * w2).clamp(min=eps))

In [3]:
image_encoder = CNN_ENCODER(256)
state_dict = torch.load('./sim_models/bird/image_encoder.pth', map_location=lambda storage, loc: storage)
image_encoder.load_state_dict(state_dict)
for p in image_encoder.parameters():
    p.requires_grad = False
print('Load image encoder')
image_encoder.eval()

# load the image encoder model to obtain the latent feature of the real caption
text_encoder = RNN_ENCODER(5450, nhidden=256)
state_dict = torch.load('./sim_models/bird/text_encoder.pth', map_location=lambda storage, loc: storage)
text_encoder.load_state_dict(state_dict)
for p in text_encoder.parameters():
    p.requires_grad = False
print('Load text encoder')
text_encoder.eval()

image_encoder = image_encoder.to(device)
text_encoder = text_encoder.to(device)

Load image encoder
Load text encoder


In [4]:
transform = transforms.Compose([
    transforms.Resize((128, 128))
])

test_dataset = CUBDataset(cfg.DATA_DIR, transform=transform, split='test', eval_mode=True)

print(f'\ttest data directory:\n{test_dataset.split_dir}\n')
print(f'\t# of test filenames:{test_dataset.filenames.shape}\n')
print(f'\texample of filename of test image:{test_dataset.filenames[0]}\n')
print(f'\texample of caption and its ids:\n{test_dataset.captions[0]}\n{test_dataset.captions_ids[0]}\n')
print(f'\t# of test captions:{np.asarray(test_dataset.captions).shape}\n')
print(f'\t# of test caption ids:{np.asarray(test_dataset.captions_ids).shape}\n')

test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=cfg.BATCH_SIZE,
                                              drop_last=False, shuffle=False, num_workers=int(cfg.WORKERS))

self.current_dir:
/home/ccw/DL_final/final-project-deep-learning-20-pt/evaluation

self.data_dir:
/home/ccw/DL_final/final-project-deep-learning-20-pt/data/birds

self.image_dir:
/home/ccw/DL_final/final-project-deep-learning-20-pt/data/birds/CUB-200-2011/images

filepath /home/ccw/DL_final/final-project-deep-learning-20-pt/data/birds/captions.pickle
Load from:  /home/ccw/DL_final/final-project-deep-learning-20-pt/data/birds/captions.pickle
	test data directory:
/home/ccw/DL_final/final-project-deep-learning-20-pt/data/birds/test

	# of test filenames:(2933,)

	example of filename of test image:001.Black_footed_Albatross/Black_Footed_Albatross_0046_18

	example of caption and its ids:
['this', 'is', 'a', 'small', 'bird', 'that', 'has', 'a', 'brilliant', 'blue', 'color', 'on', 'it', 's', 'body', 'a', 'slightly', 'darker', 'blue', 'on', 'it', 's', 'head', 'a', 'teal', 'color', 'on', 'it', 's', 'wings', 'and', 'a', 'light', 'colored', 'beak']
[18, 19, 1, 250, 2, 33, 13, 1, 853, 50, 37, 86

  return array(a, dtype, copy=False, order=order)


In [5]:
MP_list = []
DIFF_list = []
SIM_list = []
for data in test_dataloader:
    imgs = data['img'][-1].to(device)
    gen_imgs = data['gen_img'][-1].to(device)
    captions = data['caps']
    captions_lens = data['cap_len']
    class_ids = data['cls_id']
    keys = data['key']
    sentence_idx = data['sent_ix']

    sorted_cap_lens, sorted_cap_indices = torch.sort(captions_lens, 0, True)
    captions = captions[sorted_cap_indices].squeeze()
    if data['caps'].size(0) == 1:
        captions = captions.unsqueeze(0)
    class_ids = class_ids[sorted_cap_indices].numpy()
    keys = [keys[i] for i in sorted_cap_indices.numpy()]

    if cfg.CUDA:
        captions = captions.to(device)
        sorted_cap_lens = sorted_cap_lens.to(device)

    hidden = text_encoder.init_hidden(captions.size(0))
    _, sent_emb = text_encoder(captions, sorted_cap_lens, hidden)

    _, sent_code = image_encoder(imgs)
    _, gen_sent_code = image_encoder(gen_imgs)

    sim = cosine_similarity(gen_sent_code, sent_emb)
    l1 = torch.abs(imgs - gen_imgs)
    diff = torch.mean(l1.view(l1.size(0), -1), dim=1)
    mp = (1 - diff) * sim

    MP_list.append(mp.detach().cpu().numpy())
    DIFF_list.append(diff.detach().cpu().numpy())
    SIM_list.append(sim.detach().cpu().numpy())

MP_array = np.concatenate(MP_list, axis=0)
DIFF_array = np.concatenate(DIFF_list, axis=0)
SIM_array = np.concatenate(SIM_list, axis=0)
print('# images for evaluation:', len(MP_array))
print('mean:', "%.6f" % np.mean(MP_array))

np.savez(cfg.MP_FILE, mp=MP_array, diff=DIFF_array, sim=SIM_array)

  "please use transforms.Resize instead.")
  "See the documentation of nn.Upsample for details.".format(mode))


# images for evaluation: 29330
mean: 0.031076
