In [33]:
import json
import os

import torch
from torchvision import transforms
from dotted_dict import DottedDict

import sys

from utils.dataset import CaptionDataset

sys.path.append('.')

In [34]:
config = DottedDict()

# model parameters
# config.captions_per_image = 9 (never used)

config.beam_size = 5
config.nb_heads = 8  # number of attention heads on IMAGE used in the model -> important for figuring out visual word/sentence size

config.data_folder = '/home/users/sadler/data/blockworld_pre'  # folder with data files saved by create_input_files.py
config.data_name = 'blocks2D_logos_9_cap_per_img_pair_1_min_word_freq'  # base name shared by data files

# Load word map (word2ix)
word_map_file = os.path.join(config.data_folder, 'WORDMAP_' + config.data_name + '.json')
with open(word_map_file, 'r') as j:
    config.word_map = json.load(j)
config.rev_word_map = {v: k for k, v in config.word_map.items()}
config.vocab_size = len(config.word_map)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # CPU isn't really practical here

# Load model
config.checkpoint_name = "diff_att_8_MODEL_FINAL"

save_dir = "/home/users/sadler/cache/052_block_instruct_transformer/models/blocks"
transformer_checkpoint = save_dir + "/BEST_{}.pth.tar".format(config.checkpoint_name)
checkpoint = torch.load(transformer_checkpoint, map_location=torch.device('cuda'))

model = checkpoint['model'].to(device)
model.eval()
image_feature_encoder = checkpoint['image_encoder'].to(device)
image_feature_encoder.eval()

normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])

In [35]:
from utils import eval_utils

class Predictor():
    
    def __init__(self, context, model, image_encoder, device):
        self.context = context
        self.model = model
        self.image_encoder = image_encoder
        self.device = device
        self.special_tokens = {context.word_map['<start>'],
                              context.word_map['<end>'],
                              context.word_map['<pad>']}

    def predict(self, image_before, image_after):
        best_hypothesis, _, _, _ = eval_utils.translate(
                                                        self.context, 
                                                        self.model, 
                                                        self.image_encoder, 
                                                        self.device,
                                                        image_before, image_after, 
                                                        length_norm_coefficient=0.6)
        return [w for w in best_hypothesis if w not in self.special_tokens]
    
predictor = Predictor(config, model, image_feature_encoder, device)    

In [66]:
from skimage.transform import resize as imresize
from skimage.io import imread
from skimage import color
import numpy as np

class Sampler():
    
    def __init__(self, data_folder, transform):
        self.data_folder = data_folder
        self.transform = transform
        
    def get_image_by_name(self, name):
        img_path = os.path.join(self.data_folder, name)
        img_before = imread(img_path)
        if img_before.shape[2] == 4:
            img_before = color.rgba2rgb(img_before)
        img_before = imresize(img_before, (360, 480))
        assert img_before.shape == (360, 480, 3), "But is {}".format(img_before.shape)
        img_before = img_before.transpose(2,0,1)
        assert img_before.shape == (3, 360, 480), "But is {}".format(img_before.shape)
        assert np.max(img_before) <= 255
        return img_before

    def get_image_pair_by_name(self, name_before, name_after):
        img_before = self.get_image_by_name(name_before)
        img_after = self.get_image_by_name(name_after)
        img_before = torch.FloatTensor(img_before / 255.)
        img_after = torch.FloatTensor(img_after / 255.)
        return img_before, img_after

    def get_sample_by_name(self, name_before, name_after):
        img_before, img_after = self.get_image_pair_by_name(name_before, name_after)
        if self.transform is not None:
            img_before = self.transform(img_before)
            img_after = self.transform(img_after)
        return img_before, img_after
    

In [None]:
# get some more logo image-pairs from the testset

In [1]:
import os
os.listdir("/data/ImageCorpora/blockworld/MNIST/annotations")

['testset.json', 'trainset.json', 'devset.json']

In [6]:
import json
with open("/data/ImageCorpora/blockworld/MNIST/annotations/testset.json", "r") as f:
    testdata = [json.loads(line) for line in f.readlines()]

In [7]:
testdata[0].keys()

dict_keys(['shape_params', 'decoration', 'notes', 'filename', 'states', 'images', 'side_length'])

In [8]:
logodata = [d for d in testdata if d["decoration"] == "logo"]

In [17]:
flatlogodata = [(n, d["images"]) for idx, d in enumerate(logodata) for n in d["notes"] if n["type"] == "A0"]

In [18]:
len(flatlogodata)

181

In [21]:
import random
random.shuffle(flatlogodata)

In [29]:
gts = flatlogodata[:10]

In [30]:
selection = [(random.choice(notes["notes"]),images[notes["start"]],images[notes["finish"]]) for notes, images in gts]

In [49]:
from shutil import copyfile
sourcedir = "/data/ImageCorpora/blockworld/MNIST/images/testset/"
targetdir = "/data/blockworld_inspect/"
for s in selection:
    copyfile(sourcedir + s[1], targetdir + s[1])
    copyfile(sourcedir + s[2], targetdir + s[2])

In [31]:
for s in selection:
    print("gt:", s[0])
    print(s[1], s[2])
    print()

gt: Slide the BMW vertically below the Adidas block.
19_num1_06.png 19_num1_07.png

gt: position the target block so that its aligned with the McDonald block and aligned with the center and top right of the Pepsi block.
59_num5_01.png 59_num5_02.png

gt: Move Burger King so it is below BMW
99_num9_09.png 99_num9_10.png

gt: Put the McDonalds block in the same row as the SRI block, horizontally equidistant between the SRI and Adidas blocks.
48_num4_00.png 48_num4_01.png

gt: Place the Nvidia block south of the Mercedes block.
48_num4_16.png 48_num4_17.png

gt: Move the Twitter block below the Toyota block
68_num6_03.png 68_num6_04.png

gt: Place the Burger King block in the first open space above the Coca Cola block.
8_num0_17.png 8_num0_18.png

gt: Place Twitter so its right edge is flush against the left edge of UPS.
28_num2_01.png 28_num2_02.png

gt: Move the Burger King block to the same vertical column as the Texaco block, and half a row above the McDonalds block.
88_num8_00.png 88

In [None]:
#sampler = Sampler("/data/ImageCorpora/blockworld/MNIST/images/trainset", normalize)
#name_before = "73_num7_03.png"
#name_after = "73_num7_04.png"

In [37]:
sampler = Sampler("/data/ImageCorpora/blockworld/MNIST/images/testset", normalize)
name_before = "48_num4_05.png"
name_after = "48_num4_06.png"

In [45]:
hyps = []
for gt, b, a in selection:
    bi, ai = sampler.get_sample_by_name(b,a)
    hyp = predictor.predict(bi.unsqueeze(dim=0), ai.unsqueeze(dim=0))
    hyps.append(hyp)

In [48]:
for (gt,a,b),hyp in zip(selection, hyps):
    print(a,b)
    print(gt.lower())
    print(" ".join(hyp))
    print()

19_num1_06.png 19_num1_07.png
slide the bmw vertically below the adidas block.
move the bmw block below the adidas block.

59_num5_01.png 59_num5_02.png
position the target block so that its aligned with the mcdonald block and aligned with the center and top right of the pepsi block.
move the target block to the left of the twitter block.

99_num9_09.png 99_num9_10.png
move burger king so it is below bmw
move the burger king block below the bmw block.

48_num4_00.png 48_num4_01.png
put the mcdonalds block in the same row as the sri block, horizontally equidistant between the sri and adidas blocks.
put the mcdonalds block in the first open space to the right of the bmw block.

48_num4_16.png 48_num4_17.png
place the nvidia block south of the mercedes block.
move the nvidia block below the mercedes block.

68_num6_03.png 68_num6_04.png
move the twitter block below the toyota block
move the twitter block below the toyota block.

8_num0_17.png 8_num0_18.png
place the burger king block in t

In [67]:
msampler = Sampler("/data/blockworld_inspect/masked", normalize)
mhyps = []
for gt, b, a in selection:
    bi, ai = msampler.get_sample_by_name(b,a)
    hyp = predictor.predict(bi.unsqueeze(dim=0), ai.unsqueeze(dim=0))
    mhyps.append(hyp)

In [69]:
for (gt, b, a), hyp, mhyp in zip(selection, hyps, mhyps):
    print(b,a)
    print(hyp)
    print(mhyp)
    print()

19_num1_06.png 19_num1_07.png
['move', 'the', 'bmw', 'block', 'below', 'the', 'adidas', 'block.']
['move', 'the', 'nvidia', 'block', 'below', 'and', 'to', 'the', 'left', 'of', 'the', 'sri', 'block.']

59_num5_01.png 59_num5_02.png
['move', 'the', 'target', 'block', 'to', 'the', 'left', 'of', 'the', 'twitter', 'block.']
['move', 'the', 'target', 'block', 'to', 'the', 'left', 'of', 'the', 'sri', 'block.']

99_num9_09.png 99_num9_10.png
['move', 'the', 'burger', 'king', 'block', 'below', 'the', 'bmw', 'block.']
['move', 'the', 'burger', 'king', 'block', 'to', 'the', 'left', 'of', 'the', 'sri', 'block.']

48_num4_00.png 48_num4_01.png
['put', 'the', 'mcdonalds', 'block', 'in', 'the', 'first', 'open', 'space', 'to', 'the', 'right', 'of', 'the', 'bmw', 'block.']
['put', 'the', 'mcdonalds', 'block', 'in', 'the', 'first', 'open', 'space', 'to', 'the', 'right', 'of', 'the', 'bmw', 'block.']

48_num4_16.png 48_num4_17.png
['move', 'the', 'nvidia', 'block', 'below', 'the', 'mercedes', 'block.']
[