In [1]:
pwd

'/home/dfried/projects/ImageCaptioning.pytorch/notebooks'

In [2]:
import os

In [3]:
os.environ['CUDA_VISIBLE_DEVICES'] = '7'

In [4]:
cd /home/dfried/projects/ImageCaptioning.pytorch

/home/dfried/projects/ImageCaptioning.pytorch


In [5]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [6]:
import argparse
import captioning.utils.opts as opts
import captioning.utils.misc as utils
import captioning.models as models
from captioning.utils import eval_utils

In [7]:
from captioning.data.dataloader import NearestNeighborIndex
from captioning.data.dataloader import DataLoader

In [8]:
parser = argparse.ArgumentParser()
opts.add_eval_options(parser)
opts.add_diversity_opts(parser)

In [9]:
opt = parser.parse_args([])

In [10]:
model_fname = 'models/updown/model-best.pth'
infos_fname = 'models/updown/infos_tds-best.pkl'

In [11]:
with open(infos_fname, 'rb') as f:
    infos = utils.pickle_load(f)
infos.keys()

dict_keys(['vocab', 'opt', 'best_val_score', 'iter', 'iterators', 'epoch', 'split_ix'])

In [12]:
replace = ['input_fc_dir', 'input_att_dir', 'input_box_dir', 'input_label_h5', 'input_json', 'batch_size', 'id']
ignore = ['start_from']

for k in vars(infos['opt']).keys():
    if k in replace:
        setattr(opt, k, getattr(opt, k) or getattr(infos['opt'], k, ''))
    elif k not in ignore:
        if not k in vars(opt):
            vars(opt).update({k: vars(infos['opt'])[k]}) # copy over options from model

In [13]:
opt.max_images_per_split = None
opt.num_workers = 0

In [14]:
import torch

In [15]:
from clip import clip

In [16]:
from PIL import Image

In [17]:
device = 'cuda'

In [18]:
clip_model, clip_transform = clip.load("ViT-B/32")
clip_model = clip_model.to(device)

In [19]:
loader = DataLoader(opt, shuffle_override=False, wrap_override=False)

DataLoader loading json file:  data/cocotalk.json
vocab size is  9487
DataLoader loading h5 file:  data/cocobu_fc data/cocobu_att data/cocotalk_box data/cocotalk_label.h5
max sequence length in data is 16
read 123287 image features
assigned 113287 images to split train
assigned 5000 images to split val
assigned 5000 images to split test


In [20]:
vocab = loader.get_vocab()

In [21]:
def wrap_tag(tag, inner):
    return f'<{tag}>{inner}</{tag}>'

def image_html(image_path, width=300, border=False):
    if border:
        style = ' style="border: 5px solid #0FF" '
    else:
        style = ''
    return f'<img width={width} src="{image_path}" {style}></img>'

def captions_html(captions):
    #return wrap_tag('p', '<br>'.join(' '.join(cap) for cap in captions))
    return wrap_tag('ol', ''.join(wrap_tag('li', cap) for cap in captions))

def images_html(image_paths, width=300, num_per_row=5, target=None, captions=None):
    rows = []
    for ix in range(0, len(image_paths), num_per_row):
        items = [wrap_tag('td', image_html(image_paths[image_ix], width=width, border=image_ix == target)) 
                 for image_ix in range(ix, ix+num_per_row) if image_ix < len(image_paths)]
        rows.append(wrap_tag('tr', ''.join(items)))
        if captions is not None:
            cap_html = [
                wrap_tag('td', captions_html(captions[image_ix]))
                for image_ix in range(ix, ix+num_per_row)
                if image_ix < len(image_paths)
            ]
            rows.append(wrap_tag('tr', ''.join(cap_html)))
    return wrap_tag('table', ''.join(rows))

def display_images(image_paths, width=300, num_per_row=5, target=None, captions=None):
    display(HTML(images_html(image_paths, width=width, num_per_row=num_per_row, target=target, captions=captions)))

In [22]:
def get_captions_from_batch(data):
    batch_captions = []
    for batch_labels in data['labels']:
        instance_captions = []
        for img_labels in batch_labels:
            caption = ' '.join([vocab[str(ix.item())] for ix in img_labels if ix != 0])
            instance_captions.append(caption)
        batch_captions.append(instance_captions)
    return batch_captions

In [23]:
def score_clip(image, text):
    with torch.no_grad():
        image_features = clip_model.encode_image(image)
        text_features = clip_model.encode_text(text)

        logits_per_image, logits_per_text = clip_model(image, text)
        probs = logits_per_image.softmax(dim=-1)
    return probs

In [24]:
import random

In [25]:
def shuffled(lst):
    lst = lst.copy()
    random.shuffle(lst)
    return lst

In [26]:
batch = next(loader.iters['val'])

In [27]:
batch['infos'][0]['file_path']

'val2014/COCO_val2014_000000184613.jpg'

In [30]:
from itertools import groupby

In [42]:
def display_stuff(batch, ix, distractors=['PERSON', 'WOMAN', 'MAN', 'CAMERA', 'TV', 'DOG', 'BANANA', 'LEFT'], group_length=False):
    fpath = batch['infos'][ix]['file_path']
    captions = get_captions_from_batch(batch)[ix]
    
    image = clip_transform(Image.open(fpath)).unsqueeze(0).to(device)
    display_images([fpath], captions=[captions])
    for cap in captions:
        cap_split = cap.split()
        candidates = [cap]
        # shuffle
        # for _ in range(10):
        #     candidates.append(' '.join(shuffled(cap_split)))
        for ix in range(len(cap_split)):
            candidates.append(' '.join(cap_split[:ix]))
            for distractor in distractors:
                candidates.append(' '.join(cap_split[:ix] + [distractor]))
        text = clip.tokenize([cap.lower() for cap in candidates]).to(device)
        scored = zip(candidates, score_clip(image, text).detach().cpu().flatten().numpy())
        if group_length:
            groups = [list(g) for k, g in groupby(sorted(scored, key=lambda t: len(t[0].split())), key=lambda t: len(t[0].split()))]
        else:
            groups = [scored]
        for group in groups:
            scored = sorted(group, key=lambda t: t[1], reverse=False)
            for cap, score in scored:
                print(f'{score:.4f} {cap}')
            print()
        print()
        print("----")
        print()

In [35]:
pdb on

Automatic pdb calling has been turned ON


In [43]:
display_stuff(batch, 0, group_length=True)

0
a child holding a flowered umbrella and petting a yaka young man holding an umbrella next to a herd of cattlea young boy barefoot holding an umbrella touching the horn of a cowa young boy with an umbrella who is touching the horn of a cowa boy holding an umbrella while standing next to livestock


0.0000 

0.0000 TV
0.0000 DOG
0.0000 MAN
0.0000 a
0.0000 CAMERA
0.0000 LEFT
0.0000 WOMAN
0.0000 PERSON
0.0000 BANANA

0.0000 a WOMAN
0.0000 a MAN
0.0000 a DOG
0.0000 a TV
0.0000 a BANANA
0.0000 a LEFT
0.0000 a PERSON
0.0000 a CAMERA
0.0000 a child

0.0000 a child TV
0.0000 a child BANANA
0.0000 a child DOG
0.0000 a child CAMERA
0.0000 a child LEFT
0.0000 a child WOMAN
0.0001 a child PERSON
0.0001 a child MAN
0.0007 a child holding

0.0001 a child holding TV
0.0001 a child holding BANANA
0.0001 a child holding WOMAN
0.0001 a child holding DOG
0.0006 a child holding a
0.0006 a child holding MAN
0.0008 a child holding PERSON
0.0008 a child holding LEFT
0.0012 a child holding CAMERA

0.0000 a child holding a TV
0.0001 a child holding a BANANA
0.0001 a child holding a WOMAN
0.0001 a child holding a DOG
0.0002 a child holding a flowered
0.0004 a child holding a LEFT
0.0005 a child holding a PERSON
0.0005 a child holding a MAN
0.0007 a child holding a CAMERA

0.0000 a child holding a flowered

0.0000 

0.0000 TV
0.0000 DOG
0.0000 MAN
0.0000 LEFT
0.0000 a
0.0000 CAMERA
0.0000 WOMAN
0.0000 PERSON
0.0000 BANANA

0.0000 a WOMAN
0.0000 a MAN
0.0000 a TV
0.0000 a DOG
0.0000 a BANANA
0.0000 a LEFT
0.0000 a PERSON
0.0000 a CAMERA
0.0000 a young

0.0000 a young TV
0.0000 a young DOG
0.0000 a young BANANA
0.0000 a young WOMAN
0.0000 a young LEFT
0.0000 a young MAN
0.0000 a young CAMERA
0.0000 a young PERSON
0.0000 a young boy

0.0000 a young boy DOG
0.0000 a young boy TV
0.0000 a young boy BANANA
0.0000 a young boy MAN
0.0000 a young boy LEFT
0.0000 a young boy WOMAN
0.0000 a young boy with
0.0000 a young boy PERSON
0.0000 a young boy CAMERA

0.0000 a young boy with an
0.0000 a young boy with WOMAN
0.0000 a young boy with TV
0.0000 a young boy with DOG
0.0000 a young boy with LEFT
0.0000 a young boy with MAN
0.0001 a young boy with PERSON
0.0001 a young boy with BANANA
0.0001 a young boy with CAMERA

0.0000 a young boy with an WOMAN
0.0000 a young boy with an TV
0.0000 a young boy wit

In [36]:
display_stuff(batch, 8)

0
an elegant bathroom features a tub sink mirror and decorationsan old fashion above ground tub is shown with gold feeta lovely vintage styled bathroom with a great claw footed tubbathroom with a pedestal sink and claw foot bathtuba claw foot tub is in a large bathroom near a pedestal sink


0.1388 an elegant bathroom features a tub WOMAN
0.0869 an elegant bathroom features
0.0816 an elegant bathroom features LEFT
0.0720 an elegant bathroom features a LEFT
0.0676 an elegant bathroom features a tub PERSON
0.0676 an elegant bathroom features a tub LEFT
0.0597 an elegant bathroom features a tub
0.0330 an elegant bathroom features a
0.0310 an elegant bathroom features a tub sink WOMAN
0.0265 an elegant bathroom features a WOMAN
0.0206 an elegant bathroom features a tub sink mirror and WOMAN
0.0188 an elegant bathroom features a tub sink PERSON
0.0166 an elegant bathroom features a MAN
0.0161 an elegant bathroom features a PERSON
0.0156 an elegant bathroom features a tub TV
0.0151 an elegant bathroom features a tub MAN
0.0137 an elegant bathroom
0.0133 an elegant bathroom features a tub BANANA
0.0131 an elegant bathroom features WOMAN
0.0125 an elegant bathroom features a tub sink
0.0125 an elegant bathroom features a tub sink LEFT
0.0112 an elegant bathroom features a CAMERA
0

0.0984 bathroom with a pedestal sink and claw foot bathtub
0.0510 bathroom with a pedestal LEFT
0.0457 bathroom with a pedestal sink and CAMERA
0.0404 bathroom with a pedestal sink and LEFT
0.0351 bathroom with a pedestal sink LEFT
0.0330 bathroom with a pedestal sink
0.0330 bathroom with a pedestal sink and PERSON
0.0286 bathroom with a pedestal sink and BANANA
0.0282 bathroom with a pedestal sink and WOMAN
0.0269 bathroom with a pedestal sink and claw
0.0269 bathroom with a pedestal sink and claw foot
0.0265 bathroom with a pedestal sink and claw TV
0.0265 bathroom with a pedestal sink and claw foot LEFT
0.0234 bathroom with a pedestal
0.0226 bathroom with a pedestal sink and claw LEFT
0.0226 bathroom with a pedestal sink and claw foot TV
0.0223 bathroom with a pedestal sink and TV
0.0216 bathroom with a pedestal sink and claw WOMAN
0.0210 bathroom with a pedestal sink TV
0.0210 bathroom with a pedestal sink and MAN
0.0166 bathroom with a pedestal sink BANANA
0.0156 bathroom with a p