# Import

In [1]:
import re
#import regex as re
import json
import pickle
import random
import datetime
import numpy as np
from scipy import spatial
import tensorflow as tf
from gensim.models.word2vec import Word2Vec

# Define functions

In [2]:
def sum_caption(caption):
    caption_splitted = re.split("[^a-zåàâäæçéèêëîïôöœßùûüÿA-ZÅÀÂÄÆÇÉÈÊËÎÏÔÖŒÙÛÜŸ’\-]+",caption)
    caption_vector = np.array(300*[0])
    for c in caption_splitted:
#        caption_vector = np.sum([w2v[c] for c in caption_splitted])
        try:
            caption_vector = caption_vector + w2v[c]
        except KeyError:
            pass
    return caption_vector

# Load data

In [3]:
train_path = "mscoco/captions_val2014.json"
with open(train_path, 'r') as train_file:
    train_dict = json.load(train_file)
    
    print("Number of images: {}".format(len(train_dict["images"])))
    for inst in train_dict["images"][:3]:
        print(inst)
    print()
    
    print("Number of annotations: {}".format(len(train_dict["annotations"])))
    for inst in train_dict["annotations"][:3]:
        print(inst)

Number of images: 40504
{'license': 3, 'file_name': 'COCO_val2014_000000391895.jpg', 'coco_url': 'http://mscoco.org/images/391895', 'height': 360, 'width': 640, 'date_captured': '2013-11-14 11:18:45', 'flickr_url': 'http://farm9.staticflickr.com/8186/8119368305_4e622c8349_z.jpg', 'id': 391895}
{'license': 4, 'file_name': 'COCO_val2014_000000522418.jpg', 'coco_url': 'http://mscoco.org/images/522418', 'height': 480, 'width': 640, 'date_captured': '2013-11-14 11:38:44', 'flickr_url': 'http://farm1.staticflickr.com/1/127244861_ab0c0381e7_z.jpg', 'id': 522418}
{'license': 3, 'file_name': 'COCO_val2014_000000184613.jpg', 'coco_url': 'http://mscoco.org/images/184613', 'height': 336, 'width': 500, 'date_captured': '2013-11-14 12:36:29', 'flickr_url': 'http://farm3.staticflickr.com/2169/2118578392_1193aa04a0_z.jpg', 'id': 184613}

Number of annotations: 202654
{'image_id': 203564, 'id': 37, 'caption': 'A bicycle replica with a clock as the front wheel.'}
{'image_id': 179765, 'id': 38, 'caption'

### Re-structure

In [4]:
# Maybe have dict keys as strings
own_dict = {}
for im in train_dict["images"]:
    own_dict[im["id"]] = {}
    own_dict[im["id"]]["url"] = im["flickr_url"]
for cap in train_dict["annotations"]:
    try:
        own_dict[cap["image_id"]]["captions"].append(cap["caption"])
    except KeyError:
        own_dict[cap["image_id"]]["captions"] = [cap["caption"]]

In [5]:
len(own_dict)
for a in own_dict.items():
    print(a)
    break

(391895, {'url': 'http://farm9.staticflickr.com/8186/8119368305_4e622c8349_z.jpg', 'captions': ['A man with a red helmet on a small moped on a dirt road. ', 'Man riding a motor bike on a dirt road on the countryside.', 'A man riding on the back of a motorcycle.', 'A dirt path with a young person on a motor bike rests to the foreground of a verdant area with a bridge and a background of cloud-wreathed mountains. ', 'A man in a red shirt and a red hat is on a motorcycle on a hill side.']})


# Build vocabulary

In [11]:
#word_set = set()
#for inst in train_dict["annotations"][:3]:
#    word_set = word_set.union(re.split("[^a-zåàâäæçéèêëîïôöœßùûüÿA-ZÅÀÂÄÆÇÉÈÊËÎÏÔÖŒÙÛÜŸ’\-]+",inst["caption"].lower()))
#word_set.remove('')
#print(word_set)

from gensim import corpora
documents = []
for im in own_dict.values():
    documents.extend(im["captions"])
documents = [d.lower() for d in documents]

### Preview structure of document list

In [12]:
print(len(documents))
print(documents[:10])

202654
['a man with a red helmet on a small moped on a dirt road. ', 'man riding a motor bike on a dirt road on the countryside.', 'a man riding on the back of a motorcycle.', 'a dirt path with a young person on a motor bike rests to the foreground of a verdant area with a bridge and a background of cloud-wreathed mountains. ', 'a man in a red shirt and a red hat is on a motorcycle on a hill side.', 'a woman wearing a net on her head cutting a cake. ', 'a woman cutting a large white sheet cake.', 'a woman wearing a hair net cutting a large sheet cake.', 'there is a woman that is cutting a white cake', "a woman marking a cake with the back of a chef's knife. "]


# Load word2vec

In [6]:
w2v = Word2Vec.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)
w2v.init_sims(replace=True)

### Try similarity

In [8]:
query_image = random.choice(list(own_dict.keys()))
# Use all captions for query image
query_vector = sum_caption(own_dict[query_image]["captions"][1])
curr_best = {"im":0, "dist":float("inf"), "cap_nr":-1}
for im in own_dict.items():
    if im[0] != query_image:
        for j in range(len(im[1]["captions"])):
            temp_vector = sum_caption(own_dict[im[0]]["captions"][j])
            if spatial.distance.cosine(query_vector,temp_vector) < curr_best["dist"]:
                curr_best["dist"] = spatial.distance.cosine(query_vector,temp_vector)
                curr_best["im"] = im[0]
                curr_best["cap_nr"] = j
                
# Show images
import webbrowser
webbrowser.open_new(own_dict[query_image]["url"])
webbrowser.open_new_tab(own_dict[curr_best["im"]]["url"])
print(own_dict[query_image]["url"])
print(own_dict[query_image]["captions"][1])
print(own_dict[curr_best["im"]]["url"])
print(own_dict[curr_best["im"]]["captions"][curr_best["cap_nr"]])

http://farm4.staticflickr.com/3398/4630652911_38107d0cb9_z.jpg
A fire hydrant is standing in the middle of a road.
http://farm1.staticflickr.com/25/44927440_42e2e35a60_z.jpg
A fire hydrant is standing in the middle of a parking lot.
