In [1]:
from __future__ import division
import json
from itertools import chain, izip, permutations, combinations
from collections import Counter, defaultdict
import configparser
import os
import random
from textwrap import fill
import scipy
import sys
from copy import deepcopy
from nltk.parse import CoreNLPParser
import nltk
import pandas as pd
import numpy as np

from annoy import AnnoyIndex

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# Load up config file (needs path; adapt if necessary); local imports

my_config = '../Config/default.cfg'
if os.environ.has_key('DSG-VIS-CONF'):
    my_config = os.environ['DSG-VIS-CONF']

config = configparser.ConfigParser()
with open(my_config, 'r', encoding='utf-8') as f:
    config.read_file(f)

corpora_base = config.get('DEFAULT', 'corpora_base')

dsgv_home = config.get('DSGV-PATHS', 'dsgv_home')
sys.path.append(dsgv_home + '/Utils')
from utils import icorpus_code, plot_labelled_bb, get_image_filename, query_by_id
from utils import plot_img_cropped, plot_img_ax, invert_dict, get_a_by_b
sys.path.append(dsgv_home + '/WACs/WAC_Utils')
from wac_utils import create_word2den, is_relational
sys.path.append(dsgv_home + '/Preproc')
from sim_preproc import load_imsim, n_most_sim

preproc_path = dsgv_home + '/Preproc/PreprocOut/'

In [3]:
def load_dfs(path, inlist):
    df = dict()
    for this_df in inlist:
        df[this_df] = pd.read_json(path + this_df + '.json.gz',
                                   typ='frame', orient='split',
                                   compression='gzip')
    return df


In [4]:
df_names = ['cococapdf']
df = load_dfs(preproc_path, df_names)

In [5]:
cococapdf = df['cococapdf']

In [6]:
cococapdf.head(2)

Unnamed: 0,caption,id,image_id,i_corpus
0,A very clean and well decorated empty bathroom,48,318556,1
1,A panoramic view of a kitchen and all of its a...,67,116100,1


In [7]:
len(cococapdf)

414113

In [8]:
len(cococapdf.drop_duplicates('image_id'))

82783

In [14]:
cococapdf['caption'].to_csv('captions.csv', index=False)

The actual embedding is done on Google Colab. (See `embed_captions_colab.ipynb`, which must be run on their servers.) Results of that are loaded up here.

In [2]:
capemb = np.load('../PreprocOut/cap_embeds.npz')['arr_0']

In [3]:
capemb.nbytes / 1024 / 1024

808.814453125

In [4]:
capemb.shape

(414113, 512)

In [7]:
len(capemb)

414113

In [5]:
ind = AnnoyIndex(capemb.shape[1], metric='euclidean')

In [8]:
%%time
for i in range(len(capemb)):
    ind.add_item(i, capemb[i])

CPU times: user 15.4 s, sys: 488 ms, total: 15.9 s
Wall time: 15.7 s


In [9]:
%%time
ind.build(50)

CPU times: user 58.3 s, sys: 450 ms, total: 58.8 s
Wall time: 58.8 s


True

In [99]:
this_row = 100002
#print cococapdf.iloc[this_row]['caption']
for this_pair in [(i, cococapdf.iloc[i]['caption']) for i in ind.get_nns_by_item(this_row, 10)]:
    print this_pair

(100002, u'An old looking boat with a bird perched on top of it. ')
(130481, u'some old boat that is on the ground with a bird flying in front of it')
(121632, u'an old boat on a body of water ')
(175569, u'An old boat on a field next to a fence.')
(137716, u'an old photo of a boat in a body of water')
(141599, u'A large old boat docked at an old port.')
(153593, u'a big old boat that is sitting a field')
(193963, u'THIS IS A PHOTO OF A OLD BOAT THAT IS RESTING ON LAND')
(194858, u'a picture of an old boat with a lot of people around it')
(163645, u'An old boat lying on the side of a road.')


In [112]:
ind.get_distance(1,4)

1.2413090467453003

In [10]:
ind.save('../PreprocOut/caps.ann')

True