# Create Test Datasets for Entailments (CapObj, CapReg)

Create the caption / object and caption / region entailment data for testing the model building approach.

In [1]:
# imports
import configparser
import os
import random
import sys
import pandas as pd
import numpy as np

from annoy import AnnoyIndex

In [2]:
# Load up config file (needs path; adapt env var if necessary); local imports

# load config file, set up paths, make project-specific imports
config_path = os.environ.get('VISCONF')
if not config_path:
    # try default location, if not in environment
    default_path_to_config = '../../clp-vision/Config/default.cfg'
    if os.path.isfile(default_path_to_config):
        config_path = default_path_to_config

assert config_path is not None, 'You need to specify the path to the config file via environment variable VISCONF.'        

config = configparser.ConfigParser()
with open(config_path, 'r', encoding='utf-8') as f:
    config.read_file(f)

corpora_base = config.get('DEFAULT', 'corpora_base')
preproc_path = config.get('DSGV-PATHS', 'preproc_path')
dsgv_home = config.get('DSGV-PATHS', 'dsgv_home')


sys.path.append(dsgv_home + '/Utils')
from utils import icorpus_code, plot_labelled_bb, get_image_filename, query_by_id
from utils import plot_img_cropped, plot_img_ax, invert_dict, get_a_by_b

sys.path.append('../../Common')
from data_utils import load_dfs

In [3]:
# Load up preprocessed DataFrames. Slow!
# These DataFrames are the result of pre-processing the original corpus data,
# as per dsg-vision/Preprocessing/preproc.py

df_names = ['mscoco_bbdf', 'refcoco_refdf', 'refcocoplus_refdf',
            'vgregdf', 'vgimgdf', 'vgobjdf', 'vgreldf', 'vgattdf', 'cococapdf']
df = load_dfs(preproc_path, df_names)

# a derived DF, containing only those region descriptions which I was able to resolve
df['vgpregdf'] = df['vgregdf'][df['vgregdf']['pphrase'].notnull() & 
                               (df['vgregdf']['pphrase'] != '')]

In [4]:
# intersecting visual genome and coco captions.
caption_coco_iids = list(set(df['cococapdf']['image_id'].tolist()))
# regions for only those image for which we also have coco captions
visgencocap_regdf = df['vgpregdf'].merge(pd.DataFrame(caption_coco_iids, columns=['coco_id']))
# coco_image_ids for images with both caption and region
vgcap_coco_iids = list(set(visgencocap_regdf['coco_id'].tolist()))
# visgen_image_ids for images with both caption and region
vgcap_vg_iids = list(set(visgencocap_regdf['image_id'].tolist()))

# map coco_ids to visgen_ids, and back
coco2vg = dict(visgencocap_regdf[['coco_id', 'image_id']].values)
vg2coco = dict([(v,k) for k,v in coco2vg.items()])

visgencocap_objdf = df['vgobjdf'].merge(pd.DataFrame(vgcap_vg_iids, columns=['image_id']))

In [5]:
vgobjdf_syned = df['vgobjdf'][~df['vgobjdf']['syn'].isnull()]

In [6]:
# load up the nearest neighbour index of captions in embedding space
ind = AnnoyIndex(512, metric='euclidean')
ind.load(preproc_path + '/caps.ann')

True

In [7]:
# the embedding index works via the row number in cococapdf, so need mapping

# N.B.: this is actually lossy. This is the row of the *first* caption for that
#  image. But there will also be other rows (captions) for same image.
#  So, should really one to many mapping. But for purposes here, is ok.
coco2row = dict(zip(df['cococapdf']['image_id'].tolist(), df['cococapdf'].index.tolist()))
# This is also the reason which this here is not inverse of above:
row2coco = dict(zip(df['cococapdf'].index.tolist(), df['cococapdf']['image_id'].tolist()))

**Seed the random generators**

For reproducability, seed the random number generators. This will lead to reproducible creation of datasets.

In [8]:
np.random.seed(42)
df['vgobjdf'].sample(random_state=42)

Unnamed: 0,i_corpus,image_id,obj_id,syn,name,bb
3875508,5,2417469,3136701,flower.n.01,purple flower,"[253, 113, 66, 43]"


# Caption / Object

Format: `caption, object that is indeed in image (name + synset), object from some other image (name + synset)`. Note that no check is made whether the "negative" hypothesis object is or isn't in target image.

This data makes two ways of evaluation possible: *Choice* (which one is the correct hypothesis), and *Prediction* (is this hypothesis correct?). The latter style means that there are two predictions for each row (one for the positive hypothesis, one for the negative one).

In [9]:
%%time
n_samples = 10000
selected_capcocoiids = np.random.choice(vgcap_coco_iids, n_samples)
sampled_names = vgobjdf_syned.sample(n_samples)[['name', 'syn']].values

triples = []
for this_cocoii, nhyp in zip(selected_capcocoiids, sampled_names):
    prem = df['cococapdf'][df['cococapdf']['image_id'] == this_cocoii].sample()['caption'].values[0]
    all_p_names = visgencocap_objdf[visgencocap_objdf['image_id'] == coco2vg[this_cocoii]]
    if len(all_p_names) == 0:
        continue
    phyp, phyp_syn = all_p_names.sample()[['name', 'syn']].values[0]
    triples.append((int(this_cocoii), prem, phyp, phyp_syn, 1))
    triples.append((int(this_cocoii), prem, nhyp[0], nhyp[1], 0))

CPU times: user 32.6 s, sys: 103 ms, total: 32.7 s
Wall time: 32.8 s


In [10]:
triples[0]

(370121,
 u'A bench sitting out in a field next to a tree.',
 u'table',
 u'table.n.02',
 1)

In [11]:
capobjdf = pd.DataFrame(triples, columns='image_id premise hypothesis hypothesis_syn label'.split())

In [12]:
capobjdf.sample(frac=1).to_csv('EntailOut/capobj.csv', index=False)

# Caption / Region

In [13]:
%%time

selected_capcocoiids = np.random.choice(vgcap_coco_iids, n_samples)

sampled_regions = df['vgpregdf'].sample(n_samples)['phrase rels'.split()].values
#sampled_regions = [(e[0], e[1][0]) for e in sampled_regions]

reg_triples = []
for this_cocoii, nregion in zip(selected_capcocoiids, sampled_regions):
    npreg, nrel = nregion
    prem = df['cococapdf'][df['cococapdf']['image_id'] == this_cocoii].sample()['caption'].values[0]
    all_p_regions = df['vgpregdf'][df['vgpregdf']['image_id'] == coco2vg[this_cocoii]]
    if len(all_p_regions) == 0:
        continue
    ppreg, prel = all_p_regions.sample()[['phrase', 'rels']].values[0]
    reg_triples.append((int(this_cocoii), prem, ppreg, prel, 1))
    reg_triples.append((int(this_cocoii), prem, npreg, nrel, 0))

CPU times: user 31.6 s, sys: 79.1 ms, total: 31.6 s
Wall time: 31.7 s


In [14]:
pd.DataFrame(reg_triples).head(2)

Unnamed: 0,0,1,2,3,4
0,246444,A long row of motor scooters on a city street.,windows in a building,"[[2493630, IN, in.r.01, 3688836]]",1
1,246444,A long row of motor scooters on a city street.,one cat is in the car,"[[297622, IN, in.r.01, 297623]]",0


In [15]:
capregdf = pd.DataFrame(reg_triples, columns='image_id premise hypothesis hyp_rel label'.split())

In [16]:
capregdf.sample(frac=1).to_csv('EntailOut/capreg.csv', index=False)