In [2]:
# imports

import configparser
import os
from collections import defaultdict
import random
from textwrap import fill
import sys
import os
import pandas as pd
import numpy as np
from scipy.spatial.distance import euclidean

from annoy import AnnoyIndex

import matplotlib.pyplot as plt
import matplotlib
%matplotlib inline

In [3]:
# load config file, set up paths, make project-specific imports
config_path = os.environ.get('VISCONF')
if not config_path:
    # try default location, if not in environment
    default_path_to_config = '../../clp-vision/Config/default.cfg'
    if os.path.isfile(default_path_to_config):
        config_path = default_path_to_config

assert config_path is not None, 'You need to specify the path to the config file via environment variable VISCONF.'        

config = configparser.ConfigParser()
with open(config_path, 'r', encoding='utf-8') as f:
    config.read_file(f)

corpora_base = config.get('DEFAULT', 'corpora_base')
preproc_path = config.get('DSGV-PATHS', 'preproc_path')
dsgv_home = config.get('DSGV-PATHS', 'dsgv_home')

sys.path.append(dsgv_home + '/Utils')
from utils import icorpus_code, plot_labelled_bb, get_image_filename, query_by_id
from utils import plot_img_cropped, plot_img_ax, invert_dict, get_a_by_b


sys.path.append('../../Common')
from data_utils import load_dfs

In [4]:
# Load up preprocessed DataFrames. Slow!
# These DataFrames are the result of pre-processing the original corpus data,
# as per dsg-vision/Preprocessing/preproc.py

df_names = ['mscoco_bbdf', 'refcoco_refdf', 'refcocoplus_refdf',
            'vgregdf', 'vgimgdf', 'vgobjdf', 'vgreldf', 'vgattdf', 'cococapdf']
df = load_dfs(preproc_path, df_names)

# a derived DF, containing only those region descriptions which I was able to resolve
df['vgpregdf'] = df['vgregdf'][df['vgregdf']['pphrase'].notnull() & 
                               (df['vgregdf']['pphrase'] != '')]

In [5]:
# intersecting visual genome and coco captions. Slow-ish.
caption_coco_iids = list(set(df['cococapdf']['image_id'].tolist()))
# regions for only those image for which we also have coco captions
visgencocap_regdf = df['vgpregdf'].merge(pd.DataFrame(caption_coco_iids, columns=['coco_id']))
# coco_image_ids for images with both caption and region
vgcap_coco_iids = list(set(visgencocap_regdf['coco_id'].tolist()))
# visgen_image_ids for images with both caption and region
vgcap_vg_iids = list(set(visgencocap_regdf['image_id'].tolist()))

# map coco_ids to visgen_ids, and back
coco2vg = dict(visgencocap_regdf[['coco_id', 'image_id']].values)
vg2coco = dict([(v,k) for k,v in coco2vg.items()])

visgencocap_objdf = df['vgobjdf'].merge(pd.DataFrame(vgcap_vg_iids, columns=['image_id']))

vgobjdf_syned = df['vgobjdf'][~df['vgobjdf']['syn'].isnull()]

In [6]:
# load up the nearest neighbour index of captions in embedding space
ind = AnnoyIndex(512, metric='euclidean')
ind.load(preproc_path + '/caps.ann')

True

In [7]:
# the embedding index works via the row number in cococapdf, so need mapping

# N.B.: this is actually lossy. This is the row of the *first* caption for that
#  image. But there will also be other rows (captions) for same image.
#  So, should really one to many mapping. But for purposes here, is ok.
coco2row = dict(zip(df['cococapdf']['image_id'].tolist(), df['cococapdf'].index.tolist()))
# This is also the reason which this here is not inverse of above:
row2coco = dict(zip(df['cococapdf'].index.tolist(), df['cococapdf']['image_id'].tolist()))

# Caption / Object

Given the situation described by the caption, could this object be present?

In [8]:
capobj_df = pd.read_csv('EntailOut/capobj.csv')

In [9]:
capobj_df.head(2)

Unnamed: 0,image_id,premise,hypothesis,hypothesis_syn,label
0,348816,A small white dog sitting on a piece of luggage.,scratch,abrasion.n.01,1
1,261196,A table has a meal placed on top of it.,cardboard,cardboard.n.01,0


## Predict via retrieved models

The idea is that given the premise, related images (= models) are retrieved, based on the similarity of their caption to that premise. That is, we first do an image retrieval task via captions. Then we check whether the object mentioned in the hypothesis is present in the retrieved images, and via that, answer the question whether it is also likely to be present in the situation described by the premise.

A hyperparameter is the threshold above which we say yes. If set to 0.2, this means that we are happy if it is in 20% of the retrieved models.

In [10]:
def retrieve_captions(background_data, this_cocoii, nns_max_try=50, nns_max=10):
    '''Retrieve some nearest captions, ensuring that they are from different image
    '''
    ind, coco2row, coco2vg, vgcap_coco_iids = background_data
    # this works on row numbers in cococap
    this_row = coco2row[this_cocoii]
    nns = ind.get_nns_by_item(this_row, nns_max_try)
    # translate back to coco image_ids
    nns = [row2coco[nn] for nn in nns]
    # it's quite likely that the other captions of the same image
    # ended up very similar. Mapped to coco image_ids, that means
    # that we may have same id multiple times in this list. 
    # To filter these out & ensure that we really retrieve different
    # images, and do this w/o changing order (as list(set(.)) would do)
    seen = set()
    seen.add(this_cocoii) # make sure that seed image is filtered out
    nns_filtered = []
    for nn in nns:
        if nn not in seen:
            nns_filtered.append(nn)
            seen.add(nn)
    # filter so that only coco ids in visual genome remain
    nns_filtered = [nn for nn in nns_filtered if nn in vgcap_coco_iids]
    # at most nns_max 
    nns_filtered = nns_filtered[:min(nns_max, len(nns_filtered))]
    # finally, translate into vg image_ids
    nns_filtered = [int(coco2vg[e]) for e in nns_filtered]
    # returns vg image_ids 
    return nns_filtered

def extract_model(objdf, pregdf, ii):
    ic = icorpus_code['visual_genome']
    individuals = query_by_id(objdf, (ic, ii), 'obj_id syn name'.split()).values
    D = [e[0] for e in individuals]
    I_indv = defaultdict(list)
    _ = [I_indv[osyn].append(oid) for oid, osyn, _ in individuals if osyn is not None]
    
    relations = [r for rs in query_by_id(pregdf, (ic, ii), 'rels') for r in rs]
    I_rels = defaultdict(list)
    _ = [I_rels[rsyn].append((sid, oid)) for sid, _, rsyn, oid in relations if rsyn is not None]
    return D, I_indv, I_rels

In [11]:
background_data = (ind, coco2row, coco2vg, vgcap_coco_iids)

An example. Retrieve a related model (type). Given a target caption, retrieve similar captions, and from the image linked to one of these, retrieve all object and relation types.

In [12]:
this_ii = capobj_df.iloc[0,0]
nns_filtered = retrieve_captions(background_data, this_ii, nns_max_try=50, nns_max=10)

D, I_indv, I_rels = extract_model(df['vgobjdf'], df['vgpregdf'], nns_filtered[3])
print("size of D:", len(D))
print("object types:", I_indv.keys())
print("relation types:", I_rels.keys())

size of D: 61
object types: [u'backpack.n.01', u'mouth.n.01', u'head.n.01', u'woman.n.01', u'leash.n.01', u'skirt.n.01', u'dog.n.01', u'eye.n.01', u'ear.n.01', u'nose.n.01', u'handle.n.01', u'slide_fastener.n.01', u'bag.n.01', u'tongue.n.01', u'pocket.n.01', u'vest.n.01', u'battalion.n.02', u'television.n.01', u'man.n.01', u'jacket.n.01', u'sign.n.02', u'hair.n.01']
relation types: [u'have.v.01', u'along.r.01', u'wear.v.01', u'transport.v.02']


In [13]:
def debug_print(message, debug):
    if debug:
        print(message)
    
def predict_via_retrieval(background_data, this_row, debug=False,
                          threshold=0.2, nns_max=6):
    ind, coco2row, coco2vg, vgcap_coco_iids = background_data
    
    coco_ii, cap, hyp, hyp_syn, label = this_row
    
    debug_print("=" * 60, debug)
    debug_print(this_row, debug)
    
    nns_filtered = retrieve_captions(background_data, coco_ii, nns_max=nns_max)
    
    score = 0
    debug_print("retrieved:", debug)
    for this_nn in nns_filtered:
        debug_print('  ' + str(this_nn) + ' ' + df['cococapdf'].iloc[coco2row[vg2coco[this_nn]]]['caption'], debug)
        D, I_indv, I_rels = extract_model(df['vgobjdf'], df['vgpregdf'], this_nn)
        if hyp_syn in I_indv.keys():
            debug_print( '       match', debug)
            score += 1
    score /= nns_max
    debug_print( "score: {}".format(score), debug)
    
    return 1 if ((score > threshold and label == 1) or (score <= threshold and label == 0)) else 0

In [15]:
pd.set_option('display.max_colwidth', 200)

In [16]:
predict_via_retrieval(background_data, capobj_df.iloc[19], debug=True)

image_id                                                                             519374
premise           He is expertly using his skateboard to go in and out of the street cones.
hypothesis                                                                              man
hypothesis_syn                                                                     man.n.01
label                                                                                     1
Name: 19, dtype: object
retrieved:
  2317174 Two skateboarders make there way through a cone obstacle course.
       match
  2392316 Skate boarder keeping balance while navigating green cones
       match
  286083 A man in blue jeans and a helmet rides a skateboard near two green cones.
  2318884 A young man is skateboarding down a hill.
       match
  2368945 The young man is going around the cone on his skateboard.
       match
  2384481 A skateboarder doing a trick on a ramp in the evening.
       match
score: 0.833333333333


1

The following is very slow, so am not rerunning this here... See the (less commented) original notebook `model_building_orig.ipynb`.

In [14]:
%%time
capobj_scores = capobj_df.apply(lambda x: predict_via_retrieval(background_data, x), axis=1)

CPU times: user 6h 35min 14s, sys: 44min 46s, total: 7h 20min
Wall time: 1h 42min 22s


In [20]:
def evaluate(scores):
    return scores.sum() / len(scores)

In [21]:
evaluate(capobj_scores)

NameError: name 'capobj_scores' is not defined

## Predict via string matching

Baseline 1: Is the object name mentioned in the premise?

In [17]:
def predict_via_string(this_row, mode='prediction'):
    _coco_ii, cap, hyp, _hyp_syn, label = this_row
    return ( 1 if ((hyp in cap) and (label == 1)) or 
                   ((hyp not in cap) and (label == 0)) else 0)

In [18]:
scores_bsln = capobj_df.apply(lambda x: predict_via_string(x), axis=1)

In [22]:
evaluate(scores_bsln)

0.57505

## Predict via embedding distance

Baseline 2: Predict via euclidean distance in the embedding space, between the caption (premise) and the object name (hypothesis). We choose the threshold distance for making the decision so that it partitions the set (as we know that the test set is balanced). That is, we choose it in such a way that it assigns "yes" to 10k instances and "no" to the 10k others.

In [24]:
objvecs = np.load('EntailOut/capobj.npz')['arr_0']

In [26]:
capvecs = np.load(preproc_path + '/cap_embeds.npz')['arr_0']

In [27]:
row_n = 17
coco_ii, cap, hyp, hyp_syn, label = capobj_df.iloc[row_n]
objvec = objvecs[row_n]
capvec = capvecs[coco2row[coco_ii]]
euclidean(objvec, capvec)

1.3174902200698853

In [28]:
capvec_indices = [coco2row[e] for e in capobj_df['image_id'].tolist()]

capvec_matrix = capvecs[capvec_indices]
# subset of caption embedding matrix, in order of test data set

distances = np.sqrt(np.sum((capvec_matrix - objvecs)**2, axis=1))
# vector of distances between caption and object name

In [29]:
# set threshold so that it partitions set
threshold = 1.27
(distances < threshold).sum()

10191

In [30]:
((capobj_df[distances < threshold]['label'] == 1).sum() + 
 (capobj_df[distances >= threshold]['label'] == 0).sum()) / len(capobj_df)

0.64105

# Caption / Region

Now we do the same with pairs of caption (hypothesis) and region (premise).

In [31]:
capreg_df = pd.read_csv('EntailOut/capreg.csv')

In [32]:
# the relations as read from csv are strings; must be turned into objects
capreg_df['hyp_rel'] = capreg_df['hyp_rel'].apply(lambda x: eval(x))

In [33]:
capreg_df.head(2)

Unnamed: 0,image_id,premise,hypothesis,hyp_rel,label
0,421813,A young boy holding a donut with pink sprinkles on it.,boats in the water,"[[544272, IN, in.r.01, 544263]]",0
1,290935,The young man is riding his skateboard down the street.,A man in a green shirt.,"[[2555677, IN, in.r.01, 3904012]]",0


## Predict via retrieved models

In [34]:
def get_obj_syn(df, obj_id, key='syn'):
    return df[df['obj_id'] == obj_id][key].values[0]

def rels2model_type(objdf, rels):
    ind_types = []
    rel_types = []
    for sid, _, rsyn, oid in rels:
        [ind_types.append(get_obj_syn(objdf, this_id)) for this_id in [sid, oid]]
        rel_types.append(rsyn)
    return set(ind_types), set(rel_types)

In [35]:
coco_ii, cap, hyp, hyp_rel, label = capreg_df.iloc[10]
rels2model_type(df['vgobjdf'], hyp_rel)

({u'curtain.n.01', u'piano.n.01'}, {u'behind.r.01'})

In [36]:
def score_model(I_indiv, I_rels, hyp_indiv, hyp_rels):
    score_ind = len(set(I_indiv.keys()).intersection(hyp_indiv)) / len(hyp_indiv)
    score_rel = len(set(I_rels.keys()).intersection(hyp_rels)) / len(hyp_rels)
    return (score_ind + score_rel) / 2

def predict_via_retrieval_region(background_data, this_row, debug=False,
                                 threshold=0.2, nns_max=6):
    ind, coco2row, coco2vg, vgcap_coco_iids = background_data
    
    coco_ii, cap, hyp, hyp_rel, label = this_row
    debug_print("=" * 60, debug)
    debug_print((cap, hyp, label), debug)
    
    nns_filtered = retrieve_captions(background_data, coco_ii, nns_max=nns_max)
    
    hyp_indiv, hyp_rels = rels2model_type(df['vgobjdf'], hyp_rel)
    debug_print('hyp_ind: {}  hyp_rels: {}'.format(hyp_indiv, hyp_rels), debug)

    score = 0
    debug_print("retrieved:", debug)
    for this_nn in nns_filtered:
        debug_print('  ' + str(this_nn) + ' ' + df['cococapdf'].iloc[coco2row[vg2coco[this_nn]]]['caption'], debug)
        D, I_indv, I_rels = extract_model(df['vgobjdf'], df['vgpregdf'], this_nn)
        
        score += score_model(I_indv, I_rels, hyp_indiv, hyp_rels)

    score /= len(nns_filtered)
    debug_print( "score: {}".format(score), debug)
    
    return 1 if ((score > threshold and label == 1) or (score <= threshold and label == 0)) else 0

In [37]:
predict_via_retrieval_region(background_data, capreg_df.iloc[4], debug=True)

('A piece of cake with icing on a plate. ', 'a girl wearing a white shirt', 0)
hyp_ind: set([u'girl.n.01', u'shirt.n.01'])  hyp_rels: set([u'wear.v.01'])
retrieved:
  2365845 Apples and dessert are plated on a table.
  2392234 A white plate with a piece of cake next to a puff of whipped cream.
  2391081 A small piece of cake sits beside a fork on a tiny plate.
  2384101 A piece of pastry is speared by a fork.
  2329260 A plate with a slice of dessert with whipped cream.
  2323507 A table with a white plate and a cake.
score: 0.0


1

In [33]:
%%time
capreg_scores = capreg_df.apply(lambda x: predict_via_retrieval_region(background_data, x),
                               axis=1)

CPU times: user 6h 39min 32s, sys: 44min 57s, total: 7h 24min 30s
Wall time: 1h 43min 24s


In [34]:
evaluate(capreg_scores)

0.64875

## Predict via string matching

In [38]:
def jaccard_sim(phrase_A, phrase_B):
    set_A = set(phrase_A.split())
    set_B = set(phrase_B.split())
    return len(set_A.intersection(set_B)) / len(set_A.union(set_B))

In [39]:
def predict_via_jaccard(this_row, threshold=0.2):
    coco_ii, cap, hyp, hyp_rel, label = this_row
    return 1 if (((jaccard_sim(cap, hyp) >= threshold) and label == 1) or
                 ((jaccard_sim(cap, hyp) < threshold and label == 0))) else 0

In [40]:
scores_bsln_reg = capreg_df.apply(lambda x: predict_via_jaccard(x), axis=1)

In [41]:
evaluate(scores_bsln_reg)

0.54435

## Predict via embedding distance

In [42]:
regvecs = np.load('EntailOut/capreg.npz')['arr_0']

In [43]:
distances_reg = np.sqrt(np.sum((capvec_matrix - regvecs)**2, axis=1))
# vector of distances between caption and region embeddings

In [44]:
# set threshold so that it partitions set
threshold_reg = 1.245
(distances_reg < threshold_reg).sum()

10307

In [45]:
((capreg_df[distances_reg < threshold_reg]['label'] == 1).sum() + 
 (capreg_df[distances_reg >= threshold_reg]['label'] == 0).sum()) / len(capreg_df)

0.50255