# Create "Entailment" / Implicature Data Sets

Create the entailment datasets to be tested via Amazon Mechanical Turk.

In [1]:
# imports

from __future__ import division
import codecs
import json
from itertools import chain, izip, permutations, combinations
from collections import Counter, defaultdict
import ConfigParser
import os
import random
from textwrap import fill
import scipy
import sys
from copy import deepcopy
from nltk.parse import CoreNLPParser
import nltk
import pandas as pd
import numpy as np

pd.set_option('max_colwidth', 250)

In [2]:
# Load up config file (needs path; adapt env var if necessary); local imports

# load config file, set up paths, make project-specific imports
config_path = os.environ.get('VISCONF')
if not config_path:
    # try default location, if not in environment
    default_path_to_config = '../Config/default.cfg'
    if os.path.isfile(default_path_to_config):
        config_path = default_path_to_config

assert config_path is not None, 'You need to specify the path to the config file via environment variable VISCONF.'        

config = ConfigParser.SafeConfigParser()
with codecs.open(config_path, 'r', encoding='utf-8') as f:
    config.readfp(f)

corpora_base = config.get('DEFAULT', 'corpora_base')
preproc_path = config.get('DSGV-PATHS', 'preproc_path')
dsgv_home = config.get('DSGV-PATHS', 'dsgv_home')

sys.path.append(dsgv_home + '/Utils')
from utils import icorpus_code, plot_labelled_bb, get_image_filename, query_by_id
from utils import plot_img_cropped, plot_img_ax, invert_dict, get_a_by_b
sys.path.append(dsgv_home + '/Preproc')
from sim_preproc import load_imsim, n_most_sim

In [3]:
# more local imports

import utils
reload(utils)

sys.path.append('../../Common')
import data_utils
reload(data_utils)
from data_utils import load_dfs, plot_rel_by_relid, get_obj_bb, compute_distance_objs
from data_utils import get_obj_key, compute_relpos_relargs_row, get_all_predicate
from data_utils import compute_distance_relargs_row, get_rel_type, get_rel_instances
from data_utils import compute_obj_sizes_row

In [5]:
# Load up preprocessed DataFrames. Slow!
# These DataFrames are the result of pre-processing the original corpus data,
# as per dsg-vision/Preprocessing/preproc.py

df_names = ['refcoco_refdf', #'refcocoplus_refdf', 'grex_refdf',
            'vgregdf', 'vgimgdf', 'vgobjdf', 'vgreldf', 'vgattdf', 'vgvqadf',
            'vgpardf', 'cococapdf']
df = load_dfs(preproc_path, df_names)

# a derived DF, containing only those region descriptions which I was able to resolve
df['vgpregdf'] = df['vgregdf'][df['vgregdf']['pphrase'].notnull() & 
                               (df['vgregdf']['pphrase'] != '')]

# load up pre-computed similarities
coco_sem_sim, coco_sem_map = load_imsim(preproc_path + '/mscoco_sim.npz')
visg_sem_sim, visg_sem_map = load_imsim(preproc_path + '/visgen_sim.npz')
coco_id2semsim = invert_dict(coco_sem_map)
visg_id2semsim = invert_dict(visg_sem_map)

coco_vis_sim, coco_vis_map = load_imsim(preproc_path + '/mscoco_vis_sim.npz')
visg_vis_sim, visg_vis_map = load_imsim(preproc_path + '/visgen_vis_sim.npz')
coco_id2vissim = invert_dict(coco_vis_map)
visg_id2vissim = invert_dict(visg_vis_map)

## Parameters

How many triples to sample of each. (Number of rows will be twice that, as this samples both positive and negative hypotheses.)

In [6]:
n = 30

## refexp / refexp

In [7]:
np.random.seed(42)
df['vgobjdf'].sample(random_state=42)

Unnamed: 0,i_corpus,image_id,obj_id,syn,name,bb
3875508,5,2417469,3136701,flower.n.01,purple flower,"[253, 113, 66, 43]"


In [8]:
# pairing premise and hypotheses
# n = 30 # how many to do
triples = []

this_df = df['refcoco_refdf']

for _ in range(n):
    # seed image
    ic, ii, ri, rexi = this_df.sample()['i_corpus image_id region_id rex_id'.split()].values[0]
    premise, phyp =  np.random.choice(query_by_id(this_df, (ic, ii, ri), 'refexp'), 2, replace=False)

    # negative hypothesis
    nhyp = this_df.sample()['refexp'].values[0]
    triples.append((premise, phyp, nhyp))

In [9]:
out_rows = []
for premise, phyp, nhyp in triples:
    out_rows.append((premise, phyp, 1))
    out_rows.append((premise, nhyp, 0))

In [10]:
rexrexdf = pd.DataFrame(out_rows, columns='premise hypothesis label'.split())

In [11]:
rexrexdf['prompt'] = 'Assume Text 1 is meant to refer to something (which could be a person, an animal, or a thing) in a picture. Could Text 2 refer to the same thing? (That is, could Text 2 be a clarification of Text 1, in some situation?)'

In [12]:
rexrexdf.head(2)

Unnamed: 0,premise,hypothesis,label,prompt
0,middle giraffe,giraffe in middle,1,"Assume Text 1 is meant to refer to something (which could be a person, an animal, or a thing) in a picture. Could Text 2 refer to the same thing? (That is, could Text 2 be a clarification of Text 1, in some situation?)"
1,middle giraffe,right bus orange and white,0,"Assume Text 1 is meant to refer to something (which could be a person, an animal, or a thing) in a picture. Could Text 2 refer to the same thing? (That is, could Text 2 be a clarification of Text 1, in some situation?)"


In [13]:
rexrexdf.sample(frac=1).to_csv('To_Test/rexrex.csv', index=False)

## caption / caption

In [14]:
# pairing premise and hypotheses
#n = 50 # how many to do
tuples = []

this_df = df['cococapdf']
ic = icorpus_code['mscoco']

for _ in range(n):
    # seed image
    ii = np.random.choice(coco_id2semsim.keys())
    premise, phyp =  np.random.choice(query_by_id(this_df, (ic, ii), 'caption'), 2, replace=False)

    # negative hypothesis
    sim_ids = n_most_sim(coco_sem_sim, coco_sem_map, coco_id2semsim[ii], n=5)
    n_ii = np.random.choice(sim_ids[1:])
    nhyp = this_df[this_df['image_id'] == n_ii]['caption'].values[0]
    
    tuples.append((premise, phyp, nhyp))

In [15]:
len(tuples)

30

In [16]:
out_rows = []
for premise, phyp, nhyp in tuples:
    out_rows.append((premise, phyp, 1))
    out_rows.append((premise, nhyp, 0))

In [17]:
capcapdf = pd.DataFrame(out_rows, columns='premise hypothesis label'.split())

In [18]:
prompt = '''Is Text 2 likely to be describing the same situation as Text 1?'''

In [19]:
capcapdf['prompt'] = prompt

In [20]:
capcapdf.head(2)

Unnamed: 0,premise,hypothesis,label,prompt
0,Salmon and egg yolks on toast on a plate with eggs and tomatoes on toast.,The sandwich has many slices of hard boiled eggs.,1,Is Text 2 likely to be describing the same situation as Text 1?
1,Salmon and egg yolks on toast on a plate with eggs and tomatoes on toast.,An editorial sandwich on a plate and newspaper.,0,Is Text 2 likely to be describing the same situation as Text 1?


In [21]:
capcapdf.sample(frac=1).to_csv('To_Test/capcap.csv', index=False)

## Caption / There is

In [22]:
# intersecting visual genome and coco captions. Slow-ish.
caption_coco_iids = list(set(df['cococapdf']['image_id'].tolist()))
# regions for only those image for which we also have coco captions
visgencocap_regdf = df['vgregdf'].merge(pd.DataFrame(caption_coco_iids, columns=['coco_id']))
# coco_image_ids for images with both caption and region
vgcap_coco_iids = list(set(visgencocap_regdf['coco_id'].tolist()))
# visgen_image_ids for images with both caption and region
vgcap_vg_iids = list(set(visgencocap_regdf['image_id'].tolist()))

# map coco_ids to visgen_ids, and back
coco2vg = dict(visgencocap_regdf[['coco_id', 'image_id']].values)
vg2coco = dict([(v,k) for k,v in coco2vg.items()])

df['vgpardf']['coco_image_id'] = df['vgpardf']['image_id'].apply(lambda x: vg2coco.get(x, None))
df['cocoparcapdf'] = df['cococapdf'].merge(df['vgpardf'],
                                           left_on='image_id', right_on='coco_image_id')

In [23]:
# captions and objects (slotted into "there is __" frame)
tuples = []

#n = 50

i = 0
while i < n:
    vgii, cocoii = visgencocap_regdf.sample()['image_id coco_id'.split()].values[0]
    prem = df['cococapdf'][df['cococapdf']['image_id'] == cocoii].sample()['caption'].values[0]
    try:
        phyp = df['vgobjdf'][df['vgobjdf']['image_id'] == vgii].sample()['name'].values[0]
        i += 1
    except:
        continue
    nhyp = df['vgobjdf'].sample()['name'].values[0]
    tuples.append((prem, phyp, nhyp))

In [24]:
len(tuples)

30

In [25]:
def add_existential_prefix(instring):
    prefix = 'There are ' if instring.endswith('s') else 'There is (a) '
    return prefix + instring

In [26]:
out_rows = []
for premise, phyp, nhyp in tuples:
    out_rows.append((premise, add_existential_prefix(phyp), 1))
    out_rows.append((premise, add_existential_prefix(nhyp), 0))

In [27]:
capobjdf = pd.DataFrame(out_rows, columns='premise hypothesis label'.split())

In [28]:
capobjdf.head(10)

Unnamed: 0,premise,hypothesis,label
0,Bunch of people out in the water on surfboards waiting for a wave,There is (a) person,1
1,Bunch of people out in the water on surfboards waiting for a wave,There is (a) hair,0
2,"The plate has a fork, english muffins, and sauce on it.",There is (a) lettuce,1
3,"The plate has a fork, english muffins, and sauce on it.",There are writings,0
4,THERE IS AN IMAGE OF A BATHROOM WITH A TOILET IN THERE,There is (a) floor,1
5,THERE IS AN IMAGE OF A BATHROOM WITH A TOILET IN THERE,There is (a) road,0
6,a white plate a fork and a pizza with black olives,There is (a) pizza,1
7,a white plate a fork and a pizza with black olives,There is (a) bag,0
8,A couple of young women standing near a counter preparing food.,There is (a) woman,1
9,A couple of young women standing near a counter preparing food.,There is (a) sticker,0


In [29]:
prompt = '''Using what you know about the world, in the situation described by Text 1, is Text 2 likely to be true?'''

In [30]:
capobjdf['prompt'] = prompt

In [31]:
capobjdf.head(2)

Unnamed: 0,premise,hypothesis,label,prompt
0,Bunch of people out in the water on surfboards waiting for a wave,There is (a) person,1,"Using what you know about the world, in the situation described by Text 1, is Text 2 likely to be true?"
1,Bunch of people out in the water on surfboards waiting for a wave,There is (a) hair,0,"Using what you know about the world, in the situation described by Text 1, is Text 2 likely to be true?"


In [32]:
capobjdf.sample(frac=1).to_csv('To_Test/capobj.csv', index=False)

## Caption / Region Description

In [33]:
#n = 50

tuples = []

i = 0
# caption + region description
while i < n:
    try:
        p_caps = []
        while len(p_caps) == 0:
            coco_ii = np.nan
            while np.isnan(coco_ii):
                ic, vg_ii, coco_ii = df['vgimgdf'].sample()[['i_corpus', 'image_id', 'coco_id']].values[0]
            p_caps = query_by_id(df['cococapdf'], (icorpus_code['mscoco'], coco_ii))
        p_cap_ind = random.choice(range(len(p_caps)))
        p_cap = p_caps.iloc[p_cap_ind]['caption']
        p_row = p_caps.index[p_cap_ind]

        p_hyp_regions = query_by_id(df['vgpregdf'], (ic, vg_ii))
        p_hyp_regions = p_hyp_regions[~p_hyp_regions['rels'].isnull()]
        p_hyp_reg, p_hyp_relids, p_hyp_rels, p_hyp_pphrase = \
            p_hyp_regions.sample()['phrase rel_ids rels pphrase'.split()].values[0]

        n_hyp_reg, n_hyp_relids, n_hyp_rels, n_hyp_pphrase = \
            df['vgpregdf'].sample()['phrase rel_ids rels pphrase'.split()].values[0]

        tuples.append((p_cap, p_hyp_reg, n_hyp_reg))
        
        i += 1
    except:
        continue

In [34]:
def add_existential_prefix_b(instring):
    if ' is ' in instring:
        outstring = instring
    else:
        outstring = 'There are / is (a) ' + instring
    return outstring

In [35]:
out_rows = []
for premise, phyp, nhyp in tuples:
    out_rows.append((premise, add_existential_prefix_b(phyp), 1))
    out_rows.append((premise, add_existential_prefix_b(nhyp), 0))

In [36]:
capregdf = pd.DataFrame(out_rows, columns='premise hypothesis label'.split())

In [37]:
capregdf.head(20)

Unnamed: 0,premise,hypothesis,label
0,A woman wearing a very colorful costume while checking her phone.,the woman`s stomach is showing,1
1,A woman wearing a very colorful costume while checking her phone.,There are / is (a) the wires are over the buses,0
2,A very large cruise ship docked next to a blue train.,There are / is (a) British flag on side of train,1
3,A very large cruise ship docked next to a blue train.,The striped shirt the man on the right is wearing.,0
4,A man holding an umbrella while standing next to people at a bus stop.,A person is wearing nice eyeglasses,1
5,A man holding an umbrella while standing next to people at a bus stop.,There are / is (a) kite flying in the sky,0
6,a group of people sitting around a dinner table.,There are / is (a) bottle on the table,1
7,a group of people sitting around a dinner table.,There are / is (a) words on white tag,0
8,A kid doing a trick on a skateboard on a ramp.,There are / is (a) a man on a skateboard,1
9,A kid doing a trick on a skateboard on a ramp.,There are / is (a) left hand of man in ocean,0


In [38]:
prompt = '''Using what you know about the world, in the situation described by Text 1, is Text 2 likely to be true?'''

In [39]:
capregdf['prompt'] = prompt

In [40]:
capregdf.head(2)

Unnamed: 0,premise,hypothesis,label,prompt
0,A woman wearing a very colorful costume while checking her phone.,the woman`s stomach is showing,1,"Using what you know about the world, in the situation described by Text 1, is Text 2 likely to be true?"
1,A woman wearing a very colorful costume while checking her phone.,There are / is (a) the wires are over the buses,0,"Using what you know about the world, in the situation described by Text 1, is Text 2 likely to be true?"


In [41]:
capregdf.sample(frac=1).to_csv('To_Test/capreg.csv', index=False)

## Caption / Deep Caption

In [42]:
# caption, paragraph for same image, paragraph for different by similar image
#n = 50

available_iis_cappar = df['cocoparcapdf']['image_id_x']
available_iis_sim = coco_id2semsim.keys()
available_iis = set(available_iis_cappar).intersection(available_iis_sim)
# len(available_iis)    # Only 1503 available...

tuples = []

for _ in range(n):
    ii = np.random.choice(list(available_iis))
    cap, ppar = df['cocoparcapdf'][df['cocoparcapdf']['image_id_x'] == ii]['caption paragraph'.split()].values[0]
    all_sim = n_most_sim(coco_sem_sim, coco_sem_map, coco_id2semsim[ii], n=200)
    all_neg = set(available_iis).intersection(all_sim)
    nii = np.random.choice(list(all_neg))
    npar = df['cocoparcapdf'][df['cocoparcapdf']['image_id_x'] == nii]['paragraph'].values[0]

    tuples.append((cap, ppar, npar))

In [43]:
out_rows = []
for premise, phyp, nhyp in tuples:
    out_rows.append((premise, phyp, 1))
    out_rows.append((premise, nhyp, 0))

In [44]:
capdeepdf = pd.DataFrame(out_rows, columns='premise hypothesis label'.split())

In [45]:
capdeepdf.head(5)

Unnamed: 0,premise,hypothesis,label
0,a female tennis player in a black top playing tennis,A woman in a black tennis outfit is standing on a blue tennis court. She is holding a tennis racket and hitting a green tennis ball. She is wearing a red hat on her head.,1
1,a female tennis player in a black top playing tennis,Four children are standing on a tennis court. All of them are holding rackets in their hands. They are standing in a partial line behind each other. A green tennis ball is soaring in the air towards the first child. The tennis court has a pink ha...,0
2,A dog walking next to cows in a field and behind a fence.,A brown and white dog runs up to some cows. The cows are assorted colors. The dark brown cow is by a white wooden fence. A white cow stand by the dark brown cow. There is another brown cow by the fence. The fence is made of metal. There is a lot ...,1
3,A dog walking next to cows in a field and behind a fence.,Two light brown cows stand on a muddy path. They have horns on their heads. There are several cows on the ground behind them. There is a thatched roofed building in the back ground. The building behind it had a blue tarp on part of the roof. ...,0
4,Two young men sitting on a touch reading a book together.,"Two men are sitting together on a couch. The man on the left has dark pants, a dark shirt, and dark shoes on his feet. The shoes have gold metal on them. The man has dark brown hair and is holding an e-reader. The man on the right is wearing gray...",1


In [46]:
prompt = '''Is Text 2 likely to be a longer description of the situation described by Text 1?'''

In [47]:
capdeepdf['prompt'] = prompt

In [48]:
capdeepdf.head(2)

Unnamed: 0,premise,hypothesis,label,prompt
0,a female tennis player in a black top playing tennis,A woman in a black tennis outfit is standing on a blue tennis court. She is holding a tennis racket and hitting a green tennis ball. She is wearing a red hat on her head.,1,Is Text 2 likely to be a longer description of the situation described by Text 1?
1,a female tennis player in a black top playing tennis,Four children are standing on a tennis court. All of them are holding rackets in their hands. They are standing in a partial line behind each other. A green tennis ball is soaring in the air towards the first child. The tennis court has a pink ha...,0,Is Text 2 likely to be a longer description of the situation described by Text 1?


In [49]:
capdeepdf.sample(frac=1).to_csv('To_Test/capdeep.csv', index=False)