# Evaluation of Hu et al Paper

In [1]:
%matplotlib inline
import sys
import os
import numpy
import matplotlib.pyplot as plt
import pylab
pylab.rcParams['figure.figsize'] = (10.0, 8.0)
import operator
from scipy import stats

# MAKE Sure that google_refexp_py_lib is in your python libary search path
# before you run API in this toolbox. You can use something as follows:
sys.path.append('./google_refexp_py_lib')
from refexp_eval import RefexpEvalComprehension
from refexp_eval import RefexpEvalGeneration
from common_utils import draw_bbox

# Set coco_data_path and Google Refexp dataset validation set path
coco_data_path = './external/coco/annotations/instances_train2014.json'
refexp_dataset_path = './google_refexp_dataset_release/google_refexp_val_201511_coco_aligned.json'

In [2]:
# Load Google Refexp dataset and MS COCO dataset (takes some time)
eval_compreh = RefexpEvalComprehension(refexp_dataset_path, coco_data_path)

Loading Google Refexp dataset file for the comprehension task.
loading annotations into memory...
Done (t=28.43s)
creating index...
index created!
Dataset loaded.


In [3]:
# We evalute a sample comprehension task results generated by
# a naive method which outputs the groundtruth bounding boxes
# in the coco image with a random order.
pred_results_path = ('/home/dwright/repos/papers/Hu et al 2017/exp-refgoog/results/'
                     'refgoog_attbilstm_iter_150000_val.txt')

(prec, eval_results) = eval_compreh.evaluate(pred_results_path)

Loading predicted result file for the comprehension task.
Missing 9536 refexps in the refexp dataset file in the predicted file
The average prec@1 score is 0.679


## Helper functions for plotting

In [4]:
from matplotlib.colors import LinearSegmentedColormap
colors = [(1, 1, 1), (1, 0, 0)]
cm = LinearSegmentedColormap.from_list(
        'text_cmap', colors)
def makeHeatmap(attention, words, ax, title=""):
    plt.figure(figsize=(18, 1.5))
    heatmap = plt.pcolor(attention, cmap=cm)
    i = 0
    for x in range(len(attention[0])):
        plt.text(x + 0.5, 0.5, words[i], horizontalalignment='center', verticalalignment='center')
        i += 1
        
    plt.gca().get_xaxis().set_visible(False)
    plt.title(title)

In [5]:
from matplotlib.backends.backend_pdf import PdfPages

def multipage(filename, figs=None, dpi=200):
    pp = PdfPages(filename)
    if figs is None:
        figs = [plt.figure(n) for n in plt.get_fignums()]
    for fig in figs:
        fig.savefig(pp, format='pdf')
    pp.close()

In [6]:
def getRefExpFromEval(sample):
    ref = None
    if 'refexp' in sample or 'refexp_id' in sample:
        if 'refexp' in sample:
            ref = sample['refexp']
        else:
            refexp_tmp = self.refexp_dataset.loadRefexps(ids=sample['refexp_id'])[0]
            ref = refexp_tmp['raw']
            
    return ref

In [7]:
import re

SENTENCE_SPLIT_REGEX = re.compile(r'(\W+)')
def split(sentence):
    if isinstance(sentence, bytes):
        sentence = sentence.decode()
    words = SENTENCE_SPLIT_REGEX.split(sentence.strip())
    words = [w.lower() for w in words if len(w.strip()) > 0]
    # remove .
    if len(words) > 0 and (words[-1] == '.' or words[-1] == '?'):
        words = words[:-1]
        
    if len(words) > 20:
        words = words[:20]
    return words


In [11]:
def printCategoryStats(results):
    coco = eval_compreh.refexp_dataset.coco
    cococats = coco.loadCats(coco.getCatIds())
    cococats = {c['id'] : c for c in cococats}
    anns = eval_compreh.refexp_dataset.loadAnns(ids=[r['annotation_id'] for r in results])
    cats = {}
    for ann in anns:
        cat = cococats[ann['category_id']]['name']
        if cat in cats:
            cats[cat] += 1
        else:
            cats[cat] = 1

    catIds = eval_compreh.refexp_dataset.getCatIds()
    Get all images that contain all given categories.
    totals = {}
    for catId in catIds:
        imgIds = eval_compreh.refexp_dataset.getImgIds(catIds=[catId])
        totals[cococats[catId]['name']] = len(imgIds)

    print('Missed bboxes by category')
    for m in sorted(cats.items(), key=operator.itemgetter(1), reverse=True):
        print(str(m[0]) + " -- missed: " + str(m[1]) + ", total: " + str(totals[m[0]]) + ", %: " + str(float(m[1]) / totals[m[0]]) )

## Negative examples, generate statistics and plots

In [9]:
# Visualize the cases where your method performs bad/good. E.g.:
bad_results = [res for res in eval_results if res['best_iou'] < 0.05]
print ('%d top 1 predicted bounding boxes have IoU '
       'with GT less than 0.1' % len(bad_results))
print

# Randomly sample 100 bad examples
for i in range(100):
    coco_image_dir = './external/coco/images/train2014'
    bad_sample = bad_results[numpy.random.randint(0,len(bad_results))]
    eval_compreh.visualize_top_predicted_bbox(bad_sample, coco_image_dir)
     
    ref = getRefExpFromEval(bad_sample)
    ref = split(ref)
    sub = [s[0] for s in bad_sample['obj1_prob']][-len(ref):]
    makeHeatmap([sub], ref, plt.gca(), title='Subject attn')
    rel = [s[0] for s in bad_sample['rel_prob']][-len(ref):]
    makeHeatmap([rel], ref, plt.gca(), title='Relation attn')
    obj = [s[0] for s in bad_sample['obj2_prob']][-len(ref):]
    makeHeatmap([obj], ref, plt.gca(), title='Object attn')
    multipage('/hdd/dustin/data/hu_et_al_eval/negative_samples/%.3d.pdf' % i)
    #multipage('/hdd/dustin/data/hu_et_al_eval/negative_samples/test.pdf')
    plt.close('all')

1741 top 1 predicted bounding boxes have IoU with GT less than 0.1



In [12]:
printCategoryStats(bad_results)

Missed bboxes by category
person -- missed: 74, total: 108, %: 0.685185185185


In [7]:
import json

res_out = {"results": bad_results}
with open('/home/dwright/repos/papers/Hu et al 2017/exp-refgoog/results/bad_results.json', 'w') as f:
    json.dump(res_out, f)

In [8]:
with open('/hdd/dustin/data/hu_et_al_eval/negative_samples/bad_refexps.txt', 'w') as f:
    for sample in bad_results:
        if 'refexp' in sample or 'refexp_id' in sample:
            if 'refexp' in sample:
                ref = sample['refexp']
            else:
                refexp_tmp = self.refexp_dataset.loadRefexps(ids=sample['refexp_id'])[0]
                ref = refexp_tmp['raw']
            f.write(ref + "\n")

## Positive examples

In [11]:
# Visualize the cases where your method performs bad/good. E.g.:
good_results = [res for res in eval_results if res['best_iou'] >= 0.5]
print ('%d top 1 predicted bounding boxes have IoU '
       'with GT less than 0.1' % len(good_results))
print

# Randomly sample 100 bad examples
for i in range(100):
    coco_image_dir = './external/coco/images/train2014'
    good_sample = good_results[numpy.random.randint(0,len(good_results))]
    eval_compreh.visualize_top_predicted_bbox(good_sample, coco_image_dir)
     
    ref = getRefExpFromEval(good_sample)
    ref = split(ref)
    sub = [s[0] for s in good_sample['obj1_prob']][-len(ref):]
    makeHeatmap([sub], ref, plt.gca(), title='Subject attn')
    rel = [s[0] for s in good_sample['rel_prob']][-len(ref):]
    makeHeatmap([rel], ref, plt.gca(), title='Relation attn')
    obj = [s[0] for s in good_sample['obj2_prob']][-len(ref):]
    makeHeatmap([obj], ref, plt.gca(), title='Object attn')
    multipage('/hdd/dustin/data/hu_et_al_eval/gref_multiobject_positive_samples/%.3d.pdf' % i)
    #multipage('/hdd/dustin/data/hu_et_al_eval/negative_samples/test.pdf')
    plt.close('all')

565 top 1 predicted bounding boxes have IoU with GT less than 0.1

The Referring expression input to the model is:
  the hand of someone holding scissors
The Referring expression input to the model is:
  There is a whole orange on the upper right side of the picture.
The Referring expression input to the model is:
  man skating in the right side of the image
The Referring expression input to the model is:
  girl in white top
The Referring expression input to the model is:
  the man in the right
The Referring expression input to the model is:
  the chair on the bottom right
The Referring expression input to the model is:
  A small white teddy bear sitting in front of a large bear.
The Referring expression input to the model is:
  A man in a black suit wearing safety glasses and a green tie.
The Referring expression input to the model is:
  The horse on the left.
The Referring expression input to the model is:
  cow in front
The Referring expression input to the model is:
  A SPARROW WIT

The Referring expression input to the model is:
  A woman in a green jacket with skis and poles.
The Referring expression input to the model is:
  Catcher behind the batter
The Referring expression input to the model is:
  The giraffe whose head is not visible.
The Referring expression input to the model is:
  Wine glass directly infront of white cheese with blue streak.
The Referring expression input to the model is:
  Duck being hit with other duck's wing.
The Referring expression input to the model is:
  hat in the right most side of the image
The Referring expression input to the model is:
  the man holding a beer
The Referring expression input to the model is:
  White guy with mole on left side of face.
The Referring expression input to the model is:
  front most hot pocket piece
The Referring expression input to the model is:
  a black leather dinning chair


In [15]:
len(good_results) / float(len(eval_results))

0.6166281755196305

In [145]:
printCategoryStats(good_results)

Missed bboxes by category
person -- missed: 1486, total: 2936, %: 0.506130790191
chair -- missed: 178, total: 870, %: 0.204597701149
giraffe -- missed: 135, total: 166, %: 0.813253012048
dining table -- missed: 102, total: 929, %: 0.10979547901
zebra -- missed: 101, total: 128, %: 0.7890625
couch -- missed: 96, total: 380, %: 0.252631578947
bowl -- missed: 94, total: 478, %: 0.196652719665
car -- missed: 90, total: 441, %: 0.204081632653
elephant -- missed: 79, total: 113, %: 0.699115044248
horse -- missed: 64, total: 129, %: 0.496124031008
truck -- missed: 61, total: 240, %: 0.254166666667
cow -- missed: 60, total: 86, %: 0.697674418605
bus -- missed: 59, total: 134, %: 0.440298507463
cup -- missed: 54, total: 652, %: 0.0828220858896
sandwich -- missed: 53, total: 204, %: 0.259803921569
pizza -- missed: 52, total: 219, %: 0.237442922374
motorcycle -- missed: 51, total: 160, %: 0.31875
airplane -- missed: 46, total: 63, %: 0.730158730159
cat -- missed: 45, total: 113, %: 0.398230088496

In [146]:
import json

res_out = {"results": good_results}
with open('/home/dwright/repos/papers/Hu et al 2017/exp-refgoog/results/good_results.json', 'w') as f:
    json.dump(res_out, f)

In [147]:
with open('/hdd/dustin/data/hu_et_al_eval/positive_samples/good_refexps.txt', 'w') as f:
    for sample in good_results:
        ref = getRefExpFromEval(sample)
        f.write(ref + "\n")
        

## Evaluate KL distance for attention model

In [15]:
sr_kl = []
rs_kl = []
so_kl = []
os_kl = []
ro_kl = []
or_kl = []

for res in eval_results:
    ref = getRefExpFromEval(res)
    ref = split(ref)
    sub = [s[0] for s in res['obj1_prob']][-len(ref):]
    rel = [s[0] for s in res['rel_prob']][-len(ref):]
    obj = [s[0] for s in res['obj2_prob']][-len(ref):]
    
    sr_kl.append(stats.entropy(sub, rel))
    rs_kl.append(stats.entropy(rel, sub))
    
    so_kl.append(stats.entropy(sub, obj))
    os_kl.append(stats.entropy(obj, sub))
    
    ro_kl.append(stats.entropy(rel, obj))
    or_kl.append(stats.entropy(obj, rel))
    
print('Avg K(s||r): ', numpy.mean(sr_kl))
print('Avg K(r||s): ', numpy.mean(rs_kl))

print('Avg K(s||o): ', numpy.mean(so_kl))
print('Avg K(o||s): ', numpy.mean(os_kl))

print('Avg K(r||o): ', numpy.mean(ro_kl))
print('Avg K(o||r): ', numpy.mean(or_kl))

('Avg K(s||r): ', 0.48298778563366879)
('Avg K(r||s): ', 0.52495721460993305)
('Avg K(s||o): ', 0.41107716988187898)
('Avg K(o||s): ', 0.48697174823757983)
('Avg K(r||o): ', 0.015733561862567511)
('Avg K(o||r): ', 0.015772716419188097)


## Visualize attention

In [8]:
from gensim.models.keyedvectors import KeyedVectors

def getWeightedEmbedding(tokens, weights, wv, unk):
    return numpy.sum([weights[i] * wv[tokens[i]] if tokens[i] in wv.vocab else weights[i] * unk for i in range(len(tokens))], axis=0)

#Get word vectors
wv = KeyedVectors.load_word2vec_format('/hdd/dustin/data/word_vectors/glove.6B.100d.w2vformat.txt', binary=False)
unk = numpy.random.normal(loc=0., scale=1., size=(wv.vector_size,))

In [9]:
embeddings = []
#Calculate avg embedding using attn
for res in eval_results:
    ref = getRefExpFromEval(res)
    ref = split(ref)
    sub = [s[0] for s in res['obj1_prob']][-len(ref):]
    rel = [s[0] for s in res['rel_prob']][-len(ref):]
    obj = [s[0] for s in res['obj2_prob']][-len(ref):]
    
    embeddings.append(getWeightedEmbedding(ref, sub, wv, unk))
    embeddings.append(getWeightedEmbedding(ref, rel, wv, unk))
    embeddings.append(getWeightedEmbedding(ref, obj, wv, unk))

embeddings = numpy.asarray(embeddings)


In [10]:
#PCA to 2 components
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
pca.fit(embeddings)

PCA(copy=True, iterated_power='auto', n_components=2, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [None]:
import matplotlib.patches as mpatches
#Plot
def plot(data, labs=[], colors=[], handles=None, title="Plot of data after PCA", pointSize=0.5):
    print(len(data))
    fig = plt.figure(figsize=(18, 16), dpi= 80, facecolor='w', edgecolor='k')
    ax = fig.add_subplot(111)
    ax.scatter(data[:,0], data[:,1], s=pointSize)
    ax.set_title(title)
    for i in range(len(data)):
        ax.annotate(labs[i], (data[i,0],data[i,1]), fontsize=14, color=colors[i])
        
    if handles is not None:
        ax.legend(handles=handles)
    plt.show()
    
    return fig

samples = numpy.asarray(eval_results)[numpy.random.randint(len(eval_results), size=20)]

ems_sub = []
ems_obj = []
ems_rel = []
labs = []
for sam in samples:
    ref_orig = getRefExpFromEval(sam)
    ref = split(ref_orig)
    sub = [s[0] for s in res['obj1_prob']][-len(ref):]
    rel = [s[0] for s in res['rel_prob']][-len(ref):]
    obj = [s[0] for s in res['obj2_prob']][-len(ref):]

    ems_sub.append(getWeightedEmbedding(ref, sub, wv, unk))
    ems_rel.append(getWeightedEmbedding(ref, rel, wv, unk))
    ems_obj.append(getWeightedEmbedding(ref, obj, wv, unk))
    labs.append(ref_orig)

spoints = pca.transform(ems_sub)
rpoints = pca.transform(ems_rel)
opoints = pca.transform(ems_obj)

scolors = ['red'] * len(spoints)
rcolors = ['blue'] * len(rpoints)
ocolors = ['green'] * len(opoints)

red_patch = mpatches.Patch(color='red', label='Subject attention')
blue_patch = mpatches.Patch(color='blue', label='Relation attention')
green_patch = mpatches.Patch(color='green', label='Object attention')

fig = plot(numpy.concatenate([spoints, rpoints]), labs + labs, scolors + rcolors, title='2D projection of BOW with subject and relation attention', handles=[red_patch, blue_patch])
fig.savefig('/hdd/dustin/data/hu_et_al_eval/sub_rel_attention.pdf')
plt.close('all')

fig = plot(numpy.concatenate([rpoints, opoints]), labs + labs, rcolors + ocolors, title='2D projection of BOW with relation and object attention', handles=[blue_patch, green_patch])
fig.savefig('/hdd/dustin/data/hu_et_al_eval/rel_obj_attention.pdf')
plt.close('all')

fig = plot(numpy.concatenate([spoints, opoints]), labs + labs, scolors + ocolors, title='2D projection of BOW with subject and object attention', handles=[red_patch, green_patch])
fig.savefig('/hdd/dustin/data/hu_et_al_eval/sub_obj_attention.pdf')
plt.close('all')