In [1]:
import json
import os.path
import numpy as np
from collections import defaultdict

In [2]:
ADJECTIVES_COLORS_ANIMATE = {"black_cat", "brown_dog", "white_horse", "black_bird"}
ADJECTIVES_COLORS_INANIMATE = {"red_bus", "white_truck", "blue_bus", "white_boat"}

ADJECTIVES_SIZES_ANIMATE = {"big_bird", "small_cat", "big_cat", "small_dog"}
ADJECTIVES_SIZES_INANIMATE = {"small_plane", "big_plane", "small_table", "big_truck"}

VERBS_TRANSITIVE = {"eat_man", "ride_woman", "hold_child", "eat_horse"}
VERBS_INTRANSITIVE = {"lie_woman", "fly_bird", "stand_bird", "stand_child"}

all_pairs = ADJECTIVES_COLORS_ANIMATE.union(ADJECTIVES_COLORS_INANIMATE).union(ADJECTIVES_SIZES_ANIMATE)\
                                     .union(ADJECTIVES_SIZES_INANIMATE).union(VERBS_TRANSITIVE).union(VERBS_INTRANSITIVE)

In [3]:
def read_coco_metrics(model_dir, split='test', beam=5, rr=False):
    if rr:
        fn = 'coco.re_ranking.beam_%d.%s' % (beam, split)
    else:
        fn = 'coco.beam_%d.%s' % (beam, split)
    with open(os.path.join(model_dir, fn)) as f:
        lines = f.readlines()
    results = dict()
    for line in lines:
        m, s = line.split(': ')
        results[m] = 100*float(s)
    return results

def avg_coco_metrics(metrics_dicts):
    count = len(metrics_dicts)
    results = defaultdict(int)
    for ix, d in metrics_dicts.items():
        for m, s in d.items():
            results[m] += s
    for m in results:
        results[m] /= count
    return results

def read_pair_recalls(mdir, concept_pairs, at=5, split='test', beam=5, rr=False):
    if rr:
        fn = 'recall_%d.%s.re_ranking.beam_%d.%s' % (at, '%s', beam, split)
    else:
        fn = 'recall_%d.%s.beam_%d.%s' % (at, '%s', beam, split)
    basefile = os.path.join(mdir, fn)
    results = dict()
    for pair in concept_pairs:
        fn = basefile % pair
        if os.path.isfile(fn):
            with open(fn) as f:
                recall_score = json.load(f)
            results[pair] = recall_score
    return results

def average_recall(recall_scores, min_importance=1):
    pair_recalls_summed = 0
    length = 0
    for i, pair in enumerate(recall_scores.keys()):
        average_pair_recall = np.sum(list(recall_scores[pair]["true_positives"].values())[min_importance - 1:]) / \
                              np.sum(list(recall_scores[pair]["numbers"].values())[min_importance - 1:])
        if not np.isnan(average_pair_recall):
            pair_recalls_summed += average_pair_recall
            length += 1
    recall = 100 * pair_recalls_summed / length
    return recall

def agg_bertscores(model_fns):
    all_scores = []
    for fn in model_fns:        
        with open(fn) as f:
            scores = [float(l.split()[-1]) for l in f.readlines()[5:]]
        all_scores += scores
    bertscore = 100 * np.mean(all_scores)
    return bertscore

def get_scores(model2dirs, split='test', beam=100, recall_at=5):
    # Recall
    model2recalls = {m: {i+1: read_pair_recalls(os.path.join(mdir, 'results'), all_pairs, recall_at, split, beam, rr=('+rr' in m)) 
                         for i, mdir in enumerate(mdirs)} 
                     for m, mdirs in model2dirs.items()}
    model2pair2metrics = dict()
    for model, values in model2recalls.items():
        model2pair2metrics[model] = dict()
        for val in values.values():
            for pair, metrics in val.items():
                model2pair2metrics[model][pair] = metrics
    model2avg_recall = {model: average_recall(pair2metrics) for model, pair2metrics in model2pair2metrics.items()}
    # COCO metrics
    model2coco_metrics = {m: {i+1: read_coco_metrics(os.path.join(mdir, 'results'), split, beam=beam, rr=('+rr' in m)) 
                              for i, mdir in enumerate(mdirs)} 
                          for m, mdirs in model2dirs.items()}
    model2avg_coco_metrics = {model: avg_coco_metrics(metrics) for model, metrics in model2coco_metrics.items()}
    # BERTScore
    model2bs = {m: agg_bertscores([os.path.join(mdir, 'bertscore/%s.out' % split) for mdir in mdirs]) for m, mdirs in model2dirs.items()}
    return model2avg_recall, model2avg_coco_metrics, model2bs

In [4]:
def latex_print(model2avg_recall, model2avg_coco_metrics, model2bs):
    print('\\textbf{Model} & \\textbf{R@5} & \\textbf{M} & \\textbf{S} & \\textbf{C} & \\textbf{B} & \\textbf{BS} \\\\')
    for model, score in model2avg_recall.items():
        line = model.upper() + " & %.1f " % score
        coco_scores = model2avg_coco_metrics[model]
        for metric in ['METEOR', 'SPICE', 'CIDEr', 'Bleu_4']:
            line += "& %.1f " % coco_scores[metric]
        line += "& %.1f " % model2bs[model]
        line += "\\\\"
        print(line)

In [5]:
beam = 100
recall_at = 5

# Syntax Awareness (with BUTD)

In [6]:
split = 'val'

## Sequential

In [7]:
approach = 'seq'
basedir = "../experiments/"
model2dirs = {'butd': [basedir+'coco_heldout_%d/butd/' % d for d in range(1,4+1)],
              'butd+idle': [basedir+'coco_heldout_%d_idle_%s/butd/' % (d, approach) for d in range(1,4+1)],
              'butd+chunk': [basedir+'coco_heldout_%d_chunk_%s/butd/' % (d, approach) for d in range(1,4+1)],
              'butd+pos': [basedir+'coco_heldout_%d_pos_%s/butd/' % (d, approach) for d in range(1,4+1)],
              'butd+dep': [basedir+'coco_heldout_%d_dep_%s/butd/' % (d, approach) for d in range(1,4+1)],
              'butd+ccg': [basedir+'coco_heldout_%d_ccg_%s/butd/' % (d, approach) for d in range(1,4+1)],
             }

model2avg_recall, model2avg_coco_metrics, model2bertscore = get_scores(model2dirs, split=split, beam=beam, recall_at=recall_at)
latex_print(model2avg_recall, model2avg_coco_metrics, model2bertscore)

\textbf{Model} & \textbf{R@5} & \textbf{M} & \textbf{S} & \textbf{C} & \textbf{B} & \textbf{BS} \\
BUTD & 9.5 & 25.2 & 18.6 & 92.7 & 32.3 & 41.7 \\
BUTD+IDLE & 8.7 & 23.7 & 17.8 & 87.6 & 30.0 & 38.8 \\
BUTD+CHUNK & 10.9 & 24.7 & 18.2 & 89.2 & 31.2 & 41.2 \\
BUTD+POS & 9.5 & 24.1 & 17.5 & 86.1 & 30.1 & 40.7 \\
BUTD+DEP & 11.1 & 24.6 & 17.8 & 89.7 & 30.8 & 41.0 \\
BUTD+CCG & 10.6 & 24.5 & 18.0 & 88.4 & 30.4 & 41.0 \\


## Interleaved

In [8]:
approach = 'inter'
basedir = "../experiments/"
model2dirs = {'butd': [basedir+'coco_heldout_%d/butd/' % d for d in range(1,4+1)],
              'butd+idle': [basedir+'coco_heldout_%d_idle_%s/butd/' % (d, approach) for d in range(1,4+1)],
              'butd+chunk': [basedir+'coco_heldout_%d_chunk_%s/butd/' % (d, approach) for d in range(1,4+1)],
              'butd+pos': [basedir+'coco_heldout_%d_pos_%s/butd/' % (d, approach) for d in range(1,4+1)],
              'butd+dep': [basedir+'coco_heldout_%d_dep_%s/butd/' % (d, approach) for d in range(1,4+1)],
              'butd+ccg': [basedir+'coco_heldout_%d_ccg_%s/butd/' % (d, approach) for d in range(1,4+1)],
             }

model2avg_recall, model2avg_coco_metrics, model2bertscore = get_scores(model2dirs, split=split, beam=beam, recall_at=recall_at)
latex_print(model2avg_recall, model2avg_coco_metrics, model2bertscore)

\textbf{Model} & \textbf{R@5} & \textbf{M} & \textbf{S} & \textbf{C} & \textbf{B} & \textbf{BS} \\
BUTD & 9.5 & 25.2 & 18.6 & 92.7 & 32.3 & 41.7 \\
BUTD+IDLE & 10.5 & 25.3 & 18.8 & 94.3 & 32.3 & 41.7 \\
BUTD+CHUNK & 9.7 & 25.2 & 18.7 & 93.4 & 32.5 & 41.7 \\
BUTD+POS & 11.8 & 25.4 & 18.8 & 94.4 & 32.7 & 41.7 \\
BUTD+DEP & 10.8 & 25.2 & 18.7 & 93.0 & 31.9 & 41.6 \\
BUTD+CCG & 10.5 & 25.4 & 19.0 & 94.6 & 32.7 & 41.9 \\


## Multi-task

In [9]:
approach = 'multi'
basedir = "../experiments/"
model2dirs = {'butd': [basedir+'coco_heldout_%d/butd/' % d for d in range(1,4+1)],
              'butd+idle': [basedir+'coco_heldout_%d_idle_%s/butd/' % (d, approach) for d in range(1,4+1)],
              'butd+chunk': [basedir+'coco_heldout_%d_chunk_%s/butd/' % (d, approach) for d in range(1,4+1)],
              'butd+pos': [basedir+'coco_heldout_%d_pos_%s/butd/' % (d, approach) for d in range(1,4+1)],
              'butd+dep': [basedir+'coco_heldout_%d_dep_%s/butd/' % (d, approach) for d in range(1,4+1)],
              'butd+ccg': [basedir+'coco_heldout_%d_ccg_%s/butd/' % (d, approach) for d in range(1,4+1)],
             }

model2avg_recall, model2avg_coco_metrics, model2bertscore = get_scores(model2dirs, split=split, beam=beam, recall_at=recall_at)
latex_print(model2avg_recall, model2avg_coco_metrics, model2bertscore)

\textbf{Model} & \textbf{R@5} & \textbf{M} & \textbf{S} & \textbf{C} & \textbf{B} & \textbf{BS} \\
BUTD & 9.5 & 25.2 & 18.6 & 92.7 & 32.3 & 41.7 \\
BUTD+IDLE & 9.8 & 25.5 & 18.7 & 94.5 & 32.7 & 41.8 \\
BUTD+CHUNK & 10.3 & 25.5 & 19.0 & 94.5 & 32.4 & 41.9 \\
BUTD+POS & 10.3 & 25.4 & 18.8 & 93.8 & 32.6 & 41.8 \\
BUTD+DEP & 11.4 & 25.5 & 18.9 & 93.9 & 32.7 & 41.9 \\
BUTD+CCG & 10.8 & 25.7 & 19.0 & 95.6 & 32.7 & 42.0 \\


# Final Results (standard & interleaved POS)

## Validation

In [11]:
split = 'val'
basedir = "../experiments/"
model2dirs = {'butd': [basedir+'coco_heldout_%d/butd/' % d for d in range(1,4+1)],
              'butd+pos': [basedir+'coco_heldout_%d_pos_inter/butd/' % d for d in range(1,4+1)],
              'butr+rr': [basedir+'coco_heldout_%d/butr/' % d for d in range(1,4+1)],
              'butr+rr+pos': [basedir+'coco_heldout_%d_pos_inter/butr/' % d for d in range(1,4+1)],
              'butr_mean+rr+pos': [basedir+'coco_heldout_%d_pos_inter/butr_mean/' % d for d in range(1,4+1)],
              'butr_weight+rr': [basedir+'coco_heldout_%d/butr_weight/' % d for d in range(1,4+1)],
              'butr_weight+rr+pos': [basedir+'coco_heldout_%d_pos_inter/butr_weight/' % d for d in range(1,4+1)],
              'm2': [basedir+'coco_heldout_%d/m2/' % d for d in range(1,4+1)],
              'm2+pos': [basedir+'coco_heldout_%d_pos_inter/m2/' % d for d in range(1,4+1)],
             }

model2avg_recall, model2avg_coco_metrics, model2bertscore = get_scores(model2dirs, split=split, beam=beam, recall_at=recall_at)
latex_print(model2avg_recall, model2avg_coco_metrics, model2bertscore)

\textbf{Model} & \textbf{R@5} & \textbf{M} & \textbf{S} & \textbf{C} & \textbf{B} & \textbf{BS} \\
BUTD & 9.5 & 25.2 & 18.6 & 92.7 & 32.3 & 41.7 \\
BUTD+POS & 11.8 & 25.4 & 18.8 & 94.4 & 32.7 & 41.7 \\
BUTR+RR & 15.0 & 26.2 & 19.9 & 88.6 & 28.9 & 41.8 \\
BUTR+RR+POS & 12.0 & 25.7 & 19.4 & 85.4 & 27.4 & 41.4 \\
BUTR_MEAN+RR+POS & 14.2 & 25.9 & 19.7 & 87.4 & 28.3 & 42.9 \\
BUTR_WEIGHT+RR & 14.9 & 26.4 & 20.2 & 88.8 & 28.5 & 43.2 \\
BUTR_WEIGHT+RR+POS & 16.4 & 26.4 & 20.0 & 89.8 & 29.1 & 43.1 \\
M2 & 10.6 & 27.9 & 21.6 & 114.0 & 37.2 & 44.4 \\
M2+POS & 13.2 & 28.0 & 21.7 & 113.8 & 35.4 & 44.9 \\


## Test

In [12]:
split = 'test'
basedir = "../experiments/"
model2dirs = {'butd': [basedir+'coco_heldout_%d/butd/' % d for d in range(1,4+1)],
              'butd+pos': [basedir+'coco_heldout_%d_pos_inter/butd/' % d for d in range(1,4+1)],
              'butr+rr': [basedir+'coco_heldout_%d/butr/' % d for d in range(1,4+1)],
              'butr+rr+pos': [basedir+'coco_heldout_%d_pos_inter/butr/' % d for d in range(1,4+1)],
              'butr_mean+rr+pos': [basedir+'coco_heldout_%d_pos_inter/butr_mean/' % d for d in range(1,4+1)],
              'butr_weight+rr': [basedir+'coco_heldout_%d/butr_weight/' % d for d in range(1,4+1)],
              'butr_weight+rr+pos': [basedir+'coco_heldout_%d_pos_inter/butr_weight/' % d for d in range(1,4+1)],
              'm2': [basedir+'coco_heldout_%d/m2/' % d for d in range(1,4+1)],
              'm2+pos': [basedir+'coco_heldout_%d_pos_inter/m2/' % d for d in range(1,4+1)],
             }

model2avg_recall, model2avg_coco_metrics, model2bertscore = get_scores(model2dirs, split=split, beam=beam, recall_at=recall_at)
latex_print(model2avg_recall, model2avg_coco_metrics, model2bertscore)

\textbf{Model} & \textbf{R@5} & \textbf{M} & \textbf{S} & \textbf{C} & \textbf{B} & \textbf{BS} \\
BUTD & 9.2 & 25.4 & 18.6 & 94.4 & 32.4 & 41.8 \\
BUTD+POS & 11.1 & 25.4 & 18.7 & 96.3 & 32.9 & 41.8 \\
BUTR+RR & 13.7 & 26.1 & 19.7 & 89.8 & 28.4 & 42.0 \\
BUTR+RR+POS & 11.5 & 25.6 & 19.2 & 87.9 & 27.6 & 41.5 \\
BUTR_MEAN+RR+POS & 13.4 & 25.9 & 19.7 & 88.9 & 27.9 & 43.0 \\
BUTR_WEIGHT+RR & 13.5 & 26.4 & 20.1 & 91.0 & 28.6 & 43.3 \\
BUTR_WEIGHT+RR+POS & 15.4 & 26.3 & 20.0 & 91.0 & 28.7 & 43.2 \\
M2 & 10.1 & 27.8 & 21.5 & 115.7 & 36.5 & 44.5 \\
M2+POS & 12.1 & 28.0 & 21.6 & 115.7 & 35.0 & 44.9 \\
