In [1]:
import jsonlines
import numpy as np
from collections import defaultdict

In [2]:
basedir = "../outputs/SVO"

models = {
    "albef_4m": "ALBEF$_{4M}$",
    "albef_14m": "ALBEF$_{14M}$",
    "blip_14m": "BLIP$_{14M}$",
    "blip_129m": "BLIP$_{129M}$",
    "blip_capfiltl_129m": "BLIP$_{129M}$+CapFilt-L",
    "blip_vitl_129m": "BLIP-ViT/L$_{129M}$",
    "pevl_pretraining": "PEVL$_{pretrain}$",
    "pevl_grounding": "PEVL$_{grounding}$",
    "pevl_vrd": "PEVL$_{vrd}$",
    "x-vlm_4m": "X-VLM$_{4M}$",
    "x-vlm_16m": "X-VLM$_{16M}$",
    "clipcap_cc": "clipcap$_{CC3M}$",
    "clipcap_coco": "clipcap$_{COCO}$",
    "blip2_itm_pretrain": "BLIP-2",
    "clip_b32": "CLIP (ViT-B/32)"
    }

In [3]:
model = list(models.keys())[0]
with jsonlines.open(f'{basedir}/{model}/{model}.jsonl') as reader:
    lines = [obj for obj in reader]

lines[:2]

[{'label': '0_p',
  'score': 0.5716244578361511,
  'pos_triplet': 'girl,stand,grass',
  'neg_triplet': 'dog,stand,grass',
  'subj_neg': True,
  'verb_neg': False,
  'obj_neg': False},
 {'label': '0_n',
  'score': 2.2030102627468295e-05,
  'pos_triplet': 'girl,stand,grass',
  'neg_triplet': 'dog,stand,grass',
  'subj_neg': True,
  'verb_neg': False,
  'obj_neg': False}]

In [4]:
for model in models:
    with jsonlines.open(f'{basedir}/{model}/{model}.jsonl') as reader:
        lines = [obj for obj in reader]
    res_dict = defaultdict(list)
    for i in range(0, len(lines), 2):
        lc, lf = lines[i], lines[i+1]
        assert lc['label'].split('_')[0] == lf['label'].split('_')[0]
        assert lc['label'].split('_')[1] == 'p'
        assert lf['label'].split('_')[1] == 'n'
        if lc['subj_neg']:
            pos = 'subj'
        elif lc['verb_neg']:
            pos = 'verb'
        else:
            pos = 'obj'
        res = lc['score'] > lf['score']
        res_dict['overall'].append(res)
        res_dict[pos].append(res)
    
    print(f'{models[model]} &', end=' ')
    for k in ['subj', 'verb', 'obj']:
        print(f'{np.mean(res_dict[k])*100:.1f}', end=' & ')
    print(f'{np.mean(res_dict["overall"])*100:.1f} \\\\')

ALBEF$_{4M}$ & 88.5 & 85.4 & 93.7 & 87.6 \\
ALBEF$_{14M}$ & 89.4 & 86.4 & 94.7 & 88.6 \\
BLIP$_{14M}$ & 49.8 & 48.8 & 47.5 & 48.7 \\
BLIP$_{129M}$ & 50.8 & 51.4 & 51.8 & 51.4 \\
BLIP$_{129M}$+CapFilt-L & 49.4 & 51.3 & 52.5 & 51.2 \\
BLIP-ViT/L$_{129M}$ & 50.0 & 50.9 & 50.9 & 50.8 \\
PEVL$_{pretrain}$ & 89.4 & 82.9 & 93.9 & 86.2 \\
PEVL$_{grounding}$ & 91.2 & 85.9 & 94.6 & 88.5 \\
PEVL$_{vrd}$ & 90.1 & 81.1 & 92.3 & 84.8 \\
X-VLM$_{4M}$ & 89.3 & 87.1 & 94.5 & 88.9 \\
X-VLM$_{16M}$ & 90.3 & 88.4 & 94.6 & 90.0 \\
clipcap$_{CC3M}$ & 84.2 & 80.5 & 90.2 & 83.1 \\
clipcap$_{COCO}$ & 87.3 & 81.5 & 89.8 & 84.1 \\
BLIP-2 & 87.6 & 84.6 & 91.7 & 86.5 \\
CLIP (ViT-B/32) & 83.6 & 79.0 & 88.1 & 81.6 \\
