In [1]:
import jsonlines
import numpy as np
from collections import defaultdict

In [2]:
basedir = "../outputs/VALSE"

models = {
    "albef_4m": "ALBEF$_{4M}$",
    "albef_14m": "ALBEF$_{14M}$",
    "blip_14m": "BLIP$_{14M}$",
    "blip_129m": "BLIP$_{129M}$",
    "blip_capfiltl_129m": "BLIP$_{129M}$+CapFilt-L",
    "blip_vitl_129m": "BLIP-ViT/L$_{129M}$",
    "pevl_pretraining": "PEVL$_{pretrain}$",
    "pevl_grounding": "PEVL$_{grounding}$",
    "pevl_vrd": "PEVL$_{vrd}$",
    "x-vlm_4m": "X-VLM$_{4M}$",
    "x-vlm_16m": "X-VLM$_{16M}$",
    "clipcap_cc": "clipcap$_{CC3M}$",
    "clipcap_coco": "clipcap$_{COCO}$",
    "blip2_itm_pretrain": "BLIP-2",
    }

In [3]:
model = list(models.keys())[0]
with jsonlines.open(f'{basedir}/{model}/{model}.jsonl') as reader:
    lines = [obj for obj in reader]

lines[:2]

[{'label': '0_c',
  'score': 0.053006917238235474,
  'instrument': 'actant-swap',
  'piece': 'actions',
  'type': 'caption',
  'id': 0},
 {'label': '0_f',
  'score': 0.05906842276453972,
  'instrument': 'actant-swap',
  'piece': 'actions',
  'type': 'foil',
  'id': 0}]

In [4]:
for model in models:
    with jsonlines.open(f'{basedir}/{model}/{model}.jsonl') as reader:
        lines = [obj for obj in reader]
    res_dict = defaultdict(list)
    for i in range(0, len(lines), 2):
        lc, lf = lines[i], lines[i+1]
        assert lc['label'].split('_')[0] == lf['label'].split('_')[0]
        assert lc['label'].split('_')[1] == 'c'
        assert lf['label'].split('_')[1] == 'f'
        instrument = lc['instrument']
        res = lc['score'] > lf['score']
        res_dict['overall'].append(res)
        res_dict[instrument].append(res)
    
    print(f'{models[model]} &', end=' ')
    for k in ['existence', 'plurals', 'counting-hard', 'counting-small-quant', 'counting-adversarial', 'relations',
              'action-replacement', 'actant-swap', 'coreference-standard', 'coreference-hard', 'foil-it']:
        print(f'{np.mean(res_dict[k])*100:.1f}', end=' & ')
    print(f'{np.mean(res_dict["overall"])*100:.1f} \\\\')

ALBEF$_{4M}$ & 71.3 & 78.8 & 62.2 & 65.1 & 59.8 & 73.1 & 73.6 & 58.4 & 52.4 & 55.8 & 95.5 & 69.1 \\
ALBEF$_{14M}$ & 69.5 & 76.0 & 61.5 & 61.0 & 64.5 & 70.7 & 77.6 & 60.5 & 55.9 & 61.5 & 96.1 & 69.4 \\
BLIP$_{14M}$ & 82.4 & 73.8 & 61.8 & 62.6 & 63.7 & 65.2 & 74.7 & 55.2 & 52.3 & 42.3 & 92.3 & 67.8 \\
BLIP$_{129M}$ & 78.2 & 75.9 & 63.4 & 63.4 & 58.5 & 66.2 & 75.2 & 59.0 & 56.4 & 52.9 & 93.2 & 68.8 \\
BLIP$_{129M}$+CapFilt-L & 75.4 & 75.0 & 64.7 & 68.8 & 53.0 & 66.7 & 73.0 & 60.6 & 48.2 & 51.0 & 93.8 & 68.2 \\
BLIP-ViT/L$_{129M}$ & 73.3 & 77.7 & 68.2 & 67.6 & 61.2 & 71.8 & 75.3 & 60.8 & 51.1 & 45.2 & 96.1 & 70.3 \\
PEVL$_{pretrain}$ & 89.7 & 65.5 & 66.0 & 66.2 & 57.3 & 67.9 & 73.5 & 59.4 & 58.2 & 56.7 & 90.9 & 68.9 \\
PEVL$_{grounding}$ & 91.1 & 63.9 & 70.0 & 70.9 & 63.2 & 62.4 & 74.4 & 57.1 & 53.8 & 49.0 & 92.6 & 69.5 \\
PEVL$_{vrd}$ & 83.8 & 61.8 & 62.8 & 70.3 & 40.4 & 64.5 & 68.1 & 53.2 & 47.7 & 42.3 & 94.1 & 64.5 \\
X-VLM$_{4M}$ & 80.0 & 77.8 & 69.0 & 68.4 & 72.5 & 74.8 & 77.3 & 65.0 