Skip to content
This repository has been archived by the owner on Jun 15, 2023. It is now read-only.

Commit

Permalink
add grounding eval script for generated sentences (F1_all and F1_loc)
Browse files Browse the repository at this point in the history
  • Loading branch information
LuoweiZhou committed Apr 14, 2019
1 parent ddaeb29 commit ef96c3b
Show file tree
Hide file tree
Showing 4 changed files with 190 additions and 29 deletions.
6 changes: 5 additions & 1 deletion README.md
Expand Up @@ -21,7 +21,11 @@ Under the `scripts` directory, we include:

- `attr_prep_tag_NP.py`: The preprocessing scripts to obtain the NP/object annotation files.
- `anet_entities_np_stats.py`, `anet_entities_object_stats.py`: The scripts that print the dataset stats.
- `eval_grd_anet_entities.py`: The evaluation script for object grounding on GT captions. [PyTorch](https://pytorch.org/get-started/locally/) is required. To evaluate your results, simply run:
- `eval_gt_grd_anet_entities.py`: The evaluation script for object grounding on GT captions (metrics in paper: Attn., Grd.). [PyTorch](https://pytorch.org/get-started/locally/) is required. To evaluate your results, simply run:
```
python scripts/eval_gt_grd_anet_entities.py -s YOUR_SUBMISSION_FILE.JSON
```
- `eval_grd_anet_entities.py`: The evaluation script for object grounding on generated captions (metrics in paper: F1<sub>all</sub>}, F1<sub>loc</sub>). [PyTorch](https://pytorch.org/get-started/locally/), [Stanford CoreNLP 3.9.1](https://stanfordnlp.github.io/CoreNLP/history.html) and the [Python wrapper](https://github.com/Lynten/stanford-corenlp) are required. To evaluate, similarly run:
```
python scripts/eval_grd_anet_entities.py -s YOUR_SUBMISSION_FILE.JSON
```
Expand Down
16 changes: 9 additions & 7 deletions data/anet_entities_skeleton.txt
Expand Up @@ -6,7 +6,7 @@ Format of JSON ActivityNet-Entities annotation files
- rwidth: resized width of video, will be 720px
- rheight: resized height of video, maintains aspect ratio
-> segments
-> [segment number]: segment from video with bounding box annotations
-> [segment id]: segment from video with bounding box annotations
-> objects
-> [object number]: annotated object from segment
-> noun_phrases: a list of noun phrase (NP) annotations of the object, both the text and the index of the word in the sentence
Expand All @@ -23,7 +23,7 @@ Format of JSON ActivityNet-Entities annotation files
-> database
-> [video name]: identifier of video
-> segments
-> [segment number]: segment from video with bounding box annotations
-> [segment id]: segment from video with bounding box annotations
-> process_clss: object class of all the bounding boxes
-> tokens: tokenized sentence
-> frame_ind: frame index of all the bounding boxes
Expand All @@ -36,13 +36,15 @@ Format of JSON ActivityNet-Entities annotation files
{
"results": {
"v_QOlSCBRmfWY": {
"clss": ["room", "woman", "she"], # object class
"idx_in_sent": [8, 2, 12], # index of object in the sentence
"bbox_for_all_frames": [[[1,2,3,4], …, [1,2,3,4]], [[1,2,3,4], …, [1,2,3,4]], [[1,2,3,4], …, [1,2,3,4]]] # predicted bbox on all 10 uniformly sampled frames
"0": { # segment id
"clss": ["room", "woman", "she"], # object class
"idx_in_sent": [8, 2, 12], # index of object in the sentence
"bbox_for_all_frames": [[[1,2,3,4], …, [1,2,3,4]], [[1,2,3,4], …, [1,2,3,4]], [[1,2,3,4], …, [1,2,3,4]]] # predicted bbox on all 10 uniformly sampled frames
}
}
}
},
"external_data": {
"used": True, # Boolean flag
"used": true, # Boolean flag
"details": "Object detector pre-trained on Visual Genome on object detection task."
}
}
Expand Down
106 changes: 85 additions & 21 deletions scripts/eval_grd_anet_entities.py
Expand Up @@ -5,7 +5,7 @@
# LICENSE file in the root directory of this source tree.
#

# Evaluation script for object grounding over GT sentences
# Evaluation script for object grounding over generated sentences

import json
import argparse
Expand All @@ -15,7 +15,14 @@
from collections import defaultdict
from utils import bbox_overlaps_batch, get_frm_mask

from stanfordcorenlp import StanfordCoreNLP
from tqdm import tqdm

def main(args):

nlp = StanfordCoreNLP('tools/stanford-corenlp-full-2018-02-27')
props={'annotators': 'lemma','pipelineLanguage':'en', 'outputFormat':'json'}

with open(args.reference) as f:
ref = json.load(f)['annotations']
with open(args.split_file) as f:
Expand All @@ -30,7 +37,52 @@ def main(args):

print('Number of videos in the reference: {}, number of videos in the submission: {}'.format(len(ref), len(pred)))

results = defaultdict(list)
vocab_in_split = set()

# precision
prec = defaultdict(list)
for vid, anns in tqdm(ref.items()):
for seg, ann in anns['segments'].items():
if len(ann['frame_ind']) == 0 or vid not in pred or seg not in pred[vid]:
continue # do not penalize if sentence not annotated

ref_bbox_all = torch.cat((torch.Tensor(ann['process_bnd_box']), \
torch.Tensor(ann['frame_ind']).unsqueeze(-1)), dim=1) # 5-D coordinates

idx_in_sent = {}
for box_idx, cls_lst in enumerate(ann['process_clss']):
vocab_in_split.update(set(cls_lst))
for cls_idx, cls in enumerate(cls_lst):
idx_in_sent[cls] = idx_in_sent.get(cls, []) + [ann['process_idx'][box_idx][cls_idx]]

sent_idx = set(itertools.chain.from_iterable(ann['process_idx'])) # index of gt object words
exclude_obj = {json.loads(nlp.annotate(token.encode('utf-8'), properties=props) \
)['sentences'][0]['tokens'][0]['lemma']:1 for token_idx, token in enumerate(ann['tokens'] \
) if (token_idx not in sent_idx and token != '')}

for pred_idx, class_name in enumerate(pred[vid][seg]['clss']):
if class_name in idx_in_sent:
gt_idx = min(idx_in_sent[class_name]) # always consider the first match...
sel_idx = [idx for idx, i in enumerate(ann['process_idx']) if gt_idx in i]
ref_bbox = ref_bbox_all[sel_idx] # select matched boxes
assert(ref_bbox.size(0) > 0)

pred_bbox = torch.cat((torch.Tensor(pred[vid][seg]['bbox_for_all_frames'][pred_idx])[:,:4], \
torch.Tensor(range(10)).unsqueeze(-1)), dim=1)

frm_mask = torch.from_numpy(get_frm_mask(pred_bbox[:, 4].numpy(), \
ref_bbox[:, 4].numpy()).astype('uint8'))
overlap = bbox_overlaps_batch(pred_bbox[:, :5].unsqueeze(0), \
ref_bbox[:, :5].unsqueeze(0), frm_mask.unsqueeze(0))
prec[class_name].append(1 if torch.max(overlap) > args.iou else 0)
elif json.loads(nlp.annotate(class_name.encode('utf-8'), properties=props))['sentences'][0]['tokens'][0]['lemma'] in exclude_obj:
pass # do not penalize if gt object word not annotated (missed)
else:
if args.mode == 'all':
prec[class_name].append(0) # hallucinated object

# recall
recall = defaultdict(list)
for vid, anns in ref.items():
for seg, ann in anns['segments'].items():
if len(ann['frame_ind']) == 0:
Expand All @@ -39,51 +91,63 @@ def main(args):

ref_bbox_all = torch.cat((torch.Tensor(ann['process_bnd_box']), \
torch.Tensor(ann['frame_ind']).unsqueeze(-1)), dim=1) # 5-D coordinates
sent_idx = set(itertools.chain.from_iterable(ann['process_idx'])) # index of word in sentence to evaluate
for idx in sent_idx:
sel_idx = [ind for ind, i in enumerate(ann['process_idx']) if idx in i]
sent_idx = set(itertools.chain.from_iterable(ann['process_idx'])) # index of gt object words

for gt_idx in sent_idx:
sel_idx = [idx for idx, i in enumerate(ann['process_idx']) if gt_idx in i]
ref_bbox = ref_bbox_all[sel_idx] # select matched boxes
# Note that despite discouraged, a single word could be annotated across multiple boxes/frames

assert(ref_bbox.size(0) > 0)
class_name = ann['process_clss'][sel_idx[0]][ann['process_idx'][sel_idx[0]].index(idx)]

class_name = ann['process_clss'][sel_idx[0]][ann['process_idx'][sel_idx[0]].index(gt_idx)]
if vid not in pred:
results[class_name].append(0) # video not grounded
recall[class_name].append(0) # video not grounded
elif seg not in pred[vid]:
results[class_name].append(0) # segment not grounded
elif idx not in pred[vid][seg]['idx_in_sent']:
results[class_name].append(0) # object not grounded
else:
pred_ind = pred[vid][seg]['idx_in_sent'].index(idx)
pred_bbox = torch.cat((torch.Tensor(pred[vid][seg]['bbox_for_all_frames'][pred_ind])[:,:4], \
recall[class_name].append(0) # segment not grounded
elif class_name in pred[vid][seg]['clss']:
pred_idx = pred[vid][seg]['clss'].index(class_name) # always consider the first match...
pred_bbox = torch.cat((torch.Tensor(pred[vid][seg]['bbox_for_all_frames'][pred_idx])[:,:4], \
torch.Tensor(range(10)).unsqueeze(-1)), dim=1)

frm_mask = torch.from_numpy(get_frm_mask(pred_bbox[:, 4].numpy(), \
ref_bbox[:, 4].numpy()).astype('uint8'))
overlap = bbox_overlaps_batch(pred_bbox[:, :5].unsqueeze(0), \
ref_bbox[:, :5].unsqueeze(0), frm_mask.unsqueeze(0))
results[class_name].append(1 if torch.max(overlap) > args.iou else 0)
recall[class_name].append(1 if torch.max(overlap) > args.iou else 0)
else:
if args.mode == 'all':
recall[class_name].append(0) # object not grounded

print('Number of groundable objects in this split: {}'.format(len(results)))
grd_accu = np.mean([sum(hm)*1./len(hm) for i,hm in results.items()])
num_vocab = len(vocab_in_split)
print('Number of groundable objects in this split: {}'.format(num_vocab))
print('Number of objects in prec and recall: {}, {}'.format(len(prec), len(recall)))
prec_accu = np.sum([sum(hm)*1./len(hm) for i,hm in prec.items()])*1./num_vocab
recall_accu = np.sum([sum(hm)*1./len(hm) for i,hm in recall.items()])*1./num_vocab
f1 = 2. * prec_accu * recall_accu / (prec_accu + recall_accu)

print('-' * 80)
print('The overall grounding accuracy is {}'.format(grd_accu))
print('The overall precision / recall / F1 are {:.4f} / {:.4f} / {:.4f}'.format(prec_accu, recall_accu, f1))
print('-' * 80)
if args.verbose:
print('Object frequency and grounding accuracy per class (descending by object frequency):')
accu_per_clss = {(i, sum(hm)*1./len(hm)):len(hm) for i,hm in results.items()}
accu_per_clss = sorted(accu_per_clss.items(), key=lambda x:x[1], reverse=True)
accu_per_clss = {}
for i in vocab_in_split:
prec_clss = sum(prec[i])*1./len(prec[i]) if i in prec else 0
recall_clss = sum(recall[i])*1./len(recall[i]) if i in recall else 0
accu_per_clss[(i, prec_clss, recall_clss)] = (len(prec[i]), len(recall[i]))
accu_per_clss = sorted(accu_per_clss.items(), key=lambda x:x[1][1], reverse=True)
for accu in accu_per_clss:
print('{} ({}): {:.4f}'.format(accu[0][0], accu[1], accu[0][1]))
print('{} ({} / {}): {:.4f} / {:.4f}'.format(accu[0][0], accu[1][0], accu[1][1], accu[0][1], accu[0][2]))

nlp.close()

if __name__=='__main__':
parser = argparse.ArgumentParser(description='ActivityNet-Entities object grounding evaluation script.')
parser.add_argument('-s', '--submission', type=str, default='', help='submission grounding result file')
parser.add_argument('-r', '--reference', type=str, default='data/anet_entities_cleaned_class_thresh50_trainval.json', help='reference file')
parser.add_argument('--split_file', type=str, default='data/split_ids_anet_entities.json', help='path to the split file')
parser.add_argument('--split', type=str, nargs='+', default=['validation'], help='which split(s) to evaluate')
parser.add_argument('--mode', type=str, default='all', help='all | loc, as whether consider lang error')
parser.add_argument('-iou', type=float, default=0.5, help='the iou threshold for grounding correctness')
parser.add_argument('-v', '--verbose', action='store_true')
args = parser.parse_args()
Expand Down
91 changes: 91 additions & 0 deletions scripts/eval_gt_grd_anet_entities.py
@@ -0,0 +1,91 @@
# Copyright (c) Facebook, Inc. and its affiliates.
# All rights reserved.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
#

# Evaluation script for object grounding over GT sentences

import json
import argparse
import torch
import itertools
import numpy as np
from collections import defaultdict
from utils import bbox_overlaps_batch, get_frm_mask

def main(args):

with open(args.reference) as f:
ref = json.load(f)['annotations']
with open(args.split_file) as f:
split_file = json.load(f)
split = {}
for s in args.split:
split.update({i:i for i in split_file[s]})
ref = {k:v for k,v in ref.items() if k in split}

with open(args.submission) as f:
pred = json.load(f)['results']

print('Number of videos in the reference: {}, number of videos in the submission: {}'.format(len(ref), len(pred)))

results = defaultdict(list)
for vid, anns in ref.items():
for seg, ann in anns['segments'].items():
if len(ann['frame_ind']) == 0:
continue # annotation not available

ref_bbox_all = torch.cat((torch.Tensor(ann['process_bnd_box']), \
torch.Tensor(ann['frame_ind']).unsqueeze(-1)), dim=1) # 5-D coordinates
sent_idx = set(itertools.chain.from_iterable(ann['process_idx'])) # index of word in sentence to evaluate
for idx in sent_idx:
sel_idx = [ind for ind, i in enumerate(ann['process_idx']) if idx in i]
ref_bbox = ref_bbox_all[sel_idx] # select matched boxes
# Note that despite discouraged, a single word could be annotated across multiple boxes/frames
assert(ref_bbox.size(0) > 0)

class_name = ann['process_clss'][sel_idx[0]][ann['process_idx'][sel_idx[0]].index(idx)]
if vid not in pred:
results[class_name].append(0) # video not grounded
elif seg not in pred[vid]:
results[class_name].append(0) # segment not grounded
elif idx not in pred[vid][seg]['idx_in_sent']:
results[class_name].append(0) # object not grounded
else:
pred_ind = pred[vid][seg]['idx_in_sent'].index(idx)
pred_bbox = torch.cat((torch.Tensor(pred[vid][seg]['bbox_for_all_frames'][pred_ind])[:,:4], \
torch.Tensor(range(10)).unsqueeze(-1)), dim=1)

frm_mask = torch.from_numpy(get_frm_mask(pred_bbox[:, 4].numpy(), \
ref_bbox[:, 4].numpy()).astype('uint8'))
overlap = bbox_overlaps_batch(pred_bbox[:, :5].unsqueeze(0), \
ref_bbox[:, :5].unsqueeze(0), frm_mask.unsqueeze(0))
results[class_name].append(1 if torch.max(overlap) > args.iou else 0)

print('Number of groundable objects in this split: {}'.format(len(results)))
grd_accu = np.mean([sum(hm)*1./len(hm) for i,hm in results.items()])

print('-' * 80)
print('The overall grounding accuracy is {:.4f}'.format(grd_accu))
print('-' * 80)
if args.verbose:
print('Object frequency and grounding accuracy per class (descending by object frequency):')
accu_per_clss = {(i, sum(hm)*1./len(hm)):len(hm) for i,hm in results.items()}
accu_per_clss = sorted(accu_per_clss.items(), key=lambda x:x[1], reverse=True)
for accu in accu_per_clss:
print('{} ({}): {:.4f}'.format(accu[0][0], accu[1], accu[0][1]))


if __name__=='__main__':
parser = argparse.ArgumentParser(description='ActivityNet-Entities object grounding evaluation script.')
parser.add_argument('-s', '--submission', type=str, default='', help='submission grounding result file')
parser.add_argument('-r', '--reference', type=str, default='data/anet_entities_cleaned_class_thresh50_trainval.json', help='reference file')
parser.add_argument('--split_file', type=str, default='data/split_ids_anet_entities.json', help='path to the split file')
parser.add_argument('--split', type=str, nargs='+', default=['validation'], help='which split(s) to evaluate')
parser.add_argument('-iou', type=float, default=0.5, help='the iou threshold for grounding correctness')
parser.add_argument('-v', '--verbose', action='store_true')
args = parser.parse_args()

main(args)

0 comments on commit ef96c3b

Please sign in to comment.