initial commit

facebookresearch · Jan 25, 2019 · 34e83c3 · 34e83c3
commit 34e83c3
Show file tree

Hide file tree

Showing 15 changed files with 941 additions and 0 deletions.
diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md
@@ -0,0 +1,5 @@
+# Code of Conduct
+
+Facebook has adopted a Code of Conduct that we expect project participants to adhere to.
+Please read the [full text](https://code.fb.com/codeofconduct/)
+so that you can understand what actions will and will not be tolerated.
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -0,0 +1,39 @@
+# Contributing to ActivityNet-Entities
+We want to make contributing to this project as easy and transparent as possible.
+
+## Our Development Process
+Minor changes and improvements will be released on an ongoing basis.
+Larger changes (e.g., changesets implementing a new paper) will be released
+on a more periodic basis.
+
+
+## Pull Requests
+We actively welcome your pull requests.
+
+1. Fork the repo and create your branch from `master`.
+2. If you've added code that should be tested, add tests.
+3. If you've changed APIs, update the documentation.
+4. Ensure the test suite passes.
+5. Make sure your code lints.
+6. If you haven't already, complete the Contributor License Agreement ("CLA").
+
+## Contributor License Agreement ("CLA")
+In order to accept your pull request, we need you to submit a CLA. You only need
+to do this once to work on any of Facebook's open source projects.
+
+Complete your CLA here: <https://code.facebook.com/cla>
+
+## Issues
+We use GitHub issues to track public bugs. Please ensure your description is
+clear and has sufficient instructions to be able to reproduce the issue.
+
+Facebook has a [bounty program](https://www.facebook.com/whitehat/) for the safe
+disclosure of security bugs. In those cases, please go through the process
+outlined on that page and do not file a public issue.
+
+## Coding Style  
+* 4 spaces for indentation rather than tabs
+
+## License
+By contributing to ActivityNet-Entities, you agree that your contributions will
+be licensed under the LICENSE file in the root directory of this source tree.
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,81 @@
+MIT License
+
+Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+=============================================================================
+
+For the following file(s):
+ActivityNet-Entities/scripts/utils.py
+
+MIT License
+
+Copyright (c) 2017 Jiasen Lu
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+=============================================================================
+
+For the following file(s):
+ActivityNet-Entities/scripts/utils.py
+
+Fast R-CNN
+
+Copyright (c) Microsoft Corporation
+
+All rights reserved.
+
+MIT License
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the "Software"),
+to deal in the Software without restriction, including without limitation
+the rights to use, copy, modify, merge, publish, distribute, sublicense,
+and/or sell copies of the Software, and to permit persons to whom the
+Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+OTHER DEALINGS IN THE SOFTWARE.
+
+=============================================================================
diff --git a/README.md b/README.md
@@ -0,0 +1,47 @@
+# ActivityNet Entities dataset
+This repo hosts the dataset used in our paper [Grounded Video Description](https://arxiv.org/abs/1812.06587).
+
+ActivityNet-Entities, is based on the video description dataset [ActivityNet Captions](https://cs.stanford.edu/people/ranjaykrishna/densevid/) and augments it with 158k bounding box annotations, each grounding a noun phrase (NP). Here we release the complete set of NP-based annotations as well as the pre-processed object-based annotations.
+
+<img src='demo/dataset_teaser.png' alt="dataset teaser" width="80%"/>
+
+### Data
+We have the following dataset files under the `data` directory:
+
+- `anet_entities_trainval.json`: The raw dataset file with noun phrase and bounding box annotations. We only release the training and the validation splits for now.
+
+- `anet_entities_cleaned_class_thresh50_trainval.json`: Pre-processed dataset file with object class and bounding box annotations. For training and validation splits only.
+
+- `anet_entities_skeleton.txt`: Specify the expected structure of the JSON annotation files.
+
+- `split_ids_anet_entities.json`: Video IDs included in the training/validation/testing splits.
+
+- `anet_entities_cleaned_class_thresh50_test_skeleton.json`: Object class annotation for the testing split. This file is for evaluation server purpose and the bounding box annotation is not given. See below for more details.
+
+Note: Both the raw dataset file and the pre-processed dataset file contains all the 12469 videos in the original training and validation splits (as in ActivityNet Captions, which is based on [ActivityNet 1.3](http://activity-net.org/download.html)). This includes 626 videos without box annotations.
+
+### Evaluation
+Under the `scripts` directory, we include:
+- `attr_prep_tag_NP.py`: The preprocessing scripts to obtain the NP/object annotation files.
+- The scripts that print the dataset stats.
+- The evaluation script for object grounding. [PyTorch](https://pytorch.org/get-started/locally/) is required. To evaluate your results, run:
+```
+python scripts/eval_grd_anet_entities.py -s YOUR_SUBMISSION_FILE.JSON
+```
+Please follow the example in `data/anet_entities_skeleton.txt` to format your submission file.
+
+
+### Others
+Please contact <luozhou@umich.edu> if you have any trouble running the code. Please cite the following paper if you use the dataset.
+```
+@article{zhou2018grounded,
+  title={Grounded Video Description},
+  author={Zhou, Luowei and Kalantidis, Yannis and Chen, Xinlei and Corso, Jason J and Rohrbach, Marcus},
+  journal={arXiv preprint arXiv:1812.06587},
+  year={2018}
+}
+```
+### License
+This project is licensed under the license found in the LICENSE file in the root directory of this source tree.
+
+The noun phrases in these annotations are based on [ActivityNet Captions](https://cs.stanford.edu/people/ranjaykrishna/densevid/), which are linked to videos in [ActivityNet 1.3](http://activity-net.org/download.html) 
diff --git a/data/anet_entities_cleaned_class_thresh50_test_skeleton.json b/data/anet_entities_cleaned_class_thresh50_test_skeleton.json
diff --git a/data/anet_entities_cleaned_class_thresh50_trainval.json b/data/anet_entities_cleaned_class_thresh50_trainval.json
diff --git a/data/anet_entities_skeleton.txt b/data/anet_entities_skeleton.txt
@@ -0,0 +1,49 @@
+Format of JSON ActivityNet-Entities annotation files
+
+### for anet_entities_trainval.json
+-> database
+  -> [video name]: identifier of video
+    - rwidth: resized width of video, will be 720px
+    - rheight: resized height of video, maintains aspect ratio
+    -> segments
+      -> [segment number]: segment from video with bounding box annotations
+        -> objects
+          -> [object number]: annotated object from segment
+            -> noun_phrases: a list of noun phrase (NP) annotations of the object, both the text and the index of the word in the sentence
+            - frame_ind: frame index (0-9, among the 10 sampled frames)
+            - ybr: y coordinate of bottom right corner of bounding box
+            - ytl: y coordinate of top left corner of bounding box
+            - xbr: x coordinate of bottom right corner of bounding box
+            - xtl: x coordinate of top left corner of bounding box
+            - crowds: whether the box represents a group of objects
+
+
+### for anet_entities_cleaned_class_thresh50_trainval.json
+-> vocab: the 431 object classes (not including the background class)
+-> database
+  -> [video name]: identifier of video
+    -> segments
+      -> [segment number]: segment from video with bounding box annotations
+        -> process_clss: object class of all the bounding boxes
+        -> tokens: tokenized sentence
+        -> frame_ind: frame index of all the bounding boxes
+        -> process_idx: the index of the object class in the sentence
+        -> process_bnd_box: coordinate of all the bounding boxes
+        -> crowds: whether the box represents a group of objects
+
+### an example on grounding evaluation subsmission files
+```
+{
+  "results": {
+    "v_QOlSCBRmfWY": {
+      "clss": ["room", "woman", "she"], # object class
+      "idx_in_sent": [8, 2, 12], # index of object in the sentence
+      "bbox_for_all_frames": [[[1,2,3,4], …, [1,2,3,4]], [[1,2,3,4], …, [1,2,3,4]], [[1,2,3,4], …, [1,2,3,4]]] # predicted bbox on all 10 uniformly sampled frames 
+    }
+  }
+  "external_data": {
+    "used": True, # Boolean flag
+    "details": "Object detector pre-trained on Visual Genome on object detection task."
+  }
+}
+```
diff --git a/data/anet_entities_trainval.json b/data/anet_entities_trainval.json
diff --git a/data/split_ids_anet_entities.json b/data/split_ids_anet_entities.json
diff --git a/demo/dataset_teaser.png b/demo/dataset_teaser.png
diff --git a/scripts/anet_entities_np_stats.py b/scripts/anet_entities_np_stats.py
@@ -0,0 +1,46 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+#
+
+# Script to print stats on the NP annotation file
+
+import numpy as np
+import json
+import csv
+import sys
+
+src_file = sys.argv[1] # 'anet_entities.json'
+dataset_file = sys.argv[2] # 'anet_captions_all_splits.json'
+split_file = sys.argv[3] # 'split_ids_anet_entities.json'
+
+with open(src_file) as f:
+    data = json.load(f)['database']
+
+with open(dataset_file) as f:
+    raw_data = json.load(f)
+
+split_dict = {}
+with open(split_file) as f:
+    split = json.load(f)
+    for s,ids in split.items():
+        split_dict.update({i:s for i in ids})
+
+num_seg = np.sum([len(dat['segments']) for vid, dat in data.items()])
+
+total_box = {}
+total_dur = []
+seg_splits = {}
+for vid, dat in data.items():
+    for seg, ann in dat['segments'].items():
+        total_box[split_dict[vid]] = total_box.get(split_dict[vid], 0)+len(ann['objects'])
+        total_dur.append(float(raw_data[vid]['timestamps'][int(seg)][1]-raw_data[vid]['timestamps'][int(seg)][0]))
+        seg_splits[split_dict[vid]] = seg_splits.get(split_dict[vid], 0)+1
+
+print('number of annotated video: {}'.format(len(data)))
+print('number of annotated video segments: {}'.format(num_seg))
+print('number of segments in each split: {}'.format(seg_splits))
+print('total duration in hr: {}'.format(np.sum(total_dur)/3600))
+print('total number of noun phrase boxes: {}'.format(total_box))
diff --git a/scripts/anet_entities_object_stats.py b/scripts/anet_entities_object_stats.py
@@ -0,0 +1,73 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+#
+
+# Script to print stats on the object annotation file
+
+import numpy as np
+import json
+import csv
+import visdom
+import sys
+from collections import Counter
+
+src_file = sys.argv[1] # 'anet_entities_cleaned_class_thresh50_trainval.json'
+dataset_file = sys.argv[2] # 'anet_captions_all_splits.json'
+split_file = sys.argv[3] # 'split_ids_anet_entities.json'
+
+with open(src_file) as f:
+    data = json.load(f)['annotations']
+
+with open(dataset_file) as f:
+    raw_data = json.load(f)
+
+split_dict = {}
+with open(split_file) as f:
+    split = json.load(f)
+    for s,ids in split.items():
+        split_dict.update({i:s for i in ids})
+
+num_seg = np.sum([len(dat['segments']) for vid, dat in data.items()])
+
+total_box = {}
+total_dur = []
+seg_splits = {}
+
+box_per_seg = []
+obj_per_box = []
+count_obj = []
+
+for vid, dat in data.items():
+    for seg, ann in dat['segments'].items():
+        total_box[split_dict[vid]] = total_box.get(split_dict[vid], 0)+len(ann['process_bnd_box'])
+        total_dur.append(float(raw_data[vid]['timestamps'][int(seg)][1]-raw_data[vid]['timestamps'][int(seg)][0]))
+        seg_splits[split_dict[vid]] = seg_splits.get(split_dict[vid], 0)+1
+        box_per_seg.append(len(ann['process_bnd_box']))
+        for c in ann['process_clss']:
+            obj_per_box.append(len(c))
+            count_obj.extend(c)
+
+print('number of annotated video: {}'.format(len(data)))
+print('number of annotated video segments: {}'.format(num_seg))
+print('number of segments in each split: {}'.format(seg_splits))
+print('total duration in hr: {}'.format(np.sum(total_dur)/3600))
+print('total number of phrase (not object) boxes: {}'.format(total_box))
+
+print('box per segment, mean {}, std {}, count {}'.format(np.mean(box_per_seg), np.std(box_per_seg), Counter(box_per_seg)))
+print('object per box, mean {}, std {}, count {}'.format(np.mean(obj_per_box), np.std(obj_per_box), Counter(obj_per_box)))
+
+print('Top 10 object labels: {}'.format(Counter(count_obj).most_common(10)))
+
+"""
+vis = visdom.Visdom()
+vis.histogram(X=[i for i in box_per_seg if i < 20],
+              opts={'numbins': 20, 'xtickmax':20, 'xtickmin':0, 'xmax':20, 'xmin':0, 'title':'Distribution of number of boxes per segment', 'xtickfont':{'size':14}, \
+                    'ytickfont':{'size':14}, 'xlabel':'Number of boxes', 'ylabel': 'Counts'})
+
+vis.histogram(X=[i for i in obj_per_box if i < 100],
+              opts={'numbins': 100, 'xtickmax':100, 'xtickmin':0, 'xmax':100, 'xmin':0, 'title':'Distribution of number of object labels per box', 'xtickfont':{'size':14}, \
+                    'ytickfont':{'size':14}, 'xlabel':'Number of object labels', 'ylabel': 'Counts'})
+"""