In [1]:
# Augment the Ref-COCO dataset by flipping all images left-to-right and update
# referring expressions accordingly: "zebra on the left" --> "zebra on the right"
import os
from tqdm import tqdm

import matplotlib.pyplot as plt
import numpy as np

import json
import pickle
import copy

In [2]:
# The path for the Ref-COCO dataset images:
images_dir = '/root/refer/data/images/mscoco/images/train2014'

In [20]:
# Create a list of all original images (not flipped)
# Original image starts with "COCO_train2014_0" prefix
# Flipped image starts with "COCO_train2014_1" prefix
image_list = []
flipped_image_count = 0

for filename in tqdm(os.listdir(images_dir)):
    if filename.endswith(".jpg"): 
        image_path = os.path.join(images_dir, filename)
        if 'COCO_train2014_0' in image_path:
            image_list.append(image_path)
        elif 'COCO_train2014_1' in image_path:
            flipped_image_count += 1

100%|██████████| 165566/165566 [00:00<00:00, 339720.29it/s]


In [16]:
# Flip all images left-to-right (keep originals)
problematic_image_list = []

for image_path in tqdm(image_list):
    try:
        flipped_image_path = image_path.replace('COCO_train2014_0', 'COCO_train2014_1')

        im = plt.imread(image_path)

        plt.imsave(flipped_image_path, np.fliplr(im))
    except KeyboardInterrupt:
        raise
    except:
        problematic_image_list.append(image_path)

100%|██████████| 82783/82783 [4:29:22<00:00,  5.12it/s]  


In [79]:
# Load referring expressions:
# "/root/refer/data/refcoco_original/" is the original refcoco data path
# "/root/refer/data/refcoco/" is where the new augmented refcoco data will be saved
refs = pickle.load(open('/root/refer/data/refcoco_original/refs(unc).p', 'rb'))

In [80]:
# Create flipped referring expressions for the new flipped images:

filter_non_left_right_sentences = False # Set to True in order to filter out sentences with no
                                        # "left/right/leftmost/rightmost" keywords

big_const = 100000000000 # Used to not conflict with original id's
new_image_ids = []

new_refs = []

for ref in tqdm(refs):
    if ref['split'] != 'train':
        continue
    new_ref = copy.deepcopy(ref)
    new_ref['sent_ids'] = [big_const + sent_id for sent_id in new_ref['sent_ids']]
    new_ref['file_name'] = new_ref['file_name'].replace('COCO_train2014_0', 'COCO_train2014_1')
    new_ref['ann_id'] = big_const + new_ref['ann_id']
    new_ref['ref_id'] = big_const + new_ref['ref_id']
    new_ref['image_id'] = big_const + new_ref['image_id']
    for sentence in new_ref['sentences']:
        tokens = []
        for token in sentence['tokens']:
            if token == 'right':
                tokens.append('left')
            elif token == 'left':
                tokens.append('right')
            elif token == 'rightmost':
                tokens.append('leftmost')
            elif token == 'leftmost':
                tokens.append('rightmost')
            else:
                tokens.append(token)
        sentence['tokens'] = tokens
        sentence['raw'] = ' '.join(tokens)
        sentence['sent'] = sentence['raw']
        sentence['sent_id'] = big_const + sentence['sent_id']
    
    # Filter out non-{left/right} sentences:
    if filter_non_left_right_sentences:
        for sentence in new_ref['sentences']:
            if 'right' not in sentence['tokens'] and \
               'left' not in sentence['tokens'] and \
               'rightmost' not in sentence['tokens'] and \
               'leftmost' not in sentence['tokens']:
                    new_ref['sent_ids'].remove(sentence['sent_id'])
        if len(new_ref['sent_ids']) == 0:
            continue
        new_ref['sentences'] = [sentence for sentence in new_ref['sentences'] if sentence['sent_id'] in new_ref['sent_ids']]
    #End - Filter
    
    new_refs.append(new_ref)
    new_image_ids.append(new_ref['image_id'])
new_image_ids = set(new_image_ids)

100%|██████████| 50000/50000 [00:04<00:00, 10231.87it/s]


In [82]:
refs.extend(new_refs)

In [102]:
len(new_refs)

26469

# Filter out non left-right sentences in dataset:

In [85]:
filtered_image_ids = []

filtered_refs = []

filtered_ann_ids = []

for ref in tqdm(refs):
    new_ref = copy.deepcopy(ref)
    # Filter out non-{left/right} sentences:
    if filter_non_left_right_sentences:
        for sentence in new_ref['sentences']:
            if 'right' not in sentence['tokens'] and \
               'left' not in sentence['tokens'] and \
               'rightmost' not in sentence['tokens'] and \
               'leftmost' not in sentence['tokens']:
                    new_ref['sent_ids'].remove(sentence['sent_id'])
        if len(new_ref['sent_ids']) == 0:
            continue
        new_ref['sentences'] = [sentence for sentence in new_ref['sentences'] if sentence['sent_id'] in new_ref['sent_ids']]
    #End - Filter
    
    filtered_refs.append(new_ref)
    filtered_image_ids.append(new_ref['image_id'])
    filtered_ann_ids.append(new_ref['ann_id'])
filtered_image_ids = set(filtered_image_ids)
filtered_ann_ids = set(filtered_ann_ids)

100%|██████████| 76469/76469 [00:07<00:00, 10435.65it/s]


In [101]:
len(filtered_refs)

57730

In [87]:
# Load annotations:
instances = json.load(open('/root/refer/data/refcoco_original/instances.json', 'r'))

In [88]:
instances.keys()

dict_keys(['info', 'images', 'licenses', 'annotations', 'categories'])

In [89]:
#instances['annotations']

In [90]:
# Add new images data and save their width for further use

new_images = []
image_widths = {}

for image in tqdm(instances['images']):
    new_image = copy.deepcopy(image)
    new_image['file_name'] = new_image['file_name'].replace('COCO_train2014_0', 'COCO_train2014_1')
    new_image['id'] = big_const + new_image['id']
    
    if new_image['id'] in filtered_image_ids:
        new_images.append(new_image)

        image_widths[new_image['id']] = new_image['width']


100%|██████████| 19994/19994 [00:00<00:00, 31094.70it/s]


In [91]:
# Create new annotations (flipped bounding boxes):
new_annotations = []

for annotation in tqdm(instances['annotations']):
    new_annotation = copy.deepcopy(annotation)
    
    new_annotation['id'] = big_const + new_annotation['id']
    new_annotation['image_id'] = big_const + new_annotation['image_id']
    
    if new_annotation['image_id'] in new_image_ids:
        # Flip bbox x-coord: new_x = image_width - (old_x + bbox_width):
        new_annotation['bbox'][0] = image_widths[new_annotation['image_id']] - \
                                    (new_annotation['bbox'][0] + new_annotation['bbox'][2])

        for seg in new_annotation['segmentation']:
            if type(seg) == list:
                for i, coord in enumerate(seg):
                    if i % 2 == 0:
                        # Flip x-coord:
                        seg[i] = image_widths[new_annotation['image_id']] - coord

        new_annotations.append(new_annotation)

100%|██████████| 196771/196771 [00:23<00:00, 8499.04it/s]


# Filter out non left-right sentences in annotations:

In [92]:
instances['images'].extend(new_images)
instances['annotations'].extend(new_annotations)

In [93]:
instances['images'] = [image for image in instances['images'] if image['id'] in filtered_image_ids]

In [94]:
instances['annotations'] = [ann for ann in instances['annotations'] if ann['id'] in filtered_ann_ids]

In [None]:
# Save the augmented dataset data files:

In [95]:
pickle.dump(filtered_refs, open('/root/refer/data/refcoco/refs(unc).p', 'wb'))

In [96]:
json.dump(instances, open('/root/refer/data/refcoco/instances.json', 'w'))