In [1]:
import glob
import numpy as np
import json
from pycocotools.coco import COCO

# Get Image ID

## Get Image ID - SALICON

In [2]:
def get_image_ids(path: str) -> list:
    imges = glob.glob(path)
    len(imges)
    ids = [int(t.split("_")[-1].split(".")[0]) for t in imges]
    ids.sort()
    return ids

train_ids = get_image_ids("../../data/images/train/*.jpg")
val_ids = get_image_ids("../../data/images/val/*.jpg")
len(train_ids), len(val_ids)

(10000, 5000)

## Get Image ID - COCO 2017

In [3]:
coco_data_dir='../../data/annotations/'
coco_json='../../data/annotations/instances_train2014.json'
train_coco = COCO(coco_json)
coco_json='../../data/annotations/instances_val2014.json'
val_coco = COCO(coco_json)

loading annotations into memory...
Done (t=18.47s)
creating index...
index created!
loading annotations into memory...
Done (t=10.71s)
creating index...
index created!


In [4]:
train_ids_coco = sorted(train_coco.getImgIds())
val_ids_coco = sorted(val_coco.getImgIds())
len(train_ids_coco), len(val_ids_coco)

(82783, 40504)

## Get Common ID

In [5]:
train_train_found = set(train_ids_coco).intersection(set(train_ids))
train_val_found = set(val_ids_coco).intersection(set(train_ids))
train_train_found = sorted(list(train_train_found))
train_val_found = sorted(list(train_val_found))
len(train_train_found), len(train_val_found)

(10000, 0)

In [6]:
val_train_found = set(train_ids_coco).intersection(set(val_ids))
val_val_found = set(val_ids_coco).intersection(set(val_ids))
val_train_found = sorted(list(val_train_found))
val_val_found = sorted(list(val_val_found))

np.savetxt("val_in_train2017.txt", val_train_found)
np.savetxt("val_in_val2017.txt", val_val_found)
len(val_train_found), len(val_val_found)

(0, 5000)

# Filter COCO Json

In [12]:
def to_full_img_jpg_train(idx: int) -> str:
    prefix = "COCO_train2014"
    return f"{prefix}_{idx:012}.jpg"

def to_full_img_jpg_val(idx: int) -> str:
    prefix = "COCO_val2014"
    return f"{prefix}_{idx:012}.jpg"

def load_json(path: str) -> dict:
    with open(path, 'r') as j:
        contents = json.loads(j.read())
        return contents

def filter_annoataion(in_json_path: str, out_json_path: str, target_ids: list, train: bool) -> None:
    if train:
        target_full_ids = set(map(to_full_img_jpg_train, target_ids))
    else:
        target_full_ids = set(map(to_full_img_jpg_val, target_ids))

    contents = load_json(in_json_path)
    
    print("Cleaning Images")
    print("Before", len(contents['images']))
    filtered_img_json = [j for j in contents['images'] if j['file_name'] in target_full_ids]
    print("After", len(filtered_img_json))
    contents['images'] = filtered_img_json

    print("Cleaning Annotations")
    print("Before", len(contents['annotations']))
    filtered_ann_json = [j for j in contents['annotations'] if j['image_id'] in target_ids]
    print("After", len(filtered_ann_json))
    contents['annotations'] = filtered_ann_json

    json.dump(contents, open(out_json_path, 'w+'))


In [9]:
cocoval_json = '../../data/annotations/instances_val2014.json'
contents = load_json(cocoval_json)

In [11]:
contents['images'][0]

{'license': 3,
 'file_name': 'COCO_val2014_000000391895.jpg',
 'coco_url': 'http://images.cocodataset.org/val2014/COCO_val2014_000000391895.jpg',
 'height': 360,
 'width': 640,
 'date_captured': '2013-11-14 11:18:45',
 'flickr_url': 'http://farm9.staticflickr.com/8186/8119368305_4e622c8349_z.jpg',
 'id': 391895}

## Get Image ID - SALICON

In [None]:
# cocotrain_json = '../../data/annotations/instances_train2017.json'
# out_json = "../../data/val_train.json"
# filter_annoataion(cocotrain_json, out_json, val_train_found)

In [14]:
cocoval_json = '../../data/annotations/instances_val2014.json'
out_json = "../../data/salicon_val_coco.json"
filter_annoataion(cocoval_json, out_json, val_val_found, train=False)

Cleaning Images
Before 40504
After 5000
Cleaning Annotations
Before 291875
After 56872


In [None]:
# ##Combine Val
# val_val_json = "../../data/val_val.json"
# val_train_json = "../../data/val_train.json"
# val_val_data = load_json(val_val_json)
# val_train_data = load_json(val_val_json)
# out_json = "../../data/salicon_val_coco.json"
# print(type(val_train_data['images']), type(val_train_data['annotations']))
# val_train_data['images'] = val_train_data['images'] + val_val_data['images']
# val_train_data['annotations'] = val_train_data['annotations'] + val_val_data['annotations']
# json.dump(val_train_data, open(out_json, 'w+'))


In [15]:
## No Train Data in Val Set 2017
cocotrain_json = '../../data/annotations/instances_train2014.json'
out_json = "../../data/salicon_train_coco.json"
filter_annoataion(cocotrain_json, out_json, train_train_found, train=True)

Cleaning Images
Before 82783
After 10000
Cleaning Annotations
Before 604907
After 114833


# Check

In [2]:
coco_json='../../data/salicon_train_coco.json'
train_coco = COCO(coco_json)
coco_json='../../data/salicon_val_coco.json'
val_coco = COCO(coco_json)

loading annotations into memory...
Done (t=2.47s)
creating index...
index created!
loading annotations into memory...
Done (t=1.20s)
creating index...
index created!


In [3]:
train_ids_coco = sorted(train_coco.getImgIds())
val_ids_coco = sorted(val_coco.getImgIds())
len(train_ids_coco), len(val_ids_coco)

(10000, 5000)