In [1]:
# !pip install funcy
# !pip install scikit-multilearn

# source: https://github.com/akarazniewicz/cocosplit

In [2]:
import os
import json
import shutil
import argparse
import funcy
from sklearn.model_selection import train_test_split
from skmultilearn.model_selection import iterative_train_test_split
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm



In [3]:
def save_coco(file, info, licenses, images, annotations, categories):
    with open(file, 'wt', encoding='UTF-8') as coco:
        json.dump({ 'info': info, 'licenses': licenses, 'images': images, 
            'annotations': annotations, 'categories': categories}, coco, indent=2, sort_keys=True)

def filter_annotations(annotations, images):
    image_ids = funcy.lmap(lambda i: int(i['id']), images)
    return funcy.lfilter(lambda a: int(a['image_id']) in image_ids, annotations)


def filter_images(images, annotations):
    annotation_ids = funcy.lmap(lambda i: int(i['image_id']), annotations)
    return funcy.lfilter(lambda a: int(a['id']) in annotation_ids, images)

In [4]:
annotations = "./ppes-custom-dataset/all/_annotations.coco.json"  # path to COCO annotations file
train = "./ppes-custom-dataset/train/_annotations.coco.json"  # here to store COCO training annotations
test = "./ppes-custom-dataset/valid/_annotations.coco.json"  # here to store COCO training annotations
train_ratio = 0.8  # a percentage of a split; a number in (0, 1)
multi_class = True  # split a multi-class dataset while preserving class distributions in train and test sets

src_dir = "./ppes-custom-dataset/all/"  # directory with all images

In [5]:
with open(annotations, 'rt', encoding='UTF-8') as annotations:
    coco = json.load(annotations)
    info = coco['info']
    licenses = coco['licenses']
    images = coco['images']
    annotations = coco['annotations']
    categories = coco['categories']
    
    
    # checking the number of boxes of individual classes and setting priorities
    dist = pd.DataFrame(annotations)[['category_id']].value_counts()
    dist_df = dist.to_frame().rename(columns={0: "count"})
    dist_df["priority"] = np.argsort(dist_df["count"].values)
    display(dist_df)

    # creating a dictionary, in which the key is the id of the category and the value is priority
    label_prio = {}
    for idx, _, priority in dist_df.itertuples():
        label_prio[idx[0]] = priority


    print(label_prio)

    # adding column with priorities
    annot_df = pd.DataFrame(annotations)
    annot_df['__prio'] = annot_df['category_id'].replace(label_prio)
    # getting rid of many-to-many relationships - exactly 1 assignment to a category is kept for each image according to priorities
    prio_annot_df = annot_df.loc[annot_df.sort_values(by=['image_id', '__prio'])[['image_id']].drop_duplicates(keep='first').index][['image_id', 'category_id']]

    assert prio_annot_df.shape[0] == annot_df.drop_duplicates(subset=["image_id"]).shape[0]

    # split images and annotations into train & test subsets
    img_all, cat_all = prio_annot_df[['image_id']], prio_annot_df[['category_id']]
    img_train, img_test, cat_train, cat_test = train_test_split(img_all, cat_all, stratify=cat_all, test_size=1-train_ratio)
    
    annot_df = pd.DataFrame(annotations)
    annot_train_df = annot_df.loc[annot_df["image_id"].isin(img_train['image_id'].values)]
    annot_test_df = annot_df.loc[annot_df["image_id"].isin(img_test['image_id'].values)]

    img_df = pd.DataFrame(images)
    img_train_df = img_df.loc[img_df["id"].isin(img_train['image_id'].values)]
    img_test_df = img_df.loc[img_df["id"].isin(img_test['image_id'].values)]

    # assert that the intersection is empty
    assert len(set(annot_train_df.image_id).intersection(set(annot_test_df.image_id))) == 0
    assert len(set(img_train_df.id).intersection(set(img_test_df.id))) == 0

        
    # save as coco files
    anns_train = annot_train_df.to_dict('records')
    anns_test = annot_test_df.to_dict('records')
    imgs_train = img_train_df.to_dict('records')
    imgs_test = img_test_df.to_dict('records')
    
    save_coco(train, info, licenses, imgs_train, anns_train, categories)
    save_coco(test, info, licenses,  imgs_test, anns_test, categories)

Unnamed: 0_level_0,count,priority
category_id,Unnamed: 1_level_1,Unnamed: 2_level_1
2,90377,3
3,41042,2
4,29421,1
1,21790,0


{2: 3, 3: 2, 4: 1, 1: 0}


In [6]:
print(f"Train size: {annot_train_df.shape[0]}, test size: {annot_test_df.shape[0]}")

Train size: 146235, test size: 36395


In [7]:
annot_train_df["category_id"].value_counts(normalize=True)

2    0.494847
3    0.225227
4    0.161124
1    0.118802
Name: category_id, dtype: float64

In [8]:
annot_test_df["category_id"].value_counts(normalize=True)

2    0.494931
3    0.222723
4    0.160984
1    0.121363
Name: category_id, dtype: float64

In [9]:
# copy images between locations (from all/ subdirectory to train/ and valid/ respectively)
with open(train, 'rt', encoding='UTF-8') as f:
    anns_train = json.load(f)
f.close()

img_train_df = pd.DataFrame(anns_train['images'])
for fname in tqdm(img_train_df["file_name"].unique()):
    src = os.path.join(src_dir, fname)
    dst = os.path.join(os.path.dirname(train), fname)
    shutil.copy(src, dst)

  0%|          | 0/31895 [00:00<?, ?it/s]

In [10]:
with open(test, 'rt', encoding='UTF-8') as f:
    anns_test = json.load(f)
f.close()

img_test_df = pd.DataFrame(anns_test['images'])
for fname in tqdm(img_test_df["file_name"].unique()):
    src = os.path.join(src_dir, fname)
    dst = os.path.join(os.path.dirname(test), fname)
    shutil.copy(src, dst)

  0%|          | 0/7923 [00:00<?, ?it/s]