In [1]:
# !pip install funcy
# !pip install scikit-multilearn

# source: https://github.com/akarazniewicz/cocosplit

In [2]:
import os
import json
import shutil
import argparse
import funcy
from sklearn.model_selection import train_test_split
from skmultilearn.model_selection import iterative_train_test_split
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

In [3]:
def save_coco(file, info, licenses, images, annotations, categories):
    with open(file, 'wt', encoding='UTF-8') as coco:
        json.dump({ 'info': info, 'licenses': licenses, 'images': images, 
            'annotations': annotations, 'categories': categories}, coco, indent=2, sort_keys=True)

def filter_annotations(annotations, images):
    image_ids = funcy.lmap(lambda i: int(i['id']), images)
    return funcy.lfilter(lambda a: int(a['image_id']) in image_ids, annotations)


def filter_images(images, annotations):
    annotation_ids = funcy.lmap(lambda i: int(i['image_id']), annotations)
    return funcy.lfilter(lambda a: int(a['id']) in annotation_ids, images)

In [4]:
annotations = "./ppes-custom-dataset/all/_annotations.coco.json"  # path to COCO annotations file
train = "./ppes-custom-dataset/train/_annotations.coco.json"  # here to store COCO training annotations
test = "./ppes-custom-dataset/valid/_annotations.coco.json"  # here to store COCO training annotations
train_ratio = 0.8  # a percentage of a split; a number in (0, 1)
multi_class = True  # split a multi-class dataset while preserving class distributions in train and test sets

src_dir = "./ppes-custom-dataset/all/"  # directory with all images

In [5]:
with open(annotations, 'rt', encoding='UTF-8') as annotations:
    coco = json.load(annotations)
    info = coco['info']
    licenses = coco['licenses']
    images = coco['images']
    annotations = coco['annotations']
    categories = coco['categories']

    number_of_images = len(images)

    images_with_annotations = funcy.lmap(lambda a: int(a['image_id']), annotations)


    if multi_class:

        annotation_categories = funcy.lmap(lambda a: int(a['category_id']), annotations)

        X_train, y_train, X_test, y_test = iterative_train_test_split(np.array([annotations]).T,np.array([annotation_categories]).T, test_size = 1-train_ratio)

        save_coco(train, info, licenses, filter_images(images, X_train.reshape(-1)), X_train.reshape(-1).tolist(), categories)
        save_coco(test, info, licenses,  filter_images(images, X_test.reshape(-1)), X_test.reshape(-1).tolist(), categories)

        print("Saved {} entries in {} and {} in {}".format(len(X_train), train, len(X_test), test))

    else:

        X_train, X_test = train_test_split(images, train_size=train_ratio)

        anns_train = filter_annotations(annotations, X_train)
        anns_test=filter_annotations(annotations, X_test)

        save_coco(train, info, licenses, X_train, anns_train, categories)
        save_coco(test, info, licenses, X_test, anns_test, categories)

        print("Saved {} entries in {} and {} in {}".format(len(anns_train), train, len(anns_test), test))

Saved 146104 entries in ./ppes-custom-dataset/train/_annotations.coco.json and 36526 in ./ppes-custom-dataset/valid/_annotations.coco.json


In [9]:
# copy images between locations (from all/ subdirectory to train/ and valid/ respectively)
with open(train, 'rt', encoding='UTF-8') as f:
    anns_train = json.load(f)
f.close()

img_train_df = pd.DataFrame(anns_train['images'])
for fname in tqdm(img_train_df["file_name"].unique()):
    src = os.path.join(src_dir, fname)
    dst = os.path.join(os.path.dirname(train), fname)
    shutil.copy(src, dst)

  0%|          | 0/36524 [00:00<?, ?it/s]

In [10]:
anns_train_df = pd.DataFrame(anns_train['annotations'])
anns_train_df["category_id"].value_counts(normalize=True)

2    0.466339
3    0.258015
4    0.174431
1    0.101216
Name: category_id, dtype: float64

In [12]:
with open(test, 'rt', encoding='UTF-8') as f:
    anns_test = json.load(f)
f.close()

img_test_df = pd.DataFrame(anns_test['images'])
for fname in tqdm(img_test_df["file_name"].unique()):
    src = os.path.join(src_dir, fname)
    dst = os.path.join(os.path.dirname(test), fname)
    shutil.copy(src, dst)

  0%|          | 0/20958 [00:00<?, ?it/s]

In [13]:
anns_test_df = pd.DataFrame(anns_test['annotations'])
anns_test_df["category_id"].value_counts(normalize=True)

2    0.608963
1    0.191699
4    0.107759
3    0.091579
Name: category_id, dtype: float64