In [1]:
import os
from collections import defaultdict, Counter
import random

import numpy as np
import torch
import pandas as pd
import json

In [4]:
folder_path = './data/data/'
with open(os.path.join(folder_path,'train_all.json'), 'r') as outfile:
    ann = json.load(outfile)

In [8]:
raw_data = {'image_id': [],
           'class_name': [],
           'class_id': [],
           'x_min': [],
           'y_min': [],
           'w': [],
           'h': []}
df = pd.DataFrame(raw_data)

classes = ["UNKNOWN", "General trash", "Paper", "Paper pack", "Metal", "Glass",
           "Plastic", "Styrofoam", "Plastic bag", "Battery", "Clothing"]

for ann_dict in ann['annotations']:
    file_name = ann['images'][ann_dict['image_id']]['file_name']
    row = [file_name, classes[ann_dict['category_id']], int(ann_dict['category_id']), 
           ann_dict['bbox'][0], ann_dict['bbox'][1], ann_dict['bbox'][2], ann_dict['bbox'][3]]
    df = df.append(pd.Series(row, index=df.columns), ignore_index=True)
df.to_csv('data.csv', index=False)

In [9]:
class CFG:
    n_folds = 5
    seed = 42

In [11]:
def stratified_group_k_fold(X, y, groups, k, seed=None):
    labels_num = np.max(y) + 1
    y_counts_per_group = defaultdict(lambda: np.zeros(labels_num))
    y_distr = Counter()

    for label, g in zip(y, groups):
        y_counts_per_group[g][label] += 1
        y_distr[label] += 1

    y_counts_per_fold = defaultdict(lambda: np.zeros(labels_num))
    groups_per_fold = defaultdict(set)

    def eval_y_counts_per_fold(y_counts, fold):
        y_counts_per_fold[fold] += y_counts
        std_per_label = []
        for label in range(labels_num):
            label_std = np.std([y_counts_per_fold[i][label] / y_distr[label] for i in range(k)])
            std_per_label.append(label_std)
        y_counts_per_fold[fold] -= y_counts
        return np.mean(std_per_label)

    groups_and_y_counts = list(y_counts_per_group.items())
    random.Random(seed).shuffle(groups_and_y_counts)

    for g, y_counts in sorted(groups_and_y_counts, key=lambda x: -np.std(x[1])):
        best_fold = None
        min_eval = None
        for i in range(k):
            fold_eval = eval_y_counts_per_fold(y_counts, i)
            if min_eval is None or fold_eval < min_eval:
                min_eval = fold_eval
                best_fold = i
        y_counts_per_fold[best_fold] += y_counts
        groups_per_fold[best_fold].add(g)

    all_groups = set(groups)
    for i in range(k):
        train_groups = all_groups - groups_per_fold[i]
        test_groups = groups_per_fold[i]

        train_indices = [i for i, g in enumerate(groups) if g in train_groups]
        test_indices = [i for i, g in enumerate(groups) if g in test_groups]

        yield train_indices, test_indices


def seed_everything(seed=42):
    """Seed All

    Args:
        seed: seed number
    """
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

def get_folds(df, config):
    df_folds = df[['image_id']].copy()
    df_folds.loc[:, 'bbox_count'] = 1
    df_folds = df_folds.groupby('image_id').count()
    df_folds['fold'] = 0

    for fold, (trn_idx, val_idx) in enumerate(
            stratified_group_k_fold(df, df['class_id'], df['image_id'], config.n_folds, config.seed)):
        trn_ids = df.loc[trn_idx, 'image_id'].unique()
        val_ids = df.loc[val_idx, 'image_id'].unique()
        assert len(set(trn_ids).intersection(set(val_ids))) == 0

        df_folds.loc[val_ids, 'fold'] = fold
    return df_folds


def load_all_data():
    meta_df = pd.read_csv(f"data.csv")
    meta_df = meta_df.reset_index(drop=True)

    return meta_df

In [35]:
meta_df = load_all_data()
meta_df['class_id'] = meta_df['class_id'].astype(int)

seed_everything()
f_folds = get_folds(meta_df, CFG)

In [38]:
f_folds = f_folds.reset_index()

In [40]:
import os
import json
import pandas as pd
from tqdm import tqdm

In [42]:
train_file_names = f_folds['image_id'].unique()

In [60]:
dataset_path = './data/data'

anns_file_path = dataset_path + '/' + 'train.json'
# Read annotations
with open(anns_file_path, 'r') as f:
    dataset = json.loads(f.read())

categories = dataset['categories']

In [61]:
for idx in tqdm(range(5)): 
    images = []
    annotations = []
    train_file_names = f_folds[f_folds['fold'] != idx]['image_id'].unique()
    valid_file_names = f_folds[f_folds['fold'] == idx]['image_id'].unique()
    for i, train_file_name in (enumerate(train_file_names)):
        images.append(dict(
                license=0,
                url=None,
                file_name=train_file_name,
                height=512,
                width=512,
                date_captured=None,
                id=i
            ))

        image_id = list(filter(lambda x: x['file_name'] == train_file_name, ann['images']))[0]['id']
        for x in list(filter(lambda x: x['image_id'] == image_id, ann['annotations'])):
            annotations.append(dict(id=len(annotations), 
                                image_id=i, 
                                category_id=x['category_id'], 
                                segmentation=x['segmentation'],
                                area=x['area'], 
                                bbox=x['bbox'], 
                                iscrowd=x['iscrowd']))
    
    train_ann = {}
    train_ann['images'] =  images
    train_ann['annotations'] = annotations
    train_ann['categories'] = categories
    
    with open(f'train_data{idx}.json', 'w') as f:
        json.dump(train_ann, f, indent=4)
        
    images = []
    annotations = []
    for i, valid_file_name in (enumerate(valid_file_names)):
        images.append(dict(
                license=0,
                url=None,
                file_name=valid_file_name,
                height=512,
                width=512,
                date_captured=None,
                id=i
            ))

        image_id = list(filter(lambda x: x['file_name'] == valid_file_name, ann['images']))[0]['id']
        for x in list(filter(lambda x: x['image_id'] == image_id, ann['annotations'])):
            annotations.append(dict(id=len(annotations), 
                                image_id=i, 
                                category_id=x['category_id'], 
                                segmentation=x['segmentation'],
                                area=x['area'], 
                                bbox=x['bbox'], 
                                iscrowd=x['iscrowd']))
    
    valid_ann = {}
    valid_ann['images'] =  images
    valid_ann['annotations'] = annotations
    valid_ann['categories'] = categories
        
    with open(f'valid_data{idx}.json', 'w') as f:
        json.dump(valid_ann, f, indent=4)



  0%|          | 0/5 [00:00<?, ?it/s][A[A

 20%|██        | 1/5 [00:56<03:47, 56.82s/it][A[A

 40%|████      | 2/5 [01:56<02:52, 57.58s/it][A[A

 60%|██████    | 3/5 [02:54<01:55, 57.70s/it][A[A

 80%|████████  | 4/5 [03:53<00:58, 58.07s/it][A[A

100%|██████████| 5/5 [04:51<00:00, 58.31s/it][A[A
