In [1]:
import sys
import os
import json
import pathlib
from pathlib import Path
import shutil
from tqdm import tqdm

In [2]:
def copy_json_files(source_folder:pathlib.Path, target_folder:pathlib.Path):
    # copy all json files from the source_folder to target_folder
    target_folder.mkdir(parents=True, exist_ok=True)
    for json_file in source_folder.rglob("*.json"): # recursively find image paths
        fname = os.path.basename(json_file.parent)
        shutil.copy(json_file, target_folder.joinpath(fname + ".json"))

def copy_json_files_with_cat(detect_f, source_folder:pathlib.Path, target_folder:pathlib.Path):
    # copy json from datasets that contains the given categories presented in detect_f
    target_folder.mkdir(parents=True, exist_ok=True)
    
    cat_set = set()
    
    with open(detect_f) as file:
        temp = file.readlines()
    temp = [s.strip() for s in temp]
                    
    for json_file in source_folder.rglob("*.json"): # recursively find image paths
        with open(json_file) as f:
            data = json.load(f)
        for cat in data['categories']:
            if cat['name'][6:] in temp:
                fname = os.path.basename(json_file.parent)
                cat_set.add(fname)
                shutil.copy(json_file, target_folder.joinpath(fname + ".json"))
    return cat_set

In [3]:
def remove_extra_cat(f):
    # remove unused categories in the json file
    id_set = set()
    with open(f, 'r') as json_file:
        data = json.load(json_file)
    for x in data['annotations']:
        id_set.add(x['category_id'])
    orig_len = len(data['categories'])
    data['categories'][:] = [x for x in data['categories'] if x['id'] in id_set]
    os.remove(f)
    with open(f, 'w') as f:
        json.dump(data, f, indent=4)
    return orig_len - len(data['categories'])

def remove_extra_cat_dir(d):
    # remove unused categories for all json file in the given directory
    count = 0
    for f in Path(d).rglob("*.json"):
            count += remove_extra_cat(f)
    print(str(count) + " unused categories removed")

In [4]:
def build_dict(folder):
    # build dictionaries to track all categories recorded in the json files
    cat_dict = dict()
    i = 0
    for filename in os.listdir(folder):
        with open(os.path.join(folder, filename), 'r') as f:
            data = json.load(f)
        for cat in data['categories']:
            if cat["name"] not in cat_dict:
                cat_dict[cat["name"]] = i
                i += 1
    return cat_dict

In [5]:
def change_ids(f, cat_dict):
    # change ids associate with each bbox based on the given cat_dict(name: id)
    id_dict = dict()
    with open(f) as json_file:
        data = json.load(json_file)
    for cat in data['categories']:
        id_dict[cat['id']] = cat_dict[cat['name']]
    for ann in data['annotations']:
        ann['category_id'] = id_dict[ann['category_id']]
    os.remove(f)
    with open(f, 'w') as f:
        json.dump(data, f, indent=4)

def change_ids_dir(d, cat_dict):
    # change ids associate with each bbox based on the given cat_dict(name: id)
    for f in Path(d).rglob("*.json"):
            change_ids(f, cat_dict)
    print("ids changed")

In [6]:
def sync_cat(f, cat_dict):
    # replace the ['categories'] according to the cat_dict
    with open(f) as json_file:
        data = json.load(json_file)
    data['categories'] = [{"id": v, "name": k} for (k, v) in cat_dict.items()]
    os.remove(f)
    with open(f, 'w') as f:
        json.dump(data, f, indent=4)
    
def sync_cat_dir(d, cat_dict):
    # replace the ['categories'] according to the cat_dict
    for f in os.listdir(d):
        if f.endswith(".json"):
            sync_cat(os.path.join(d, f), cat_dict)
    print("categories synced")

In [7]:
target = "combined"

In [8]:
# move the datasets to target folder and clean the combined dataset.

# # copy all jsons files under WeedCOCO folder to the target folder
# copy_json_files(pathlib.Path('WeedCOCO'), pathlib.Path(target))

# copy json from datasets that contains the given categories presented in detect_f
cat_set = copy_json_files_with_cat("detect.txt", pathlib.Path('WeedCOCO'), pathlib.Path(target))
    

In [9]:
# remove unused categories in all json files under target folder
remove_extra_cat_dir(target)

# build categories dict and save it to cat_dict
cat_dict = build_dict(target)

# change ids associate with each bbox based on the given cat_dict(name: id)
change_ids_dir(target, cat_dict)

# replace the ['categories'] according to the cat_dict
sync_cat_dir(target, cat_dict)

print(cat_dict)
print(len(cat_dict))

0 unused categories removed
ids changed
categories synced
{'weed: lolium rigidum': 0, 'weed: rapistrum rugosum': 1, 'weed: daucus carota': 2, 'weed: lupinus cosentinii': 3, 'weed: sonchus oleraceus': 4, 'weed: amsinckia (rosette)': 5, 'weed: poaceae': 6, 'weed: trifolium incarnatum': 7, 'weed: erigeron canadensis': 8, 'weed: vigna unguiculata': 9, 'weed: helianthus annuus': 10, 'weed: chenopodium album': 11, 'weed: abutilon theophrasti': 12, 'weed: brassicaceae': 13}
14


In [10]:
# credit https://github.com/mohamadmansourX/Merge_COCO_FILES

def combine(tt1,tt2,output_file):
    """ Combine two COCO annoatated files and save them into new file
    :param tt1: 1st COCO file path
    :param tt2: 2nd COCO file path
    :param output_file: output file path
    """
    with open(tt1) as json_file:
        d1 = json.load(json_file)
    with open(tt2) as json_file:
        d2 = json.load(json_file)
    b1={}
    for i,j in enumerate(d1['images']):
        b1[d1['images'][i]['id']]=i

    b2={}
    for i,j in enumerate(d2['images']):
        b2[d2['images'][i]['id']]=i+max(b1)+1
        
    #Reset File 1 and 2 images ids
    for i,j in enumerate(d1['images']):
        d1['images'][i]['id']= b1[d1['images'][i]['id']]
    for i,j in enumerate(d2['images']):
        d2['images'][i]['id']= b2[d2['images'][i]['id']]
        
    #Reset File 1 and 2 annotations ids
    b3={}
    for i,j in enumerate(d1['annotations']):
        b3[d1['annotations'][i]['id']]=i
    b4={}
    for i,j in enumerate(d2['annotations']):
        b4[d2['annotations'][i]['id']]=max(b3)+i+1

    for i,j in enumerate(d1['annotations']):
        d1['annotations'][i]['id']= b3[d1['annotations'][i]['id']]
        d1['annotations'][i]['image_id']=b1[d1['annotations'][i]['image_id']]
    for i,j in enumerate(d2['annotations']):
        d2['annotations'][i]['id']= b4[d2['annotations'][i]['id']]
        d2['annotations'][i]['image_id']=b2[d2['annotations'][i]['image_id']]

    test=d1.copy()
    for i in d2['images']:
        test['images'].append(i)
    for i in d2['annotations']:
        test['annotations'].append(i)
    test['categories']=d2['categories']
    
    
    os.remove(output_file)
    with open(output_file, 'w') as f:
        json.dump(test,f)

In [11]:
output_file = "combined.json"
input_dir = target

temp_file = os.path.join(input_dir, os.listdir(target)[0])
shutil.move(temp_file, output_file)

pbar = tqdm(total=len(os.listdir(input_dir)))
for f in os.listdir(input_dir):
    if f.endswith(".json"):
        combine(output_file, os.path.join(input_dir, f), output_file)
        os.remove(os.path.join(input_dir, f))
        pbar.update(1)
pbar.close()

shutil.move(output_file, "combined/weedcoco.json")

100%|███████████████████████████████████████████| 10/10 [00:07<00:00,  1.36it/s]


'combined/weedcoco.json'

In [22]:
def format_filename(f):
    with open(f, 'r') as json_file:
        data = json.load(json_file)
    count = 0
    for x in data['images']:
        if 'images/' not in x['file_name']:
            x['file_name'] = 'images/' + x['file_name']
            count += 1
    os.remove(f)
    with open(f, 'w') as f:
        json.dump(data, f, indent=4)
    print(str(count) + " out of " + str(len(data['images'])) + " filename cleand")


In [23]:
format_filename('combined/weedcoco.json')

0 out of 2780 filename cleand


In [None]:
def move_files(source_folder:pathlib.Path, target_folder:pathlib.Path):
    target_folder.mkdir(parents=True, exist_ok=True)
    for image_file in source_folder.rglob("*.jpg"): # recursively find image paths
        image_file.rename(target_folder.joinpath(image_file.name))
    for image_file in source_folder.rglob("*.JPG"): # recursively find image paths
        image_file.rename(target_folder.joinpath(image_file.name))
    for image_file in source_folder.rglob("*.png"): # recursively find image paths
        image_file.rename(target_folder.joinpath(image_file.name))

move_files(pathlib.Path('WeedCOCO'), pathlib.Path('combined/images'))

In [24]:
def move_files_with_cat(source_folder:pathlib.Path, target_folder:pathlib.Path, cat_set):
    target_folder.mkdir(parents=True, exist_ok=True)
    for image_file in source_folder.rglob("*.jpg"): # recursively find image paths
        dname = os.path.basename(image_file.parent.parent)
        if dname in cat_set:
            shutil.copy(image_file, target_folder.joinpath(image_file.name))
    for image_file in source_folder.rglob("*.JPG"): # recursively find image paths
        dname = os.path.basename(image_file.parent.parent)
        if dname in cat_set:
            shutil.copy(image_file, target_folder.joinpath(image_file.name))
    for image_file in source_folder.rglob("*.png"): # recursively find image paths
        dname = os.path.basename(image_file.parent.parent)
        if dname in cat_set:
            shutil.copy(image_file, target_folder.joinpath(image_file.name))
        
move_files_with_cat(pathlib.Path('WeedCOCO'), pathlib.Path('combined/images'), cat_set)