## This file helps merge multiple annotation files

### The json format has 2 lists of import
1) list of images with ids incrementing from 1
2) list of annotations with ids incrementing from 1, and references to the image ids in #1

This makes combining files a little tricky, because you have to reindex everything. That's what
this script does

In [67]:
import pandas as pd
import numpy as np
import json
import os
import pprint
import random

## read in all json files in a directory

In [94]:
path = "D://projects_working_directories//imagery_analysis//3_coco_files//"
key_constants = ['licenses', 'info', 'categories'] # these are the keys in the file that have the same content
key_vars = ['images', 'annotations'] #t these are the keys in the file that have variable content
files = os.listdir(path)
files = [f"{path}{f}" for f in files if f.split('.')[1] == 'json']


In [95]:
def read_file(f):
    with open(f, 'r') as file:
        data = json.load(file)
    return data

def map_orig_imageid_to_new(orig_id):
    # pass in the original image id reference (compound key of original image id and file name)
    return df_images['id'][df_images['orig_image_key'] == orig_id].tolist()[0]


## build a dataframe for images and annotations respectively

In [71]:
df_images = pd.DataFrame()
df_annotations = pd.DataFrame()
for f in files:

    #images 
    data_i = read_file(f)['images']
    temp_df = pd.DataFrame(data_i)
    temp_df['source'] = f

    df_images = pd.concat([df_images,temp_df], ignore_index=True)

    #annotations 
    data_a = read_file(f)['annotations']
    temp_df = pd.DataFrame(data_a)
    temp_df['source'] = f

    df_annotations = pd.concat([df_annotations,temp_df], ignore_index=True)

# preserve original fields for later
original_image_fields = list( data_i[0].keys() )
original_annotation_fields = list( data_a[0].keys() )

## ID manipulations
1) preserve original ids
2) create a compound key for image ids to make them unique across the whole dataset
3) reset the ids so they are an int unique across the dataset

In [97]:
# "back up" original ids
df_images['orig_id'] = df_images['id']
df_images['orig_image_key'] = df_images['orig_id'].astype(str) + df_images['source']

df_annotations['orig_id'] = df_annotations['id']
df_annotations['orig_image_id'] = df_annotations['image_id']
df_annotations['orig_image_key'] = df_annotations['orig_image_id'].astype(str) + df_annotations['source']

In [75]:
# reset the ids so there are no duplicates, starting at 1
df_images['id'] = df_images.index + 1
df_annotations['id'] = df_annotations.index + 1

In [84]:
# given the original compound key, look up the new, unique id
df_annotations['image_id'] = df_annotations['orig_image_key'].apply(map_orig_imageid_to_new)

In [87]:
# write dfs to file for qa
df_annotations.to_excel(f"{path}//annotations.xlsx", index=False)
df_images.to_excel(f"{path}//images.xlsx", index=False)

## rebuild the json

In [89]:
# remove the 'task' part of the filename
def scrub_image_filename(original_image_file):
    return original_image_file.split('/')[1]

df_images['file_name'] = df_images['file_name'].apply(scrub_image_filename)

In [90]:
images = df_images[original_image_fields].to_dict('records')
annotations = df_annotations[original_annotation_fields].to_dict('records')

In [91]:
output_json = {} 
for k in ['licenses', 'info', 'categories']:
    data = read_file(files[0])
    output_json[k] = data[k]

output_json['images'] = images
output_json['images'] = images
output_json['annotations'] = annotations
    

In [93]:
with open("c://temp//combined_annotations.json", "w") as outfile: 
    json.dump(output_json, outfile)
