In [2]:
import os, json, cv2, random, glob
import numpy as np
from PIL import Image
import pandas as pd
import geopandas as gpd
from sklearn.model_selection import train_test_split

In [5]:
train_json = open('coco_format_12_4/project_building_properties_right-coco-1/annotations/instances_default.json')
train_json = json.load(train_json)
val_json = open('coco_format_12_4/project_building_properties_left-coco-1/annotations/instances_default.json')
val_json = json.load(val_json)

In [7]:
len(train_json['images']), len(val_json['images']), len(train_json['annotations']), len(val_json['annotations'])

(13476, 13549, 5549, 6484)

In [8]:
train_anns_df = pd.json_normalize(train_json['annotations'])
train_images_df = pd.json_normalize(train_json['images'])
train_images_df['image_id'] = train_images_df['id']
train_anns_df['ann_id'] = train_anns_df['id']
val_anns_df = pd.json_normalize(val_json['annotations'])
val_images_df = pd.json_normalize(val_json['images'])
val_images_df['image_id'] = val_images_df['id']
val_anns_df['ann_id'] = val_anns_df['id']


train_images_df['original_image_id_right'] = train_images_df['image_id'].astype(int)
train_anns_df['original_ann_id_right'] = train_anns_df['ann_id'].astype(int)
val_images_df['original_image_id_left'] = val_images_df['image_id'].astype(int) # Original val image id
val_anns_df['original_ann_id_left'] = val_anns_df['ann_id'].astype(int)
val_anns_df['original_image_id_left'] = val_anns_df['image_id'].astype(int) # Original val image id

In [9]:
len(train_anns_df), len(val_anns_df)

(5549, 6484)

In [10]:
train_images_df_p = train_images_df # Make a copy of right side images
val_images_df_p = val_images_df # Make a copy of left side images

In [None]:
# sort left side images
val_images_df.sort_values(by='image_id', ascending=True)

In [12]:
# Make the left side images an "extension" of the number of right side images
val_images_df['id'] = val_images_df['image_id'] + train_images_df['id'].max()

In [13]:
train_anns_df_p = train_anns_df # Make a copy of right side annos
val_anns_df_p = val_anns_df # Make a copy of left side annos

In [14]:
# Make the left side annos an "extension" of the number of right side annos
val_anns_df['id'] = val_anns_df['id'] + train_anns_df['id'].max()

In [15]:
for idx, r in val_anns_df.iterrows():
  old_image_id = r.image_id # This is the original left side image ID preserved and unextended
  new_image_id = val_images_df.loc[val_images_df['original_image_id_left'] == r.image_id, 'id'] # Find the match for the extended image id in the left side images dataframe
  val_anns_df.at[idx, 'image_id'] = new_image_id # re-assign the left side annos image id with the extended image id


In [16]:
val_anns_df['attributes'] = val_anns_df['id'] # The left side label id

In [17]:
def update_attributes(row):
    att = {
        'building_completeness': row['attributes.building_completeness'],
        'building_condition': row['attributes.building_condition'],
        'building_material': row['attributes.building_material'],
        'building_security': row['attributes.building_security'],
        'building_use': row['attributes.building_use'],
        'occluded': row['attributes.occluded'],
        'rotation': row['attributes.rotation']
    }
    row['attributes'] = att
    return row

In [18]:
# Consolidate all attributes into a single column
val_anns_df = val_anns_df.apply(update_attributes, axis=1)


In [19]:
# Remove the separated attribute columns
val_anns_df = val_anns_df.drop(['attributes.building_completeness',
                                    'attributes.building_condition',
                                    'attributes.building_material',
                                    'attributes.building_security',
                                    'attributes.building_use',
                                    'attributes.occluded',
                                    'attributes.rotation'], axis=1)

In [20]:
# Same for right side
train_anns_df = train_anns_df.apply(update_attributes, axis=1)
train_anns_df = train_anns_df.drop(['attributes.building_completeness',
                                    'attributes.building_condition',
                                    'attributes.building_material',
                                    'attributes.building_security',
                                    'attributes.building_use',
                                    'attributes.occluded',
                                    'attributes.rotation'], axis=1)

In [21]:
total_df_images = pd.concat([train_images_df, val_images_df], axis=0) # combine left and right side images
total_df_anns = pd.concat([train_anns_df, val_anns_df], axis=0) # combine left and right side annos

In [22]:
total_images_json_data = total_df_images.to_json(orient='records')
with open('coco_format_12_4/project_building_properties_combined-coco-1_annotations/instances_default_images_total.json', 'w') as file:
    file.write(total_images_json_data)
total_anns_json_data = total_df_anns.to_json(orient='records')
with open('coco_format_12_4/project_building_properties_combined-coco-1_annotations/instances_default_annotations_total.json', 'w') as file:
    file.write(total_anns_json_data)

In [23]:
total_images_json_data_ = open('coco_format_12_4/project_building_properties_combined-coco-1_annotations/instances_default_images_total.json')
total_images_ = json.load(total_images_json_data_)
total_anns_json_data_ = open('coco_format_12_4/project_building_properties_combined-coco-1_annotations/instances_default_annotations_total.json')
total_anns_ = json.load(total_anns_json_data_)

In [24]:
comb_dict = {}
comb_dict['images'] = total_images_
comb_dict['annotations'] = total_anns_
comb_dict['categories'] = [{'id': int(1), 'name': 'building_properties', 'supercategory': ''}]

In [25]:
with open('coco_format_12_4/project_building_properties_combined-coco-1_annotations/instances_default_total.json', 'w') as file:
    json.dump(comb_dict, file)

In [26]:
total_df = total_df_anns.sample(frac=1, random_state=42) # shuffle all labels
# Calculate the sizes for train, val, and test sets
total_rows = len(total_df)
train_size = int(0.7 * total_rows) # get training amount
val_size = int(0.2 * total_rows) # get val amount

# Split the DataFrame
train_df = total_df_anns[:train_size] # Get rows up until 70%
val_df = total_df_anns[train_size:train_size + val_size] # Get rows up between 70% and 90%
test_df = total_df_anns[train_size + val_size:] # Get rows up between 90% and total

# Check the lengths of each set
print("Train size:", len(train_df))
print("Validation size:", len(val_df))
print("Test size:", len(test_df))

train_json_data = train_df.to_json(orient='records')
with open('coco_format_12_4/project_building_properties_combined-coco-1_annotations/instances_default_annotations_train.json', 'w') as file:
    file.write(train_json_data)
val_json_data = val_df.to_json(orient='records')
with open('coco_format_12_4/project_building_properties_combined-coco-1_annotations/instances_default_annotations_val.json', 'w') as file:
    file.write(val_json_data)
test_json_data = test_df.to_json(orient='records')
with open('coco_format_12_4/project_building_properties_combined-coco-1_annotations/instances_default_annotations_test.json', 'w') as file:
    file.write(test_json_data)

Train size: 8423
Validation size: 2406
Test size: 1204


In [27]:
train_anns_json_data_ = open('coco_format_12_4/project_building_properties_combined-coco-1_annotations/instances_default_annotations_train.json')
train_anns_ = json.load(train_anns_json_data_)

val_anns_json_data_ = open('coco_format_12_4/project_building_properties_combined-coco-1_annotations/instances_default_annotations_val.json')
val_anns_ = json.load(val_anns_json_data_)

test_anns_json_data_ = open('coco_format_12_4/project_building_properties_combined-coco-1_annotations/instances_default_annotations_test.json')
test_anns_ = json.load(test_anns_json_data_)

In [28]:
dict_train = {}
dict_train['images'] = total_images_
dict_train['annotations'] = train_anns_
dict_train['categories'] = [{'id': int(1), 'name': 'building_properties', 'supercategory': ''}]

dict_val = {}
dict_val['images'] = total_images_
dict_val['annotations'] = val_anns_
dict_val['categories'] = [{'id': int(1), 'name': 'building_properties', 'supercategory': ''}]

dict_test = {}
dict_test['images'] = total_images_
dict_test['annotations'] = test_anns_
dict_test['categories'] = [{'id': int(1), 'name': 'building_properties', 'supercategory': ''}]

In [29]:
with open('coco_format_12_4/project_building_properties_combined-coco-1_annotations/instances_default_train.json', 'w') as file:
    json.dump(dict_train, file)

with open('coco_format_12_4/project_building_properties_combined-coco-1_annotations/instances_default_val.json', 'w') as file:
    json.dump(dict_val, file)

with open('coco_format_12_4/project_building_properties_combined-coco-1_annotations/instances_default_test.json', 'w') as file:
    json.dump(dict_test, file)

In [30]:
len(dict_train['annotations']), len(dict_val['annotations']), len(dict_test['annotations'])

(8423, 2406, 1204)