# Create Metadataframe of Images & Labels

This notebook traverses the directory of image, json and label data to create an index / metadataframe to be used for EDA

In [1]:
import warnings

warnings.filterwarnings("ignore")

import pandas as pd
import requests
import os
import json
import boto3
from loguru import logger
from typing import Literal
from PIL import Image
from smart_open import open
from tqdm.autonotebook import tqdm
from IPython.display import display
from concurrent.futures import ThreadPoolExecutor

In [2]:
# Notebook has to be run for each of these three params separately to get all data
CHALLENGE_TYPE: Literal["train", "test", "hold"] = "test"

In [3]:
LOCAL_DATA_DIR: str = "/home/jupyter-jhub-admin/data"

BASE_DIR: str = LOCAL_DATA_DIR + "/xview2/challenge"
PROCESS_DIR: str = LOCAL_DATA_DIR + "/xview2/processed"

print(f"{BASE_DIR=}")
print(f"{PROCESS_DIR=}")

assert all([BASE_DIR]), "Download Destination not Set"

os.makedirs(BASE_DIR, exist_ok=True)
os.makedirs(PROCESS_DIR, exist_ok=True)

BASE_DIR='/home/jupyter-jhub-admin/data/xview2/challenge'
PROCESS_DIR='/home/jupyter-jhub-admin/data/xview2/processed'


In [4]:
image_path, label_path, target_path = (
    f"{BASE_DIR}/{CHALLENGE_TYPE}/images",
    f"{BASE_DIR}/{CHALLENGE_TYPE}/labels",
    f"{BASE_DIR}/{CHALLENGE_TYPE}/targets",
)

image_dir, label_dir, target_dir = (
    [f"{image_path}/{f}" for f in os.listdir(image_path)],
    [f"{label_path}/{f}" for f in os.listdir(label_path)],
    [f"{target_path}/{f}" for f in os.listdir(target_path)],
)

print(f"{len(image_dir)=}")
print(f"{len(label_dir)=}")
print(f"{len(target_dir)=}")

with open(label_dir[0]) as f:
    x = json.load(f)
    assert x, "Unable to read label data"

y = Image.open(image_dir[0])
assert y, "Unable to read image data"

z = Image.open(target_dir[0])
assert z, "Unable to read target data"

len(image_dir)=1866
len(label_dir)=1866
len(target_dir)=1866


## Process Labels

In [5]:
label_json_data: list[dict] = []


def read_and_store_label_json(label_json_path: str):
    """A thread-safe function that reads a json as a dictionary and writes to a global list"""
    with open(label_json_path) as f:
        label_json_data.append(json.load(f))


with ThreadPoolExecutor(max_workers=20) as executor:
    executor.map(read_and_store_label_json, label_dir)

label_json_series: pd.Series = pd.Series(label_json_data)
label_df_original: pd.DataFrame = pd.json_normalize(label_json_series)

print(label_df_original.shape)
label_df_original.head()

(1866, 20)


Unnamed: 0,features.lng_lat,features.xy,metadata.sensor,metadata.provider_asset_type,metadata.gsd,metadata.capture_date,metadata.off_nadir_angle,metadata.pan_resolution,metadata.sun_azimuth,metadata.sun_elevation,metadata.target_azimuth,metadata.disaster,metadata.disaster_type,metadata.catalog_id,metadata.original_width,metadata.original_height,metadata.width,metadata.height,metadata.id,metadata.img_name
0,"[{'properties': {'feature_type': 'building', '...","[{'properties': {'feature_type': 'building', '...",WORLDVIEW03_VNIR,WORLDVIEW03_VNIR,1.25246,2019-05-30T17:35:04.000Z,7.110581,0.313106,140.67555,72.39723,2.891011,midwest-flooding,flooding,104001004BC65000,1024,1024,1024,1024,MjU1NDk1Mw.cfeYzidrkWPN8dQa3pkn_0EJrXI,midwest-flooding_00000248_post_disaster.png
1,"[{'properties': {'feature_type': 'building', '...","[{'properties': {'feature_type': 'building', '...",WORLDVIEW02,WORLDVIEW02,2.080973,2019-01-26T17:16:23.000Z,20.197157,0.519204,157.14372,32.06165,245.92258,midwest-flooding,flooding,103001008CBA2300,1024,1024,1024,1024,MjQ4MzM2Mw.xZmgS5i7AGCfqm7fS-zY_4X-888,midwest-flooding_00000169_pre_disaster.png
2,[],[],WORLDVIEW02,WORLDVIEW02,1.900348,2018-10-23T18:50:43.000Z,8.8854,0.474044,163.84906,42.730503,302.2995,socal-fire,fire,1030010085017B00,1024,1024,1024,1024,MjU3ODA2Nw.crP2H47Q9GAdO4ewsYCDvQb1004,socal-fire_00001358_pre_disaster.png
3,"[{'properties': {'feature_type': 'building', '...","[{'properties': {'feature_type': 'building', '...",WORLDVIEW03_VNIR,WORLDVIEW03_VNIR,1.59235,2016-10-10T16:04:56.646Z,29.839882,0.398145,158.565308,63.009785,54.715256,hurricane-matthew,wind,104001002437EF00,1024,1024,1024,1024,MjU4NjIwNQ.6gm5_9RWfgfBpylM4UUdWBnlgRQ,hurricane-matthew_00000111_post_disaster.png
4,"[{'properties': {'feature_type': 'building', '...","[{'properties': {'feature_type': 'building', '...",GEOEYE01,GEOEYE01,1.739208,2019-05-31T16:54:31.000Z,15.354399,0.434699,123.605255,69.66407,265.26758,midwest-flooding,flooding,1050010016812F00,1024,1024,1024,1024,MjU1NDc3NA.19HAP_wITfNl-2FwbSZv7HEX3Bo,midwest-flooding_00000071_post_disaster.png


In [6]:
label_df: pd.DataFrame = label_df_original.copy()

In [7]:
label_df_lng_lat: pd.DataFrame = (
    label_df.drop(columns=["features.xy", "features.lng_lat"])
    .join(label_df["features.lng_lat"].explode())
    .reset_index(drop=True)
)

print(label_df_lng_lat.shape)
label_df_lng_lat.head()

(110086, 19)


Unnamed: 0,metadata.sensor,metadata.provider_asset_type,metadata.gsd,metadata.capture_date,metadata.off_nadir_angle,metadata.pan_resolution,metadata.sun_azimuth,metadata.sun_elevation,metadata.target_azimuth,metadata.disaster,metadata.disaster_type,metadata.catalog_id,metadata.original_width,metadata.original_height,metadata.width,metadata.height,metadata.id,metadata.img_name,features.lng_lat
0,WORLDVIEW03_VNIR,WORLDVIEW03_VNIR,1.25246,2019-05-30T17:35:04.000Z,7.110581,0.313106,140.67555,72.39723,2.891011,midwest-flooding,flooding,104001004BC65000,1024,1024,1024,1024,MjU1NDk1Mw.cfeYzidrkWPN8dQa3pkn_0EJrXI,midwest-flooding_00000248_post_disaster.png,"{'properties': {'feature_type': 'building', 's..."
1,WORLDVIEW03_VNIR,WORLDVIEW03_VNIR,1.25246,2019-05-30T17:35:04.000Z,7.110581,0.313106,140.67555,72.39723,2.891011,midwest-flooding,flooding,104001004BC65000,1024,1024,1024,1024,MjU1NDk1Mw.cfeYzidrkWPN8dQa3pkn_0EJrXI,midwest-flooding_00000248_post_disaster.png,"{'properties': {'feature_type': 'building', 's..."
2,WORLDVIEW03_VNIR,WORLDVIEW03_VNIR,1.25246,2019-05-30T17:35:04.000Z,7.110581,0.313106,140.67555,72.39723,2.891011,midwest-flooding,flooding,104001004BC65000,1024,1024,1024,1024,MjU1NDk1Mw.cfeYzidrkWPN8dQa3pkn_0EJrXI,midwest-flooding_00000248_post_disaster.png,"{'properties': {'feature_type': 'building', 's..."
3,WORLDVIEW03_VNIR,WORLDVIEW03_VNIR,1.25246,2019-05-30T17:35:04.000Z,7.110581,0.313106,140.67555,72.39723,2.891011,midwest-flooding,flooding,104001004BC65000,1024,1024,1024,1024,MjU1NDk1Mw.cfeYzidrkWPN8dQa3pkn_0EJrXI,midwest-flooding_00000248_post_disaster.png,"{'properties': {'feature_type': 'building', 's..."
4,WORLDVIEW03_VNIR,WORLDVIEW03_VNIR,1.25246,2019-05-30T17:35:04.000Z,7.110581,0.313106,140.67555,72.39723,2.891011,midwest-flooding,flooding,104001004BC65000,1024,1024,1024,1024,MjU1NDk1Mw.cfeYzidrkWPN8dQa3pkn_0EJrXI,midwest-flooding_00000248_post_disaster.png,"{'properties': {'feature_type': 'building', 's..."


In [8]:
label_df_features: pd.DataFrame = (
    label_df.drop(columns=["features.xy", "features.lng_lat"])
    .join(label_df["features.xy"].explode())
    .reset_index(drop=True)
)

print(label_df_features.shape)
label_df_features.head()

(110086, 19)


Unnamed: 0,metadata.sensor,metadata.provider_asset_type,metadata.gsd,metadata.capture_date,metadata.off_nadir_angle,metadata.pan_resolution,metadata.sun_azimuth,metadata.sun_elevation,metadata.target_azimuth,metadata.disaster,metadata.disaster_type,metadata.catalog_id,metadata.original_width,metadata.original_height,metadata.width,metadata.height,metadata.id,metadata.img_name,features.xy
0,WORLDVIEW03_VNIR,WORLDVIEW03_VNIR,1.25246,2019-05-30T17:35:04.000Z,7.110581,0.313106,140.67555,72.39723,2.891011,midwest-flooding,flooding,104001004BC65000,1024,1024,1024,1024,MjU1NDk1Mw.cfeYzidrkWPN8dQa3pkn_0EJrXI,midwest-flooding_00000248_post_disaster.png,"{'properties': {'feature_type': 'building', 's..."
1,WORLDVIEW03_VNIR,WORLDVIEW03_VNIR,1.25246,2019-05-30T17:35:04.000Z,7.110581,0.313106,140.67555,72.39723,2.891011,midwest-flooding,flooding,104001004BC65000,1024,1024,1024,1024,MjU1NDk1Mw.cfeYzidrkWPN8dQa3pkn_0EJrXI,midwest-flooding_00000248_post_disaster.png,"{'properties': {'feature_type': 'building', 's..."
2,WORLDVIEW03_VNIR,WORLDVIEW03_VNIR,1.25246,2019-05-30T17:35:04.000Z,7.110581,0.313106,140.67555,72.39723,2.891011,midwest-flooding,flooding,104001004BC65000,1024,1024,1024,1024,MjU1NDk1Mw.cfeYzidrkWPN8dQa3pkn_0EJrXI,midwest-flooding_00000248_post_disaster.png,"{'properties': {'feature_type': 'building', 's..."
3,WORLDVIEW03_VNIR,WORLDVIEW03_VNIR,1.25246,2019-05-30T17:35:04.000Z,7.110581,0.313106,140.67555,72.39723,2.891011,midwest-flooding,flooding,104001004BC65000,1024,1024,1024,1024,MjU1NDk1Mw.cfeYzidrkWPN8dQa3pkn_0EJrXI,midwest-flooding_00000248_post_disaster.png,"{'properties': {'feature_type': 'building', 's..."
4,WORLDVIEW03_VNIR,WORLDVIEW03_VNIR,1.25246,2019-05-30T17:35:04.000Z,7.110581,0.313106,140.67555,72.39723,2.891011,midwest-flooding,flooding,104001004BC65000,1024,1024,1024,1024,MjU1NDk1Mw.cfeYzidrkWPN8dQa3pkn_0EJrXI,midwest-flooding_00000248_post_disaster.png,"{'properties': {'feature_type': 'building', 's..."


In [9]:
lng_lat_normalized: pd.DataFrame = pd.json_normalize(
    label_df_lng_lat["features.lng_lat"]
).rename(
    columns={
        "wkt": "map_polygon",
        "properties.feature_type": "map_feature_type",
        "properties.subtype": "map_damage",
        "properties.uid": "building_id",
    }
)

print(lng_lat_normalized.shape)
lng_lat_normalized.head()

(110086, 4)


Unnamed: 0,map_polygon,map_feature_type,map_damage,building_id
0,POLYGON ((-96.34565341279105 36.14011351356439...,building,no-damage,f45a40a0-ef58-488f-a465-86871155d557
1,POLYGON ((-96.34575539169343 36.14000158867371...,building,no-damage,3c1f5674-1c92-41c6-860b-eb981012b622
2,POLYGON ((-96.34596890780844 36.13974220228118...,building,no-damage,d3a92910-f1b0-4477-8f02-870288708359
3,POLYGON ((-96.34590689848071 36.13968797219659...,building,no-damage,24613850-6dcf-4900-8d90-f5960d44a67e
4,POLYGON ((-96.34579426478086 36.13954335028814...,building,no-damage,7f557aa3-0e34-4738-ad2b-cf7924bec599


In [10]:
features_normalized: pd.DataFrame = pd.json_normalize(
    label_df_features["features.xy"]
).rename(
    columns={
        "wkt": "image_polygon",
        "properties.feature_type": "image_feature_type",
        "properties.subtype": "image_damage",
        "properties.uid": "building_id",
    }
)

print(features_normalized.shape)
features_normalized.head()

(110086, 4)


Unnamed: 0,image_polygon,image_feature_type,image_damage,building_id
0,"POLYGON ((89.85022663845265 16.57266964621184,...",building,no-damage,f45a40a0-ef58-488f-a465-86871155d557
1,"POLYGON ((70.35982870380325 44.51145504934903,...",building,no-damage,3c1f5674-1c92-41c6-860b-eb981012b622
2,"POLYGON ((29.7195729266486 109.1358004320701, ...",building,no-damage,d3a92910-f1b0-4477-8f02-870288708359
3,"POLYGON ((42.38899833718492 122.0681969213222,...",building,no-damage,24613850-6dcf-4900-8d90-f5960d44a67e
4,"POLYGON ((65.7102826076016 156.8426802930912, ...",building,no-damage,7f557aa3-0e34-4738-ad2b-cf7924bec599


In [11]:
label_df_lng_lat_normalized = label_df_lng_lat.drop(columns=["features.lng_lat"]).join(
    lng_lat_normalized
)

label_df_features_normalized = label_df_features.drop(columns=["features.xy"]).join(
    features_normalized
)

print(label_df_lng_lat_normalized.shape)
print(label_df_features_normalized.shape)

display(label_df_lng_lat_normalized.head())
display(label_df_features_normalized.head())

(110086, 22)
(110086, 22)


Unnamed: 0,metadata.sensor,metadata.provider_asset_type,metadata.gsd,metadata.capture_date,metadata.off_nadir_angle,metadata.pan_resolution,metadata.sun_azimuth,metadata.sun_elevation,metadata.target_azimuth,metadata.disaster,...,metadata.original_width,metadata.original_height,metadata.width,metadata.height,metadata.id,metadata.img_name,map_polygon,map_feature_type,map_damage,building_id
0,WORLDVIEW03_VNIR,WORLDVIEW03_VNIR,1.25246,2019-05-30T17:35:04.000Z,7.110581,0.313106,140.67555,72.39723,2.891011,midwest-flooding,...,1024,1024,1024,1024,MjU1NDk1Mw.cfeYzidrkWPN8dQa3pkn_0EJrXI,midwest-flooding_00000248_post_disaster.png,POLYGON ((-96.34565341279105 36.14011351356439...,building,no-damage,f45a40a0-ef58-488f-a465-86871155d557
1,WORLDVIEW03_VNIR,WORLDVIEW03_VNIR,1.25246,2019-05-30T17:35:04.000Z,7.110581,0.313106,140.67555,72.39723,2.891011,midwest-flooding,...,1024,1024,1024,1024,MjU1NDk1Mw.cfeYzidrkWPN8dQa3pkn_0EJrXI,midwest-flooding_00000248_post_disaster.png,POLYGON ((-96.34575539169343 36.14000158867371...,building,no-damage,3c1f5674-1c92-41c6-860b-eb981012b622
2,WORLDVIEW03_VNIR,WORLDVIEW03_VNIR,1.25246,2019-05-30T17:35:04.000Z,7.110581,0.313106,140.67555,72.39723,2.891011,midwest-flooding,...,1024,1024,1024,1024,MjU1NDk1Mw.cfeYzidrkWPN8dQa3pkn_0EJrXI,midwest-flooding_00000248_post_disaster.png,POLYGON ((-96.34596890780844 36.13974220228118...,building,no-damage,d3a92910-f1b0-4477-8f02-870288708359
3,WORLDVIEW03_VNIR,WORLDVIEW03_VNIR,1.25246,2019-05-30T17:35:04.000Z,7.110581,0.313106,140.67555,72.39723,2.891011,midwest-flooding,...,1024,1024,1024,1024,MjU1NDk1Mw.cfeYzidrkWPN8dQa3pkn_0EJrXI,midwest-flooding_00000248_post_disaster.png,POLYGON ((-96.34590689848071 36.13968797219659...,building,no-damage,24613850-6dcf-4900-8d90-f5960d44a67e
4,WORLDVIEW03_VNIR,WORLDVIEW03_VNIR,1.25246,2019-05-30T17:35:04.000Z,7.110581,0.313106,140.67555,72.39723,2.891011,midwest-flooding,...,1024,1024,1024,1024,MjU1NDk1Mw.cfeYzidrkWPN8dQa3pkn_0EJrXI,midwest-flooding_00000248_post_disaster.png,POLYGON ((-96.34579426478086 36.13954335028814...,building,no-damage,7f557aa3-0e34-4738-ad2b-cf7924bec599


Unnamed: 0,metadata.sensor,metadata.provider_asset_type,metadata.gsd,metadata.capture_date,metadata.off_nadir_angle,metadata.pan_resolution,metadata.sun_azimuth,metadata.sun_elevation,metadata.target_azimuth,metadata.disaster,...,metadata.original_width,metadata.original_height,metadata.width,metadata.height,metadata.id,metadata.img_name,image_polygon,image_feature_type,image_damage,building_id
0,WORLDVIEW03_VNIR,WORLDVIEW03_VNIR,1.25246,2019-05-30T17:35:04.000Z,7.110581,0.313106,140.67555,72.39723,2.891011,midwest-flooding,...,1024,1024,1024,1024,MjU1NDk1Mw.cfeYzidrkWPN8dQa3pkn_0EJrXI,midwest-flooding_00000248_post_disaster.png,"POLYGON ((89.85022663845265 16.57266964621184,...",building,no-damage,f45a40a0-ef58-488f-a465-86871155d557
1,WORLDVIEW03_VNIR,WORLDVIEW03_VNIR,1.25246,2019-05-30T17:35:04.000Z,7.110581,0.313106,140.67555,72.39723,2.891011,midwest-flooding,...,1024,1024,1024,1024,MjU1NDk1Mw.cfeYzidrkWPN8dQa3pkn_0EJrXI,midwest-flooding_00000248_post_disaster.png,"POLYGON ((70.35982870380325 44.51145504934903,...",building,no-damage,3c1f5674-1c92-41c6-860b-eb981012b622
2,WORLDVIEW03_VNIR,WORLDVIEW03_VNIR,1.25246,2019-05-30T17:35:04.000Z,7.110581,0.313106,140.67555,72.39723,2.891011,midwest-flooding,...,1024,1024,1024,1024,MjU1NDk1Mw.cfeYzidrkWPN8dQa3pkn_0EJrXI,midwest-flooding_00000248_post_disaster.png,"POLYGON ((29.7195729266486 109.1358004320701, ...",building,no-damage,d3a92910-f1b0-4477-8f02-870288708359
3,WORLDVIEW03_VNIR,WORLDVIEW03_VNIR,1.25246,2019-05-30T17:35:04.000Z,7.110581,0.313106,140.67555,72.39723,2.891011,midwest-flooding,...,1024,1024,1024,1024,MjU1NDk1Mw.cfeYzidrkWPN8dQa3pkn_0EJrXI,midwest-flooding_00000248_post_disaster.png,"POLYGON ((42.38899833718492 122.0681969213222,...",building,no-damage,24613850-6dcf-4900-8d90-f5960d44a67e
4,WORLDVIEW03_VNIR,WORLDVIEW03_VNIR,1.25246,2019-05-30T17:35:04.000Z,7.110581,0.313106,140.67555,72.39723,2.891011,midwest-flooding,...,1024,1024,1024,1024,MjU1NDk1Mw.cfeYzidrkWPN8dQa3pkn_0EJrXI,midwest-flooding_00000248_post_disaster.png,"POLYGON ((65.7102826076016 156.8426802930912, ...",building,no-damage,7f557aa3-0e34-4738-ad2b-cf7924bec599


In [12]:
# Checking if building id is common between pre and post images
agg_building_id_img = label_df_lng_lat_normalized.groupby(["building_id"]).agg(
    {"metadata.img_name": "nunique"}
)
assert (
    max(agg_building_id_img["metadata.img_name"])
    == min(agg_building_id_img["metadata.img_name"])
    == 2
), "Buildings that appear in pre do not appear in post or vice versa by building ID"

# Checking id `metadata.id` can be used to uniquely identify a pre-image from a post-image
agg_id_img = label_df_lng_lat_normalized.groupby(["metadata.id"]).agg(
    {"metadata.img_name": "nunique"}
)
assert (
    max(agg_id_img["metadata.img_name"]) == min(agg_id_img["metadata.img_name"]) == 1
), "Unique Images not represented with unique IDs"

In [13]:
label_df_final: pd.DataFrame = label_df_lng_lat_normalized.merge(
    label_df_features_normalized[
        [
            "metadata.id",
            "image_polygon",
            "image_feature_type",
            "image_damage",
            "building_id",
        ]
    ],
    "left",
    ["metadata.id", "building_id"],
)

print(label_df_final.shape)
label_df_final.head()

(110086, 25)


Unnamed: 0,metadata.sensor,metadata.provider_asset_type,metadata.gsd,metadata.capture_date,metadata.off_nadir_angle,metadata.pan_resolution,metadata.sun_azimuth,metadata.sun_elevation,metadata.target_azimuth,metadata.disaster,...,metadata.height,metadata.id,metadata.img_name,map_polygon,map_feature_type,map_damage,building_id,image_polygon,image_feature_type,image_damage
0,WORLDVIEW03_VNIR,WORLDVIEW03_VNIR,1.25246,2019-05-30T17:35:04.000Z,7.110581,0.313106,140.67555,72.39723,2.891011,midwest-flooding,...,1024,MjU1NDk1Mw.cfeYzidrkWPN8dQa3pkn_0EJrXI,midwest-flooding_00000248_post_disaster.png,POLYGON ((-96.34565341279105 36.14011351356439...,building,no-damage,f45a40a0-ef58-488f-a465-86871155d557,"POLYGON ((89.85022663845265 16.57266964621184,...",building,no-damage
1,WORLDVIEW03_VNIR,WORLDVIEW03_VNIR,1.25246,2019-05-30T17:35:04.000Z,7.110581,0.313106,140.67555,72.39723,2.891011,midwest-flooding,...,1024,MjU1NDk1Mw.cfeYzidrkWPN8dQa3pkn_0EJrXI,midwest-flooding_00000248_post_disaster.png,POLYGON ((-96.34575539169343 36.14000158867371...,building,no-damage,3c1f5674-1c92-41c6-860b-eb981012b622,"POLYGON ((70.35982870380325 44.51145504934903,...",building,no-damage
2,WORLDVIEW03_VNIR,WORLDVIEW03_VNIR,1.25246,2019-05-30T17:35:04.000Z,7.110581,0.313106,140.67555,72.39723,2.891011,midwest-flooding,...,1024,MjU1NDk1Mw.cfeYzidrkWPN8dQa3pkn_0EJrXI,midwest-flooding_00000248_post_disaster.png,POLYGON ((-96.34596890780844 36.13974220228118...,building,no-damage,d3a92910-f1b0-4477-8f02-870288708359,"POLYGON ((29.7195729266486 109.1358004320701, ...",building,no-damage
3,WORLDVIEW03_VNIR,WORLDVIEW03_VNIR,1.25246,2019-05-30T17:35:04.000Z,7.110581,0.313106,140.67555,72.39723,2.891011,midwest-flooding,...,1024,MjU1NDk1Mw.cfeYzidrkWPN8dQa3pkn_0EJrXI,midwest-flooding_00000248_post_disaster.png,POLYGON ((-96.34590689848071 36.13968797219659...,building,no-damage,24613850-6dcf-4900-8d90-f5960d44a67e,"POLYGON ((42.38899833718492 122.0681969213222,...",building,no-damage
4,WORLDVIEW03_VNIR,WORLDVIEW03_VNIR,1.25246,2019-05-30T17:35:04.000Z,7.110581,0.313106,140.67555,72.39723,2.891011,midwest-flooding,...,1024,MjU1NDk1Mw.cfeYzidrkWPN8dQa3pkn_0EJrXI,midwest-flooding_00000248_post_disaster.png,POLYGON ((-96.34579426478086 36.13954335028814...,building,no-damage,7f557aa3-0e34-4738-ad2b-cf7924bec599,"POLYGON ((65.7102826076016 156.8426802930912, ...",building,no-damage


In [14]:
assert (
    label_df_final["image_damage"].to_list() == label_df_final["map_damage"].to_list()
), "Damage Classification Between Image Data and Map Data Differ - Possibly Wrong Source Data or Incorrect Joins"

In [15]:
label_df_final = (
    label_df_final.rename(
        columns={
            c: c.replace("metadata.", "")
            for c in label_df_final.columns
            if c.startswith("metadata.")
        }
    )
    .drop(
        columns=[
            "map_feature_type",
            "map_damage",
        ]
    )
    .rename(
        columns={
            "image_feature_type": "feature_type",
            "image_damage": "damage",
        }
    )
)

In [16]:
label_df_final["dataset"] = CHALLENGE_TYPE
label_df_final["capture_date"] = pd.to_datetime(label_df_final["capture_date"])

label_df_final["image_id"] = (
    label_df_final["img_name"]
    .dropna()
    .apply(lambda cell: "_".join(cell.split("_")[0:2]))
)
label_df_final["is_pre_image"] = (
    label_df_final["img_name"].dropna().apply(lambda cell: "_pre_disaster" in cell)
)
label_df_final["is_post_image"] = (
    label_df_final["img_name"].dropna().apply(lambda cell: "_post_disaster" in cell)
)

In [17]:
print(label_df_final.shape)
label_df_final.head()

(110086, 27)


Unnamed: 0,sensor,provider_asset_type,gsd,capture_date,off_nadir_angle,pan_resolution,sun_azimuth,sun_elevation,target_azimuth,disaster,...,img_name,map_polygon,building_id,image_polygon,feature_type,damage,dataset,image_id,is_pre_image,is_post_image
0,WORLDVIEW03_VNIR,WORLDVIEW03_VNIR,1.25246,2019-05-30 17:35:04+00:00,7.110581,0.313106,140.67555,72.39723,2.891011,midwest-flooding,...,midwest-flooding_00000248_post_disaster.png,POLYGON ((-96.34565341279105 36.14011351356439...,f45a40a0-ef58-488f-a465-86871155d557,"POLYGON ((89.85022663845265 16.57266964621184,...",building,no-damage,test,midwest-flooding_00000248,False,True
1,WORLDVIEW03_VNIR,WORLDVIEW03_VNIR,1.25246,2019-05-30 17:35:04+00:00,7.110581,0.313106,140.67555,72.39723,2.891011,midwest-flooding,...,midwest-flooding_00000248_post_disaster.png,POLYGON ((-96.34575539169343 36.14000158867371...,3c1f5674-1c92-41c6-860b-eb981012b622,"POLYGON ((70.35982870380325 44.51145504934903,...",building,no-damage,test,midwest-flooding_00000248,False,True
2,WORLDVIEW03_VNIR,WORLDVIEW03_VNIR,1.25246,2019-05-30 17:35:04+00:00,7.110581,0.313106,140.67555,72.39723,2.891011,midwest-flooding,...,midwest-flooding_00000248_post_disaster.png,POLYGON ((-96.34596890780844 36.13974220228118...,d3a92910-f1b0-4477-8f02-870288708359,"POLYGON ((29.7195729266486 109.1358004320701, ...",building,no-damage,test,midwest-flooding_00000248,False,True
3,WORLDVIEW03_VNIR,WORLDVIEW03_VNIR,1.25246,2019-05-30 17:35:04+00:00,7.110581,0.313106,140.67555,72.39723,2.891011,midwest-flooding,...,midwest-flooding_00000248_post_disaster.png,POLYGON ((-96.34590689848071 36.13968797219659...,24613850-6dcf-4900-8d90-f5960d44a67e,"POLYGON ((42.38899833718492 122.0681969213222,...",building,no-damage,test,midwest-flooding_00000248,False,True
4,WORLDVIEW03_VNIR,WORLDVIEW03_VNIR,1.25246,2019-05-30 17:35:04+00:00,7.110581,0.313106,140.67555,72.39723,2.891011,midwest-flooding,...,midwest-flooding_00000248_post_disaster.png,POLYGON ((-96.34579426478086 36.13954335028814...,7f557aa3-0e34-4738-ad2b-cf7924bec599,"POLYGON ((65.7102826076016 156.8426802930912, ...",building,no-damage,test,midwest-flooding_00000248,False,True


In [18]:
label_df_final.dtypes

sensor                              object
provider_asset_type                 object
gsd                                float64
capture_date           datetime64[ns, UTC]
off_nadir_angle                    float64
pan_resolution                     float64
sun_azimuth                        float64
sun_elevation                      float64
target_azimuth                     float64
disaster                            object
disaster_type                       object
catalog_id                          object
original_width                       int64
original_height                      int64
width                                int64
height                               int64
id                                  object
img_name                            object
map_polygon                         object
building_id                         object
image_polygon                       object
feature_type                        object
damage                              object
dataset    

In [19]:
print(f"{len(label_df_final['id'].unique())=}")
print(f"{len(label_df_final['building_id'].unique())=}")
print()
print(label_df_final["damage"].value_counts())
print()
print(label_df_final["disaster"].value_counts())

len(label_df_final['id'].unique())=1866
len(label_df_final['building_id'].unique())=54863

damage
no-damage        41427
minor-damage      4798
major-damage      3850
destroyed         3775
un-classified     1012
Name: count, dtype: int64

disaster
palu-tsunami           25120
mexico-earthquake      22822
hurricane-harvey       15448
hurricane-michael      11324
socal-fire              8840
santa-rosa-wildfire     8456
hurricane-matthew       8390
midwest-flooding        5076
hurricane-florence      4546
guatemala-volcano         64
Name: count, dtype: int64


In [20]:
label_df_final.to_parquet(f"{BASE_DIR}/{CHALLENGE_TYPE}.parquet")

In [21]:
concat_list: list[pd.DataFrame] = [
    pd.read_parquet(f"{BASE_DIR}/{pq_file}")
    for pq_file in os.listdir(BASE_DIR)
    if pq_file.endswith(".parquet")
]

print(len(concat_list))

df = pd.concat(concat_list).reset_index(drop=True)

print(df.shape)
df.head()

3
(545816, 27)


Unnamed: 0,sensor,provider_asset_type,gsd,capture_date,off_nadir_angle,pan_resolution,sun_azimuth,sun_elevation,target_azimuth,disaster,...,img_name,map_polygon,building_id,image_polygon,feature_type,damage,dataset,image_id,is_pre_image,is_post_image
0,GEOEYE01,GEOEYE01,1.958603,2018-04-06 15:49:36+00:00,24.974987,0.489478,140.36385,55.891373,282.33646,hurricane-florence,...,hurricane-florence_00000305_pre_disaster.png,POLYGON ((-77.91138666970029 34.63734867897129...,4a33d9ed-43bc-492f-a775-d3fe42878f10,"POLYGON ((397.4192300260881 115.3648942447117,...",building,,hold,hurricane-florence_00000305,True,False
1,GEOEYE01,GEOEYE01,1.958603,2018-04-06 15:49:36+00:00,24.974987,0.489478,140.36385,55.891373,282.33646,hurricane-florence,...,hurricane-florence_00000305_pre_disaster.png,POLYGON ((-77.91132962263291 34.63732882782131...,884aab65-39f1-4a51-939d-f9601144d90b,"POLYGON ((408.7295498053606 120.5163551093028,...",building,,hold,hurricane-florence_00000305,True,False
2,GEOEYE01,GEOEYE01,1.958603,2018-04-06 15:49:36+00:00,24.974987,0.489478,140.36385,55.891373,282.33646,hurricane-florence,...,hurricane-florence_00000305_pre_disaster.png,POLYGON ((-77.91125705952103 34.63730560873788...,a976212b-86a2-4708-9c5e-583027fa5bca,"POLYGON ((423.1303919841092 126.5756666624842,...",building,,hold,hurricane-florence_00000305,True,False
3,GEOEYE01,GEOEYE01,1.958603,2018-04-06 15:49:36+00:00,24.974987,0.489478,140.36385,55.891373,282.33646,hurricane-florence,...,hurricane-florence_00000305_pre_disaster.png,"POLYGON ((-77.9112023974676 34.63728574095958,...",985447c3-31d6-4af7-a9e5-5af051921e79,"POLYGON ((433.9619211361394 131.7173140666901,...",building,,hold,hurricane-florence_00000305,True,False
4,GEOEYE01,GEOEYE01,1.958603,2018-04-06 15:49:36+00:00,24.974987,0.489478,140.36385,55.891373,282.33646,hurricane-florence,...,hurricane-florence_00000305_pre_disaster.png,POLYGON ((-77.91135754701126 34.63739885219834...,5f091b36-c5a7-4e0b-bd26-b1a8db568c52,"POLYGON ((403.6164962973445 103.3501846669777,...",building,,hold,hurricane-florence_00000305,True,False


In [22]:
df.to_parquet(f"{PROCESS_DIR}/full_challenge_data.parquet")
df.to_csv(f"{PROCESS_DIR}/full_challenge_data.csv", index=False)