In [0]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import requests
import os
import json
import boto3
from loguru import logger
from typing import Literal
from PIL import Image
from smart_open import open
from tqdm.autonotebook import tqdm
from concurrent.futures import ThreadPoolExecutor

from src.utils.config import LOCAL_DATA_DIR, S3_DATA_DIR, S3_BUCKET
from xview2_dataset_links import challenge_links as xview2_links

In [0]:
DESTINATION: Literal["LOCAL", "S3"] = "S3"
CHALLENGE_TYPE: Literal["train", "test", "hold"] = "train"

In [0]:
DATASET_PREFIX: str = "/xview2/challenge"

BASE_DIR: str = (
    LOCAL_DATA_DIR + DATASET_PREFIX
    if DESTINATION == "LOCAL"
    else S3_DATA_DIR + DATASET_PREFIX
    if DESTINATION == "S3"
    else None
)

print(f"{BASE_DIR=}")

assert all([BASE_DIR]), "Download Destination not Set"

BASE_DIR='s3://alivio/datasets/xview2/challenge'


Run the following shell commands on the terminal to download data locally:


```bash
mkdir test hold train
```


```bash
aws s3 sync s3://alivio/datasets/xview2/challenge/test ./test --request-payer requester
```


```bash
aws s3 sync s3://alivio/datasets/xview2/challenge/hold ./hold --request-payer requester
```


```bash
aws s3 sync s3://alivio/datasets/xview2/challenge/train ./train --request-payer requester
```

In [0]:
image_path, label_path, target_path = (
    f"{CHALLENGE_TYPE}/images",
    f"{CHALLENGE_TYPE}/labels",
    f"{CHALLENGE_TYPE}/targets",
)

image_dir, label_dir, target_dir = (
    [f"{image_path}/{f}" for f in os.listdir(image_path)],
    [f"{label_path}/{f}" for f in os.listdir(label_path)],
    [f"{target_path}/{f}" for f in os.listdir(target_path)],
)

print(f"{len(image_dir)=}")
print(f"{len(label_dir)=}")
print(f"{len(target_dir)=}")

print()
print(image_dir[0:5])
print(label_dir[0:5])
print(target_dir[0:5])

with open(label_dir[0]) as f:
    x = json.load(f)
    assert x, "Unable to read label data"

y = Image.open(image_dir[0])
assert y, "Unable to read image data"

z = Image.open(target_dir[0])
assert z, "Unable to read target data"

len(image_dir)=5598
len(label_dir)=5598
len(target_dir)=5598

['train/images/guatemala-volcano_00000000_pre_disaster.png', 'train/images/guatemala-volcano_00000002_pre_disaster.png', 'train/images/guatemala-volcano_00000001_pre_disaster.png', 'train/images/guatemala-volcano_00000007_post_disaster.png', 'train/images/guatemala-volcano_00000002_post_disaster.png']
['train/labels/guatemala-volcano_00000000_post_disaster.json', 'train/labels/guatemala-volcano_00000000_pre_disaster.json', 'train/labels/guatemala-volcano_00000001_post_disaster.json', 'train/labels/guatemala-volcano_00000001_pre_disaster.json', 'train/labels/guatemala-volcano_00000002_post_disaster.json']
['train/targets/guatemala-volcano_00000000_post_disaster_target.png', 'train/targets/guatemala-volcano_00000000_pre_disaster_target.png', 'train/targets/guatemala-volcano_00000001_post_disaster_target.png', 'train/targets/guatemala-volcano_00000001_pre_disaster_target.png', 'train/targets/guatemala-volcano_00000002_post_disa

## Process Labels

In [0]:
label_json_data: list[dict] = []


def read_and_store_label_json(label_json_path: str):
    """A thread-safe function that reads a json as a dictionary and writes to a global list"""
    with open(label_json_path) as f:
        label_json_data.append(json.load(f))


with ThreadPoolExecutor(max_workers=20) as executor:
    executor.map(read_and_store_label_json, label_dir)

label_json_series: pd.Series = pd.Series(label_json_data)
label_df_original: pd.DataFrame = pd.json_normalize(label_json_series)

print(label_df_original.shape)
label_df_original.head()

(5598, 20)


Unnamed: 0,features.lng_lat,features.xy,metadata.sensor,metadata.provider_asset_type,metadata.gsd,metadata.capture_date,metadata.off_nadir_angle,metadata.pan_resolution,metadata.sun_azimuth,metadata.sun_elevation,metadata.target_azimuth,metadata.disaster,metadata.disaster_type,metadata.catalog_id,metadata.original_width,metadata.original_height,metadata.width,metadata.height,metadata.id,metadata.img_name
0,"[{'properties': {'feature_type': 'building', '...","[{'properties': {'feature_type': 'building', '...",WORLDVIEW03_VNIR,WORLDVIEW03_VNIR,1.345919,2018-02-05T17:10:18.000Z,17.933279,0.336265,150.80763,55.506645,118.24727,guatemala-volcano,volcano,10400100377A9B00,1024,1024,1024,1024,MjUxMzEyMA.FwIEh3iVRb5-R_EXW9amgViNSoA,guatemala-volcano_00000000_pre_disaster.png
1,"[{'properties': {'feature_type': 'building', '...","[{'properties': {'feature_type': 'building', '...",WORLDVIEW03_VNIR,WORLDVIEW03_VNIR,1.408524,2018-06-22T16:55:40.000Z,21.97336,0.351739,58.571484,71.182274,314.29373,guatemala-volcano,volcano,104001003E575F00,1024,1024,1024,1024,MjU0NzE2NA.73QhG8U-LJLGkyNS0FB24X8vQS8,guatemala-volcano_00000000_post_disaster.png
2,"[{'properties': {'feature_type': 'building', '...","[{'properties': {'feature_type': 'building', '...",WORLDVIEW03_VNIR,WORLDVIEW03_VNIR,1.408524,2018-06-22T16:55:40.000Z,21.97336,0.351739,58.571484,71.182274,314.29373,guatemala-volcano,volcano,104001003E575F00,1024,1024,1024,1024,MjU0NzE3OQ.rLIRvqfOFINcGa1ZjyKXbgP8tH4,guatemala-volcano_00000015_post_disaster.png
3,"[{'properties': {'feature_type': 'building', '...","[{'properties': {'feature_type': 'building', '...",WORLDVIEW03_VNIR,WORLDVIEW03_VNIR,1.397611,2018-02-05T17:10:04.000Z,21.536394,0.349023,150.52327,55.458157,145.26248,guatemala-volcano,volcano,1040010037B88300,1024,1024,1024,1024,MjUxMzMzMQ.qL2xxfgtI4QCHnyrhzpq74ZsKF8,guatemala-volcano_00000015_pre_disaster.png
4,"[{'properties': {'feature_type': 'building', '...","[{'properties': {'feature_type': 'building', '...",WORLDVIEW03_VNIR,WORLDVIEW03_VNIR,1.408524,2018-06-22T16:55:40.000Z,21.97336,0.351739,58.571484,71.182274,314.29373,guatemala-volcano,volcano,104001003E575F00,1024,1024,1024,1024,MjU0NzE2NQ.fqr99QXF8mpcnRhhi9qrTLA1ALc,guatemala-volcano_00000001_post_disaster.png


In [0]:
label_df: pd.DataFrame = label_df_original.copy()

In [0]:
label_df_lng_lat: pd.DataFrame = (
    label_df.drop(columns=["features.xy", "features.lng_lat"])
    .join(label_df["features.lng_lat"].explode())
    .reset_index(drop=True)
)

print(label_df_lng_lat.shape)
label_df_lng_lat.head()

(326606, 19)


Unnamed: 0,metadata.sensor,metadata.provider_asset_type,metadata.gsd,metadata.capture_date,metadata.off_nadir_angle,metadata.pan_resolution,metadata.sun_azimuth,metadata.sun_elevation,metadata.target_azimuth,metadata.disaster,metadata.disaster_type,metadata.catalog_id,metadata.original_width,metadata.original_height,metadata.width,metadata.height,metadata.id,metadata.img_name,features.lng_lat
0,WORLDVIEW03_VNIR,WORLDVIEW03_VNIR,1.345919,2018-02-05T17:10:18.000Z,17.933279,0.336265,150.80763,55.506645,118.24727,guatemala-volcano,volcano,10400100377A9B00,1024,1024,1024,1024,MjUxMzEyMA.FwIEh3iVRb5-R_EXW9amgViNSoA,guatemala-volcano_00000000_pre_disaster.png,"{'properties': {'feature_type': 'building', 'u..."
1,WORLDVIEW03_VNIR,WORLDVIEW03_VNIR,1.345919,2018-02-05T17:10:18.000Z,17.933279,0.336265,150.80763,55.506645,118.24727,guatemala-volcano,volcano,10400100377A9B00,1024,1024,1024,1024,MjUxMzEyMA.FwIEh3iVRb5-R_EXW9amgViNSoA,guatemala-volcano_00000000_pre_disaster.png,"{'properties': {'feature_type': 'building', 'u..."
2,WORLDVIEW03_VNIR,WORLDVIEW03_VNIR,1.345919,2018-02-05T17:10:18.000Z,17.933279,0.336265,150.80763,55.506645,118.24727,guatemala-volcano,volcano,10400100377A9B00,1024,1024,1024,1024,MjUxMzEyMA.FwIEh3iVRb5-R_EXW9amgViNSoA,guatemala-volcano_00000000_pre_disaster.png,"{'properties': {'feature_type': 'building', 'u..."
3,WORLDVIEW03_VNIR,WORLDVIEW03_VNIR,1.345919,2018-02-05T17:10:18.000Z,17.933279,0.336265,150.80763,55.506645,118.24727,guatemala-volcano,volcano,10400100377A9B00,1024,1024,1024,1024,MjUxMzEyMA.FwIEh3iVRb5-R_EXW9amgViNSoA,guatemala-volcano_00000000_pre_disaster.png,"{'properties': {'feature_type': 'building', 'u..."
4,WORLDVIEW03_VNIR,WORLDVIEW03_VNIR,1.345919,2018-02-05T17:10:18.000Z,17.933279,0.336265,150.80763,55.506645,118.24727,guatemala-volcano,volcano,10400100377A9B00,1024,1024,1024,1024,MjUxMzEyMA.FwIEh3iVRb5-R_EXW9amgViNSoA,guatemala-volcano_00000000_pre_disaster.png,"{'properties': {'feature_type': 'building', 'u..."


In [0]:
label_df_features: pd.DataFrame = (
    label_df.drop(columns=["features.xy", "features.lng_lat"])
    .join(label_df["features.xy"].explode())
    .reset_index(drop=True)
)

print(label_df_features.shape)
label_df_features.head()

(326606, 19)


Unnamed: 0,metadata.sensor,metadata.provider_asset_type,metadata.gsd,metadata.capture_date,metadata.off_nadir_angle,metadata.pan_resolution,metadata.sun_azimuth,metadata.sun_elevation,metadata.target_azimuth,metadata.disaster,metadata.disaster_type,metadata.catalog_id,metadata.original_width,metadata.original_height,metadata.width,metadata.height,metadata.id,metadata.img_name,features.xy
0,WORLDVIEW03_VNIR,WORLDVIEW03_VNIR,1.345919,2018-02-05T17:10:18.000Z,17.933279,0.336265,150.80763,55.506645,118.24727,guatemala-volcano,volcano,10400100377A9B00,1024,1024,1024,1024,MjUxMzEyMA.FwIEh3iVRb5-R_EXW9amgViNSoA,guatemala-volcano_00000000_pre_disaster.png,"{'properties': {'feature_type': 'building', 'u..."
1,WORLDVIEW03_VNIR,WORLDVIEW03_VNIR,1.345919,2018-02-05T17:10:18.000Z,17.933279,0.336265,150.80763,55.506645,118.24727,guatemala-volcano,volcano,10400100377A9B00,1024,1024,1024,1024,MjUxMzEyMA.FwIEh3iVRb5-R_EXW9amgViNSoA,guatemala-volcano_00000000_pre_disaster.png,"{'properties': {'feature_type': 'building', 'u..."
2,WORLDVIEW03_VNIR,WORLDVIEW03_VNIR,1.345919,2018-02-05T17:10:18.000Z,17.933279,0.336265,150.80763,55.506645,118.24727,guatemala-volcano,volcano,10400100377A9B00,1024,1024,1024,1024,MjUxMzEyMA.FwIEh3iVRb5-R_EXW9amgViNSoA,guatemala-volcano_00000000_pre_disaster.png,"{'properties': {'feature_type': 'building', 'u..."
3,WORLDVIEW03_VNIR,WORLDVIEW03_VNIR,1.345919,2018-02-05T17:10:18.000Z,17.933279,0.336265,150.80763,55.506645,118.24727,guatemala-volcano,volcano,10400100377A9B00,1024,1024,1024,1024,MjUxMzEyMA.FwIEh3iVRb5-R_EXW9amgViNSoA,guatemala-volcano_00000000_pre_disaster.png,"{'properties': {'feature_type': 'building', 'u..."
4,WORLDVIEW03_VNIR,WORLDVIEW03_VNIR,1.345919,2018-02-05T17:10:18.000Z,17.933279,0.336265,150.80763,55.506645,118.24727,guatemala-volcano,volcano,10400100377A9B00,1024,1024,1024,1024,MjUxMzEyMA.FwIEh3iVRb5-R_EXW9amgViNSoA,guatemala-volcano_00000000_pre_disaster.png,"{'properties': {'feature_type': 'building', 'u..."


In [0]:
lng_lat_normalized: pd.DataFrame = pd.json_normalize(label_df_lng_lat["features.lng_lat"]).rename(
    columns={
        "wkt": "map_polygon",
        "properties.feature_type": "map_feature_type",
        "properties.subtype": "map_damage",
        "properties.uid": "building_id",
    }
)

print(lng_lat_normalized.shape)
lng_lat_normalized.head()

(326606, 4)


Unnamed: 0,map_polygon,map_feature_type,building_id,map_damage
0,"POLYGON ((-90.8154467949004 14.39086318334907,...",building,486b0813-ecd2-4b84-856c-9c0e42156953,
1,POLYGON ((-90.81420592026568 14.38829423654955...,building,139cf2c8-ad52-4739-82b5-bb646b215e76,
2,POLYGON ((-90.81324060670399 14.38779112489009...,building,d43deb4a-529c-4df4-b666-26dd5b17e040,
3,POLYGON ((-90.81324307540712 14.38753750651894...,building,563b145d-732d-4eb4-8c77-380519842324,
4,POLYGON ((-90.81335464228361 14.38761690782337...,building,56f51b26-d511-461d-bb31-747901a4ea75,


In [0]:
features_normalized: pd.DataFrame = pd.json_normalize(
    label_df_features["features.xy"]
).rename(
    columns={
        "wkt": "image_polygon",
        "properties.feature_type": "image_feature_type",
        "properties.subtype": "image_damage",
        "properties.uid": "building_id",
    }
)

print(features_normalized.shape)
features_normalized.head()

(326606, 4)


Unnamed: 0,image_polygon,image_feature_type,building_id,image_damage
0,"POLYGON ((532.1863731987335 165.9824500704975,...",building,486b0813-ecd2-4b84-856c-9c0e42156953,
1,"POLYGON ((810.6063105745833 743.0540226914463,...",building,139cf2c8-ad52-4739-82b5-bb646b215e76,
2,"POLYGON ((1024 854.5625520879265, 1003.7774388...",building,d43deb4a-529c-4df4-b666-26dd5b17e040,
3,"POLYGON ((1024 911.7944398581552, 1020.8877501...",building,563b145d-732d-4eb4-8c77-380519842324,
4,"POLYGON ((999.2912769870155 894.1110629984253,...",building,56f51b26-d511-461d-bb31-747901a4ea75,


In [0]:
label_df_lng_lat_normalized = label_df_lng_lat.drop(columns=["features.lng_lat"]).join(
    lng_lat_normalized
)

label_df_features_normalized = label_df_features.drop(columns=["features.xy"]).join(
    features_normalized
)

label_df_final: pd.DataFrame = label_df_lng_lat_normalized.merge(
    label_df_features_normalized[
        [
            "metadata.id",
            "image_polygon",
            "image_feature_type",
            "image_damage",
            "building_id",
        ]
    ],
    "left",
    ["metadata.id", "building_id"],
)

print(label_df_final.shape)
label_df_final.head()

(326606, 25)


Unnamed: 0,metadata.sensor,metadata.provider_asset_type,metadata.gsd,metadata.capture_date,metadata.off_nadir_angle,metadata.pan_resolution,metadata.sun_azimuth,metadata.sun_elevation,metadata.target_azimuth,metadata.disaster,metadata.disaster_type,metadata.catalog_id,metadata.original_width,metadata.original_height,metadata.width,metadata.height,metadata.id,metadata.img_name,map_polygon,map_feature_type,building_id,map_damage,image_polygon,image_feature_type,image_damage
0,WORLDVIEW03_VNIR,WORLDVIEW03_VNIR,1.345919,2018-02-05T17:10:18.000Z,17.933279,0.336265,150.80763,55.506645,118.24727,guatemala-volcano,volcano,10400100377A9B00,1024,1024,1024,1024,MjUxMzEyMA.FwIEh3iVRb5-R_EXW9amgViNSoA,guatemala-volcano_00000000_pre_disaster.png,"POLYGON ((-90.8154467949004 14.39086318334907,...",building,486b0813-ecd2-4b84-856c-9c0e42156953,,"POLYGON ((532.1863731987335 165.9824500704975,...",building,
1,WORLDVIEW03_VNIR,WORLDVIEW03_VNIR,1.345919,2018-02-05T17:10:18.000Z,17.933279,0.336265,150.80763,55.506645,118.24727,guatemala-volcano,volcano,10400100377A9B00,1024,1024,1024,1024,MjUxMzEyMA.FwIEh3iVRb5-R_EXW9amgViNSoA,guatemala-volcano_00000000_pre_disaster.png,POLYGON ((-90.81420592026568 14.38829423654955...,building,139cf2c8-ad52-4739-82b5-bb646b215e76,,"POLYGON ((810.6063105745833 743.0540226914463,...",building,
2,WORLDVIEW03_VNIR,WORLDVIEW03_VNIR,1.345919,2018-02-05T17:10:18.000Z,17.933279,0.336265,150.80763,55.506645,118.24727,guatemala-volcano,volcano,10400100377A9B00,1024,1024,1024,1024,MjUxMzEyMA.FwIEh3iVRb5-R_EXW9amgViNSoA,guatemala-volcano_00000000_pre_disaster.png,POLYGON ((-90.81324060670399 14.38779112489009...,building,d43deb4a-529c-4df4-b666-26dd5b17e040,,"POLYGON ((1024 854.5625520879265, 1003.7774388...",building,
3,WORLDVIEW03_VNIR,WORLDVIEW03_VNIR,1.345919,2018-02-05T17:10:18.000Z,17.933279,0.336265,150.80763,55.506645,118.24727,guatemala-volcano,volcano,10400100377A9B00,1024,1024,1024,1024,MjUxMzEyMA.FwIEh3iVRb5-R_EXW9amgViNSoA,guatemala-volcano_00000000_pre_disaster.png,POLYGON ((-90.81324307540712 14.38753750651894...,building,563b145d-732d-4eb4-8c77-380519842324,,"POLYGON ((1024 911.7944398581552, 1020.8877501...",building,
4,WORLDVIEW03_VNIR,WORLDVIEW03_VNIR,1.345919,2018-02-05T17:10:18.000Z,17.933279,0.336265,150.80763,55.506645,118.24727,guatemala-volcano,volcano,10400100377A9B00,1024,1024,1024,1024,MjUxMzEyMA.FwIEh3iVRb5-R_EXW9amgViNSoA,guatemala-volcano_00000000_pre_disaster.png,POLYGON ((-90.81335464228361 14.38761690782337...,building,56f51b26-d511-461d-bb31-747901a4ea75,,"POLYGON ((999.2912769870155 894.1110629984253,...",building,


In [0]:
assert (
    label_df_final["image_damage"].to_list()
    == label_df_final["map_damage"].to_list()
), "Damage Classification Between Image Data and Map Data Differ - Possibly Wrong Source Data or Incorrect Joins"

In [0]:
label_df_final = (
    label_df_final.rename(
        columns={
            c: c.replace("metadata.", "")
            for c in label_df_final.columns
            if c.startswith("metadata.")
        }
    )
    .drop(
        columns=[
            "map_feature_type",
            "map_damage",
        ]
    )
    .rename(
        columns={
            "image_feature_type": "feature_type",
            "image_damage": "damage",
        }
    )
)

label_df_final["dataset"] = CHALLENGE_TYPE
label_df_final["capture_date"] = pd.to_datetime(label_df_final["capture_date"])

print(label_df_final.shape)
label_df_final.head()

(326606, 24)


Unnamed: 0,sensor,provider_asset_type,gsd,capture_date,off_nadir_angle,pan_resolution,sun_azimuth,sun_elevation,target_azimuth,disaster,disaster_type,catalog_id,original_width,original_height,width,height,id,img_name,map_polygon,building_id,image_polygon,feature_type,damage,dataset
0,WORLDVIEW03_VNIR,WORLDVIEW03_VNIR,1.345919,2018-02-05 17:10:18+00:00,17.933279,0.336265,150.80763,55.506645,118.24727,guatemala-volcano,volcano,10400100377A9B00,1024,1024,1024,1024,MjUxMzEyMA.FwIEh3iVRb5-R_EXW9amgViNSoA,guatemala-volcano_00000000_pre_disaster.png,"POLYGON ((-90.8154467949004 14.39086318334907,...",486b0813-ecd2-4b84-856c-9c0e42156953,"POLYGON ((532.1863731987335 165.9824500704975,...",building,,train
1,WORLDVIEW03_VNIR,WORLDVIEW03_VNIR,1.345919,2018-02-05 17:10:18+00:00,17.933279,0.336265,150.80763,55.506645,118.24727,guatemala-volcano,volcano,10400100377A9B00,1024,1024,1024,1024,MjUxMzEyMA.FwIEh3iVRb5-R_EXW9amgViNSoA,guatemala-volcano_00000000_pre_disaster.png,POLYGON ((-90.81420592026568 14.38829423654955...,139cf2c8-ad52-4739-82b5-bb646b215e76,"POLYGON ((810.6063105745833 743.0540226914463,...",building,,train
2,WORLDVIEW03_VNIR,WORLDVIEW03_VNIR,1.345919,2018-02-05 17:10:18+00:00,17.933279,0.336265,150.80763,55.506645,118.24727,guatemala-volcano,volcano,10400100377A9B00,1024,1024,1024,1024,MjUxMzEyMA.FwIEh3iVRb5-R_EXW9amgViNSoA,guatemala-volcano_00000000_pre_disaster.png,POLYGON ((-90.81324060670399 14.38779112489009...,d43deb4a-529c-4df4-b666-26dd5b17e040,"POLYGON ((1024 854.5625520879265, 1003.7774388...",building,,train
3,WORLDVIEW03_VNIR,WORLDVIEW03_VNIR,1.345919,2018-02-05 17:10:18+00:00,17.933279,0.336265,150.80763,55.506645,118.24727,guatemala-volcano,volcano,10400100377A9B00,1024,1024,1024,1024,MjUxMzEyMA.FwIEh3iVRb5-R_EXW9amgViNSoA,guatemala-volcano_00000000_pre_disaster.png,POLYGON ((-90.81324307540712 14.38753750651894...,563b145d-732d-4eb4-8c77-380519842324,"POLYGON ((1024 911.7944398581552, 1020.8877501...",building,,train
4,WORLDVIEW03_VNIR,WORLDVIEW03_VNIR,1.345919,2018-02-05 17:10:18+00:00,17.933279,0.336265,150.80763,55.506645,118.24727,guatemala-volcano,volcano,10400100377A9B00,1024,1024,1024,1024,MjUxMzEyMA.FwIEh3iVRb5-R_EXW9amgViNSoA,guatemala-volcano_00000000_pre_disaster.png,POLYGON ((-90.81335464228361 14.38761690782337...,56f51b26-d511-461d-bb31-747901a4ea75,"POLYGON ((999.2912769870155 894.1110629984253,...",building,,train


In [0]:
label_df_final.dtypes

sensor                              object
provider_asset_type                 object
gsd                                float64
capture_date           datetime64[ns, UTC]
off_nadir_angle                    float64
pan_resolution                     float64
sun_azimuth                        float64
sun_elevation                      float64
target_azimuth                     float64
disaster                            object
disaster_type                       object
catalog_id                          object
original_width                       int64
original_height                      int64
width                                int64
height                               int64
id                                  object
img_name                            object
map_polygon                         object
building_id                         object
image_polygon                       object
feature_type                        object
damage                              object
dataset    

In [0]:
print(f"{len(label_df_final['id'].unique())=}")
print(f"{len(label_df_final['building_id'].unique())=}")
print()
print(label_df_final["damage"].value_counts())
print()
print(label_df_final["disaster"].value_counts())

len(label_df_final['id'].unique())=5598
len(label_df_final['building_id'].unique())=162788

no-damage        117426
minor-damage      14980
major-damage      14161
destroyed         13227
un-classified      2993
Name: damage, dtype: int64

mexico-earthquake      64542
palu-tsunami           62792
hurricane-harvey       46066
hurricane-michael      45390
hurricane-matthew      27908
santa-rosa-wildfire    25914
socal-fire             21780
midwest-flooding       17582
hurricane-florence     12918
guatemala-volcano       1714
Name: disaster, dtype: int64


In [0]:
label_df_final.to_parquet(f"{CHALLENGE_TYPE}.parquet")

In [0]:
concat_list: list[pd.DataFrame] = [
    pd.read_parquet(pq_file) for pq_file in os.listdir() if pq_file.endswith(".parquet")
]

print(len(concat_list))

df = pd.concat(concat_list).reset_index(drop=True)

print(df.shape)
df.head()

3
(545816, 24)


Unnamed: 0,sensor,provider_asset_type,gsd,capture_date,off_nadir_angle,pan_resolution,sun_azimuth,sun_elevation,target_azimuth,disaster,disaster_type,catalog_id,original_width,original_height,width,height,id,img_name,map_polygon,building_id,image_polygon,feature_type,damage,dataset
0,WORLDVIEW03_VNIR,WORLDVIEW03_VNIR,1.408524,2018-06-22 16:55:40+00:00,21.97336,0.351739,58.571484,71.182274,314.29373,guatemala-volcano,volcano,104001003E575F00,1024,1024,1024,1024,MjU0NzE2Nw.m-DFlxQeMVNP1AiBYflHzPY-lgA,guatemala-volcano_00000003_post_disaster.png,POLYGON ((-90.83554484998086 14.43845885230631...,88703461-a33d-4327-9244-a0d4e2242ede,"POLYGON ((452.9629646916362 86.81005767060303,...",building,minor-damage,test
1,WORLDVIEW03_VNIR,WORLDVIEW03_VNIR,1.408524,2018-06-22 16:55:40+00:00,21.97336,0.351739,58.571484,71.182274,314.29373,guatemala-volcano,volcano,104001003E575F00,1024,1024,1024,1024,MjU0NzE2Nw.m-DFlxQeMVNP1AiBYflHzPY-lgA,guatemala-volcano_00000003_post_disaster.png,POLYGON ((-90.83658244456636 14.43748886352666...,e168e405-3479-44ee-849a-7af2ed32dee1,"POLYGON ((226.8267143191367 307.8635026276182,...",building,destroyed,test
2,WORLDVIEW03_VNIR,WORLDVIEW03_VNIR,1.408524,2018-06-22 16:55:40+00:00,21.97336,0.351739,58.571484,71.182274,314.29373,guatemala-volcano,volcano,104001003E575F00,1024,1024,1024,1024,MjU0NzE2Nw.m-DFlxQeMVNP1AiBYflHzPY-lgA,guatemala-volcano_00000003_post_disaster.png,POLYGON ((-90.83487803042956 14.43776515972186...,9f8f8916-b6c5-4fa8-ab64-c8fd86eca035,"POLYGON ((601.0923397346356 241.9779197994199,...",building,minor-damage,test
3,GEOEYE01,GEOEYE01,2.895928,2018-09-18 16:32:47+00:00,41.74134,0.722669,162.69955,57.024815,69.02214,hurricane-florence,flooding,10500100123A5000,1024,1024,1024,1024,MjU0NjYxOA.oDPPzoZ4qHe2KBGmwerMXsQbF9I,hurricane-florence_00000005_post_disaster.png,POLYGON ((-79.05237408501472 33.58644838741007...,624f417b-fd9f-4544-ba25-9ba2be361d27,POLYGON ((43.56144524808627 0.0045224975870225...,building,no-damage,test
4,GEOEYE01,GEOEYE01,2.895928,2018-09-18 16:32:47+00:00,41.74134,0.722669,162.69955,57.024815,69.02214,hurricane-florence,flooding,10500100123A5000,1024,1024,1024,1024,MjU0NjYxOA.oDPPzoZ4qHe2KBGmwerMXsQbF9I,hurricane-florence_00000005_post_disaster.png,POLYGON ((-79.05057793741786 33.58642007435035...,557fbd1d-68bf-4776-a1a5-9bbd45cda227,POLYGON ((406.9497321265795 0.0095005985733089...,building,no-damage,test


In [0]:
df.to_parquet(f"s3://{S3_BUCKET}/datasets/xview2/processed/full_challenge_data.parquet")

df.to_csv(
    f"s3://{S3_BUCKET}/datasets/xview2/processed/full_challenge_data.csv", index=False
)