In [0]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import requests
import os
import json
import boto3
from loguru import logger
from typing import Literal
from PIL import Image
from smart_open import open
from tqdm.autonotebook import tqdm
from concurrent.futures import ThreadPoolExecutor

from src.utils.config import LOCAL_DATA_DIR, S3_DATA_DIR, S3_BUCKET
from xview2_dataset_links import challenge_links as xview2_links

In [0]:
DESTINATION: Literal["LOCAL", "S3"] = "S3"
CHALLENGE_TYPE: Literal["train", "test", "hold"] = "train"

In [0]:
DATASET_PREFIX: str = "/xview2/challenge"

BASE_DIR: str = (
    LOCAL_DATA_DIR + DATASET_PREFIX
    if DESTINATION == "LOCAL"
    else S3_DATA_DIR + DATASET_PREFIX
    if DESTINATION == "S3"
    else None
)

print(f"{BASE_DIR=}")

assert all([BASE_DIR]), "Download Destination not Set"

Run the following shell commands on the terminal to download data locally:


```bash
mkdir test hold train
```


```bash
aws s3 sync s3://alivio/datasets/xview2/challenge/test ./test --request-payer requester
```


```bash
aws s3 sync s3://alivio/datasets/xview2/challenge/hold ./hold --request-payer requester
```


```bash
aws s3 sync s3://alivio/datasets/xview2/challenge/train ./train --request-payer requester
```

In [0]:
image_path, label_path, target_path = (
    f"{CHALLENGE_TYPE}/images",
    f"{CHALLENGE_TYPE}/labels",
    f"{CHALLENGE_TYPE}/targets",
)

image_dir, label_dir, target_dir = (
    [f"{image_path}/{f}" for f in os.listdir(image_path)],
    [f"{label_path}/{f}" for f in os.listdir(label_path)],
    [f"{target_path}/{f}" for f in os.listdir(target_path)],
)

print(f"{len(image_dir)=}")
print(f"{len(label_dir)=}")
print(f"{len(target_dir)=}")

print()
print(image_dir[0:5])
print(label_dir[0:5])
print(target_dir[0:5])

with open(label_dir[0]) as f:
    x = json.load(f)
    assert x, "Unable to read label data"

y = Image.open(image_dir[0])
assert y, "Unable to read image data"

z = Image.open(target_dir[0])
assert z, "Unable to read target data"

## Process Labels

In [0]:
label_json_data: list[dict] = []


def read_and_store_label_json(label_json_path: str):
    """A thread-safe function that reads a json as a dictionary and writes to a global list"""
    with open(label_json_path) as f:
        label_json_data.append(json.load(f))


with ThreadPoolExecutor(max_workers=20) as executor:
    executor.map(read_and_store_label_json, label_dir)

label_json_series: pd.Series = pd.Series(label_json_data)
label_df_original: pd.DataFrame = pd.json_normalize(label_json_series)

print(label_df_original.shape)
label_df_original.head()

In [0]:
label_df: pd.DataFrame = label_df_original.copy()

In [0]:
label_df_lng_lat: pd.DataFrame = (
    label_df.drop(columns=["features.xy", "features.lng_lat"])
    .join(label_df["features.lng_lat"].explode())
    .reset_index(drop=True)
)

print(label_df_lng_lat.shape)
label_df_lng_lat.head()

In [0]:
label_df_features: pd.DataFrame = (
    label_df.drop(columns=["features.xy", "features.lng_lat"])
    .join(label_df["features.xy"].explode())
    .reset_index(drop=True)
)

print(label_df_features.shape)
label_df_features.head()

In [0]:
lng_lat_normalized: pd.DataFrame = pd.json_normalize(label_df_lng_lat["features.lng_lat"]).rename(
    columns={
        "wkt": "map_polygon",
        "properties.feature_type": "map_feature_type",
        "properties.subtype": "map_damage",
        "properties.uid": "building_id",
    }
)

print(lng_lat_normalized.shape)
lng_lat_normalized.head()

In [0]:
features_normalized: pd.DataFrame = pd.json_normalize(
    label_df_features["features.xy"]
).rename(
    columns={
        "wkt": "image_polygon",
        "properties.feature_type": "image_feature_type",
        "properties.subtype": "image_damage",
        "properties.uid": "building_id",
    }
)

print(features_normalized.shape)
features_normalized.head()

In [0]:
label_df_lng_lat_normalized = label_df_lng_lat.drop(columns=["features.lng_lat"]).join(
    lng_lat_normalized
)

label_df_features_normalized = label_df_features.drop(columns=["features.xy"]).join(
    features_normalized
)

label_df_final: pd.DataFrame = label_df_lng_lat_normalized.merge(
    label_df_features_normalized[
        [
            "metadata.id",
            "image_polygon",
            "image_feature_type",
            "image_damage",
            "building_id",
        ]
    ],
    "left",
    ["metadata.id", "building_id"],
)

print(label_df_final.shape)
label_df_final.head()

In [0]:
assert (
    label_df_final["image_damage"].to_list()
    == label_df_final["map_damage"].to_list()
), "Damage Classification Between Image Data and Map Data Differ - Possibly Wrong Source Data or Incorrect Joins"

In [0]:
label_df_final = (
    label_df_final.rename(
        columns={
            c: c.replace("metadata.", "")
            for c in label_df_final.columns
            if c.startswith("metadata.")
        }
    )
    .drop(
        columns=[
            "map_feature_type",
            "map_damage",
        ]
    )
    .rename(
        columns={
            "image_feature_type": "feature_type",
            "image_damage": "damage",
        }
    )
)

In [0]:
label_df_final["dataset"] = CHALLENGE_TYPE
label_df_final["capture_date"] = pd.to_datetime(label_df_final["capture_date"])

label_df_final["image_id"] = label_df_final["img_name"].dropna().apply(lambda cell: "_".join(cell.split("_")[0:2]))
label_df_final["is_pre_image"] = label_df_final["img_name"].dropna().apply(lambda cell: "_pre_disaster" in cell)
label_df_final["is_post_image"] = (
    label_df_final["img_name"].dropna().apply(lambda cell: "_post_disaster" in cell)
)

In [0]:
print(label_df_final.shape)
label_df_final.head()

In [0]:
label_df_final.dtypes

In [0]:
print(f"{len(label_df_final['id'].unique())=}")
print(f"{len(label_df_final['building_id'].unique())=}")
print()
print(label_df_final["damage"].value_counts())
print()
print(label_df_final["disaster"].value_counts())

In [0]:
label_df_final.to_parquet(f"{CHALLENGE_TYPE}.parquet")

In [0]:
concat_list: list[pd.DataFrame] = [
    pd.read_parquet(pq_file) for pq_file in os.listdir() if pq_file.endswith(".parquet")
]

print(len(concat_list))

df = pd.concat(concat_list).reset_index(drop=True)

print(df.shape)
df.head()

In [0]:
df.to_parquet(f"s3://{S3_BUCKET}/datasets/xview2/processed/full_challenge_data.parquet")

df.to_csv(
    f"s3://{S3_BUCKET}/datasets/xview2/processed/full_challenge_data.csv", index=False
)