In [2]:
from pathlib import Path
import pickle
from IPython.display import display
import time
import random
import pandas as pd
from collections import Counter, OrderedDict
import numpy as np
import matplotlib.pyplot as plt
import os
from datetime import datetime
import copy
from typing import Tuple, Dict, Union, Any, List
from functools import partial
from tqdm import tqdm
import cv2
import json
from multiprocessing import Pool
import yaml
import shutil
from collections import defaultdict

**IMPORTANT**: The following parameters can be overwritten with `papermill` (please keep the `parameters` cell tag.)

In [3]:
# Default parameters

DATASET_NAME = "synthetic2real_day"
DATASET_SOURCE = "2023_12" # 2023_Q3, 2023_Q4, 2023_12
ATD_RANGE_M = [-300., -6000.] # Valid along track distance range for dev dataset
TRAIN_RATIO = 1 # 0.95
IMAGE_SIZE = [1024, 750]  # Output W,H
DATA_ROOT_DIRPATH = "/mnt/data1/user_cache/geoffrey.g.delhomme/data" # Main location to store datasets
RUNWAY_HEADING_RANGE_DEG = 5
MAX_ASPECT_RATIO = 1.5  # Maximum aspect ratio (W/H)
MIN_AREA = 10  # Minimum aera (px) for a runway to be retained

# Load pickles & extract dev data

Define pickle filepaths:

In [4]:
if DATASET_SOURCE == "2023_Q3":
    PICKLE_FILEPATHS = [
        "/mnt/data1/ml_dev/data/AIP/aip_2023_Q3_delivery/2023-07-19/dataset/dev/dataset_dev.p"
    ]
elif DATASET_SOURCE == "2023_Q4":
    PICKLE_FILEPATHS = [
        "/mnt/data1/ml_dev/data/AIP/P2_VLA_v1_i1_f1/real_dev/dataset/dev/dataset_dev.p",
        "/mnt/data1/ml_dev/data/AIP/P2_VLA_v1_i1_f1/sim_dev_q1/dataset/dev/dataset_dev.p",
        "/mnt/data1/ml_dev/data/AIP/P2_VLA_v1_i1_f1/sim_dev_q2/dataset/dev/dataset_dev.p",
        "/mnt/data1/ml_dev/data/AIP/P2_VLA_v1_i1_f1/sim_dev_q3/dataset/dev/dataset_dev.p",
    ]
elif DATASET_SOURCE == "2023_12":
    PICKLE_FILEPATHS = [
        "/mnt/data1/ml_dev/data/AIP/P2_VLA_v2_i1_f2/real/dataset/dev/dataset_dev.p",
        "/mnt/data1/ml_dev/data/AIP/P2_VLA_v2_i1_f2/sim_q1_2023_trajectory/dataset/dev/dataset_dev.p",
        "/mnt/data1/ml_dev/data/AIP/P2_VLA_v2_i1_f2/sim_q2_2023_disjoint/dataset/dev/dataset_dev.p",
        "/mnt/data1/ml_dev/data/AIP/P2_VLA_v2_i1_f2/sim_q3_2023_disjoint/dataset/dev/dataset_dev.p",
    ]

Load pickle files:

In [5]:
dataset_raw = []

for pickle_filepath in PICKLE_FILEPATHS:
    print(f"Loading {pickle_filepath} ...")
    tic = time.perf_counter()
    with open(pickle_filepath, 'rb') as f:
        dataset_raw.extend(pickle.load(f))
    print(f"Elapsed: {time.perf_counter() - tic:.2f} s")

print(f"Number of raw samples: {len(dataset_raw)}")

Loading /mnt/data1/ml_dev/data/AIP/P2_VLA_v2_i1_f2/real/dataset/dev/dataset_dev.p ...
Elapsed: 82.22 s
Loading /mnt/data1/ml_dev/data/AIP/P2_VLA_v2_i1_f2/sim_q1_2023_trajectory/dataset/dev/dataset_dev.p ...
Elapsed: 31.13 s
Loading /mnt/data1/ml_dev/data/AIP/P2_VLA_v2_i1_f2/sim_q2_2023_disjoint/dataset/dev/dataset_dev.p ...


Number of airports:

In [5]:
airports = set([metadata["airport"] for metadata in dataset_raw])
print(f"Number of airports in raw dataset: {len(airports)}")

Number of airports in raw dataset: 777


Filter dev dataset samples:

In [6]:
def is_dev(metadata):
    atd = metadata["labels"]["runways"][0]["aircraft_rel_position"]["along_track_offset_m"]
    return not metadata["is_night"] and atd >= ATD_RANGE_M[1] and atd <= ATD_RANGE_M[0]

dataset_dev = list(filter(is_dev, dataset_raw))

print(f"Number of dev samples: {len(dataset_dev)} ({100.*len(dataset_dev)/len(dataset_raw):.2f}%)")

Number of dev samples: 1480612 (64.87%)


In [7]:
airports = set([metadata["airport"] for metadata in dataset_dev])
print(f"Number of airports in dev dataset: {len(airports)}")

Number of airports in dev dataset: 777


Print metadata sample:

In [8]:
# print sample
random.choice(dataset_dev)

{'image_path': '/mnt/data1/sim/datasets/aip/delivery_2023_q2/disjoint/v1.0/URKA_04_A_1/images/000/0000077.jpeg',
 'flight_test': 'sim_aip_q2_2023_disjoint_dev_58',
 'recording_seq_id': 18,
 'src_metadata': {'image_source': 'xplane-12',
  'is_trajectory': False,
  'ellipsoid_model': 'WGS84',
  'altitude_reference': 'MSL',
  'gravitational_model': 'EGM96-5',
  'trajectory_real_twin': ''},
 'is_synthetic': True,
 'is_night': False,
 'airport': 'URKA',
 'active_runway': '04',
 'labels': {'runways': [{'id': '22',
    'icao': 'URKA',
    'is_active_runway': False,
    'runway': {'top_left': {'x_px': 1251.3808291581167,
      'y_px': 1265.2763620015114,
      'lat_deg': 44.993709569693145,
      'lon_deg': 37.3367813540148,
      'alt_m': 39.081034671515226},
     'top_right': {'x_px': 1191.1016456779475,
      'y_px': 1233.147954021367,
      'lat_deg': 44.9939644299654,
      'lon_deg': 37.3363870451088,
      'alt_m': 38.99803467001766},
     'bottom_left': {'x_px': 1711.0961889395169,
   

# Split train / validation

Create data frame with useful metadata:

In [9]:
airports_real = set([m['airport'] for m in dataset_dev if not m["is_synthetic"]])
print("Number of real airports:", len(airports_real))
airports_synthetic = set([m['airport'] for m in dataset_dev if m["is_synthetic"]])
print("Number of synthetic airports:", len(airports_synthetic))
airports = []
for a in set(list(airports_real) + list(airports_synthetic)):
    if a in airports_real and a in airports_synthetic:
        airports.append(a)
print("Number of airports with real and synthetic images:", len(airports))

Number of real airports: 48
Number of synthetic airports: 776
Number of airports with real and synthetic images: 47


In [10]:
dataset_dev_reduced = [{k: m[k] for k in ["airport", "image_path", "is_synthetic", "environment"]} for m in dataset_dev if not m["is_night"] and m["airport"] in airports]
print("Number of samples for selected airports in `not night` conditions:", len(dataset_dev_reduced))
print("Number of real samples for selected airports in day conditions:", len([m for m in dataset_dev_reduced if not m["is_synthetic"]]))
print("Number of synthetic samples for selected airports in day conditions:", len([m for m in dataset_dev_reduced if m["is_synthetic"]]))

Number of samples for selected airports in `not night` conditions: 357023
Number of real samples for selected airports in day conditions: 177190
Number of synthetic samples for selected airports in day conditions: 179833


In [11]:
print(Counter([m["environment"]["time_of_day"] for m in dataset_dev_reduced]))
print(Counter([m["environment"]["visibility_category"] for m in dataset_dev_reduced if "visibility_category" in m["environment"]]))
print(Counter([m["environment"]["cloud_type"] for m in dataset_dev_reduced if "cloud_type" in m["environment"]]))

Counter({'day': 314253, 'dusk': 34085, 'dawn': 8685})
Counter({'nominal': 179833, 'degraded': 2503})
Counter({'clear': 44966, 'scattered': 44966, 'few': 44966, 'broken': 44935})


In [12]:
dataset_dev_reduced = [m for m in dataset_dev if not m["is_night"] and m["airport"] in airports and m["environment"]["time_of_day"] == "day"]
print("Number of samples for selected airports in day conditions:", len(dataset_dev_reduced))
print("Number of real samples for selected airports in day conditions:", len([m for m in dataset_dev_reduced if not m["is_synthetic"]]))
print("Number of synthetic samples for selected airports in day conditions:", len([m for m in dataset_dev_reduced if m["is_synthetic"]]))

Number of samples for selected airports in day conditions: 314253
Number of real samples for selected airports in day conditions: 144805
Number of synthetic samples for selected airports in day conditions: 169448


In [13]:
airports = set([m["airport"] for m in dataset_dev_reduced])
print(f"Number of airports in dev dataset: {len(airports)}")

Number of airports in dev dataset: 47


Select airports so that we keep desired ratio of real data for training:

In [14]:
c = Counter([metadata["airport"] for metadata in dataset_dev_reduced])
x, y = list(zip(*c.most_common()))
cy = np.cumsum(y)
xmax = np.argmax(cy > TRAIN_RATIO*np.sum(y))
train_airports = x[:xmax]
train_ratio = 1 - np.sum(y[xmax:]) / np.sum(y)
print(f"Train dataset ratio: {100*train_ratio:.2f}%")
print(f"Number of airports in training dataset: {len(train_airports)}")

Train dataset ratio: 94.38%
Number of airports in training dataset: 37


Sum up airports:

In [15]:
train_airports = set(train_airports)
valid_airports = set([m['airport'] for m in dataset_dev_reduced if m['airport'] not in train_airports])

print(f"Number of airports in training dataset: {len(train_airports)} ({100 * len(train_airports) / len(airports):.2f}%)")
print(f"Number of airports in validation dataset: {len(valid_airports)} ({100 * len(valid_airports) / len(airports):.2f}%)")

Number of airports in training dataset: 37 (78.72%)
Number of airports in validation dataset: 10 (21.28%)


Sum up the number of samples:

In [16]:
dataset_train = list(filter(lambda m: m["airport"] in train_airports, dataset_dev))
dataset_valid = list(filter(lambda m: m["airport"] in valid_airports, dataset_dev))

print(f"Number of samples in training dataset: {len(dataset_train)} ({100.*len(dataset_train)/len(dataset_dev):.2f}%)")
print(f"Number of samples in validation dataset: {len(dataset_valid)} ({100.*len(dataset_valid)/len(dataset_dev):.2f}%)")

Number of samples in training dataset: 319476 (21.58%)
Number of samples in validation dataset: 37547 (2.54%)


Save datasets in respective pickle files:

In [17]:
save_pickle_dirpath = Path(DATA_ROOT_DIRPATH) / DATASET_SOURCE / DATASET_NAME / "pickles"

os.makedirs(save_pickle_dirpath.as_posix(), exist_ok=True)

dataset_train_pickle_filepath = save_pickle_dirpath / "dataset_train.p"
with open(dataset_train_pickle_filepath.as_posix(), 'wb') as f:
    pickle.dump(dataset_train, f)
print(f"Training dataset saved at: {dataset_train_pickle_filepath.as_posix()}")

dataset_valid_pickle_filepath = save_pickle_dirpath / "dataset_valid.p"
with open(dataset_valid_pickle_filepath.as_posix(), 'wb') as f:
    pickle.dump(dataset_valid, f)
print(f"Validation dataset saved at: {dataset_valid_pickle_filepath.as_posix()}")

Training dataset saved at: /mnt/data1/user_cache/geoffrey.g.delhomme/data/2023_12/synthetic2real_day/pickles/dataset_train.p
Validation dataset saved at: /mnt/data1/user_cache/geoffrey.g.delhomme/data/2023_12/synthetic2real_day/pickles/dataset_valid.p


# Generate

Define main constants:

In [25]:
# define callibration priority
CALIBRATION_PRIORITY = (
    "sim",
    "interframe_runway",
    "central",
    "recording_runway",
    "recorded",
)

# keypoints fields and order
KEYPOINTS = [
    ("runway", "bottom_left"),
    ("runway", "top_left"),
    ("runway", "top_right"),
    ("runway", "bottom_right"),
]

Define where to generate the dataset files:

In [4]:
dest_dirpath = Path(DATA_ROOT_DIRPATH) / DATASET_SOURCE / DATASET_NAME / f"{IMAGE_SIZE[0]}x{IMAGE_SIZE[1]}"
# if dest_dirpath.exists():
#     shutil.rmtree(dest_dirpath)
dest_dirpath.mkdir(exist_ok=True, parents=True)
print(f"Destination directory: {dest_dirpath.as_posix()}")

Destination directory: /mnt/data1/user_cache/geoffrey.g.delhomme/data/2023_12/synthetic2real_day/1024x750


Main functions to generate the dataset:

In [27]:
def process(
    split, sample
):
    # TODO:
    # - downscale image to 750x1024 (.png)
    # - list all runway corners (for valid runway (+/- 5°))
    # - generate bbox .txt (cls xmin ymin xmax ymax)

    # expand
    index, metadata = sample

    # get image dimensions
    image_width, image_height = metadata["camera_dict_list"][0]["resolution_px"]

    # process image
    image_filepath = Path(metadata["image_path"])

    # Dicard runways not aligned with the active one +- range
    compatible_runways: List[Dict[str, Any]] = []
    active_rwy_heading = (
        int(
            metadata["active_runway"]
            if metadata["active_runway"][-1].isnumeric()
            else metadata["active_runway"][:-1]
        )
        * 10
    )
    for metadata_runway in metadata["labels"]["runways"]:
        rwy_heading = (
            int(
                metadata_runway["id"]
                if metadata_runway["id"][-1].isnumeric()
                else metadata_runway["id"][:-1]
            )
            * 10
        )
        delta_rwy_heading = abs(((rwy_heading - active_rwy_heading) + 180) % 360 - 180)
        if delta_rwy_heading <= RUNWAY_HEADING_RANGE_DEG:
            compatible_runways.append(metadata_runway)

    # Grab calibration ids and associated processes
    calibrations: Dict[str, int] = OrderedDict(list(zip(CALIBRATION_PRIORITY, [None]*len(CALIBRATION_PRIORITY))))
    for camera_dict in metadata["camera_dict_list"]:
        if calibrations[camera_dict["calibration_process"]] is None:
            calibrations[camera_dict["calibration_process"]] = camera_dict["calibration_id"]

    # Remove calibration duplicates
    calibrated_runways: Dict[int, Dict[str, Any]] = {}
    for calibration_id in calibrations.values():
        if calibration_id is None:
            continue
        for metadata_runway in compatible_runways:
            if str(metadata_runway["id"]) in calibrated_runways:
                continue
            if metadata_runway["calibration_id"] != calibration_id:
                continue
            calibrated_runways[str(metadata_runway["id"])] = metadata_runway

    # compute bboxes and draw mask
    contours, bboxes = [], []
    for metadata_runway in calibrated_runways.values():
        try:
            contour = metadata_runway["runway"]["region"]["points"]
            # ! some values are nan !
            contour = np.array(contour, dtype=np.float32)
            if np.any(np.isnan(contour)):
                raise ValueError("NaNs")
            if len(contour.shape) != 2 or contour.shape[1] != 2:
                raise ValueError("Wrong shape")
            contour[:,0] = contour[:,0] / image_width * IMAGE_SIZE[0]
            contour[:,1] = contour[:,1] / image_height * IMAGE_SIZE[1]
            x_min = float(contour[:, 0].min())
            x_max = float(contour[:, 0].max())
            y_min = float(contour[:, 1].min())
            y_max = float(contour[:, 1].max())
            w = abs(x_max - x_min)
            h = abs(y_max - y_min)
            if w/h > MAX_ASPECT_RATIO:
                raise ValueError("Wrong AR!")
            if w * h < MIN_AREA:
                raise ValueError("Bbox area negative or too small!")
        except ValueError:
            continue
        except KeyError:
            continue
        # gather contours
        contours.append(contour)
        # gather bboxes
        bboxes.append([
            1, *[int(v) for v in [x_min, y_min, x_max, y_max]]
        ])

    # check if there is any annotations
    if len(bboxes) == 0 or len(contours) == 0:
        return f"{image_filepath.as_posix()}: no annotations!"

    # save image
    new_image_filepath = dest_dirpath / "raw" / (f"{split}A" if metadata["is_synthetic"] else f"{split}B") / f"{index:010d}.png"
    os.makedirs(new_image_filepath.parent.as_posix(), exist_ok=True)
    if image_width == IMAGE_SIZE[0] and image_height == IMAGE_SIZE[1]:
        # create symlink
        try:
            os.symlink(image_filepath, new_image_filepath)
        except OSError as e:
            os.remove(new_image_filepath.as_posix())
            os.symlink(image_filepath, new_image_filepath)
    else:
        # resize image
        im = cv2.imread(image_filepath.as_posix())
        im = cv2.resize(im, IMAGE_SIZE)
        cv2.imwrite(new_image_filepath.as_posix(), im)

    # save bbox
    bbox_image_filepath = dest_dirpath / "bbox" / (f"{split}A" if metadata["is_synthetic"] else f"{split}B") / "img" / f"{index:010d}.png"
    bbox_image_filepath.parent.mkdir(exist_ok=True, parents=True)
    try:
        os.symlink(new_image_filepath, bbox_image_filepath)
    except OSError as e:
        os.remove(bbox_image_filepath.as_posix())
        os.symlink(new_image_filepath, bbox_image_filepath)
    bbox_filepath = dest_dirpath / "bbox" / (f"{split}A" if metadata["is_synthetic"] else f"{split}B") / "bbox" / f"{index:010d}.txt"
    os.makedirs(bbox_filepath.parent.as_posix(), exist_ok=True)
    with open(bbox_filepath.as_posix(), "w") as f:
        f.write('\n'.join([' '.join([str(l) for l in line]) for line in bboxes]))
    # draw mask
    mask_image_filepath = dest_dirpath / "mask" / (f"{split}A" if metadata["is_synthetic"] else f"{split}B") / "img" / f"{index:010d}.png"
    mask_image_filepath.parent.mkdir(exist_ok=True, parents=True)
    try:
        os.symlink(new_image_filepath, mask_image_filepath)
    except OSError as e:
        os.remove(mask_image_filepath.as_posix())
        os.symlink(new_image_filepath, mask_image_filepath)
    mask_filepath = dest_dirpath / "mask" / (f"{split}A" if metadata["is_synthetic"] else f"{split}B") / "mask" / f"{index:010d}.png"
    os.makedirs(mask_filepath.parent.as_posix(), exist_ok=True)
    mask = np.zeros(IMAGE_SIZE[::-1], dtype=np.uint8)
    mask = np.ascontiguousarray(mask)
    for contour in contours:
        mask = cv2.fillPoly(mask, pts=[contour.astype(np.int32)], color=1)
    cv2.imwrite(mask_filepath.as_posix(), mask)

    return (bbox_image_filepath, bbox_filepath), (mask_image_filepath, mask_filepath)

# process("train", (0, dataset_dev_reduced[0]))

In [28]:
def generate(split, dataset):
    bbox_dirpath = dest_dirpath / "bbox"
    bbox_dirpath.mkdir(exist_ok=True, parents=True)
    mask_dirpath = dest_dirpath / "mask"
    mask_dirpath.mkdir(exist_ok=True, parents=True)
    bbox, mask = {}, {}
    for k in ["trainA", "trainB", "testA", "testB"]:
        bbox_filepath = bbox_dirpath / k / "paths.txt"
        bbox_filepath.parent.mkdir(exist_ok=True, parents=True)
        bbox[k] = open(bbox_filepath.as_posix(), 'w')
        mask_filepath = mask_dirpath / k / "paths.txt"
        mask_filepath.parent.mkdir(exist_ok=True, parents=True)
        mask[k] = open(mask_filepath.as_posix(), 'w')
    with Pool(max(1, int(os.cpu_count() * 0.9))) as p:
        for sample in tqdm(
            p.imap_unordered(partial(process, split), enumerate(dataset)), total=len(dataset)
        ):
    # if True:
    #     for sample in tqdm(enumerate(dataset), total=len(dataset)):
    #         sample = process(split, sample)
            if sample is None or isinstance(sample, str):
                continue
            (bbox_image_filepath, bbox_filepath), (mask_image_filepath, mask_filepath) = sample
            for k, f in bbox.items():
                if k in bbox_filepath.as_posix().split(os.sep):
                    f.write(f"{bbox_image_filepath.as_posix()} {bbox_filepath.as_posix()}")
            for k, f in mask.items():
                if k in mask_filepath.as_posix().split(os.sep):
                    f.write(f"{mask_image_filepath.as_posix()} {mask_filepath.as_posix()}")
    for f in bbox.values():
        f.close()
    for f in mask.values():
        f.close()

In [29]:
generate("train", dataset_train)

100%|██████████| 319476/319476 [31:22<00:00, 169.73it/s] 


In [30]:
generate("test", dataset_valid)

100%|██████████| 37547/37547 [03:23<00:00, 184.82it/s]


In [8]:
def generate_paths(dirpath):
    if not isinstance(dirpath, Path):
        dirpath = Path(dirpath)
    paths_filepath = dirpath / "paths.txt"
    subdirs = os.listdir(dirpath.as_posix())
    target_dirpath = dirpath / ("bbox" if "bbox" in subdirs else "mask")
    img_dirpath = dirpath / "img"
    print(f"Generate paths for {paths_filepath.as_posix()}")
    with open(paths_filepath, 'w') as f:
        for img_filepath in tqdm(list(img_dirpath.iterdir())):
            img_filepath = img_filepath.relative_to(dirpath.parent)
            target_filepath = (target_dirpath / (img_filepath.stem + ('.txt' if 'bbox' in subdirs else '.png'))).relative_to(dirpath.parent)
            f.write(f"{img_filepath.as_posix()} {target_filepath.as_posix()}\n")

for t in ["bbox", "mask"]:
    dirpath = dest_dirpath / t
    for d in dirpath.iterdir():
        generate_paths(d)

Generate paths for /mnt/data1/user_cache/geoffrey.g.delhomme/data/2023_12/synthetic2real_day/1024x750/bbox/trainA/paths.txt


100%|██████████| 96235/96235 [00:01<00:00, 58113.36it/s]


Generate paths for /mnt/data1/user_cache/geoffrey.g.delhomme/data/2023_12/synthetic2real_day/1024x750/bbox/trainB/paths.txt


100%|██████████| 150988/150988 [00:02<00:00, 58213.38it/s]


Generate paths for /mnt/data1/user_cache/geoffrey.g.delhomme/data/2023_12/synthetic2real_day/1024x750/bbox/testA/paths.txt


100%|██████████| 5240/5240 [00:00<00:00, 58382.66it/s]


Generate paths for /mnt/data1/user_cache/geoffrey.g.delhomme/data/2023_12/synthetic2real_day/1024x750/bbox/testB/paths.txt


100%|██████████| 21715/21715 [00:00<00:00, 58290.61it/s]


Generate paths for /mnt/data1/user_cache/geoffrey.g.delhomme/data/2023_12/synthetic2real_day/1024x750/mask/trainB/paths.txt


100%|██████████| 150988/150988 [00:02<00:00, 57853.22it/s]


Generate paths for /mnt/data1/user_cache/geoffrey.g.delhomme/data/2023_12/synthetic2real_day/1024x750/mask/trainA/paths.txt


100%|██████████| 96235/96235 [00:01<00:00, 58650.36it/s]


Generate paths for /mnt/data1/user_cache/geoffrey.g.delhomme/data/2023_12/synthetic2real_day/1024x750/mask/testB/paths.txt


100%|██████████| 21715/21715 [00:00<00:00, 56735.99it/s]


Generate paths for /mnt/data1/user_cache/geoffrey.g.delhomme/data/2023_12/synthetic2real_day/1024x750/mask/testA/paths.txt


100%|██████████| 5240/5240 [00:00<00:00, 58696.22it/s]


-----------