In [13]:
!which python

/efs/players/to122838/projects/lard-yolov8/.conda/bin/python


In [14]:
import os
from pathlib import Path

import numpy as np
import pandas as pd
import plotly.express as px
import plotly.figure_factory as ff

# Analyze

## Load data

Load metadata CSV files and airports database.

In [15]:
# temporary dirpath
tmp_dirpath = Path(os.environ["TMP_DIRPATH"]).resolve()
# list of zip files to download (train / test datasets) (zipfile, csv entry point relative file path)
train_archives = [
    ("LARD_train_BIRK_LFST.zip", "LARD_train_BIRK_LFST.csv"),
    ("LARD_train_DAAG_DIAP.zip", "LARD_train_DAAG_DIAP.csv"),
    ("LARD_train_KMSY.zip", "LARD_train_KMSY.csv"),
    ("LARD_train_LFMP_LFPO.zip", "LARD_train_LFMP_LFPO.csv"),
    ("LARD_train_LFQQ.zip", "LARD_train_LFQQ.csv"),
    ("LARD_train_LPPT_SRLI.zip", "LARD_train_LPPT_SRLI.csv"),
    ("LARD_train_VABB.zip", "LARD_train_VABB.csv"),
]
valid_archives = [
    ("LARD_test_real.zip", "LARD_test_real_nominal/Test_Real_Nominal.csv"),
    ("LARD_test_real.zip", "LARD_test_real_edge_cases/Test_Real_Edge_Cases.csv"),
    ("LARD_test_synth.zip", "LARD_test_synth.csv"),
]

In [16]:
# airports
df_airports = pd.read_json('airports.json', orient="index")
# train
dfs_train = []
for zip, csv in train_archives:
    csv_filepath = tmp_dirpath / zip.rpartition('.')[0] / csv
    dfs_train.append(pd.read_csv(csv_filepath.as_posix(), delimiter=';'))
df_train = pd.concat(dfs_train).reset_index(drop=True)
# df_train = pd.merge(df_train, df_airports, 'left', left_on="airport", right_on="icao")
df_train["dataset"] = "train"
# valid
dfs_valid = []
for zip, csv in valid_archives:
    csv_filepath = tmp_dirpath / zip.rpartition('.')[0] / csv
    dfs_valid.append(pd.read_csv(csv_filepath.as_posix(), delimiter=';'))
df_valid = pd.concat(dfs_valid).reset_index(drop=True)
# df_valid = pd.merge(df_valid, df_airports, 'left', left_on="airport", right_on="icao")
df_valid["dataset"] = "valid"
# concat
df = pd.concat((df_train, df_valid)).reset_index(drop=True)
df['watermark_height'] = df['watermark_height'].fillna(0.)

## Airport coverage

In [17]:
# project data
dfc = df[['airport', 'dataset', 'image']].groupby(['airport', 'dataset']).count().rename(columns={'image': 'count'})
dfc.reset_index(inplace=True)
dfc = pd.merge(dfc, df_airports, left_on='airport', right_on='icao')
# display graph
fig = px.sunburst(
    dfc,
    path=[px.Constant("ALL"), "dataset"],
    values="count",
    color_discrete_sequence=px.colors.qualitative.D3
)
fig.update_layout(
    paper_bgcolor="rgba(0,0,0,0)",
    plot_bgcolor="rgba(0,0,0,0)",
    template="plotly_dark",
    autosize=True,
)
fig.show()
# display graph
fig = px.scatter_mapbox(
    dfc,
    lat='lat',
    lon='lon',
    size='count',
    color='dataset',
    hover_name='icao',
    hover_data=["name", "country", "city", "state"],
    zoom=1,
    color_discrete_sequence=px.colors.qualitative.D3
)
fig.update_layout(
    mapbox_style="carto-darkmatter",
    paper_bgcolor="rgba(0,0,0,0)",
    plot_bgcolor="rgba(0,0,0,0)",
    template="plotly_dark",
    showlegend=True,
    autosize=True,
    margin=dict(r=0, t=0, l=0, b=0),
)
fig.show()

- The validation set contains airports from nearly all over the world, but training data are recorded from a smaller number of airports with much more data from Europe which could lead to a bias from landscape diversity.

In [18]:
# project data
dfc = df[['airport', 'dataset', 'image']].groupby(['airport', 'dataset']).count().rename(columns={'image': 'count'})
dfc.reset_index(inplace=True)
# display graph
fig = px.bar(
    dfc,
    x="airport",
    y="count",
    color="dataset",
    color_discrete_sequence=px.colors.qualitative.D3,
)
fig.update_layout(
    paper_bgcolor="rgba(0,0,0,0)",
    plot_bgcolor="rgba(0,0,0,0)",
    template="plotly_dark",
    autosize=True,
)
fig.show()

- No overlap between training and validation set airports.

## Trajectory sampling

In [19]:
df["dx"] = df["along_track_distance"]*1000
df["dz"] = df["height_above_runway"]
df["dy"] = df["along_track_distance"]*np.tan(np.deg2rad(df["lateral_path_angle"]))*1000

fig = px.scatter_3d(
    df,
    x="dx",
    y="dy",
    z="dz",
    color="dataset",
    color_discrete_sequence=px.colors.qualitative.D3
)
fig.update_traces(marker_size=1)
fig.update_layout(
    paper_bgcolor="rgba(0,0,0,0)",
    plot_bgcolor="rgba(0,0,0,0)",
    template="plotly_dark",
    autosize=True,
    scene=dict(aspectmode='data')
)
fig.show()


- Sampled data lies in a cone centered on the standard -3° path with nearly Gaussian sampling diameter augmenting as distance from the landing point increases.
- There is no validation samples between 2800 and 4000 meters.

In [20]:
# display graph
fig = ff.create_distplot([df[df['dataset'] == d]['dx'].replace([np.inf, -np.inf], np.nan).dropna() for d in ("train", "valid")], ['train', "valid"], bin_size=50)
fig.update_layout(
    paper_bgcolor="rgba(0,0,0,0)",
    plot_bgcolor="rgba(0,0,0,0)",
    template="plotly_dark",
    autosize=True,
)
fig.show()

- Validation and training dataset are imbalanced regarding the along track distance distribution, as validation samples are more frequent in the range 0 to 2500 meters.

In [21]:
df['x_min'] = np.min(df[[f'x_{k}' for k in 'ABCD']].to_numpy(), axis=-1).astype(float) / df['width']
df['x_max'] = np.max(df[[f'x_{k}' for k in 'ABCD']].to_numpy(), axis=-1).astype(float) / df['width']
df['y_min'] = (np.min(df[[f'y_{k}' for k in 'ABCD']].to_numpy(), axis=-1).astype(float) - df['watermark_height']) / (df['height'] - df['watermark_height']*2.)
df['y_max'] = (np.max(df[[f'y_{k}' for k in 'ABCD']].to_numpy(), axis=-1).astype(float) - df['watermark_height']) / (df['height'] - df['watermark_height']*2.)
df['w'] = (df['x_max'] - df['x_min'])
df['h'] = df['y_max'] - df['y_min']
df['cx'] = df['x_min'] + df['w']/2.
df['cy'] = df['y_min'] + df['h']/2.
df['aspect_ratio'] = df['w'] / (df['h']+1.e-6)
# display graph
fig = ff.create_distplot([df[df['dataset'] == d]['aspect_ratio'].replace([np.inf, -np.inf], np.nan).dropna() for d in ('train', 'valid')], ['train', 'valid'], bin_size=0.05)
fig.update_layout(
    paper_bgcolor="rgba(0,0,0,0)",
    plot_bgcolor="rgba(0,0,0,0)",
    template="plotly_dark",
    autosize=True,
)
fig.update_xaxes(range=[0,3])
fig.show()

- The aspect ratio peaks on nearly 0.8, meaning bounding boxes are likely to be higher than larger. This trend is consistent with the fact that during approach, the runway is most likely a vertical paralleloid than an horizontal one.
- There is a small shift on the ascending part of the distribution between validation and training datasets sampling.

In [22]:
# display graph
fig = px.density_contour(
    df,
    x='cx',
    y='cy',
    color="dataset",
    marginal_x="violin",
    marginal_y="violin",
    range_x=[0, 1.],
    range_y=[0, 1.],
    color_discrete_sequence=px.colors.qualitative.D3
)
print(f"Mean cx, cy: {df['cx'].mean()}, {df['cy'].mean()}")
fig.update_layout(
    paper_bgcolor="rgba(0,0,0,0)",
    plot_bgcolor="rgba(0,0,0,0)",
    template="plotly_dark",
    autosize=True,
)
fig.update_xaxes(range=[0,1.])
fig.update_yaxes(range=[0,1.])
fig.show()

Mean cx, cy: 0.49979980650258676, 0.46524801463547577


- Most of the bounding boxes lies in the center of the image, centered slightly below the center along the vertical axis (0.46).