# Load ADS-B Flight Data

This notebook demonstrates how to fetch flight data from the contrails.org [ADS-B API](https://apidocs.contrails.org/notebooks/adsb_api.html), impute missing flight IDs, and structure the data into a [`pycontrails.Flight`](https://py.contrails.org/api/pycontrails.Flight.html) DataFrame.

## Imports

In [None]:
import asyncio
import io
import time
from datetime import date, datetime, timedelta

import aiohttp
import pandas as pd
import plotly.graph_objects as go

from pycontrails import Flight
from pycontrails.core import flight

# Configuration

Set the date for data retrieval and your contrails API key.

Contact api@contrails.org if you need an API key.

In [None]:
# Replace with the desired date range. End date is inclusive.
START_DATE = date(2025, 1, 15)
END_DATE = date(2025, 1, 16)

# Replace with your contrails.org API key
CONTRAILS_API_KEY = "your key here"  # @param {type:\"string\"}

API_BASE_URL = "https://api.contrails.org/v1/adsb/telemetry"

# Fetch global ADS-B asynchronously

See API documentation: [https://apidocs.contrails.org/notebooks/adsb_api.html](https://apidocs.contrails.org/notebooks/adsb_api.html)

Fetch data in hourly chunks for the target date concurrently using `asyncio` and `aiohttp`.

In [None]:
async def fetch_adsb_data_hour(
    session: aiohttp.ClientSession, dt_hour: datetime, api_key: str
) -> pd.DataFrame | None:
    """Asynchronously fetch ADS-B data for a single hour."""
    headers = {"accept": "application/vnd.apache.parquet", "x-api-key": api_key}
    # The /telemetry endpoint uses 'date' param for the start of the hour
    params = {"date": dt_hour.strftime("%Y-%m-%dT%H")}

    try:
        async with session.get(API_BASE_URL, headers=headers, params=params) as response:
            response.raise_for_status()
            content = await response.read()
            if not content:
                print(f"No content received for {dt_hour}")
                return None
            # Load Parquet from response content
            return pd.read_parquet(io.BytesIO(content))
    except aiohttp.ClientError as e:
        print(f"Error fetching data for {dt_hour}: {e}")
        return None
    except Exception as e:
        print(f"Error processing data for {dt_hour}: {e}")
        return None


async def fetch_all_day_data(target_date: date, api_key: str) -> pd.DataFrame:
    """Fetch ADS-B data for the entire day asynchronously."""
    start_datetime = datetime(target_date.year, target_date.month, target_date.day)
    tasks = []

    async with aiohttp.ClientSession() as session:
        for hour in range(24):
            dt_hour = start_datetime + timedelta(hours=hour)
            tasks.append(fetch_adsb_data_hour(session, dt_hour, api_key))

        results = await asyncio.gather(*tasks, return_exceptions=True)

    dataframes = []
    total_resp_size = 0
    for res in results:
        if isinstance(res, pd.DataFrame) and not res.empty:
            dataframes.append(res)
            total_resp_size += res.memory_usage(deep=True).sum()
        elif isinstance(res, Exception):
            print(f"An exception occurred during fetch: {res}")

    if not dataframes:
        raise ValueError("No data fetched. Check API key and date range.")

    print(f"Total response size: {round(total_resp_size / 1000000, 2)} MB")
    return pd.concat(dataframes, ignore_index=True)


# Asynchronously fetch waypoint data for given date range
all_raw_dfs = []
date_range = pd.date_range(start=START_DATE, end=END_DATE, freq="D")

print(f"Fetching data from {START_DATE} to {END_DATE}")
start_time = time.time()

for target_date in date_range:
    print(f"Fetching data for {target_date.date()}")
    try:
        daily_df = await fetch_all_day_data(target_date.date(), CONTRAILS_API_KEY)
        if not daily_df.empty:
            all_raw_dfs.append(daily_df)
    except ValueError as e:
        print(f"Error for {target_date.date()}: {e}")

if all_raw_dfs:
    raw_df = pd.concat(all_raw_dfs, ignore_index=True)
    total_time_taken = time.time() - start_time
    print(f"Flight data ingestion completed in {total_time_taken:.2f}s")
    print(f"Fetched {len(raw_df)} waypoints in total.")
    print(raw_df.head())
else:
    raw_df = pd.DataFrame()  # Initialize empty DataFrame
    print("No data fetched for the specified date range.")

Fetching data from 2025-01-15 to 2025-01-16
Fetching data for 2025-01-15
Total response size: 16494.38 MB
Fetching data for 2025-01-16
Total response size: 16225.12 MB
Flight data ingestion completed in 136.58s
Fetched 53055476 waypoints in total.
            timestamp   latitude   longitude collection_type  altitude_baro  \
0 2025-01-15 00:59:59  44.242130  -93.978523     terrestrial           2550   
1 2025-01-15 00:59:59  41.310501 -112.014915     terrestrial           5000   
2 2025-01-15 00:59:59  40.158661  -80.841522     terrestrial          41000   
3 2025-01-15 00:59:59  34.386703  -82.267700     terrestrial           4250   
4 2025-01-15 00:59:59  33.856884  -84.697784     terrestrial          11625   

   altitude_gnss icao_address                             flight_id callsign  \
0            NaN       A3E5EA  c57f2c52-6c77-4025-89b8-58645c8af44b    MVK51   
1            NaN       A32A1E  d35885bc-221a-426b-8bb8-043fa907e8eb   N3027S   
2            NaN       A4C00D  190ffc

# Data Cleaning and Preparation

Ensure correct data types and sort the data.

In [None]:
def clean_adsb_df(df: pd.DataFrame) -> pd.DataFrame:
    """Clean and prepare the raw ADS-B DataFrame."""
    if df.empty:
        return df
    df = df.copy()

    # Rename columns to match pycontrails expectations
    # The API returns 'altitude_baro', 'icao_address', and 'timestamp'
    # Pycontrails expects 'altitude', 'icao', and 'time'
    rename_map = {
        "altitude_baro": "altitude",
        "icao_address": "icao",
        "timestamp": "time",
    }
    df = df.rename(columns=rename_map)

    # Ensure time is datetime object
    df["time"] = pd.to_datetime(df["time"], utc=True)

    # Select necessary columns
    columns = [
        "time",
        "latitude",
        "longitude",
        "altitude",
        "icao",
        "flight_id",
        "tail_number",
        "collection_type",
    ]
    # Keep only columns that exist in the dataframe
    return df[df.columns.intersection(columns)]


if not raw_df.empty:
    cleaned_df = clean_adsb_df(raw_df)
    print(f"Cleaned DataFrame has {len(cleaned_df)} waypoints.")
    display(cleaned_df.head())
else:
    cleaned_df = pd.DataFrame()
    print("Skipping cleaning, no data loaded.")

Cleaned DataFrame has 53055476 waypoints.


Unnamed: 0,time,latitude,longitude,collection_type,altitude,icao,flight_id,tail_number
0,2025-01-15 00:59:59+00:00,44.24213,-93.978523,terrestrial,2550,A3E5EA,c57f2c52-6c77-4025-89b8-58645c8af44b,N350MK
1,2025-01-15 00:59:59+00:00,41.310501,-112.014915,terrestrial,5000,A32A1E,d35885bc-221a-426b-8bb8-043fa907e8eb,N3027S
2,2025-01-15 00:59:59+00:00,40.158661,-80.841522,terrestrial,41000,A4C00D,190ffc48-fbd1-4c43-985f-bad3adbdf6ec,N405JS
3,2025-01-15 00:59:59+00:00,34.386703,-82.2677,terrestrial,4250,A67040,1e557528-404c-4c14-a5b9-26812706e5a2,N51390
4,2025-01-15 00:59:59+00:00,33.856884,-84.697784,terrestrial,11625,A2FAF0,65be145b-3514-433a-a5c8-5834b0384817,N291TX


# Create Flights from grouped waypoints

Create a dataframe of flights grouped by Flight ID. Imput missing `flight_id` values based on temporal proximity for the same ICAO address.

**Methodology:**

1. Group waypoints by `icao`.
2. Identify segments where `flight_id` is missing
3. Group consecutive missing `flight_id` waypoints if the time gap is less than `MAX_GAP_SECONDS`.
4. For each group of missing IDs, look for a known `flight_id` within `LOOKUP_WINDOW_SECONDS` before the start or after the end of the group.
5. If multiple known IDs are found, use the chronologically closest one.
6. If no known ID is found, generate a new unique `flight_id` for that segment.

**On flight ID generation:**

Flight IDs are generated based on the flight's start and end timestamps and its ICAO address. All IDs are prefixed with SPIRE-INFERRED-{icao_address}-. The rest of the ID depends on the time of day:

1. Midnight Rollover/Holdover: Special formatting is applied if the flight period crosses midnight within a certain threshold (midnight_threshold_mins).

* If the flight ends just after midnight (a "holdover"), the ID includes the dates of the day before the start and the start date, formatted as: {start_date - 1 day}-rollover-{start_date}.
* If the flight starts just before midnight (a "rollover"), the ID includes the start date and the day after the end date, formatted as: {start_date}-rollover-{end_date + 1 day}.
2. Standard: If the flight period doesn't cross the midnight threshold, the ID is generated using the Unix timestamp (in seconds) of the start and end times: {int(start_timestamp)}-{int(end_timestamp)}.

Examples:

* Holdover: `SPIRE-INFERRED-ABC123-2026-02-03-rollover-2026-02-04`
* Rollover: `SPIRE-INFERRED-ABC123-2026-02-04-rollover-2026-02-05`
* Standard: `SPIRE-INFERRED-ABC123-1760035200-1760042400`

In [11]:
# Run imputation and create Flight objects
if not cleaned_df.empty:
    imputed_df = flight.impute_flight_ids(cleaned_df)

    # Group into pycontrails Flight objects
    # We limit to a few flights for the demonstration to save memory
    unique_ids = imputed_df["flight_id"].unique()[:100]
    flights_data = []
    for fid in unique_ids:
        f_df = imputed_df[imputed_df["flight_id"] == fid]
        if len(f_df) > 200:  # Only keep flights with enough points
            try:
                flights_data.append(Flight(f_df, flight_id=fid))
            except Exception as e:
                print(f"Error creating Flight object for {fid}: {e}")

    print(f"Created {len(flights_data)} Flight objects.")
else:
    flights_data = []



Created 23 Flight objects.


# Example: Accessing Data for a Flight

In [None]:
def plot_flight_on_globe(flight: Flight):
    """Plot a pycontrails Flight object on a 3D Plotly globe centered and zoomed on the trajectory."""
    df = flight.dataframe
    # Access flight_id from the attributes dictionary
    fid = flight.attrs.get("flight_id", "Unknown")

    # Calculate center point for the camera
    center_lat = df["latitude"].mean()
    center_lon = df["longitude"].mean()

    # Calculate the spread to determine zoom level
    lat_range = df["latitude"].max() - df["latitude"].min()
    lon_range = df["longitude"].max() - df["longitude"].min()
    max_range = max(lat_range, lon_range, 0.1)

    # Heuristic for projection scale: 1.0 is the full globe (~180 degrees)
    # We scale such that the trajectory occupies a significant part of the frame.
    # Adjusted from 120.0 to 100.0 to zoom out slightly more.
    zoom_scale = 1.0 / (max_range / 100.0)
    zoom_scale = max(1.0, min(zoom_scale, 20.0))  # Limit zoom to stay within reasonable bounds

    fig = go.Figure()

    # Add the flight path
    fig.add_trace(
        go.Scattergeo(
            lat=df["latitude"],
            lon=df["longitude"],
            mode="lines+markers",
            line=dict(width=2, color="red"),
            marker=dict(size=5, color="blue"),
            name=f"Flight {fid}",
            hovertext=df["time"].dt.strftime("%H:%M:%S"),
        )
    )

    # Configure the globe layout and center/zoom it
    fig.update_geos(
        projection_type="orthographic",
        projection_rotation=dict(lon=center_lon, lat=center_lat, roll=0),
        projection_scale=zoom_scale,
        showcountries=True,
        showcoastlines=True,
        showland=True,
        landcolor="#E5ECF6",
        showocean=True,
        oceancolor="#f9f9f9",
        lataxis_showgrid=True,
        lonaxis_showgrid=True,
    )

    fig.update_layout(
        height=600,
        margin={"r": 0, "t": 40, "l": 0, "b": 0},
        title=f"Trajectory for Flight ID: {fid} (Zoom: {zoom_scale:.3f}x)",
    )

    fig.show()


# Example: Plot the first flight in the list
if flights_data:
    plot_flight_on_globe(flights_data[0])
else:
    print("No flights available to plot.")


coroutine 'fetch_all_day_data' was never awaited

