# vessel-cable-anomaly-hunter
DTU Deep Learning project 29, group 80

## Required Libraries Installation
Run this in your terminal before executing this:

In [1]:
pip install -r requirements.txt

Note: you may need to restart the kernel to use updated packages.


## 1) Data Download

#### File imports for the data download and preparation

In [2]:
import config
import src.data.ais_downloader as ais_downloader
import src.data.ais_filtering as ais_filtering
import src.data.ais_reader as ais_reader
import src.data.ais_to_parquet as ais_to_parquet

import src.data.ais_reader as ais_reader
import src.pre_proc.ais_query as ais_query 
import src.utils.ais_maps as ais_maps



#### Library imports for the data download and preparation

In [3]:
from tqdm import tqdm
from pathlib import Path
import pandas as pd
from datetime import date, timedelta

#### Set data preferences and configuration inputs

In [4]:
VERBOSE_MODE = config.VERBOSE_MODE                          # Whether to print verbose output

START_DATE = config.START_DATE                              # Start date for data downloading
END_DATE   = config.END_DATE                                # End date for data downloading

AIS_DATA_NAME = config.AIS_DATA_FOLDER                      # Name of the folder to store AIS data
DELETE_DOWNLOADED_CSV = config.DELETE_DOWNLOADED_CSV        # Whether to delete raw downloaded CSV files after processing

VESSEL_AIS_CLASS = config.VESSEL_AIS_CLASS                  # AIS classes of vessels to include

REMOVE_ZERO_SOG_VESSELS = config.REMOVE_ZERO_SOG_VESSELS    # Whether to remove vessels with zero Speed Over Ground
SOG_IN_MS = config.SOG_IN_MS                                # If True, SOG is in meters/second; if False, SOG is in knots
SOG_MIN_KNOTS = config.SOG_MIN_KNOTS                        # Minimum SOG in knots
SOG_MAX_KNOTS = config.SOG_MAX_KNOTS                        # Maximum SOG in knots

BBOX = config.BBOX                                          # Bounding Box to prefilter AIS data
POLYGON_COORDINATES = config.POLYGON_COORDINATES            # Polygon coordinates for filter Area of Interest

#### Create Directories and Paths

In [5]:
# --- Create paths ---
folder_path = Path(AIS_DATA_NAME)
folder_path.mkdir(parents=True, exist_ok=True)
csv_folder_path = folder_path / "csv"
csv_folder_path.mkdir(parents=True, exist_ok=True)
parquet_folder_path = folder_path / "parquet"
parquet_folder_path.mkdir(parents=True, exist_ok=True)

file_port_locations = folder_path / "port_locodes.csv" # Path to port locations file

#### Main Script
1) Download one single .csv AIS data file from http://aisdata.ais.dk (link to data column description http://aisdata.ais.dk/!_README_information_CSV_files.txt);
2) For a given AOI in Denmark with known cable positions, filter AIS messages by cleansing unrealistic/unphysical messages or duplicates and removes error-prone messages within port areas;
3) Segmentation of vessel tracks per MMSI and date;
4) Parquet Conversion;

In [6]:
# --- If you want to download all csv files before, uncomment the line below ---
# ais_downloader.download_multiple_ais_data(START_DATE, END_DATE, folder_path)

# --- Build the schedule of download string dates ---
dates = ais_downloader.get_work_dates(START_DATE, END_DATE, csv_folder_path, filter=False)

# --- Iterate with tqdm and download, unzip and delete ---
for day in tqdm(dates, desc=f"Processing data", unit="file" ):
    tag = f"{day:%Y-%m}" if day < date.fromisoformat("2024-03-01") else f"{day:%Y-%m-%d}"
    print(f"\nProcessing date: {tag}")

    # --- Download one day ---
    csv_path = ais_downloader.download_one_ais_data(day, csv_folder_path)
    
    # --- Load CSV into DataFrame ---
    df_raw = ais_reader.read_single_ais_df(csv_path, BBOX, verbose=VERBOSE_MODE)
    # --- Optionally delete the downloaded CSV file ---
    if DELETE_DOWNLOADED_CSV: csv_path.unlink(missing_ok=True)
    
    # --- Filter and split ---
    # Filter AIS data, keeping Class A and Class B by default,
    df_filtered = ais_filtering.filter_ais_df(
        df_raw,                                               # raw AIS DataFrame
        polygon_coords=POLYGON_COORDINATES,                   # polygon coordinates for precise AOI filtering
        allowed_mobile_types=VESSEL_AIS_CLASS,                # vessel AIS class filter
        apply_polygon_filter=True,                            # keep polygon filtering enabled boolean
        remove_zero_sog_vessels=REMOVE_ZERO_SOG_VESSELS,      # use True/False to enable/disable 90% zero-SOG removal
        output_sog_in_ms=SOG_IN_MS,                           # convert SOG from knots in m/s (default) boolean
        sog_min_knots=SOG_MIN_KNOTS,                          # min SOG in knots to keep (None to disable)
        sog_max_knots=SOG_MAX_KNOTS,                          # max SOG in knots to keep (None to disable) 
        port_locodes_path=file_port_locations,                # path to port locodes CSV
        exclude_ports=True,                                   # exclude port areas boolean 
        verbose=VERBOSE_MODE,                                 # verbose mode boolean
    )
        
    # --- Parquet conversion ---
    # Save to Parquet by MMSI
    ais_to_parquet.save_by_mmsi(
        df_filtered,                                             # filtered AIS DataFrame 
        verbose=VERBOSE_MODE,                                    # verbose mode boolean
        output_folder=parquet_folder_path                        # output folder path
    )

Processing data:   0%|          | 0/30 [00:00<?, ?file/s]


Processing date: 2025-08-01
Skipping 2025-08-01 download: already present in ais-data/csv folder
 Read AIS data: 1,128,873 rows within bbox,  511 unique vessels
 [filter_ais_df] Before filtering: 1,128,873 rows, 511 vessels
 [filter_ais_df] Type filtering: 1,093,101 rows (removed 35,772) using ['Class A', 'Class B']
 [filter_ais_df] MMSI filtering: 1,093,079 rows, 508 vessels
 [filter_ais_df] Duplicate removal: 638,467 rows, 508 vessels
 [filter_ais_df] Polygon filtering: 337,847 rows (removed 300,620), 378 vessels
 [filter_ais_df] Port-area removal: removed 160,778 rows in 3 overlapping ports
 [filter_ais_df] COG sanity: 175,598 rows (removed 1,471) with range [0, 360] deg
 [filter_ais_df] SOG sanity: 173,215 rows (removed 2,381) with range [0.5, 35.0] knots
 [filter_ais_df] Final: 173,215 rows, 327 unique vessels (SOG in m/s)
 [save_by_mmsi] Removing existing partition: ais-data/parquet/MMSI=219021343/Date=2025-08-01
 [save_by_mmsi] Removing existing partition: ais-data/parquet/MMSI

Processing data:   3%|▎         | 1/30 [00:04<02:01,  4.18s/file]

 [save_by_mmsi] Parquet dataset written/appended at: /Users/federicomarra/Documents/GitHub/dl-dark-vessel-hunter/ais-data/parquet

Processing date: 2025-08-02
Skipping 2025-08-02 download: already present in ais-data/csv folder
 Read AIS data: 1,161,214 rows within bbox,  452 unique vessels
 [filter_ais_df] Before filtering: 1,161,214 rows, 452 vessels
 [filter_ais_df] Type filtering: 1,120,127 rows (removed 41,087) using ['Class A', 'Class B']
 [filter_ais_df] MMSI filtering: 1,114,486 rows, 447 vessels
 [filter_ais_df] Duplicate removal: 616,865 rows, 447 vessels
 [filter_ais_df] Polygon filtering: 314,494 rows (removed 302,371), 342 vessels
 [filter_ais_df] Port-area removal: removed 168,854 rows in 3 overlapping ports
 [filter_ais_df] COG sanity: 145,303 rows (removed 337) with range [0, 360] deg
 [filter_ais_df] SOG sanity: 144,010 rows (removed 1,291) with range [0.5, 35.0] knots
 [filter_ais_df] Final: 144,010 rows, 290 unique vessels (SOG in m/s)
 [save_by_mmsi] Removing existi

Processing data:   7%|▋         | 2/30 [00:08<02:06,  4.52s/file]

 [save_by_mmsi] Parquet dataset written/appended at: /Users/federicomarra/Documents/GitHub/dl-dark-vessel-hunter/ais-data/parquet

Processing date: 2025-08-03
Skipping 2025-08-03 download: already present in ais-data/csv folder
 Read AIS data: 1,079,891 rows within bbox,  399 unique vessels
 [filter_ais_df] Before filtering: 1,079,891 rows, 399 vessels
 [filter_ais_df] Type filtering: 1,043,284 rows (removed 36,607) using ['Class A', 'Class B']
 [filter_ais_df] MMSI filtering: 1,043,284 rows, 398 vessels
 [filter_ais_df] Duplicate removal: 599,515 rows, 398 vessels
 [filter_ais_df] Polygon filtering: 319,478 rows (removed 280,037), 288 vessels
 [filter_ais_df] Port-area removal: removed 165,183 rows in 3 overlapping ports
 [filter_ais_df] COG sanity: 153,909 rows (removed 386) with range [0, 360] deg
 [filter_ais_df] SOG sanity: 152,641 rows (removed 1,266) with range [0.5, 35.0] knots
 [filter_ais_df] Final: 152,641 rows, 248 unique vessels (SOG in m/s)
 [save_by_mmsi] Removing existi

Processing data:  10%|█         | 3/30 [00:13<02:01,  4.50s/file]

 [save_by_mmsi] Parquet dataset written/appended at: /Users/federicomarra/Documents/GitHub/dl-dark-vessel-hunter/ais-data/parquet

Processing date: 2025-08-04
Skipping 2025-08-04 download: already present in ais-data/csv folder
 Read AIS data: 1,161,332 rows within bbox,  380 unique vessels
 [filter_ais_df] Before filtering: 1,161,332 rows, 380 vessels
 [filter_ais_df] Type filtering: 1,125,031 rows (removed 36,301) using ['Class A', 'Class B']
 [filter_ais_df] MMSI filtering: 1,125,031 rows, 379 vessels
 [filter_ais_df] Duplicate removal: 637,785 rows, 379 vessels
 [filter_ais_df] Polygon filtering: 321,848 rows (removed 315,937), 272 vessels
 [filter_ais_df] Port-area removal: removed 156,758 rows in 3 overlapping ports
 [filter_ais_df] COG sanity: 164,980 rows (removed 110) with range [0, 360] deg
 [filter_ais_df] SOG sanity: 162,923 rows (removed 2,057) with range [0.5, 35.0] knots
 [filter_ais_df] Final: 162,923 rows, 232 unique vessels (SOG in m/s)


Processing data:  13%|█▎        | 4/30 [00:17<01:57,  4.51s/file]

 [save_by_mmsi] Parquet dataset written/appended at: /Users/federicomarra/Documents/GitHub/dl-dark-vessel-hunter/ais-data/parquet

Processing date: 2025-08-05
Skipping 2025-08-05 download: already present in ais-data/csv folder
 Read AIS data: 1,157,746 rows within bbox,  268 unique vessels
 [filter_ais_df] Before filtering: 1,157,746 rows, 268 vessels
 [filter_ais_df] Type filtering: 1,119,953 rows (removed 37,793) using ['Class A', 'Class B']
 [filter_ais_df] MMSI filtering: 1,116,828 rows, 264 vessels
 [filter_ais_df] Duplicate removal: 613,845 rows, 264 vessels
 [filter_ais_df] Polygon filtering: 301,015 rows (removed 312,830), 175 vessels
 [filter_ais_df] Port-area removal: removed 171,761 rows in 3 overlapping ports
 [filter_ais_df] COG sanity: 129,228 rows (removed 26) with range [0, 360] deg
 [filter_ais_df] SOG sanity: 127,182 rows (removed 2,046) with range [0.5, 35.0] knots
 [filter_ais_df] Final: 127,182 rows, 115 unique vessels (SOG in m/s)


Processing data:  17%|█▋        | 5/30 [00:22<01:50,  4.40s/file]

 [save_by_mmsi] Parquet dataset written/appended at: /Users/federicomarra/Documents/GitHub/dl-dark-vessel-hunter/ais-data/parquet

Processing date: 2025-08-06
Skipping 2025-08-06 download: already present in ais-data/csv folder
 Read AIS data: 1,145,520 rows within bbox,  261 unique vessels
 [filter_ais_df] Before filtering: 1,145,520 rows, 261 vessels
 [filter_ais_df] Type filtering: 1,108,897 rows (removed 36,623) using ['Class A', 'Class B']
 [filter_ais_df] MMSI filtering: 1,108,897 rows, 260 vessels
 [filter_ais_df] Duplicate removal: 622,309 rows, 260 vessels
 [filter_ais_df] Polygon filtering: 300,756 rows (removed 321,553), 169 vessels
 [filter_ais_df] Port-area removal: removed 163,859 rows in 3 overlapping ports
 [filter_ais_df] COG sanity: 136,873 rows (removed 24) with range [0, 360] deg
 [filter_ais_df] SOG sanity: 134,698 rows (removed 2,175) with range [0.5, 35.0] knots
 [filter_ais_df] Final: 134,698 rows, 123 unique vessels (SOG in m/s)


Processing data:  20%|██        | 6/30 [00:26<01:47,  4.49s/file]

 [save_by_mmsi] Parquet dataset written/appended at: /Users/federicomarra/Documents/GitHub/dl-dark-vessel-hunter/ais-data/parquet

Processing date: 2025-08-07
Skipping 2025-08-07 download: already present in ais-data/csv folder
 Read AIS data: 1,202,886 rows within bbox,  385 unique vessels
 [filter_ais_df] Before filtering: 1,202,886 rows, 385 vessels
 [filter_ais_df] Type filtering: 1,166,600 rows (removed 36,286) using ['Class A', 'Class B']
 [filter_ais_df] MMSI filtering: 1,162,713 rows, 382 vessels
 [filter_ais_df] Duplicate removal: 642,517 rows, 382 vessels
 [filter_ais_df] Polygon filtering: 301,062 rows (removed 341,455), 277 vessels
 [filter_ais_df] Port-area removal: removed 133,073 rows in 3 overlapping ports
 [filter_ais_df] COG sanity: 167,842 rows (removed 147) with range [0, 360] deg
 [filter_ais_df] SOG sanity: 166,354 rows (removed 1,488) with range [0.5, 35.0] knots
 [filter_ais_df] Final: 166,354 rows, 247 unique vessels (SOG in m/s)


Processing data:  23%|██▎       | 7/30 [00:31<01:45,  4.57s/file]

 [save_by_mmsi] Parquet dataset written/appended at: /Users/federicomarra/Documents/GitHub/dl-dark-vessel-hunter/ais-data/parquet

Processing date: 2025-08-08
Skipping 2025-08-08 download: already present in ais-data/csv folder
 Read AIS data: 1,160,724 rows within bbox,  399 unique vessels
 [filter_ais_df] Before filtering: 1,160,724 rows, 399 vessels
 [filter_ais_df] Type filtering: 1,124,268 rows (removed 36,456) using ['Class A', 'Class B']
 [filter_ais_df] MMSI filtering: 1,124,255 rows, 397 vessels
 [filter_ais_df] Duplicate removal: 618,010 rows, 397 vessels
 [filter_ais_df] Polygon filtering: 270,458 rows (removed 347,552), 285 vessels
 [filter_ais_df] Port-area removal: removed 115,708 rows in 3 overlapping ports
 [filter_ais_df] COG sanity: 154,292 rows (removed 458) with range [0, 360] deg
 [filter_ais_df] SOG sanity: 151,933 rows (removed 2,359) with range [0.5, 35.0] knots
 [filter_ais_df] Final: 151,933 rows, 262 unique vessels (SOG in m/s)


Processing data:  27%|██▋       | 8/30 [00:36<01:41,  4.60s/file]

 [save_by_mmsi] Parquet dataset written/appended at: /Users/federicomarra/Documents/GitHub/dl-dark-vessel-hunter/ais-data/parquet

Processing date: 2025-08-09
Skipping 2025-08-09 download: already present in ais-data/csv folder
 Read AIS data: 1,206,051 rows within bbox,  424 unique vessels
 [filter_ais_df] Before filtering: 1,206,051 rows, 424 vessels
 [filter_ais_df] Type filtering: 1,168,258 rows (removed 37,793) using ['Class A', 'Class B']
 [filter_ais_df] MMSI filtering: 1,168,258 rows, 423 vessels
 [filter_ais_df] Duplicate removal: 649,751 rows, 423 vessels
 [filter_ais_df] Polygon filtering: 292,802 rows (removed 356,949), 308 vessels
 [filter_ais_df] Port-area removal: removed 132,829 rows in 3 overlapping ports
 [filter_ais_df] COG sanity: 159,617 rows (removed 356) with range [0, 360] deg
 [filter_ais_df] SOG sanity: 158,925 rows (removed 692) with range [0.5, 35.0] knots
 [filter_ais_df] Final: 158,925 rows, 271 unique vessels (SOG in m/s)


Processing data:  30%|███       | 9/30 [00:41<01:41,  4.85s/file]

 [save_by_mmsi] Parquet dataset written/appended at: /Users/federicomarra/Documents/GitHub/dl-dark-vessel-hunter/ais-data/parquet

Processing date: 2025-08-10
Skipping 2025-08-10 download: already present in ais-data/csv folder
 Read AIS data: 1,032,887 rows within bbox,  304 unique vessels
 [filter_ais_df] Before filtering: 1,032,887 rows, 304 vessels
 [filter_ais_df] Type filtering: 996,909 rows (removed 35,978) using ['Class A', 'Class B']
 [filter_ais_df] MMSI filtering: 996,909 rows, 303 vessels
 [filter_ais_df] Duplicate removal: 577,793 rows, 303 vessels
 [filter_ais_df] Polygon filtering: 253,367 rows (removed 324,426), 202 vessels
 [filter_ais_df] Port-area removal: removed 135,402 rows in 3 overlapping ports
 [filter_ais_df] COG sanity: 117,942 rows (removed 23) with range [0, 360] deg
 [filter_ais_df] SOG sanity: 116,931 rows (removed 1,011) with range [0.5, 35.0] knots
 [filter_ais_df] Final: 116,931 rows, 160 unique vessels (SOG in m/s)


Processing data:  33%|███▎      | 10/30 [00:45<01:31,  4.59s/file]

 [save_by_mmsi] Parquet dataset written/appended at: /Users/federicomarra/Documents/GitHub/dl-dark-vessel-hunter/ais-data/parquet

Processing date: 2025-08-11
Skipping 2025-08-11 download: already present in ais-data/csv folder
 Read AIS data: 1,156,494 rows within bbox,  368 unique vessels
 [filter_ais_df] Before filtering: 1,156,494 rows, 368 vessels
 [filter_ais_df] Type filtering: 1,120,818 rows (removed 35,676) using ['Class A', 'Class B']
 [filter_ais_df] MMSI filtering: 1,120,789 rows, 365 vessels
 [filter_ais_df] Duplicate removal: 658,127 rows, 365 vessels
 [filter_ais_df] Polygon filtering: 332,555 rows (removed 325,572), 275 vessels
 [filter_ais_df] Port-area removal: removed 128,044 rows in 3 overlapping ports
 [filter_ais_df] COG sanity: 204,484 rows (removed 27) with range [0, 360] deg
 [filter_ais_df] SOG sanity: 201,774 rows (removed 2,710) with range [0.5, 35.0] knots
 [filter_ais_df] Final: 201,774 rows, 247 unique vessels (SOG in m/s)


Processing data:  37%|███▋      | 11/30 [00:50<01:27,  4.62s/file]

 [save_by_mmsi] Parquet dataset written/appended at: /Users/federicomarra/Documents/GitHub/dl-dark-vessel-hunter/ais-data/parquet

Processing date: 2025-08-12
Skipping 2025-08-12 download: already present in ais-data/csv folder


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

 Read AIS data: 1,266,622 rows within bbox,  410 unique vessels
 [filter_ais_df] Before filtering: 1,266,622 rows, 410 vessels
 [filter_ais_df] Type filtering: 1,229,954 rows (removed 36,668) using ['Class A', 'Class B']
 [filter_ais_df] MMSI filtering: 1,229,743 rows, 407 vessels
 [filter_ais_df] Duplicate removal: 684,142 rows, 407 vessels
 [filter_ais_df] Polygon filtering: 369,055 rows (removed 315,087), 295 vessels
 [filter_ais_df] Port-area removal: removed 115,656 rows in 3 overlapping ports
 [filter_ais_df] COG sanity: 252,095 rows (removed 1,304) with range [0, 360] deg
 [filter_ais_df] SOG sanity: 250,511 rows (removed 1,581) with range [0.5, 35.0] knots
 [filter_ais_df] Final: 250,511 rows, 276 unique vessels (SOG in m/s)


Processing data:  40%|████      | 12/30 [00:56<01:31,  5.10s/file]

 [save_by_mmsi] Parquet dataset written/appended at: /Users/federicomarra/Documents/GitHub/dl-dark-vessel-hunter/ais-data/parquet

Processing date: 2025-08-13
Skipping 2025-08-13 download: already present in ais-data/csv folder


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

 Read AIS data: 1,795,771 rows within bbox,  469 unique vessels
 [filter_ais_df] Before filtering: 1,795,771 rows, 469 vessels
 [filter_ais_df] Type filtering: 1,748,562 rows (removed 47,209) using ['Class A', 'Class B']
 [filter_ais_df] MMSI filtering: 1,748,279 rows, 464 vessels
 [filter_ais_df] Duplicate removal: 741,922 rows, 464 vessels
 [filter_ais_df] Polygon filtering: 385,020 rows (removed 356,902), 324 vessels
 [filter_ais_df] Port-area removal: removed 123,372 rows in 3 overlapping ports
 [filter_ais_df] COG sanity: 257,045 rows (removed 4,603) with range [0, 360] deg
 [filter_ais_df] SOG sanity: 250,534 rows (removed 6,492) with range [0.5, 35.0] knots
 [filter_ais_df] Final: 250,534 rows, 293 unique vessels (SOG in m/s)


Processing data:  43%|████▎     | 13/30 [01:03<01:36,  5.70s/file]

 [save_by_mmsi] Parquet dataset written/appended at: /Users/federicomarra/Documents/GitHub/dl-dark-vessel-hunter/ais-data/parquet

Processing date: 2025-08-14
Skipping 2025-08-14 download: already present in ais-data/csv folder


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

 Read AIS data: 1,735,081 rows within bbox,  453 unique vessels
 [filter_ais_df] Before filtering: 1,735,081 rows, 453 vessels
 [filter_ais_df] Type filtering: 1,685,982 rows (removed 49,099) using ['Class A', 'Class B']
 [filter_ais_df] MMSI filtering: 1,681,923 rows, 450 vessels
 [filter_ais_df] Duplicate removal: 686,298 rows, 450 vessels
 [filter_ais_df] Polygon filtering: 336,864 rows (removed 349,434), 319 vessels
 [filter_ais_df] Port-area removal: removed 120,164 rows in 3 overlapping ports
 [filter_ais_df] COG sanity: 212,437 rows (removed 4,263) with range [0, 360] deg
 [filter_ais_df] SOG sanity: 209,294 rows (removed 3,142) with range [0.5, 35.0] knots
 [filter_ais_df] Final: 209,294 rows, 276 unique vessels (SOG in m/s)


Processing data:  47%|████▋     | 14/30 [01:11<01:41,  6.32s/file]

 [save_by_mmsi] Parquet dataset written/appended at: /Users/federicomarra/Documents/GitHub/dl-dark-vessel-hunter/ais-data/parquet

Processing date: 2025-08-15
Skipping 2025-08-15 download: already present in ais-data/csv folder


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

 Read AIS data: 1,174,842 rows within bbox,  448 unique vessels
 [filter_ais_df] Before filtering: 1,174,842 rows, 448 vessels
 [filter_ais_df] Type filtering: 1,136,029 rows (removed 38,813) using ['Class A', 'Class B']
 [filter_ais_df] MMSI filtering: 1,135,364 rows, 444 vessels
 [filter_ais_df] Duplicate removal: 629,432 rows, 444 vessels
 [filter_ais_df] Polygon filtering: 309,560 rows (removed 319,872), 332 vessels
 [filter_ais_df] Port-area removal: removed 127,775 rows in 3 overlapping ports
 [filter_ais_df] COG sanity: 180,449 rows (removed 1,336) with range [0, 360] deg
 [filter_ais_df] SOG sanity: 178,299 rows (removed 2,130) with range [0.5, 35.0] knots
 [filter_ais_df] Final: 178,299 rows, 292 unique vessels (SOG in m/s)


Processing data:  50%|█████     | 15/30 [01:16<01:29,  5.98s/file]

 [save_by_mmsi] Parquet dataset written/appended at: /Users/federicomarra/Documents/GitHub/dl-dark-vessel-hunter/ais-data/parquet

Processing date: 2025-08-16
Skipping 2025-08-16 download: already present in ais-data/csv folder
 Read AIS data: 1,255,838 rows within bbox,  416 unique vessels
 [filter_ais_df] Before filtering: 1,255,838 rows, 416 vessels
 [filter_ais_df] Type filtering: 1,217,153 rows (removed 38,685) using ['Class A', 'Class B']
 [filter_ais_df] MMSI filtering: 1,214,773 rows, 412 vessels
 [filter_ais_df] Duplicate removal: 610,454 rows, 412 vessels
 [filter_ais_df] Polygon filtering: 312,456 rows (removed 297,998), 302 vessels
 [filter_ais_df] Port-area removal: removed 169,285 rows in 3 overlapping ports
 [filter_ais_df] COG sanity: 141,891 rows (removed 1,280) with range [0, 360] deg
 [filter_ais_df] SOG sanity: 139,551 rows (removed 2,339) with range [0.5, 35.0] knots
 [filter_ais_df] Final: 139,551 rows, 246 unique vessels (SOG in m/s)


Processing data:  53%|█████▎    | 16/30 [01:22<01:23,  5.98s/file]

 [save_by_mmsi] Parquet dataset written/appended at: /Users/federicomarra/Documents/GitHub/dl-dark-vessel-hunter/ais-data/parquet

Processing date: 2025-08-17
Skipping 2025-08-17 download: already present in ais-data/csv folder


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

 Read AIS data: 1,401,996 rows within bbox,  420 unique vessels
 [filter_ais_df] Before filtering: 1,401,996 rows, 420 vessels
 [filter_ais_df] Type filtering: 1,359,332 rows (removed 42,664) using ['Class A', 'Class B']
 [filter_ais_df] MMSI filtering: 1,358,756 rows, 416 vessels
 [filter_ais_df] Duplicate removal: 605,191 rows, 416 vessels
 [filter_ais_df] Polygon filtering: 319,959 rows (removed 285,232), 288 vessels
 [filter_ais_df] Port-area removal: removed 153,953 rows in 3 overlapping ports
 [filter_ais_df] COG sanity: 164,187 rows (removed 1,819) with range [0, 360] deg
 [filter_ais_df] SOG sanity: 161,059 rows (removed 3,128) with range [0.5, 35.0] knots
 [filter_ais_df] Final: 161,059 rows, 240 unique vessels (SOG in m/s)


Processing data:  57%|█████▋    | 17/30 [01:28<01:16,  5.88s/file]

 [save_by_mmsi] Parquet dataset written/appended at: /Users/federicomarra/Documents/GitHub/dl-dark-vessel-hunter/ais-data/parquet

Processing date: 2025-08-18
Skipping 2025-08-18 download: already present in ais-data/csv folder


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

 Read AIS data: 1,835,907 rows within bbox,  472 unique vessels
 [filter_ais_df] Before filtering: 1,835,907 rows, 472 vessels
 [filter_ais_df] Type filtering: 1,788,245 rows (removed 47,662) using ['Class A', 'Class B']
 [filter_ais_df] MMSI filtering: 1,787,688 rows, 469 vessels
 [filter_ais_df] Duplicate removal: 637,476 rows, 469 vessels
 [filter_ais_df] Polygon filtering: 356,262 rows (removed 281,214), 339 vessels
 [filter_ais_df] Port-area removal: removed 127,245 rows in 3 overlapping ports
 [filter_ais_df] COG sanity: 224,904 rows (removed 4,113) with range [0, 360] deg
 [filter_ais_df] SOG sanity: 220,673 rows (removed 4,228) with range [0.5, 35.0] knots
 [filter_ais_df] Final: 220,673 rows, 305 unique vessels (SOG in m/s)


Processing data:  60%|██████    | 18/30 [01:35<01:16,  6.39s/file]

 [save_by_mmsi] Parquet dataset written/appended at: /Users/federicomarra/Documents/GitHub/dl-dark-vessel-hunter/ais-data/parquet

Processing date: 2025-08-19
Skipping 2025-08-19 download: already present in ais-data/csv folder


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

 Read AIS data: 1,116,204 rows within bbox,  343 unique vessels
 [filter_ais_df] Before filtering: 1,116,204 rows, 343 vessels
 [filter_ais_df] Type filtering: 1,078,257 rows (removed 37,947) using ['Class A', 'Class B']
 [filter_ais_df] MMSI filtering: 1,078,165 rows, 340 vessels
 [filter_ais_df] Duplicate removal: 556,725 rows, 340 vessels
 [filter_ais_df] Polygon filtering: 305,401 rows (removed 251,324), 235 vessels
 [filter_ais_df] Port-area removal: removed 129,632 rows in 3 overlapping ports
 [filter_ais_df] COG sanity: 173,790 rows (removed 1,979) with range [0, 360] deg
 [filter_ais_df] SOG sanity: 170,265 rows (removed 3,518) with range [0.5, 35.0] knots
 [filter_ais_df] Final: 170,265 rows, 195 unique vessels (SOG in m/s)


Processing data:  63%|██████▎   | 19/30 [01:41<01:08,  6.26s/file]

 [save_by_mmsi] Parquet dataset written/appended at: /Users/federicomarra/Documents/GitHub/dl-dark-vessel-hunter/ais-data/parquet

Processing date: 2025-08-20
Skipping 2025-08-20 download: already present in ais-data/csv folder
 Read AIS data: 1,015,814 rows within bbox,  360 unique vessels
 [filter_ais_df] Before filtering: 1,015,814 rows, 360 vessels
 [filter_ais_df] Type filtering: 980,694 rows (removed 35,120) using ['Class A', 'Class B']
 [filter_ais_df] MMSI filtering: 978,608 rows, 356 vessels
 [filter_ais_df] Duplicate removal: 587,058 rows, 356 vessels
 [filter_ais_df] Polygon filtering: 320,782 rows (removed 266,276), 257 vessels
 [filter_ais_df] Port-area removal: removed 129,834 rows in 3 overlapping ports
 [filter_ais_df] COG sanity: 186,653 rows (removed 4,295) with range [0, 360] deg
 [filter_ais_df] SOG sanity: 184,839 rows (removed 1,789) with range [0.5, 35.0] knots
 [filter_ais_df] Final: 184,839 rows, 227 unique vessels (SOG in m/s)


Processing data:  67%|██████▋   | 20/30 [01:46<00:57,  5.73s/file]

 [save_by_mmsi] Parquet dataset written/appended at: /Users/federicomarra/Documents/GitHub/dl-dark-vessel-hunter/ais-data/parquet

Processing date: 2025-08-21
Skipping 2025-08-21 download: already present in ais-data/csv folder
 Read AIS data: 1,032,805 rows within bbox,  317 unique vessels
 [filter_ais_df] Before filtering: 1,032,805 rows, 317 vessels
 [filter_ais_df] Type filtering: 998,010 rows (removed 34,795) using ['Class A', 'Class B']
 [filter_ais_df] MMSI filtering: 993,072 rows, 313 vessels
 [filter_ais_df] Duplicate removal: 576,822 rows, 313 vessels
 [filter_ais_df] Polygon filtering: 310,064 rows (removed 266,758), 227 vessels
 [filter_ais_df] Port-area removal: removed 107,074 rows in 3 overlapping ports
 [filter_ais_df] COG sanity: 185,525 rows (removed 17,465) with range [0, 360] deg
 [filter_ais_df] SOG sanity: 183,897 rows (removed 1,579) with range [0.5, 35.0] knots
 [filter_ais_df] Final: 183,897 rows, 193 unique vessels (SOG in m/s)


Processing data:  70%|███████   | 21/30 [01:50<00:48,  5.42s/file]

 [save_by_mmsi] Parquet dataset written/appended at: /Users/federicomarra/Documents/GitHub/dl-dark-vessel-hunter/ais-data/parquet

Processing date: 2025-08-22
Skipping 2025-08-22 download: already present in ais-data/csv folder
 Read AIS data: 1,067,486 rows within bbox,  322 unique vessels
 [filter_ais_df] Before filtering: 1,067,486 rows, 322 vessels
 [filter_ais_df] Type filtering: 1,030,816 rows (removed 36,670) using ['Class A', 'Class B']
 [filter_ais_df] MMSI filtering: 1,030,463 rows, 317 vessels
 [filter_ais_df] Duplicate removal: 567,896 rows, 317 vessels
 [filter_ais_df] Polygon filtering: 259,900 rows (removed 307,996), 225 vessels
 [filter_ais_df] Port-area removal: removed 127,301 rows in 3 overlapping ports
 [filter_ais_df] COG sanity: 132,469 rows (removed 130) with range [0, 360] deg
 [filter_ais_df] SOG sanity: 132,046 rows (removed 423) with range [0.5, 35.0] knots
 [filter_ais_df] Final: 132,046 rows, 196 unique vessels (SOG in m/s)


Processing data:  73%|███████▎  | 22/30 [01:55<00:42,  5.32s/file]

 [save_by_mmsi] Parquet dataset written/appended at: /Users/federicomarra/Documents/GitHub/dl-dark-vessel-hunter/ais-data/parquet

Processing date: 2025-08-23
Skipping 2025-08-23 download: already present in ais-data/csv folder
 Read AIS data: 929,511 rows within bbox,  288 unique vessels
 [filter_ais_df] Before filtering: 929,511 rows, 288 vessels
 [filter_ais_df] Type filtering: 893,704 rows (removed 35,807) using ['Class A', 'Class B']
 [filter_ais_df] MMSI filtering: 889,249 rows, 284 vessels
 [filter_ais_df] Duplicate removal: 499,248 rows, 284 vessels
 [filter_ais_df] Polygon filtering: 226,615 rows (removed 272,633), 182 vessels
 [filter_ais_df] Port-area removal: removed 136,806 rows in 3 overlapping ports
 [filter_ais_df] COG sanity: 89,305 rows (removed 504) with range [0, 360] deg
 [filter_ais_df] SOG sanity: 89,285 rows (removed 15) with range [0.5, 35.0] knots


Processing data:  77%|███████▋  | 23/30 [02:00<00:34,  4.95s/file]

 [filter_ais_df] Final: 89,285 rows, 144 unique vessels (SOG in m/s)
 [save_by_mmsi] Parquet dataset written/appended at: /Users/federicomarra/Documents/GitHub/dl-dark-vessel-hunter/ais-data/parquet

Processing date: 2025-08-24
Skipping 2025-08-24 download: already present in ais-data/csv folder
 Read AIS data: 1,237,843 rows within bbox,  379 unique vessels
 [filter_ais_df] Before filtering: 1,237,843 rows, 379 vessels
 [filter_ais_df] Type filtering: 1,199,187 rows (removed 38,656) using ['Class A', 'Class B']
 [filter_ais_df] MMSI filtering: 1,198,723 rows, 377 vessels
 [filter_ais_df] Duplicate removal: 562,533 rows, 377 vessels
 [filter_ais_df] Polygon filtering: 262,149 rows (removed 300,384), 262 vessels
 [filter_ais_df] Port-area removal: removed 102,306 rows in 3 overlapping ports
 [filter_ais_df] COG sanity: 156,693 rows (removed 3,150) with range [0, 360] deg
 [filter_ais_df] SOG sanity: 152,349 rows (removed 4,324) with range [0.5, 35.0] knots
 [filter_ais_df] Final: 152,34

Processing data:  80%|████████  | 24/30 [02:05<00:30,  5.00s/file]

 [save_by_mmsi] Parquet dataset written/appended at: /Users/federicomarra/Documents/GitHub/dl-dark-vessel-hunter/ais-data/parquet

Processing date: 2025-08-25
Skipping 2025-08-25 download: already present in ais-data/csv folder
 Read AIS data: 1,098,224 rows within bbox,  390 unique vessels
 [filter_ais_df] Before filtering: 1,098,224 rows, 390 vessels
 [filter_ais_df] Type filtering: 1,063,350 rows (removed 34,874) using ['Class A', 'Class B']
 [filter_ais_df] MMSI filtering: 1,063,263 rows, 387 vessels
 [filter_ais_df] Duplicate removal: 605,356 rows, 387 vessels
 [filter_ais_df] Polygon filtering: 308,699 rows (removed 296,657), 280 vessels
 [filter_ais_df] Port-area removal: removed 68,865 rows in 3 overlapping ports
 [filter_ais_df] COG sanity: 235,327 rows (removed 4,507) with range [0, 360] deg
 [filter_ais_df] SOG sanity: 229,652 rows (removed 5,663) with range [0.5, 35.0] knots
 [filter_ais_df] Final: 229,652 rows, 259 unique vessels (SOG in m/s)


Processing data:  83%|████████▎ | 25/30 [02:10<00:25,  5.02s/file]

 [save_by_mmsi] Parquet dataset written/appended at: /Users/federicomarra/Documents/GitHub/dl-dark-vessel-hunter/ais-data/parquet

Processing date: 2025-08-26
Skipping 2025-08-26 download: already present in ais-data/csv folder


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

 Read AIS data: 1,128,401 rows within bbox,  390 unique vessels
 [filter_ais_df] Before filtering: 1,128,401 rows, 390 vessels
 [filter_ais_df] Type filtering: 1,090,256 rows (removed 38,145) using ['Class A', 'Class B']
 [filter_ais_df] MMSI filtering: 1,090,197 rows, 387 vessels
 [filter_ais_df] Duplicate removal: 621,389 rows, 387 vessels
 [filter_ais_df] Polygon filtering: 323,734 rows (removed 297,655), 282 vessels
 [filter_ais_df] Port-area removal: removed 77,024 rows in 3 overlapping ports
 [filter_ais_df] COG sanity: 244,339 rows (removed 2,371) with range [0, 360] deg
 [filter_ais_df] SOG sanity: 239,853 rows (removed 4,463) with range [0.5, 35.0] knots
 [filter_ais_df] Final: 239,853 rows, 260 unique vessels (SOG in m/s)


Processing data:  87%|████████▋ | 26/30 [02:15<00:20,  5.16s/file]

 [save_by_mmsi] Parquet dataset written/appended at: /Users/federicomarra/Documents/GitHub/dl-dark-vessel-hunter/ais-data/parquet

Processing date: 2025-08-27
Skipping 2025-08-27 download: already present in ais-data/csv folder


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

 Read AIS data: 1,125,714 rows within bbox,  323 unique vessels
 [filter_ais_df] Before filtering: 1,125,714 rows, 323 vessels
 [filter_ais_df] Type filtering: 1,085,063 rows (removed 40,651) using ['Class A', 'Class B']
 [filter_ais_df] MMSI filtering: 1,083,741 rows, 320 vessels
 [filter_ais_df] Duplicate removal: 568,815 rows, 320 vessels
 [filter_ais_df] Polygon filtering: 276,814 rows (removed 292,001), 213 vessels
 [filter_ais_df] Port-area removal: removed 88,540 rows in 3 overlapping ports
 [filter_ais_df] COG sanity: 188,100 rows (removed 174) with range [0, 360] deg
 [filter_ais_df] SOG sanity: 184,610 rows (removed 3,486) with range [0.5, 35.0] knots
 [filter_ais_df] Final: 184,610 rows, 187 unique vessels (SOG in m/s)


Processing data:  90%|█████████ | 27/30 [02:20<00:15,  5.15s/file]

 [save_by_mmsi] Parquet dataset written/appended at: /Users/federicomarra/Documents/GitHub/dl-dark-vessel-hunter/ais-data/parquet

Processing date: 2025-08-28
Skipping 2025-08-28 download: already present in ais-data/csv folder


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

 Read AIS data: 1,520,546 rows within bbox,  369 unique vessels
 [filter_ais_df] Before filtering: 1,520,546 rows, 369 vessels
 [filter_ais_df] Type filtering: 1,469,075 rows (removed 51,471) using ['Class A', 'Class B']
 [filter_ais_df] MMSI filtering: 1,467,773 rows, 365 vessels
 [filter_ais_df] Duplicate removal: 590,590 rows, 365 vessels
 [filter_ais_df] Polygon filtering: 266,691 rows (removed 323,899), 250 vessels
 [filter_ais_df] Port-area removal: removed 82,210 rows in 3 overlapping ports
 [filter_ais_df] COG sanity: 182,000 rows (removed 2,481) with range [0, 360] deg
 [filter_ais_df] SOG sanity: 178,136 rows (removed 3,864) with range [0.5, 35.0] knots
 [filter_ais_df] Final: 178,136 rows, 224 unique vessels (SOG in m/s)


Processing data:  93%|█████████▎| 28/30 [02:26<00:10,  5.43s/file]

 [save_by_mmsi] Parquet dataset written/appended at: /Users/federicomarra/Documents/GitHub/dl-dark-vessel-hunter/ais-data/parquet

Processing date: 2025-08-29
Skipping 2025-08-29 download: already present in ais-data/csv folder
 Read AIS data: 1,146,177 rows within bbox,  336 unique vessels
 [filter_ais_df] Before filtering: 1,146,177 rows, 336 vessels
 [filter_ais_df] Type filtering: 1,105,492 rows (removed 40,685) using ['Class A', 'Class B']
 [filter_ais_df] MMSI filtering: 1,104,921 rows, 329 vessels
 [filter_ais_df] Duplicate removal: 569,091 rows, 329 vessels
 [filter_ais_df] Polygon filtering: 245,059 rows (removed 324,032), 224 vessels
 [filter_ais_df] Port-area removal: removed 109,816 rows in 3 overlapping ports
 [filter_ais_df] COG sanity: 135,195 rows (removed 48) with range [0, 360] deg
 [filter_ais_df] SOG sanity: 133,338 rows (removed 1,857) with range [0.5, 35.0] knots
 [filter_ais_df] Final: 133,338 rows, 199 unique vessels (SOG in m/s)


Processing data:  97%|█████████▋| 29/30 [02:32<00:05,  5.53s/file]

 [save_by_mmsi] Parquet dataset written/appended at: /Users/federicomarra/Documents/GitHub/dl-dark-vessel-hunter/ais-data/parquet

Processing date: 2025-08-30
Skipping 2025-08-30 download: already present in ais-data/csv folder
 Read AIS data: 1,220,338 rows within bbox,  346 unique vessels
 [filter_ais_df] Before filtering: 1,220,338 rows, 346 vessels
 [filter_ais_df] Type filtering: 1,175,090 rows (removed 45,248) using ['Class A', 'Class B']
 [filter_ais_df] MMSI filtering: 1,172,421 rows, 343 vessels
 [filter_ais_df] Duplicate removal: 555,626 rows, 343 vessels
 [filter_ais_df] Polygon filtering: 250,792 rows (removed 304,834), 228 vessels
 [filter_ais_df] Port-area removal: removed 131,407 rows in 3 overlapping ports
 [filter_ais_df] COG sanity: 118,470 rows (removed 915) with range [0, 360] deg
 [filter_ais_df] SOG sanity: 116,550 rows (removed 1,920) with range [0.5, 35.0] knots
 [filter_ais_df] Final: 116,550 rows, 195 unique vessels (SOG in m/s)


Processing data: 100%|██████████| 30/30 [02:37<00:00,  5.26s/file]

 [save_by_mmsi] Parquet dataset written/appended at: /Users/federicomarra/Documents/GitHub/dl-dark-vessel-hunter/ais-data/parquet





## 2) Pre-processing

#### File imports for the pre-processing

In [1]:
import config
import src.pre_proc.pre_processing_utils as pre_processing_utils
import src.pre_proc.ais_query as ais_query
import src.pre_proc.ais_segment as ais_segment

#### Library imports for the pre-processing

In [2]:
from pathlib import Path
import pandas as pd
import json

#### Set pre-processing preferences and input configuration

In [3]:
VERBOSE_MODE = config.VERBOSE_MODE

FOLDER_NAME = config.AIS_DATA_FOLDER
folder_path = Path(FOLDER_NAME)
parquet_folder_path = folder_path / "parquet"

MAX_TIME_GAP_SEC = config.MAX_TIME_GAP_SEC
MAX_TRACK_DURATION_SEC = config.MAX_TRACK_DURATION_SEC
MIN_TRACK_DURATION_SEC = config.MIN_TRACK_DURATION_SEC
MIN_SEGMENT_LENGTH = config.MIN_SEGMENT_LENGTH


NUMERIC_COLS = config.NUMERIC_COLS
# if u want to do it withouth a end date comment next line
TRAIN_START_DATE = config.TRAIN_START_DATE
TRAIN_END_DATE = config.TRAIN_END_DATE

TEST_START_DATE = config.TEST_START_DATE
TEST_END_DATE = config.TEST_END_DATE

RESAMPLING_RULE = config.RESAMPLING_RULE

#### Pre processing function

In [None]:
def main_pre_processing(dataframe_type: str = "all"):

    if dataframe_type == "all":
        main_pre_processing("train")
        main_pre_processing("test")
        return
        
    elif dataframe_type == "train":
        print(f"[pre_processing] Querying AIS data for training period: {TRAIN_START_DATE} to {TRAIN_END_DATE}")
        # Loading filtered data from parquet files
        dates = (
            pd.date_range(TRAIN_START_DATE, TRAIN_END_DATE, freq="D")
            .strftime("%Y-%m-%d")
            .tolist()
        )
        df = ais_query.query_ais_duckdb(parquet_folder_path, dates=dates, verbose=VERBOSE_MODE)
        
    elif dataframe_type == "test":
        print(f"[pre_processing] Querying AIS data for testing period: {TEST_START_DATE} to {TEST_END_DATE}")
        # Loading filtered data from parquet files
        dates = (
            pd.date_range(TEST_START_DATE, TEST_END_DATE, freq="D")
            .strftime("%Y-%m-%d")
            .tolist()
        )
        df = ais_query.query_ais_duckdb(parquet_folder_path, dates=dates, verbose=VERBOSE_MODE)
    else:
        raise ValueError(f"Invalid dataframe_type: {dataframe_type}. Must be 'train' or 'test'.")
    
    # Converting COG to sine and cosine components
    df = pre_processing_utils.cog_to_sin_cos(df)
    
    # Dropping unnecessary columns and rows with missing values
    df.drop(columns=[ 
        'Type of mobile', 
        'ROT',
        'COG',
        'Heading', 
        'IMO', 
        'Callsign', 
        'Name', 
        'Navigational status',
        'Cargo type', 
        'Width', 
        'Length',
        'Type of position fixing device', 
        'Draught', 
        'Destination', 
        'ETA',
        'Data source type', 
        'A', 'B', 'C', 'D', 
        'Date'], inplace=True, errors='ignore')
    
    # Removing rows with NaN values in essential columns
    df.dropna(inplace=True)
    
    # Grouping Ship types
    commercial_types = ["Cargo", "Tanker"]
    passenger_types = ["Passenger", "Pleasure", "Sailing"]
    service_types = ["Dredging", "Law enforcement", "Military", "Port tender", "SAR", "Towing", "Towing long/wide","Tug"]
    valid_types =  ["Fishing", "Service", "Commercial", "Passenger"]

    df.loc[df["Ship type"].isin(commercial_types), "Ship type"] = "Commercial"
    df.loc[df["Ship type"].isin(passenger_types), "Ship type"] = "Passenger"
    df.loc[df["Ship type"].isin(service_types), "Ship type"] = "Service"
    df.loc[~df["Ship type"].isin(valid_types), "Ship type"] = "Other"
    
    if VERBOSE_MODE:
        print(f"[pre_processing] DataFrame after dropping unnecessary columns and NaNs: {len(df):,} rows")

    # Segmenting AIS tracks based on time gaps and max duration, filtering short segments
    df = ais_segment.segment_ais_tracks(
        df,
        max_time_gap_sec=MAX_TIME_GAP_SEC,
        max_track_duration_sec=MAX_TRACK_DURATION_SEC,
        min_track_duration_sec=MIN_TRACK_DURATION_SEC,
        min_track_len=MIN_SEGMENT_LENGTH,
        verbose=VERBOSE_MODE
    )

    print("[pre_processing] Ship type counts:")
    print(df["Ship type"].value_counts())

    # Adding segment uid feature
    df = pre_processing_utils.add_segment_nr(df)

    # Resampling all tracks to fixed time intervals
    df = pre_processing_utils.resample_all_tracks(df, rule=RESAMPLING_RULE)

    # Normalizing numeric columns
    df, mean, std = pre_processing_utils.normalize_df(df, NUMERIC_COLS)

    # Ship type labeling (mapping to be used later)
    df, ship_type_to_id = pre_processing_utils.label_ship_types(df)
    
    # Saving pre-processed DataFrame
    if dataframe_type == "train":
        print(f"[pre_processing] Saving pre-processed DataFrame to {config.PRE_PROCESSING_DF_TRAIN_PATH}...")
        output_path = config.PRE_PROCESSING_DF_TRAIN_PATH
        metadata_path = config.PRE_PROCESSING_METADATA_TRAIN_PATH
    else:
        print(f"[pre_processing] Saving pre-processed DataFrame to {config.PRE_PROCESSING_DF_TEST_PATH}...")
        output_path = config.PRE_PROCESSING_DF_TEST_PATH
        metadata_path = config.PRE_PROCESSING_METADATA_TEST_PATH

    if VERBOSE_MODE: print(f"[pre_processing] Columns of pre-processed DataFrame:\n{df.columns.tolist()}")
    Path(output_path).parent.mkdir(parents=True, exist_ok=True)
    df.to_parquet(output_path, index=False)

    # Saving preprocessing metadata
    if VERBOSE_MODE: print(f"[pre_processing] Saving preprocessing metadata to {metadata_path}...")
    meta = {
        "mean": mean.tolist(),
        "std": std.tolist(),
        "ship_type_to_id": ship_type_to_id
    }

    with open(metadata_path, "w") as f:
        json.dump(meta, f, indent=4)

#### Pre processing script

In [5]:
main_pre_processing("train")

[pre_processing] Querying AIS data for training period: 2025-08-01 to 2025-08-28
[ais_query] Querying parquet files from: ais-data/parquet  from date None  to date  None


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

[ais_query] 4,845,439 rows, 3,060 vessels; Date IN [2025-08-01, 2025-08-02, 2025-08-03, 2025-08-04, 2025-08-05, 2025-08-06, 2025-08-07, 2025-08-08, 2025-08-09, 2025-08-10, 2025-08-11, 2025-08-12, 2025-08-13, 2025-08-14, 2025-08-15, 2025-08-16, 2025-08-17, 2025-08-18, 2025-08-19, 2025-08-20, 2025-08-21, 2025-08-22, 2025-08-23, 2025-08-24, 2025-08-25, 2025-08-26, 2025-08-27, 2025-08-28]
[pre_processing] DataFrame after dropping unnecessary columns and NaNs: 4,845,439 rows
[segment_ais_tracks] Starting with 4,845,439 rows, 3,060 unique vessels
[segment_ais_tracks] After segment-level filter: 4,838,589 rows, 7,331 segments
[pre_processing] Ship type counts:
Ship type
Commercial    1909334
Fishing       1679937
Passenger      960974
Service        157074
Other          131270
Name: count, dtype: int64
[pre_processing] Saving pre-processed DataFrame to ais-data/df_preprocessed/pre_processed_df_train.parquet...
[pre_processing] Columns of pre-processed DataFrame:
['Segment_nr', 'Timestamp', '

In [6]:
main_pre_processing("test")

[pre_processing] Querying AIS data for testing period: 2025-08-29 to 2025-08-30
[ais_query] Querying parquet files from: ais-data/parquet  from date None  to date  None
[ais_query] 249,888 rows, 358 vessels; Date IN [2025-08-29, 2025-08-30]
[pre_processing] DataFrame after dropping unnecessary columns and NaNs: 249,888 rows
[segment_ais_tracks] Starting with 249,888 rows, 358 unique vessels
[segment_ais_tracks] After segment-level filter: 249,208 rows, 427 segments
[pre_processing] Ship type counts:
Ship type
Commercial    140526
Passenger      45466
Fishing        44096
Service        13495
Other           5625
Name: count, dtype: int64
[pre_processing] Saving pre-processed DataFrame to ais-data/df_preprocessed/pre_processed_df_test.parquet...
[pre_processing] Columns of pre-processed DataFrame:
['Segment_nr', 'Timestamp', 'Latitude', 'Longitude', 'SOG', 'COG_sin', 'COG_cos', 'TrackID', 'MMSI', 'ShipTypeID']
[pre_processing] Saving preprocessing metadata to ais-data/df_preprocessed/pr

## Training

#### File imports for the training

#### Library imports for the training

#### Set training preferences

#### Script

## Evaluation

#### File imports for the evaluation

#### Library imports for the evaluation

#### Set evaluation preferences

#### Script

## Inspection