# dark-vessel-hunter
DTU Deep Learning project 29, group 80


### Run this in your terminal before executing this:

In [None]:
pip install -r requirements.txt

## Import of the files

In [1]:
import ais_downloader
import ais_filtering
import ais_reader
import ais_to_parquet
import ais_query

## Data setup
### Set data preferences

In [2]:
START_DATE = "2025-11-03"
END_DATE   = "2025-11-03"

FOLDER_NAME = "ais-data"
DELETE_DOWNLOADED_CSV = False
VERBOSE_MODE = True

VESSEL_AIS_CLASS = ("Class A", "Class B")

MIN_SEGMENT_LENGTH = 256

# Bounding Box to prefilter AIS data [lat_max, lon_min, lat_min, lon_max]
bbox = [57.58, 10.5, 57.12, 11.92]

# Polygon coordinates for precise Area of Interest (AOI) filtering (lon, lat)
polygon_coords = [
    (10.5162, 57.3500),  # coast top left (lon, lat)
    (10.9314, 57.5120),  # sea top left
    (11.5128, 57.5785),  # sea top right
    (11.9132, 57.5230),  # top right (Swedish coast)
    (11.9189, 57.4078),  # bottom right (Swedish coast)
    (11.2133, 57.1389),  # sea bottom right
    (11.0067, 57.1352),  # sea bottom left
    (10.5400, 57.1880),  # coast bottom left
    (10.5162, 57.3500),  # close polygon
]

### Imports for the script

In [3]:
from tqdm import tqdm
from pathlib import Path
import pandas as pd
from datetime import date, timedelta

### Script

In [4]:
# --- Create paths ---
folder_path = Path(FOLDER_NAME)
folder_path.mkdir(parents=True, exist_ok=True)
csv_folder_path = folder_path / "csv"
csv_folder_path.mkdir(parents=True, exist_ok=True)
parquet_folder_path = folder_path / "parquet"
parquet_folder_path.mkdir(parents=True, exist_ok=True)

file_port_locations = folder_path / "port_locodes.csv"


# --- If you want to download all csv files before, uncomment the line below ---
# ais_downloader.download_multiple_ais_data(START_DATE, END_DATE, folder_path)

# --- Build the schedule of download string dates ---
dates = ais_downloader.get_work_dates(START_DATE, END_DATE, csv_folder_path, filter=False)

# --- Iterate with tqdm and download, unzip and delete ---
for day in tqdm(dates, desc=f"Processing data", unit="file" ):
    tag = f"{day:%Y-%m}" if day < date.fromisoformat("2024-03-01") else f"{day:%Y-%m-%d}"
    print(f"\nProcessing date: {tag}")

    # --- Download one day ---
    csv_path = ais_downloader.download_one_ais_data(day, csv_folder_path)
    
    # --- Load CSV into DataFrame ---
    df_raw = ais_reader.read_single_ais_df(csv_path, bbox, verbose=VERBOSE_MODE)
    # --- Optionally delete the downloaded CSV file ---
    if DELETE_DOWNLOADED_CSV: csv_path.unlink(missing_ok=True)
    
    # --- Filter and split ---
    # Filter AIS data, keeping Class A and Class B by default,
    df_filtered = ais_filtering.filter_ais_df(
        df_raw,
        polygon_coords=polygon_coords,
        allowed_mobile_types=VESSEL_AIS_CLASS,
        bbox=bbox,                          # select bbox 
        apply_polygon_filter=True,          # keep polygon filtering enabled boolean
        remove_zero_sog_vessels=False,      # use True/False to enable/disable 90% zero-SOG removal
        sog_in_knots=False,                 # convert SOG from knots in m/s (default) boolean
        port_locodes_path=file_port_locations,
        exclude_ports=True,                 # exclude port areas boolean 
        verbose=VERBOSE_MODE,               # verbose mode boolean
    )
    
    # Segment and save to Parquet by MMSI
    df_seg = ais_to_parquet.segment_ais_tracks(df_filtered, min_track_len=MIN_SEGMENT_LENGTH, verbose=VERBOSE_MODE)

    
    ais_to_parquet.save_by_mmsi(df_seg, verbose=VERBOSE_MODE, output_folder=parquet_folder_path)

Processing data:   0%|          | 0/1 [00:00<?, ?file/s]


Processing date: 2025-11-03
Skipping 2025-11-03 download: already present in ais-data\csv folder
 Read AIS data: 965,559 rows within bbox,  230 unique vessels
 [filter_ais_df] Before filtering: 965,559 rows, 230 vessels
 [filter_ais_df] Type filtering: 927,905 rows (removed 37,654) using ['Class A', 'Class B']
 [filter_ais_df] MMSI filtering: 927,789 rows, 227 vessels
 [filter_ais_df] Duplicate removal: 526,789 rows, 227 vessels
 [filter_ais_df] BBOX filtering: 526,789 rows (removed 0), 227 vessels
 [filter_ais_df] Polygon filtering: 278,910 rows (removed 247,879), 153 vessels
 [filter_ais_df] Port-area removal: removed 37,542 rows in 2 overlapping ports
 [filter_ais_df] Final: 241,368 rows, 147 unique vessels
 [segment_ais_tracks] Starting with 241,368 rows,  147 unique vessels
 [segment_ais_tracks] After MMSI-level filter: 180,819 rows,  125 vessels
 [segment_ais_tracks] After segment-level filter: 175,610 rows,  139 segments
 [save_by_mmsi] Removing existing partition: ais-data\par

Processing data: 100%|██████████| 1/1 [00:44<00:00, 44.46s/file]

 [save_by_mmsi] Parquet dataset written/appended at: D:\Projects\dark-vessel-hunter\ais-data\parquet





In [5]:
test_df = ais_query.query_ais_duckdb(parquet_folder_path, verbose=VERBOSE_MODE)

[query_ais_duckdb] SQL:
 SELECT * FROM read_parquet('ais-data\parquet\**\*.parquet') WHERE 1=1
