# dark-vessel-hunter
DTU Deep Learning project 29, group 80

## Required Libraries Installation
Run this in your terminal before executing this:

In [None]:
pip install -r requirements.txt

# Data Download

## File imports for the data download and preparation

In [7]:
import config
import src.data.ais_downloader as ais_downloader
import src.data.ais_filtering as ais_filtering
import src.data.ais_reader as ais_reader
import src.data.ais_to_parquet as ais_to_parquet

import src.data.ais_reader as ais_reader
import src.pre_proc.ais_query as ais_query 
import src.utils.ais_maps as ais_maps

## Library imports for the data download and preparation

In [8]:
from tqdm import tqdm
from pathlib import Path
import pandas as pd
from datetime import date, timedelta

## Set data preferences and configuration inputs

In [9]:
VERBOSE_MODE = True

START_DATE = "2025-05-21"  # Start date for data downloading
END_DATE   = "2025-05-21"  # End date for data downloading

AIS_DATA_NAME = config.AIS_DATA_FOLDER  # Name of the folder to store AIS data
DELETE_DOWNLOADED_CSV = False           # Whether to delete raw downloaded CSV files after processing

VESSEL_AIS_CLASS = ("Class A", "Class B") # AIS classes of vessels to include

MIN_SEGMENT_LENGTH = 30     # datapoints
MAX_TIME_GAP_SEC = 30       # seconds
MIN_TRACK_DURATION_SEC = 60 * 60 # seconds

REMOVE_ZERO_SOG_VESSELS = False # Whether to remove vessels with zero Speed Over Ground
SOG_IN_MS = True                # If True, SOG is in meters/second; if False, SOG is in knots
SOG_MIN_KNOTS = 0.5             # Minimum SOG in knots
SOG_MAX_KNOTS = 35.0            # Maximum SOG in knots

# Bounding Box to prefilter AIS data [lat_max, lon_min, lat_min, lon_max]
BBOX = [57.58, 10.5, 57.12, 11.92]

# Polygon coordinates for precise Area of Interest (AOI) filtering (lon, lat)
POLYGON_COORDINATES = [
    (10.5162, 57.3500),  # coast top left (lon, lat)
    (10.9314, 57.5120),  # sea top left
    (11.5128, 57.5785),  # sea top right
    (11.9132, 57.5230),  # top right (Swedish coast)
    (11.9189, 57.4078),  # bottom right (Swedish coast)
    (11.2133, 57.1389),  # sea bottom right
    (11.0067, 57.1352),  # sea bottom left
    (10.5400, 57.1880),  # coast bottom left
    (10.5162, 57.3500),  # close polygon
]



#### Script

In [10]:
# --- Create paths ---
folder_path = Path(AIS_DATA_NAME)
folder_path.mkdir(parents=True, exist_ok=True)
csv_folder_path = folder_path / "csv"
csv_folder_path.mkdir(parents=True, exist_ok=True)
parquet_folder_path = folder_path / "parquet"
parquet_folder_path.mkdir(parents=True, exist_ok=True)

file_port_locations = folder_path / "port_locodes.csv" # Path to port locations file


In [None]:
# --- If you want to download all csv files before, uncomment the line below ---
# ais_downloader.download_multiple_ais_data(START_DATE, END_DATE, folder_path)

# --- Build the schedule of download string dates ---
dates = ais_downloader.get_work_dates(START_DATE, END_DATE, csv_folder_path, filter=False)

# --- Iterate with tqdm and download, unzip and delete ---
for day in tqdm(dates, desc=f"Processing data", unit="file" ):
    tag = f"{day:%Y-%m}" if day < date.fromisoformat("2024-03-01") else f"{day:%Y-%m-%d}"
    print(f"\nProcessing date: {tag}")

    # --- Download one day ---
    csv_path = ais_downloader.download_one_ais_data(day, csv_folder_path)
    
    # --- Load CSV into DataFrame ---
    df_raw = ais_reader.read_single_ais_df(csv_path, BBOX, verbose=VERBOSE_MODE)
    # --- Optionally delete the downloaded CSV file ---
    if DELETE_DOWNLOADED_CSV: csv_path.unlink(missing_ok=True)
    
    # --- Filter and split ---
    # Filter AIS data, keeping Class A and Class B by default,
    df_filtered = ais_filtering.filter_ais_df(
            df_raw,                                               # raw AIS DataFrame
            polygon_coords=POLYGON_COORDINATES,                   # polygon coordinates for precise AOI filtering
            allowed_mobile_types=VESSEL_AIS_CLASS,                # vessel AIS class filter
            apply_polygon_filter=True,                            # keep polygon filtering enabled boolean
            remove_zero_sog_vessels=REMOVE_ZERO_SOG_VESSELS,      # use True/False to enable/disable 90% zero-SOG removal
            output_sog_in_ms=SOG_IN_MS,                           # convert SOG from knots in m/s (default) boolean
            sog_min_knots=SOG_MIN_KNOTS,                          # min SOG in knots to keep (None to disable)
            sog_max_knots=SOG_MAX_KNOTS,                          # max SOG in knots to keep (None to disable) 
            port_locodes_path=file_port_locations,                # path to port locodes CSV
            exclude_ports=True,                                   # exclude port areas boolean 
            verbose=VERBOSE_MODE,                                 # verbose mode boolean
        )
        
    # --- Parquet conversion ---
    # Segment and save to Parquet by MMSI
    df_seg = ais_to_parquet.segment_ais_tracks(df_filtered, min_track_len=MIN_SEGMENT_LENGTH, max_time_gap_sec=MAX_TIME_GAP_SEC, min_track_duration_sec=MIN_TRACK_DURATION_SEC, verbose=VERBOSE_MODE)
    # Save segmented data to Parquet files
    ais_to_parquet.save_by_mmsi(df_seg, verbose=VERBOSE_MODE, output_folder=parquet_folder_path)

## Check

In [11]:


df_csv = ais_reader.read_raw_csv_with_filters(
    csv_name="aisdk-2025-05-21.csv",
    bbox=BBOX,
    time_start=None,
    time_end=None,
    csv_root="ais-data/csv",
    timestamp_format="%d/%m/%Y %H:%M:%S",
    polygon_coords=POLYGON_COORDINATES,
    verbose=True,
)
df_csv_filtered = ais_filtering.filter_ais_df(
    df_csv,
    polygon_coords=POLYGON_COORDINATES,
    allowed_mobile_types=VESSEL_AIS_CLASS,
    apply_polygon_filter=True,
    remove_zero_sog_vessels=REMOVE_ZERO_SOG_VESSELS,
    output_sog_in_ms=SOG_IN_MS,
    sog_min_knots=SOG_MIN_KNOTS,
    sog_max_knots=SOG_MAX_KNOTS,
    port_locodes_path=file_port_locations
,
    exclude_ports=True,     
    verbose=True,
)
df_parquet = ais_query.query_ais_duckdb(root_path="ais-data/parquet", verbose=True)

[read_raw_csv_with_filters] 513,285 rows, 207 vessels; no time filter applied
 [filter_ais_df] Before filtering: 513,285 rows, 207 vessels
 [filter_ais_df] Type filtering: 476,649 rows (removed 36,636) using ['Class A', 'Class B']
 [filter_ais_df] MMSI filtering: 476,646 rows, 205 vessels
 [filter_ais_df] Duplicate removal: 280,803 rows, 205 vessels
 [filter_ais_df] Polygon filtering: 280,803 rows (removed 0), 205 vessels
 [filter_ais_df] Port-area removal: removed 141,579 rows in 3 overlapping ports
 [filter_ais_df] SOG sanity: 135,422 rows (removed 3,773) with range [0.5, 35.0] knots
 [filter_ais_df] Final: 135,422 rows, 180 unique vessels (SOG in m/s)
[query_ais_duckdb] SQL:
 SELECT * FROM read_parquet('ais-data/parquet/**/*.parquet') WHERE 1=1
[query_ais_duckdb] 95,498 rows, 121 vessels; no time filter applied


In [12]:
m_parquet = ais_maps.make_ais_tracks_map(df_list=[df_parquet], 
    bbox=BBOX,
    polygon_coords=POLYGON_COORDINATES,
    max_vessels=None, 
)

m_parquet.save("ais_tracks_map_parquet.html")


m_csv = ais_maps.make_ais_tracks_map(df_list=[df_csv], 
    bbox=BBOX,
    polygon_coords=POLYGON_COORDINATES,
    max_vessels=None,
)

m_csv.save("ais_tracks_map_csv.html")


m_csv_filtered = ais_maps.make_ais_tracks_map(df_list=[df_csv_filtered], 
    bbox=BBOX,
    polygon_coords=POLYGON_COORDINATES,
    max_vessels=None,
)

m_csv_filtered.save("ais_tracks_map_csv_filtered.html")

## Pre processing

#### File imports for the pre processing

In [None]:
import config
import src.pre_proc.pre_processing_utils as pre_processing_utils
import src.pre_proc.ais_query as ais_query

#### Library imports for the pre processing

In [None]:
from pathlib import Path
import pandas as pd
import json

#### Set pre processing preferences

In [None]:
AIS_DATA_NAME = config.AIS_DATA_FOLDER
folder_path = Path(AIS_DATA_NAME)
parquet_folder_path = folder_path / "parquet"

SEGMENT_MAX_LENGTH = 300  # datapoints

NUMERIC_COLS = config.NUMERIC_COLS
# if u want to do it withouth a end date comment next line
TRAIN_START_DATE = "2025-10-20"
TRAIN_END_DATE = "2025-11-08"

TEST_START_DATE = "2025-11-09"
TEST_END_DATE = "2025-11-10"

#### Pre processing function

In [None]:
def main_pre_processing(dataframe_type: str = "all"):

    if dataframe_type == "all":
        main_pre_processing("train")
        main_pre_processing("test")
        return
        
    elif dataframe_type == "train":
        print(f"[pre_processing] Querying AIS data for training period: {TRAIN_START_DATE} to {TRAIN_END_DATE}")
        # Loading filtered data from parquet files
        dates = (
            pd.date_range(TRAIN_START_DATE, TRAIN_END_DATE, freq="D")
            .strftime("%Y-%m-%d")
            .tolist()
        )
        df = ais_query.query_ais_duckdb(parquet_folder_path, dates=dates, verbose=VERBOSE_MODE)
        
    elif dataframe_type == "test":
        print(f"[pre_processing] Querying AIS data for testing period: {TEST_START_DATE} to {TEST_END_DATE}")
        # Loading filtered data from parquet files
        dates = (
            pd.date_range(TEST_START_DATE, TEST_END_DATE, freq="D")
            .strftime("%Y-%m-%d")
            .tolist()
        )
        df = ais_query.query_ais_duckdb(parquet_folder_path, dates=dates, verbose=VERBOSE_MODE)
    else:
        raise ValueError(f"Invalid dataframe_type: {dataframe_type}. Must be 'train' or 'test'.")
    

    # Dropping unnecessary columns and rows with missing values
    print(f"[pre_processing] Initial data size: {len(df)} records.")
    print(f"[pre_processing] Dropping unnecessary columns and rows with missing values...")
    df.drop(columns=[ 
        'Type of mobile', 
        'ROT', 
        'Heading', 
        'IMO', 
        'Callsign', 
        'Name', 
        'Navigational status',
        'Cargo type', 
        'Width', 
        'Length',
        'Type of position fixing device', 
        'Draught', 
        'Destination', 
        'ETA',
        'Data source type', 
        'A', 'B', 'C', 'D', 
        'Date'], inplace=True, errors='ignore')

    df.dropna(inplace=True)
    print(f"[pre_processing] Data size after dropping: {len(df)} records.")

    # Grouping Ship types
    commercial_types = ["Cargo", "Tanker"]
    passenger_types = ["Passenger", "Pleasure", "Sailing"]
    service_types = ["Dredging", "Law enforcement", "Military", "Port tender", "SAR", "Towing", "Towing long/wide","Tug"]
    valid_types =  ["Fishing", "Service", "Commercial", "Passenger"]

    df.loc[df["Ship type"].isin(commercial_types), "Ship type"] = "Commercial"
    df.loc[df["Ship type"].isin(passenger_types), "Ship type"] = "Passenger"
    df.loc[df["Ship type"].isin(service_types), "Ship type"] = "Service"
    df.loc[~df["Ship type"].isin(valid_types), "Ship type"] = "Other"

    print("[pre_processing] Ship type counts:")
    print(df["Ship type"].value_counts())

    # Adding â–³T feature
    df = pre_processing_utils.add_delta_t(df)
    df.drop(columns=["DeltaT"], inplace=True)

    # Splitting segments
    print(f"[pre_processing] Splitting segments to max length {SEGMENT_MAX_LENGTH}...")
    df = pre_processing_utils.split_segments_fixed_length(df, max_len=SEGMENT_MAX_LENGTH)

    # Normalizing numeric columns
    df, mean, std = pre_processing_utils.normalize_df(df, NUMERIC_COLS)

    # Encoding Navicational Status as one-hot
    #df, nav_status_to_id = pre_processing_utils.one_hot_encode_nav_status(df)

    # Ship type labeling (mapping to be used later)
    df, ship_type_to_id = pre_processing_utils.label_ship_types(df)
    
    # Saving pre-processed DataFrame
    if dataframe_type == "train":
        print(f"[pre_processing] Saving pre-processed DataFrame to {config.PRE_PROCESSING_DF_TRAIN_PATH}...")
        output_path = config.PRE_PROCESSING_DF_TRAIN_PATH
        metadata_path = config.PRE_PROCESSING_METADATA_TRAIN_PATH
    else:
        print(f"[pre_processing] Saving pre-processed DataFrame to {config.PRE_PROCESSING_DF_TEST_PATH}...")
        output_path = config.PRE_PROCESSING_DF_TEST_PATH
        metadata_path = config.PRE_PROCESSING_METADATA_TEST_PATH

    print(f"[pre_processing] Columns of pre-processed DataFrame:\n{df.columns.tolist()}")
    Path(output_path).parent.mkdir(parents=True, exist_ok=True)
    df.to_parquet(output_path, index=False)

    # Saving preprocessing metadata
    print(f"[pre_processing] Saving preprocessing metadata to {metadata_path}...")
    meta = {
        "mean": mean.tolist(),
        "std": std.tolist(),
        #"nav_status_to_id": nav_status_to_id,
        "ship_type_to_id": ship_type_to_id
    }

    with open(metadata_path, "w") as f:
        json.dump(meta, f, indent=4)

#### Pre processing script

In [None]:
main_pre_processing("train")
main_pre_processing("test")

## Training

#### File imports for the training

#### Library imports for the training

#### Set training preferences

#### Script

## Evaluation

#### File imports for the evaluation

#### Library imports for the evaluation

#### Set evaluation preferences

#### Script

## Inspection