# vessel-cable-anomaly-hunter
DTU Deep Learning project 29, group 80

## Required Libraries Installation
Run this in your terminal before executing this:

In [None]:
pip install -q -r requirements.txt

## Data Download
AIS Data scraper and filter

#### File imports

In [None]:
import config
import src.data.ais_downloader as ais_downloader
import src.data.ais_filtering as ais_filtering
import src.data.ais_reader as ais_reader
import src.data.ais_to_parquet as ais_to_parquet

#### Library imports

In [None]:
from pathlib import Path
from datetime import date
from tqdm import tqdm
import gc

#### Configuration

In [None]:
# Read configuration from config.py
VERBOSE_MODE = config.VERBOSE_MODE                          # Whether to print verbose output

START_DATE = config.START_DATE                              # Start date for data downloading
END_DATE   = config.END_DATE                                # End date for data downloading

DELETE_DOWNLOADED_CSV = config.DELETE_DOWNLOADED_CSV        # Whether to delete raw downloaded CSV files after parquet conversion

BBOX = config.BBOX                                          # Bounding Box to prefilter AIS data
POLYGON_COORDINATES = config.POLYGON_COORDINATES  

#### Paths

In [None]:
folder_path = Path(config.AIS_DATA_FOLDER)
folder_path.mkdir(parents=True, exist_ok=True)
    
csv_folder_path = folder_path / config.AIS_DATA_FOLDER_CSV_SUBFOLDER
csv_folder_path.mkdir(parents=True, exist_ok=True)
    
parquet_folder_path = folder_path / config.AIS_DATA_FOLDER_PARQUET_SUBFOLDER
parquet_folder_path.mkdir(parents=True, exist_ok=True)

file_port_locations = folder_path / config.FILE_PORT_LOCATIONS

#### Download, Filter and Save into Parquet
1. **Download:** Download one single .csv AIS data file from http://aisdata.ais.dk (link to data column description http://aisdata.ais.dk/!_README_information_CSV_files.txt);
2. **Filter:** For a given AOI in Denmark with known cable positions, filter AIS messages by cleansing unrealistic/unphysical messages or duplicates and removes error-prone messages within port areas;
3. **Segmentation:** Segment the cleaned data into tracks based on time gaps and track duration;
4. **Parquet Conversion:** Save the cleaned and filtered data into Parquet files for faster loading in the next steps.

In [None]:
# --- Build the schedule of download string dates ---
dates = ais_downloader.get_work_dates(START_DATE, END_DATE, csv_folder_path, filter=False)

# --- Iterate with tqdm and download, unzip and delete ---
for day in tqdm(dates, desc=f"Processing data", unit="file" ):
    tag = f"{day:%Y-%m}" if day < date.fromisoformat("2024-03-01") else f"{day:%Y-%m-%d}"
    print(f"\nProcessing date: {tag}")

    # --- Download one day ---
    csv_path = ais_downloader.download_one_ais_data(day, csv_folder_path)
    
    # --- Load CSV into DataFrame ---
    df_raw = ais_reader.read_single_ais_df(csv_path, BBOX, columns_to_drop=config.COLUMNS_TO_DROP, verbose=VERBOSE_MODE)
    # --- Optionally delete the downloaded CSV file ---
    if DELETE_DOWNLOADED_CSV: csv_path.unlink(missing_ok=True)
    
    # --- Filter and split ---
    # Filter AIS data, keeping Class A and Class B by default,
    df_filtered = ais_filtering.filter_ais_df(
        df_raw,                                               # raw AIS DataFrame
        polygon_coords=POLYGON_COORDINATES,                   # polygon coordinates for precise AOI filtering
        allowed_mobile_types=config.VESSEL_AIS_CLASS,                # vessel AIS class filter
        apply_polygon_filter=True,                            # keep polygon filtering enabled boolean
        remove_zero_sog_vessels=config.REMOVE_ZERO_SOG_VESSELS,      # use True/False to enable/disable 90% zero-SOG removal
        output_sog_in_ms=config.SOG_IN_MS,                           # convert SOG from knots in m/s (default) boolean
        sog_min_knots=config.SOG_MIN_KNOTS,                          # min SOG in knots to keep (None to disable)
        sog_max_knots=config.SOG_MAX_KNOTS,                          # max SOG in knots to keep (None to disable) 
        port_locodes_path=file_port_locations,                # path to port locodes CSV
        exclude_ports=True,                                   # exclude port areas boolean 
        verbose=VERBOSE_MODE,                                 # verbose mode boolean
    )
    
    # Free df_raw memory
    del df_raw
    gc.collect()

    # --- Parquet conversion ---
    # Save to Parquet by MMSI
    ais_to_parquet.save_by_mmsi(
        df_filtered,                                             # filtered AIS DataFrame 
        verbose=VERBOSE_MODE,                                    # verbose mode boolean
        output_folder=parquet_folder_path                        # output folder path
    )

    # Free df_filtered memory
    del df_filtered
    gc.collect()

## Preprocess

#### File imports

In [None]:
import config
import src.pre_proc.pre_processing_utils as pre_processing_utils
import src.pre_proc.ais_query as ais_query
import src.pre_proc.ais_segment as ais_segment

#### Library imports

In [None]:
from pathlib import Path

#### Configuration

In [None]:
# Read configuration from config.py
VERBOSE_MODE = config.VERBOSE_MODE

FOLDER_NAME = config.AIS_DATA_FOLDER
folder_path = Path(FOLDER_NAME)
parquet_folder_path = folder_path / config.AIS_DATA_FOLDER_PARQUET_SUBFOLDER

TRAIN_START_DATE = config.TRAIN_START_DATE
TRAIN_END_DATE = config.TRAIN_END_DATE

TEST_START_DATE = config.TEST_START_DATE
TEST_END_DATE = config.TEST_END_DATE

MAX_TIME_GAP_SEC = config.MAX_TIME_GAP_SEC
MAX_TRACK_DURATION_SEC = config.MAX_TRACK_DURATION_SEC
MIN_TRACK_DURATION_SEC = config.MIN_TRACK_DURATION_SEC
MIN_SEGMENT_LENGTH = config.MIN_SEGMENT_LENGTH

MIN_FREQ_POINTS_PER_MIN = config.MIN_FREQ_POINTS_PER_MIN

RESAMPLING_RULE = config.RESAMPLING_RULE

#### Preprocess function
1. **Data Loading:** Queries DuckDB for AIS data within specified date ranges for 'train' or 'test'.
2. **Feature Engineering:** Converts COG (Course Over Ground) to sine/cosine components.
3. **Cleaning:** Drops unnecessary columns and rows with missing values.
4. **Ship Type Grouping:** Aggregates specific ship types into broader categories (Commercial,
Passenger, Service, Other).
5. **Segmentation:** Splits AIS tracks into segments based on time gaps and duration constraints using
ais_segment
6. **Filtering:** Removes segments with low point density.
7. **Resampling:** Resamples tracks to a fixed time interval.
8. **Labeling:** Encodes ship types into numerical IDs.
9. **Saving:** Exports the processed DataFrame to a Parquet file.

In [None]:
def main_preprocess(dataframe_type: str = "all"):

    if dataframe_type == "all":
        main_preprocess("train")
        main_preprocess("test")
        return
        
    elif dataframe_type == "train":
        print(f"[preprocess] Querying AIS data for training period: {TRAIN_START_DATE} to {TRAIN_END_DATE}")
        # Loading filtered data from parquet files
        df = ais_query.query_ais_duckdb(parquet_folder_path, date_start=TRAIN_START_DATE, date_end=TRAIN_END_DATE, verbose=VERBOSE_MODE)
        
    elif dataframe_type == "test":
        print(f"[preprocess] Querying AIS data for testing period: {TEST_START_DATE} to {TEST_END_DATE}")
        # Loading filtered data from parquet files
        df = ais_query.query_ais_duckdb(parquet_folder_path, date_start=TEST_START_DATE, date_end=TEST_END_DATE, verbose=VERBOSE_MODE)
    else:
        raise ValueError(f"Invalid dataframe_type: {dataframe_type}. Must be 'train' or 'test'.")
     
    # Converting COG to sine and cosine components
    df = pre_processing_utils.cog_to_sin_cos(df)
    
    # Dropping unnecessary columns and rows with missing values
    df.drop(columns=[ 
        'Type of mobile', 
        'COG', 
        'Date'], inplace=True, errors='ignore')
    
    # Removing rows with NaN values in essential columns
    df.dropna(inplace=True)
    
    # Grouping Ship types
    commercial_types = ["Cargo", "Tanker"]
    passenger_types = ["Passenger", "Pleasure", "Sailing"]
    service_types = ["Dredging", "Law enforcement", "Military", "Port tender", "SAR", "Towing", "Towing long/wide","Tug"]
    valid_types =  ["Fishing", "Service", "Commercial", "Passenger"]

    df.loc[df["Ship type"].isin(commercial_types), "Ship type"] = "Commercial"
    df.loc[df["Ship type"].isin(passenger_types), "Ship type"] = "Passenger"
    df.loc[df["Ship type"].isin(service_types), "Ship type"] = "Service"
    df.loc[~df["Ship type"].isin(valid_types), "Ship type"] = "Other"
    
    print("[preprocess] Ship type counts:")
    print(df["Ship type"].value_counts())

    if VERBOSE_MODE:
        print(f"[preprocess] DataFrame after dropping unnecessary columns and NaNs: {len(df):,} rows")

    # Segmenting AIS tracks based on time gaps and max duration, filtering short segments
    df = ais_segment.segment_ais_tracks(
        df,
        max_time_gap_sec=MAX_TIME_GAP_SEC,
        max_track_duration_sec=MAX_TRACK_DURATION_SEC,
        min_track_duration_sec=MIN_TRACK_DURATION_SEC,
        min_track_len=MIN_SEGMENT_LENGTH,
        verbose=VERBOSE_MODE
    )

    # Adding segment nr feature
    df = pre_processing_utils.add_segment_nr(df)

    # Removing segments with low point density
    df = pre_processing_utils.remove_notdense_segments(df, min_freq_points_per_min=MIN_FREQ_POINTS_PER_MIN)
    
    # Resampling all tracks to fixed time intervals
    df = pre_processing_utils.resample_all_tracks(df, rule=RESAMPLING_RULE)

    print(f"[preprocess] Number of segments and rows after removing low-density segments and resampling: {df['Segment_nr'].nunique():,} segments, {len(df):,} rows")

    # Normalizing numeric columns
    #df, mean, std = pre_processing_utils.normalize_df(df, NUMERIC_COLS)

    # Ship type labeling (mapping to be used later)
    df, ship_type_to_id = pre_processing_utils.label_ship_types(df)
    
    # Saving pre-processed DataFrame
    if dataframe_type == "train":
        print(f"[preprocess] Saving pre-processed DataFrame to {config.PRE_PROCESSING_DF_TRAIN_PATH}...")
        output_path = config.PRE_PROCESSING_DF_TRAIN_PATH
        #metadata_path = config.PRE_PROCESSING_METADATA_TRAIN_PATH
    else:
        print(f"[preprocess] Saving pre-processed DataFrame to {config.PRE_PROCESSING_DF_TEST_PATH}...")
        output_path = config.PRE_PROCESSING_DF_TEST_PATH
        #metadata_path = config.PRE_PROCESSING_METADATA_TEST_PATH

    if VERBOSE_MODE: print(f"[preprocess] Columns of pre-processed DataFrame:\n{df.columns.tolist()}")
    Path(output_path).parent.mkdir(parents=True, exist_ok=True)
    df.to_parquet(output_path, index=False)


#### Preprocess for train and test

In [None]:
main_preprocess("train")

In [None]:
main_preprocess("test")

## Train

#### File imports

In [None]:
import config as config_file
from src.train.ais_dataset import AISDataset, ais_collate_fn
from src.train.model import AIS_LSTM_Autoencoder
from src.train.training_loop import run_experiment

#### Library imports

In [None]:
import datetime
import torch
from torch.utils.data import DataLoader, random_split
import os
import json
import itertools # Added for grid search

#### Configuration

In [None]:
# Path to pre-processed training data
PARQUET_FILE = config_file.PRE_PROCESSING_DF_TRAIN_PATH
TRAIN_OUTPUT_DIR = config_file.TRAIN_OUTPUT_DIR

# ensure output directory exists
os.makedirs(TRAIN_OUTPUT_DIR, exist_ok=True)

SPLIT_TRAIN_VAL_RATIO = config_file.SPLIT_TRAIN_VAL_RATIO
EPOCHS = config_file.EPOCHS
PATIENCE = config_file.PATIENCE
FEATURES = config_file.FEATURE_COLS
NUM_SHIP_TYPES = config_file.NUM_SHIP_TYPES

#### Hyperparameters grid search

In [None]:
# ---------------------------------------------------------
# HYPERPARAMETER GRID SEARCH
# ---------------------------------------------------------
# Define ranges for grid search
# Since you have high compute power, we explore Width (hidden) vs Depth (layers)
# and Bottleneck tightness (latent).

param_grid = {
    'hidden_dim': [128, 256],       # Capacity of the LSTM
    'latent_dim': [16, 64],         # Bottleneck: 16 (Anomaly Detection) vs 64 (Reconstruction)
    'num_layers': [1, 2],           # Depth
    'lr': [0.001, 0.0001],          # Learning Rate
    'batch_size': [64, 128],        # Batch Size
    'dropout': [0.0, 0.2]           # Regularization
}

configs = []

# Use itertools.product to create all combinations
keys, values = zip(*param_grid.items())
for bundle in itertools.product(*values):
    params = dict(zip(keys, bundle))
    
    # Optimization: Dropout is only useful if num_layers > 1
    # Skip dropout=0.2 if num_layers=1 to avoid duplicate equivalent runs
    if params['num_layers'] == 1 and params['dropout'] > 0:
        continue
        
    # Create a descriptive run name
    run_name = (f"H{params['hidden_dim']}_L{params['latent_dim']}_"
                f"Lay{params['num_layers']}_lr{params['lr']}_"
                f"BS{params['batch_size']}_Drop{params['dropout']}")
    
    config = {
        "run_name": run_name,
        "epochs": EPOCHS,              # Fixed epochs
        "patience": PATIENCE,             # Fixed patience
        "features": FEATURES,
        "num_ship_types": NUM_SHIP_TYPES,
        "shiptype_emb_dim": 8,     # Keep embedding dim constant for now
        
        # Dynamic Params
        "hidden_dim": params['hidden_dim'],
        "latent_dim": params['latent_dim'],
        "num_layers": params['num_layers'],
        "lr": params['lr'],
        "batch_size": params['batch_size'],
        "dropout": params['dropout']
    }
    configs.append(config)

print(f"Generated {len(configs)} unique configurations for training.")


#### Device setup

In [None]:
if torch.cuda.is_available():
    device = torch.device("cuda")  # for PC with NVIDIA
    print(f"Using device: {device} (NVIDIA GPU)")
elif torch.backends.mps.is_available():
    device = torch.device("mps")   # for Mac Apple Silicon
    print(f"Using device: {device} (Apple GPU)")
else:
    device = torch.device("cpu")   # Fallback on CPU
    print(f"Using device: {device} (CPU)")

#### Data load

In [None]:
if not os.path.exists(PARQUET_FILE):
    print(f"Error: {PARQUET_FILE} not found.")
    raise FileNotFoundError(f"{PARQUET_FILE} not found. Stopping execution.")

# Initialize Dataset
full_dataset = AISDataset(PARQUET_FILE)
input_dim = full_dataset.input_dim

# Split Train/Val (80/20)
train_size = int(SPLIT_TRAIN_VAL_RATIO * len(full_dataset))
val_size = len(full_dataset) - train_size
train_dataset, val_dataset = random_split(full_dataset, [train_size, val_size])

print(f"Train samples: {len(train_dataset)}, Val samples: {len(val_dataset)}")

#### Experiment Loop
1. **Create DataLoaders:** Creates PyTorch DataLoaders for training and validation datasets.
2. **Training Setup:** Initialize Model with FIXED num_ship_types, optimizer, and loss function.
3. **Training Loop:** Trains the model over a set number of epochs, implementing early stopping based on validation loss.
4. **Evaluation:** Assesses model performance on the validation set after each epoch.
5. **Model Saving:** Saves the best-performing model based on validation loss.

In [None]:
results = []

for config in configs:
    # Create DataLoaders
    train_loader = DataLoader(  # Training DataLoader
        train_dataset,
        batch_size=config['batch_size'],
        shuffle=True,
        collate_fn=ais_collate_fn
    )

    val_loader = DataLoader(    # Validation DataLoader
        val_dataset,
        batch_size=config['batch_size'],
        shuffle=False,
        collate_fn=ais_collate_fn
    )

    # Initialize Model with FIXED num_ship_types
    model = AIS_LSTM_Autoencoder(
        input_dim=input_dim,
        hidden_dim=config['hidden_dim'],
        latent_dim=config['latent_dim'],
        num_layers=config['num_layers'],
        num_ship_types=NUM_SHIP_TYPES, # Always use the fixed constant
        shiptype_emb_dim=config['shiptype_emb_dim'],
        dropout=config['dropout']
    ).to(device)

    # Run Pipeline
    history, best_loss = run_experiment(
        config,
        model,
        train_loader,
        val_loader,
        device,
        save_path=f"{TRAIN_OUTPUT_DIR}/weights_{config['run_name']}.pth"
    )

    # Save results
    results.append({
        "config": config['run_name'],
        "best_val_loss": best_loss,
        "history": history
    })

    # Save model and config
    os.makedirs(TRAIN_OUTPUT_DIR, exist_ok=True)
    with open(f"{TRAIN_OUTPUT_DIR}/config_{config['run_name']}.json", 'w') as f:
        json.dump(config, f, indent=4)

#### Summary of the model

In [None]:
# Save full results to JSON (make sure everything is serializable)
results_path = os.path.join(TRAIN_OUTPUT_DIR, "results_summary_"+ datetime.datetime.now().strftime("%Y%m%d_%H%M%S")+".json")
with open(results_path, "w") as f:
    json.dump(results, f, indent=4)

# Print only the top 3 configurations (lowest validation loss)
sorted_results = sorted(results, key=lambda r: float(r["best_val_loss"]))
top_k = sorted_results[:3]
print("\n=== Top 3 Configurations ===") 
for i, res in enumerate(top_k, 1):
    print(f"{i}. Run: {res['config']} | Best Val Loss: {float(res['best_val_loss']):.6f}")

## Test

#### File imports

In [None]:
import config as config_file
from src.test.ais_tester import AISTester

#### Library imports

In [None]:
import os
import json

#### Configuration

In [None]:
# Name of the model configuration to use
MODEL_NAME = "H128_L16_Lay1_lr0.001_BS64_Drop0.0"  # Change as needed

N_BEST_WORST = config_file.N_BEST_WORST
N_MAP_RANDOM = config_file.N_MAP_RANDOM

# Data to test on
PARQUET_FILE = config_file.PRE_PROCESSING_DF_TEST_PATH

# Output Directory
OUTPUT_DIR = config_file.TEST_OUTPUT_DIR + "/" + MODEL_NAME
os.makedirs(OUTPUT_DIR, exist_ok=True)

WEIGHTS_FILE = config_file.TRAIN_OUTPUT_DIR + "/weights_" + MODEL_NAME + ".pth"
MODEL_CONFIG_FILE = config_file.TRAIN_OUTPUT_DIR + "/config_" + MODEL_NAME + ".json"

#### Load Model and Init Tester

In [None]:
# Load Model Config
with open(MODEL_CONFIG_FILE, 'r') as f:
    model_config = json.load(f)

# Initialize Tester
tester = AISTester(model_config, WEIGHTS_FILE, output_dir=OUTPUT_DIR)

#### Run Testing and Evaluation

In [None]:
# Run tester pipeline (assumes PARQUET_FILE and tester are defined elsewhere in the notebook)
if os.path.exists(PARQUET_FILE):
    # 1. Evaluate ALL data first
    tester.load_data(PARQUET_FILE)
    tester.evaluate()
        
    # 2. Plot General Stats
    tester.plot_error_distributions()
        
    # 3. Plot Filtered Stats (Example)
    # You can pass a list of IDs to filter just the plot without re-running evaluate
    # my_interesting_ids = ["segment_A", "segment_B"]
    # tester.plot_error_distributions(filter_ids=my_interesting_ids, filename_suffix="_special_group")
        
    # 4. Standard Best/Worst
    tester.plot_best_worst_segments(n=N_BEST_WORST)
        
    # 5. Maps
    tester.generate_maps(n_best_worst=N_BEST_WORST, n_random=N_MAP_RANDOM)

    # 6. Filtered Map Example
    # tester.generate_filtered_map(segment_ids=["segment_1", "segment_2"], map_name="map_special_segments")
        
else:
    print(f"File {PARQUET_FILE} not found.")