# Outlier Detection

In [7]:
import os, sys
import numpy as np
import pandas as pd
from typing import List
from datetime import datetime, timedelta

from typing import List
from collections import Counter

import matplotlib.pyplot as plt

import networkx as nx
from networkx.drawing.nx_pydot import graphviz_layout

data_path = "../kgdata/edges_v3.tsv"
df = pd.read_csv(data_path, sep='\t')

print(df["label"].unique())

['type' 'EventId' 'DescriptionText' 'EventType' 'StartTime' 'EndTime'
 'StartDistanceFromPort_km' 'StartDistanceFromShore_km'
 'EndDistanceFromPort_km' 'EndDistanceFromShore_km' 'Location'
 'EventGeometry' 'HasParticipant' 'ValidatedBy' 'ExplainedBy'
 'HasProvenance' 'BehaviorType' 'UncertaintyNote' 'AvgSpeed_knots'
 'PatternGeometry' 'UnexplainedByWeatherOrPort' 'ObservationId' 'MMSI'
 'Lat' 'Lon' 'Speed' 'Course' 'Timestamp' 'DistanceFromPortKM'
 'DistanceFromShoreKM' 'Source' 'SSVID' 'VesselName' 'CallSign' 'Flag'
 'VesselTypeId' 'GroupType' 'HasMember' 'AsWKT' 'CentroidLat'
 'CentroidLon' 'CRS' 'PortName' 'BerthGeometry' 'TypeCode' 'TypeName'
 'subClassOf' 'FishingEffortScore' 'GearType' 'ZoneCode' 'ZoneName'
 'ZoneType' 'ZoneGeometry' 'SourceID' 'SourceName' 'SourceType'
 'SourceURL' 'MemberVessel' 'MembershipRole' 'MemberOf' 'PatternTypeID'
 'PatternTypeName' 'domain' 'range' ' type']


In [11]:
tmp = df[df["label"] == "MMSI"]
vessel2mmsi = \
    dict(tmp[tmp.node1.apply(lambda x: x.startswith("VesselIdentity"))][["node1", "node2"]].values)

def mmsi2ais(mmsi: str):
    tmp = df[(df["label"] == "MMSI") & (df["node2"] == mmsi)]
    return tmp[tmp["edge_id"].apply(lambda x: x.startswith("AIS"))].node1.unique()

def ais2timestamp(ais: str):
    tmp = df[df["node1"] == ais]
    return tmp[tmp["label"] == "Timestamp"].node2.values[0]

def ais2latlon(ais: str):
    tmp = df[df["node1"] == ais]
    lat = float(tmp[tmp["label"] == "Lat"].node2.values[0])
    lon = float(tmp[tmp["label"] == "Lon"].node2.values[0])
    return lat, lon

def vessel2info(vessel: str):
    ais = mmsi2ais(vessel2mmsi[vessel])
    info =  [(datetime.strptime(ais2timestamp(x), "^%Y-%m-%dT%H:%M:%S%z"), ) + ais2latlon(x) \
             for x in ais]
    return sorted(info, key=lambda x: x[0])

In [12]:
vessel_info = vessel2info("VesselIdentity_0")
print(vessel_info[:3])

[(datetime.datetime(2014, 8, 21, 7, 19, 19, tzinfo=datetime.timezone.utc), 22.152597427399996, 113.4756546021), (datetime.datetime(2014, 8, 21, 7, 31, 53, tzinfo=datetime.timezone.utc), 22.152597427399996, 113.4756011963), (datetime.datetime(2014, 8, 21, 8, 34, 5, tzinfo=datetime.timezone.utc), 22.1525917053, 113.4756164551)]


## Claude-sonnet-4-5-20250929
1. **Speed over Ground (SOG)** - calculated from consecutive positions
2. **Course over Ground (COG)** - bearing between consecutive positions
3. **Acceleration** - change in speed
4. **Course change rate** - rate of turning
5. **Distance from last position**
6. **Time gap** - interval between consecutive messages
7. **Distance to port/coastline** (optional)
8. **MMSI-based behavioral patterns** (optional)