In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import geopandas as gpd

## Load data

In [2]:
train_df = pd.read_csv("../data/ais_train.csv", sep="|")
train_df["time"] = pd.to_datetime(train_df["time"])
test_df = pd.read_csv("../data/ais_test.csv")

ports_df = pd.read_csv("../data/ports.csv", sep="|")
schedules_df = pd.read_csv("../data/schedules_to_may_2024.csv", sep="|")
vessels_df = pd.read_csv("../data/vessels.csv", sep="|")
# train_df["etaRaw"] = pd.to_datetime(train_df["etaRaw"], format="%Y/%m/%d %H:%M")
train_df.tail()

Unnamed: 0,time,cog,sog,rot,heading,navstat,etaRaw,latitude,longitude,vesselId,portId
1522060,2024-05-07 23:59:07,359.1,13.4,0,1,0,05-08 05:00,52.19131,-5.82223,clh6aqawa0002gh0zypfa5dut,634c4de270937fc01c3a7417
1522061,2024-05-07 23:59:08,12.3,17.1,0,13,0,05-10 03:00,38.96142,-12.00502,61e9f3aeb937134a3c4bfe43,634c4de270937fc01c3a76a1
1522062,2024-05-07 23:59:08,269.8,14.9,-1,270,0,05-15 23:00,49.71372,-5.22042,61e9f43db937134a3c4c0169,634c4de270937fc01c3a787b
1522063,2024-05-07 23:59:08,8.0,18.7,0,6,0,05-08 12:45,38.27895,10.7828,61e9f469b937134a3c4c029b,61d3781293c6feb83e5eb73b
1522064,2024-05-07 23:59:08,336.0,14.3,5,337,0,05-07 23:00,38.98635,-75.13275,62080cff66fc0a8e43c6123a,61d38528b7b7526e1adf3e6f


## Util

In [3]:
def df_to_gdf(
    df: pd.DataFrame, longitude_col="longitude", latitude_col="latitude"
) -> gpd.GeoDataFrame:
    return gpd.GeoDataFrame(
        df,
        geometry=gpd.points_from_xy(df[longitude_col], df[latitude_col]),
        crs="EPSG:4326",
    )

## Preprocessing and feature engineering

In [4]:
train_df = train_df.sort_values(by=["vesselId", "time"]).reset_index()
train_df["time_diff"] = train_df.groupby("vesselId")["time"].diff().astype(int) // 10**9

# Assume time diff before first msg was 20 min
train_df.loc[train_df["time_diff"] < 0, "time_diff"] = 60 * 20

In [5]:
train_df["etaRaw"] = pd.to_datetime("2024-" + train_df["etaRaw"], errors="coerce")

#### Replace missing/default values with NaN

In [6]:
train_df.loc[train_df["cog"] >= 360, "cog"] = np.nan
train_df.loc[train_df["sog"] >= 102.3, "sog"] = np.nan
train_df.loc[abs(train_df["rot"]) >= 128, "rot"] = np.nan
train_df.loc[train_df["heading"] > 360, "heading"] = np.nan
train_df.loc[train_df["navstat"] >= 9, "navstat"] = np.nan

In [7]:
# Add missing indicator for AIS data
train_df["ais_missing"] = (
    train_df[["cog", "sog", "rot", "heading", "navstat"]]
    .isnull()
    .any(axis=1)
    .astype(int)
)

#### Merge vessels df

In [8]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.linear_model import BayesianRidge
from category_encoders.one_hot import OneHotEncoder

In [9]:
estimator = BayesianRidge()
imputer = IterativeImputer(estimator=estimator, max_iter=25)
one_hot = OneHotEncoder(cols=["vesselType"], handle_missing="value")
fill_cols = [
    "CEU",
    "DWT",
    "GT",
    "vesselType",
    "breadth",
    "enginePower",
    "length",
    "yearBuilt",
]

filled_array = imputer.fit_transform(one_hot.fit_transform(vessels_df[fill_cols]))

filled = one_hot.inverse_transform(
    pd.DataFrame(
        data=filled_array,
        columns=one_hot.feature_names_out_,
    )
)
vessels_df[fill_cols] = filled[fill_cols]

In [10]:
train_df = train_df.merge(
    vessels_df[
        [
            "shippingLineId",
            "vesselId",
            "CEU",
            "DWT",
            "GT",
            "vesselType",
            "breadth",
            "length",
            "yearBuilt",
        ]
    ],
    on="vesselId",
    how="left",
)

#### Merge ports df (calculate distance to closest port)

In [11]:
closest_ports = pd.read_csv("../data/ports.csv", sep="|").rename(
    {
        "portId": "closest_port",
        "latitude": "latitude_port",
        "longitude": "longitude_port",
    },
    axis=1,
)

In [12]:
orig_cols = train_df.columns.to_list()
distance_col = "dist_to_port"

ports_gdf = df_to_gdf(
    closest_ports, longitude_col="longitude_port", latitude_col="latitude_port"
).to_crs(epsg="3857")

train_gdf = df_to_gdf(train_df).to_crs(epsg="3857")
train_gdf = train_gdf.sjoin_nearest(ports_gdf, how="left", distance_col=distance_col)
train_df = pd.DataFrame(train_gdf)[
    orig_cols + [distance_col, "closest_port", "latitude_port", "longitude_port"]
]

#### Transform lat/lng to euclidian coordinates

In [13]:
def latlng_to_xyz(
    df: pd.DataFrame,
    lat_col="latitude",
    lng_col="longitude",
) -> tuple[pd.DataFrame, list[str]]:
    new_df = df.copy()
    lat_rad = np.deg2rad(df[lat_col])
    lng_rad = np.deg2rad(df[lng_col])
    new_df["x"] = np.cos(lat_rad) * np.cos(lng_rad)
    new_df["y"] = np.cos(lat_rad) * np.sin(lng_rad)
    new_df["z"] = np.sin(lat_rad)
    return new_df


def xyz_to_latlng(df: pd.DataFrame) -> pd.DataFrame:
    new_df = df.copy()
    new_df["latitude"] = np.rad2deg(np.arcsin(df["z"]))
    new_df["longitude"] = np.rad2deg(np.arctan2(df["y"], df["x"]))
    return new_df

In [14]:
train_df = latlng_to_xyz(train_df)

In [15]:
port_xyz = latlng_to_xyz(train_df, lat_col="latitude_port", lng_col="longitude_port")
train_df[["port_x", "port_y", "port_z"]] = port_xyz[["x", "y", "z"]]

#### Fill missing values

In [16]:
fill_nearest_cols = ["etaRaw", "portId", "navstat"]

In [17]:
# Fill etaRaw with nearest
to_fill_nearest = train_df[["time", "vesselId"] + fill_nearest_cols].sort_values("time")

filled = pd.merge_asof(
    to_fill_nearest,
    to_fill_nearest.dropna(),
    on="time",
    by="vesselId",
    direction="nearest",
).sort_values(["vesselId", "time"])

train_df[fill_nearest_cols] = filled[[f"{col}_y" for col in fill_nearest_cols]]

In [18]:
interpolate_cols = ["cog", "sog", "rot", "heading"]

In [19]:
# Note: some values will not be interpolated because a vessel has only NaNs for that column
filled = (
    train_df[interpolate_cols + ["vesselId", "time"]]
    .groupby("vesselId")
    .apply(
        lambda group: group.set_index("time")[interpolate_cols].interpolate(
            method="time"
        ),
        include_groups=False,
    )
    .reset_index()
)

In [20]:
train_df[interpolate_cols] = filled[interpolate_cols]

#### Add temporal features

In [25]:
def add_temporal_cols(df: pd.DataFrame, time_col="time") -> pd.DataFrame:
    day_s = 24 * 60 * 60
    year_s = (366) * day_s
    month_s = year_s // 12
    timestamp_s = (df[time_col] - pd.Timestamp("2024-01-01")) // pd.Timedelta("1s")

    df[f"{time_col}_day_sin"] = np.sin(timestamp_s * (2 * np.pi / day_s))
    df[f"{time_col}_day_cos"] = np.cos(timestamp_s * (2 * np.pi / day_s))
    df[f"{time_col}_year_sin"] = np.sin(timestamp_s * (2 * np.pi / year_s))
    df[f"{time_col}_year_cos"] = np.cos(timestamp_s * (2 * np.pi / year_s))
    df[f"{time_col}_month_sin"] = np.sin(timestamp_s * (2 * np.pi / month_s))
    df[f"{time_col}_month_cos"] = np.cos(timestamp_s * (2 * np.pi / month_s))
    return df

In [26]:
train_df = add_temporal_cols(train_df, "time")

In [27]:
train_df = add_temporal_cols(train_df, "etaRaw")

#### Categorical encoding

In [28]:
from category_encoders.count import CountEncoder
from category_encoders.target_encoder import TargetEncoder
from category_encoders.one_hot import OneHotEncoder

In [29]:
# Counts of ports might be useful for estimating the size / popularity of the port
count_ports_encoder = CountEncoder(handle_missing="value", handle_unknown=-1)
train_df["port_count"] = count_ports_encoder.fit_transform(train_df["portId"].values)
train_df["closest_port_count"] = count_ports_encoder.transform(
    train_df["closest_port"].values
)

In [30]:
shipping_line_encoder = CountEncoder()
train_df["shipping_line_count"] = shipping_line_encoder.fit_transform(
    train_df["shippingLineId"].values
)

In [31]:
encoded_cols = [
    "navstat_1.0",
    "navstat_0or8",
    "navstat_2.0",
    "navstat_5.0",
    "navstat_nan",
    "vesselType_83.0",
    "vesselType_21.0",
    "vesselType_14.0",
]

one_hot = OneHotEncoder(
    cols=["navstat", "vesselType"], handle_missing="value", use_cat_names=True
)
encoded = one_hot.fit_transform(train_df[["navstat", "vesselType"]])
encoded["navstat_0or8"] = np.logical_or(
    encoded["navstat_0.0"], encoded["navstat_8.0"]
).astype(int)

train_df[encoded_cols] = encoded[encoded_cols]

In [32]:
# Encode vesselId by target encoding for x, y, z (i.e. mix of group mean vs. global mean)
vessel_encoder = TargetEncoder(cols=["vesselId"], min_samples_leaf=20, smoothing=10)
train_df["vessel_x"] = vessel_encoder.fit_transform(train_df["vesselId"], train_df["x"])
train_df["vessel_y"] = vessel_encoder.fit_transform(train_df["vesselId"], train_df["y"])
train_df["vessel_z"] = vessel_encoder.fit_transform(train_df["vesselId"], train_df["z"])

#### Finalize and standardize/normalize dataset

In [171]:
df = train_df.drop(
    columns=[
        "index",
        "time",
        "navstat",
        "etaRaw",
        "latitude",
        "longitude",
        "vesselId",
        "portId",
        "shippingLineId",
        "vesselType",
        "closest_port",
        "latitude_port",
        "longitude_port",
    ]
)

target_cols = ["x", "y", "z"]

In [172]:
features = df.drop(columns=target_cols)
targets = df[target_cols]

In [173]:
# Fill the rest with the mean of entire df, but add extra missing indicator
features["still_missing"] = features.isnull().any(axis=1).astype(int)
features = features.fillna(features.mean())

In [174]:
from sklearn.preprocessing import RobustScaler, StandardScaler, MinMaxScaler
from sklearn.pipeline import make_pipeline, FunctionTransformer
from sklearn.compose import ColumnTransformer

In [175]:
log_scale_pipeline = make_pipeline(
    FunctionTransformer(lambda x: np.log(x + 0.01), feature_names_out="one-to-one"),
    RobustScaler(),
)
scale_pipeline = make_pipeline(RobustScaler())
minmax_pipeline = make_pipeline(MinMaxScaler((-1, 1)))

scaler = ColumnTransformer(
    [
        (
            "log_scale",
            log_scale_pipeline,
            ["sog", "time_diff", "DWT", "dist_to_port"],
        ),
        (
            "scale",
            scale_pipeline,
            ["rot", "GT", "breadth", "length", "yearBuilt"],
        ),
        (
            "minmax",
            minmax_pipeline,
            [
                "cog",
                "heading",
                "CEU",
                "port_count",
                "closest_port_count",
                "shipping_line_count",
            ],
        ),
    ],
    remainder="passthrough",
)

features = pd.DataFrame(
    data=scaler.fit_transform(features), columns=scaler.get_feature_names_out()
)

In [178]:
features[["vesselId", "time"]] = train_df[["vesselId", "time"]]

In [179]:
features

Unnamed: 0,log_scale__sog,log_scale__time_diff,log_scale__DWT,log_scale__dist_to_port,scale__rot,scale__GT,scale__breadth,scale__length,scale__yearBuilt,minmax__cog,...,remainder__navstat_nan,remainder__vesselType_83.0,remainder__vesselType_21.0,remainder__vesselType_14.0,remainder__vessel_x,remainder__vessel_y,remainder__vessel_z,remainder__still_missing,vesselId,time
0,0.484416,-0.320163,0.355653,0.677310,-6.0,0.431731,0.000000,0.0,-0.888889,0.712142,...,0.0,1.0,0.0,0.0,0.394635,-0.036839,0.314818,0.0,61e9f38eb937134a3c4bfd8b,2024-01-12 14:07:47
1,0.486018,1.211436,0.355653,0.672451,5.0,0.431731,0.000000,0.0,-0.888889,0.709364,...,0.0,1.0,0.0,0.0,0.394635,-0.036839,0.314818,0.0,61e9f38eb937134a3c4bfd8b,2024-01-12 14:31:00
2,0.482794,2.524539,0.355653,0.669823,5.0,0.431731,0.000000,0.0,-0.888889,0.704918,...,0.0,1.0,0.0,0.0,0.394635,-0.036839,0.314818,0.0,61e9f38eb937134a3c4bfd8b,2024-01-12 14:57:23
3,0.482794,0.382664,0.355653,0.669560,6.0,0.431731,0.000000,0.0,-0.888889,0.711031,...,0.0,1.0,0.0,0.0,0.394635,-0.036839,0.314818,0.0,61e9f38eb937134a3c4bfd8b,2024-01-12 15:18:48
4,0.477813,0.172742,0.355653,0.671307,7.0,0.431731,0.000000,0.0,-0.888889,0.706029,...,0.0,1.0,0.0,0.0,0.394635,-0.036839,0.314818,0.0,61e9f38eb937134a3c4bfd8b,2024-01-12 15:39:47
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1522060,0.451841,0.172742,-0.458035,0.658479,-2.0,-0.874693,-1.428571,-0.4,1.000000,0.801056,...,0.0,1.0,0.0,0.0,0.551898,0.106453,0.822448,0.0,clh6aqawa0007gh0z9h6zi9bo,2024-05-07 22:36:16
1522061,0.449784,0.090846,-0.458035,0.661247,-3.0,-0.874693,-1.428571,-0.4,1.000000,0.801612,...,0.0,1.0,0.0,0.0,0.551898,0.106453,0.822448,0.0,clh6aqawa0007gh0z9h6zi9bo,2024-05-07 22:57:05
1522062,0.437890,0.090846,-0.458035,0.643908,-1.0,-0.874693,-1.428571,-0.4,1.000000,0.981106,...,0.0,1.0,0.0,0.0,0.551898,0.106453,0.822448,0.0,clh6aqawa0007gh0z9h6zi9bo,2024-05-07 23:17:54
1522063,0.486018,-0.158834,-0.458035,0.623019,3.0,-0.874693,-1.428571,-0.4,1.000000,-0.707697,...,0.0,1.0,0.0,0.0,0.551898,0.106453,0.822448,0.0,clh6aqawa0007gh0z9h6zi9bo,2024-05-07 23:38:13


## Testing

In [24]:
vesselAis = (
    train_df.query("vesselId == 'clh6aqawa0002gh0zypfa5dut'")
    .sort_values(by="time")
    .reset_index()
)

In [25]:
vesselAis.resample("20min", on="time").last()["sog"].value_counts()

sog
0.0     2884
13.6      72
13.5      69
13.8      69
13.7      59
        ... 
3.6        1
3.8        1
5.4        1
5.9        1
7.5        1
Name: count, Length: 189, dtype: int64

Idea: compute adjacency matrix based on distance < 5nm within 1 hour