In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import geopandas as gpd
from geopy.distance import geodesic as geopy_geodesic

In [2]:
def mean_geodetic_distance(y_true_lat, y_true_long, y_pred_lat, y_pred_long):
    distances = [
        geopy_geodesic((true_lat, true_long), (pred_lat, pred_long)).km
        for true_lat, true_long, pred_lat, pred_long in zip(
            y_true_lat, y_true_long, y_pred_lat, y_pred_long
        )
    ]
    return np.mean(distances)

## Load data

In [3]:
train_df = pd.read_csv("data/ais_train.csv", sep="|")
train_df["time"] = pd.to_datetime(train_df["time"])
test_df = pd.read_csv("data/ais_test.csv")
test_df["time"] = pd.to_datetime(test_df["time"])

ports_df = pd.read_csv("data/ports.csv", sep="|")
schedules_df = pd.read_csv("data/schedules_to_may_2024.csv", sep="|")
vessels_df = pd.read_csv("data/vessels.csv", sep="|")
# train_df["etaRaw"] = pd.to_datetime(train_df["etaRaw"], format="%Y/%m/%d %H:%M")
train_df.tail()

Unnamed: 0,time,cog,sog,rot,heading,navstat,etaRaw,latitude,longitude,vesselId,portId
1522060,2024-05-07 23:59:07,359.1,13.4,0,1,0,05-08 05:00,52.19131,-5.82223,clh6aqawa0002gh0zypfa5dut,634c4de270937fc01c3a7417
1522061,2024-05-07 23:59:08,12.3,17.1,0,13,0,05-10 03:00,38.96142,-12.00502,61e9f3aeb937134a3c4bfe43,634c4de270937fc01c3a76a1
1522062,2024-05-07 23:59:08,269.8,14.9,-1,270,0,05-15 23:00,49.71372,-5.22042,61e9f43db937134a3c4c0169,634c4de270937fc01c3a787b
1522063,2024-05-07 23:59:08,8.0,18.7,0,6,0,05-08 12:45,38.27895,10.7828,61e9f469b937134a3c4c029b,61d3781293c6feb83e5eb73b
1522064,2024-05-07 23:59:08,336.0,14.3,5,337,0,05-07 23:00,38.98635,-75.13275,62080cff66fc0a8e43c6123a,61d38528b7b7526e1adf3e6f


In [4]:
test_start = "2024-05-08 00:00:00"

## Preprocessing and feature engineering

#### Drop vessels with very few rows

In [5]:
train_df = train_df[
    ~train_df["vesselId"].isin(["61e9f3adb937134a3c4bfe37", "61e9f3cbb937134a3c4bff09"])
]

#### Remove duplicated vessel

In [6]:
duplicated_vessel = "clh6aqawa0001gh0zmijpuho1"
duplicated_from = "63d27587e3fba838ce820405"
train_df = train_df[train_df["vesselId"] != duplicated_vessel]
# test_df.loc[test_df["vesselId"] == duplicated_vessel, "vesselId"] = duplicated_from

#### Set default/missing values to nan

In [7]:
train_df.loc[train_df["cog"] >= 360, "cog"] = np.nan
train_df.loc[train_df["sog"] >= 102.3, "sog"] = np.nan
train_df.loc[abs(train_df["rot"]) >= 128, "rot"] = np.nan
train_df.loc[train_df["heading"] > 360, "heading"] = np.nan
train_df.loc[train_df["navstat"] >= 9, "navstat"] = np.nan

#### Aggregate vessel stats and cluster

In [8]:
from sklearn.cluster import KMeans

In [9]:
# Calculate statistics for each vessel
vessel_stats = (
    train_df.groupby("vesselId")
    .agg(
        mean_latitude=("latitude", "mean"),
        mean_longitude=("longitude", "mean"),
        mean_speed=("sog", "mean"),
        median_speed=("sog", "median"),
        std_speed=("sog", "std"),
        count_entries=("vesselId", "size"),
        pct_navstat_0=("navstat", lambda x: ((x == 0) | (x == 8)).mean() * 100),
        pct_navstat_1=("navstat", lambda x: (x == 1).mean() * 100),
        pct_navstat_5=("navstat", lambda x: (x == 5).mean() * 100),
    )
    .reset_index()
)
vessel_stats_cols = [col for col in vessel_stats.columns.to_list() if col != "vesselId"]
vessels_df = vessels_df.merge(vessel_stats, on="vesselId", how="left")

In [10]:
# Perform KMeans clustering on train_df
kmeans = KMeans(n_clusters=20, random_state=42)
kmeans.fit(train_df[["latitude", "longitude"]])
train_df["cluster"] = kmeans.predict(train_df[["latitude", "longitude"]])

# Encode the cluster each row belongs to with cluster center
cluster_centers = kmeans.cluster_centers_
train_df["cluster_center_latitude"] = train_df["cluster"].apply(
    lambda x: cluster_centers[x][0]
)
train_df["cluster_center_longitude"] = train_df["cluster"].apply(
    lambda x: cluster_centers[x][1]
)

cluster_cols = [
    "cluster",
    "cluster_center_latitude",
    "cluster_center_longitude",
]

#### Stack train and test

In [11]:
train_df["scaling_factor"] = np.nan
train_df["ID"] = -1
full_df = pd.concat([train_df, test_df]).sort_values(["vesselId", "time"]).reset_index()

#### Util

In [12]:
def df_to_gdf(
    df: pd.DataFrame, longitude_col="longitude", latitude_col="latitude"
) -> gpd.GeoDataFrame:
    return gpd.GeoDataFrame(
        df,
        geometry=gpd.points_from_xy(df[longitude_col], df[latitude_col]),
        crs="EPSG:4326",
    )

In [13]:
full_df["time_diff"] = full_df.groupby("vesselId")["time"].diff().astype(int) // 10**9

# Assume time diff before first msg was 20 min
full_df.loc[full_df["time_diff"] < 0, "time_diff"] = 60 * 20

# full_df = full_df[full_df["time_diff"] >= 60].reset_index(drop=True)

In [14]:
train_mask = full_df["ID"] == -1
full_df.loc[train_mask, "etaRaw"] = pd.to_datetime(
    "2024-" + full_df.loc[train_mask, "etaRaw"], errors="coerce"
)

#### Add prev positions, distance, fwd_azimuth, computed sog

In [15]:
from pyproj import Geod

In [16]:
full_df[["prev_lat", "prev_lng"]] = full_df.groupby("vesselId")[
    ["latitude", "longitude"]
].shift()

In [17]:
geodesic = Geod(ellps="WGS84")
fwd_azimuth, back_azimuth, distance = geodesic.inv(
    full_df["prev_lng"],
    full_df["prev_lat"],
    full_df["longitude"],
    full_df["latitude"],
)

full_df["distance"] = distance / (1000 * 1.852)
full_df["fwd_azimuth"] = fwd_azimuth
full_df["prev_distance"] = full_df["distance"].shift()
full_df["prev_fwd_azimuth"] = full_df["fwd_azimuth"].shift()

full_df["dx"] = full_df.groupby("vesselId")["longitude"].diff()
full_df["dy"] = full_df.groupby("vesselId")["latitude"].diff()

# lng, lat, back_azimuth = geodesic.fwd(
#     train_df["prev_lng"], train_df["prev_lat"], fwd_azimuth, distance
# )
# train_df["lng_test"] = lng
# train_df["lat_test"] = lat

In [18]:
full_df["computed_sog"] = full_df["distance"] / (full_df["time_diff"] / 3600)
full_df["prev_sog"] = full_df["computed_sog"].shift()

In [19]:
speed_limit = 50

full_df = full_df[(full_df["computed_sog"] < speed_limit) | (full_df["ID"] != -1)]

#### Merge vessels df

In [20]:
full_df = full_df.merge(
    vessels_df[
        [
            "shippingLineId",
            "vesselId",
            "CEU",
            "DWT",
            "GT",
            "vesselType",
            "breadth",
            "length",
            "yearBuilt",
        ]
        + vessel_stats_cols
    ],
    on="vesselId",
    how="left",
)

#### Merge ports df (calculate distance to closest port)

In [21]:
closest_ports = pd.read_csv("data/ports.csv", sep="|").rename(
    {
        "portId": "closest_port",
        "latitude": "latitude_port",
        "longitude": "longitude_port",
    },
    axis=1,
)

In [22]:
orig_cols = full_df.columns.to_list()
distance_col = "dist_to_port"

ports_gdf = df_to_gdf(
    closest_ports, longitude_col="longitude_port", latitude_col="latitude_port"
).to_crs(epsg="3857")

full_gdf = df_to_gdf(full_df, longitude_col="prev_lng", latitude_col="prev_lat").to_crs(
    epsg="3857"
)
full_gdf = full_gdf.sjoin_nearest(ports_gdf, how="left", distance_col=distance_col)
full_df[[distance_col, "prev_closest_port", "latitude_port", "longitude_port"]] = (
    full_gdf[[distance_col, "closest_port", "latitude_port", "longitude_port"]]
)

#### Distance to land

In [23]:
land_world = gpd.read_file("data/land_and_ocean/ne_10m_land.zip")
land_world = land_world.to_crs(epsg=3857)

In [24]:
# orig_cols = full_df.columns.to_list()
# df_gdf = df_to_gdf(full_df).to_crs(epsg="3857")

# df_with_land = df_gdf.sjoin_nearest(
#     land_world, how="left", distance_col="distance_to_land"
# )

# df = pd.DataFrame(df_with_land)[orig_cols + ["distance_to_land"]]

#### Add temporal features

In [25]:
def add_temporal_cols_discrete(df: pd.DataFrame) -> pd.DataFrame:
    df["minute"] = df["time"].dt.minute
    df["hour"] = df["time"].dt.hour
    df["day_of_week"] = df["time"].dt.dayofweek
    df["day_of_month"] = df["time"].dt.day
    df["month"] = df["time"].dt.month
    df["day_of_year"] = df["time"].dt.dayofyear
    df["week_of_year"] = df["time"].dt.isocalendar().week
    cols = [
        "hour",
        "day_of_week",
        "day_of_month",
        "month",
        "day_of_year",
        "week_of_year",
    ]
    return df, cols

In [26]:
def add_temporal_cols_continuous(df: pd.DataFrame) -> pd.DataFrame:
    day_s = 24 * 60 * 60
    year_s = (366) * day_s
    month_s = year_s // 12
    timestamp_s = (df["time"] - pd.Timestamp("2024-01-01")) // pd.Timedelta("1s")

    df["time_day_sin"] = np.sin(timestamp_s * (2 * np.pi / day_s))
    df["time_day_cos"] = np.cos(timestamp_s * (2 * np.pi / day_s))
    df["time_year_sin"] = np.sin(timestamp_s * (2 * np.pi / year_s))
    df["time_year_cos"] = np.cos(timestamp_s * (2 * np.pi / year_s))
    df["time_month_sin"] = np.sin(timestamp_s * (2 * np.pi / month_s))
    df["time_month_cos"] = np.cos(timestamp_s * (2 * np.pi / month_s))
    cols = [
        f"time_day_sin",
        f"time_day_cos",
        f"time_year_sin",
        f"time_year_cos",
        f"time_month_sin",
        f"time_month_cos",
    ]
    return df, cols


full_df, time_cols = add_temporal_cols_discrete(full_df)

#### Categorical encoding

In [27]:
from category_encoders.count import CountEncoder
from category_encoders.target_encoder import TargetEncoder
from category_encoders.one_hot import OneHotEncoder

In [28]:
# Counts of ports might be useful for estimating the size / popularity of the port
count_ports_encoder = CountEncoder(handle_missing="value", handle_unknown=-1)
count_ports_encoder.fit(
    full_df.loc[
        ~full_df["prev_closest_port"].isna(),
        "prev_closest_port",
    ].values
)
full_df.loc[~full_df["prev_closest_port"].isna(), "closest_port_count"] = (
    count_ports_encoder.transform(
        full_df.loc[~full_df["prev_closest_port"].isna(), "prev_closest_port"].values
    ).values
)

In [29]:
shipping_line_encoder = CountEncoder()
shipping_line_encoder.fit(full_df["shippingLineId"].values)
full_df["shipping_line_count"] = shipping_line_encoder.transform(
    full_df["shippingLineId"].values
)

In [30]:
encoded_cols = [
    "vesselType_83.0",
    "vesselType_21.0",
    "vesselType_14.0",
]

one_hot = OneHotEncoder(cols=["vesselType"], handle_missing="value", use_cat_names=True)
encoded = one_hot.fit_transform(full_df["vesselType"])

full_df[encoded_cols] = encoded[encoded_cols]

In [31]:
# Add lag features for key metrics
lag_cols = ["prev_lat", "prev_lng", "prev_distance", "prev_fwd_azimuth", "time_diff"]
for col in lag_cols:
    # Add lags
    for lag in [1, 2, 3, 4]:
        full_df[f"{col}_lag{lag}"] = full_df.groupby("vesselId")[col].shift(lag)

    # Add rolling means
    for window in [4, 12, 24]:
        full_df[f"{col}_rolling_mean_{window}"] = (
            full_df.groupby("vesselId")[col]
            .rolling(window=window, min_periods=1)
            .mean()
            .reset_index(0, drop=True)
        )

for window in [4, 12, 24]:
    full_df[f"prev_distance_rolling_sum_{window}"] = (
        full_df.groupby("vesselId")["prev_distance"]
        .rolling(window=window, min_periods=1)
        .sum()
        .reset_index(0, drop=True)
    )

# Add the new columns to features list
lag_rolling_cols = [
    f"{col}_{transform}"
    for col in lag_cols
    for transform in (
        [f"lag{i}" for i in range(1, 5)] + [f"rolling_mean_{w}" for w in [4, 12, 24]]
    )
] + [f"prev_distance_rolling_sum_{w}" for w in [4, 12, 24]]

In [32]:
def add_circular_features(df, lat_col, long_col, azimuth_col):
    # Convert degrees to radians
    df["lat_rad"] = np.deg2rad(df[lat_col])
    df["lon_rad"] = np.deg2rad(df[long_col])
    df["azimuth_rad"] = np.deg2rad(df[azimuth_col])

    # Sine and Cosine transformations
    df["sin_lat"] = np.sin(df["lat_rad"])
    df["cos_lat"] = np.cos(df["lat_rad"])
    df["sin_lon"] = np.sin(df["lon_rad"])
    df["cos_lon"] = np.cos(df["lon_rad"])
    df["sin_azimuth"] = np.sin(df["azimuth_rad"])
    df["cos_azimuth"] = np.cos(df["azimuth_rad"])
    cols = ["sin_lat", "cos_lat", "sin_lon", "cos_lon", "cos_azimuth", "sin_azimuth"]

    return df, cols


full_df, circular_cols = add_circular_features(
    full_df, "prev_lat", "prev_lng", "prev_fwd_azimuth"
)

#### Finalize dataset

In [33]:
features = (
    [
        "time_diff",
        "prev_lat",
        "prev_lng",
        "prev_sog",
        "prev_distance",
        "prev_fwd_azimuth",
        "CEU",
        "DWT",
        "GT",
        "breadth",
        "length",
        "yearBuilt",
        "dist_to_port",
        "latitude_port",
        "longitude_port",
    ]
    + time_cols
    + encoded_cols
    + lag_rolling_cols
    + cluster_cols
    + vessel_stats_cols
    + circular_cols
    + [
        "closest_port_count",
        "shipping_line_count",
    ]
)

targets = ["distance", "fwd_azimuth", "latitude", "longitude"]

In [46]:
train_mask = full_df["ID"] == -1
# val_df = full_df[(full_df["time"] > val_cutoff) & (full_df["time"] < test_start)]
train_df = full_df[full_df["time"] < test_start]
test_df = full_df[full_df["time"] >= test_start]

## Training

In [35]:
import xgboost as xgb
from sklearn.multioutput import MultiOutputRegressor
from sklearn.compose import TransformedTargetRegressor

#### Models

In [36]:
model = MultiOutputRegressor(
    TransformedTargetRegressor(
        regressor=xgb.XGBRegressor(max_depth=12, n_estimators=100, random_state=42),
        func=np.deg2rad,
        inverse_func=np.rad2deg,
    )
)

In [37]:
# model = MultiOutputRegressor(
#     StackingRegressor(
#         estimators=[
#             (
#                 "xgb",
#                 xgb.XGBRegressor(
#                     alpha=0.05, max_depth=12, n_estimators=100, random_state=123
#                 ),
#             ),
#             (
#                 "hgb",
#                 HistGradientBoostingRegressor(
#                     max_depth=12, max_iter=100, random_state=123
#                 ),
#             ),
#             ("lgb", LGBMRegressor(max_depth=12, n_estimators=100, random_state=123)),
#             (
#                 "cat",
#                 CatBoostRegressor(
#                     max_depth=12, n_estimators=100, random_seed=123, verbose=False
#                 ),
#             ),
#         ],
#         final_estimator=xgb.XGBRegressor(
#             max_depth=6, n_estimators=50, random_state=123
#         ),
#         passthrough=True,
#         cv=2,
#     )
# )

#### Training

In [41]:
model.fit(train_df[features], train_df[targets][["latitude", "longitude"]])

NameError: name 'X_train' is not defined

#### Make predictions autoregressively

In [43]:
from tqdm import tqdm
import warnings

warnings.filterwarnings("ignore")

In [None]:
def find_closest_port_for_row(row, longitude_col, latitude_col):

    df_row = pd.DataFrame([row])
    row_gdf = df_to_gdf(
        df_row, longitude_col=longitude_col, latitude_col=latitude_col
    ).to_crs(epsg="3857")

    row_with_nearest_port = row_gdf.sjoin_nearest(
        ports_gdf, how="left", distance_col="dist_to_port"
    )

    # Extract the closest port info
    closest_port = row_with_nearest_port["closest_port"].values[0]
    dist_to_port = row_with_nearest_port["dist_to_port"].values[0]
    latitude_port = row_with_nearest_port["latitude_port"].values[0]
    longitude_port = row_with_nearest_port["longitude_port"].values[0]

    return closest_port, dist_to_port, latitude_port, longitude_port

In [44]:
def time_series_prediction(test_df: pd.DataFrame):
    lng_pred = []
    lat_pred = []
    total_steps = test_df.shape[0]

    with tqdm(total=total_steps) as progress_bar:
        for vesselId in test_df["vesselId"].unique():
            vessel_rows = test_df[test_df["vesselId"] == vesselId].copy()

            steps = vessel_rows.shape[0]
            vessel_rows["step"] = np.arange(steps)

            for i in range(steps):
                row = vessel_rows.loc[vessel_rows["step"] == i]
                latitude = model.predict(row[features])[:, 0].item()
                longitude = model.predict(row[features])[:, 1].item()
                lng_pred.append(longitude)
                lat_pred.append(latitude)

                fwd_azimuth, back_azimuth, distance = geodesic.inv(
                    row["prev_lng"].item(),
                    row["prev_lat"].item(),
                    longitude,
                    latitude,
                )
                distance = distance / (1.852 * 1000)
                sog = distance / (row["time_diff"].item() / 3600)
                closest_port, dist_to_port, latitude_port, longitude_port = (
                    find_closest_port_for_row(
                        row[["prev_lat", "prev_lng"]],
                        latitude_col="prev_lat",
                        longitude_col="prev_lng",
                    )
                )
                closest_port_count = count_ports_encoder.transform(
                    closest_port
                ).values.item()

                if i < steps - 1:
                    vessel_rows.loc[
                        vessel_rows["step"] == i + 1,
                        [
                            "prev_distance",
                            "prev_fwd_azimuth",
                            "prev_lat",
                            "prev_lng",
                            "cos_lat",
                            "sin_lat",
                            "cos_lon",
                            "sin_lon",
                            "cos_azimuth",
                            "sin_azimuth",
                            "prev_sog",
                            "closest_port_count",
                            "dist_to_port",
                            "latitude_port",
                            "longitude_port",
                        ],
                    ] = [
                        distance,
                        fwd_azimuth,
                        latitude,
                        longitude,
                        np.cos(latitude),
                        np.sin(latitude),
                        np.cos(longitude),
                        np.sin(longitude),
                        np.cos(fwd_azimuth),
                        np.sin(fwd_azimuth),
                        sog,
                        closest_port_count,
                        dist_to_port.item(),
                        latitude_port.item(),
                        longitude_port.item(),
                    ]

                    for col in [col for col in lag_cols if col != "time_diff"]:
                        vessel_rows.loc[vessel_rows["step"] == i + 1, f"{col}_lag1"] = (
                            vessel_rows.loc[vessel_rows["step"] == i, col].item()
                        )
                        for lag in range(2, 5):
                            vessel_rows.loc[
                                vessel_rows["step"] == i + 1, f"{col}_lag{lag}"
                            ] = vessel_rows.loc[
                                vessel_rows["step"] == i, f"{col}_lag{lag-1}"
                            ].item()

                        for window in [4, 12, 24]:
                            vessel_rows.loc[
                                vessel_rows["step"] == i + 1,
                                f"{col}_rolling_mean_{window}",
                            ] = (
                                vessel_rows.loc[vessel_rows["step"] <= i + 1, col]
                                .tail(window)
                                .mean()
                            )

                    for window in [4, 12, 24]:
                        vessel_rows.loc[
                            vessel_rows["step"] == i + 1,
                            f"prev_distance_rolling_sum_{window}",
                        ] = (
                            vessel_rows.loc[
                                vessel_rows["step"] <= i + 1, "prev_distance"
                            ]
                            .tail(window)
                            .sum()
                        )

                progress_bar.update()

    return lat_pred, lng_pred


lat_pred, lng_pred = time_series_prediction(test_df)

  0%|          | 0/60242 [00:00<?, ?it/s]


NameError: name 'find_closest_port_for_row' is not defined