In [12]:
# import necessary libraries
import pandas as pd
import numpy as np
from geopy.distance import geodesic
from sklearn.preprocessing import MinMaxScaler

In [13]:
# reloading the cleaned dataset
ais_train = pd.read_csv('ais_train_cleaned.csv')
ports = pd.read_csv('ports.csv', sep='|')
schedules = pd.read_csv('schedules_to_may_2024.csv', sep='|')

  ais_train = pd.read_csv('ais_train_cleaned.csv')


In [16]:
# adding previous port information for each vessel based on schedules
schedules['arrivalDate'] = pd.to_datetime(schedules['arrivalDate'])
schedules = schedules.sort_values(by=['vesselId', 'arrivalDate'])


In [None]:
# creating a feature indicating the time spent at each port
# parse datetime columns for easier handling
ais_train['time'] = pd.to_datetime(ais_train['time'], errors='coerce')

ais_train['time_since_last_port_call'] = ais_train.groupby('vesselId')['time'].diff().dt.total_seconds() / 3600  # Time in hours

# check if vessel is currently at port
ais_train['at_port'] = ais_train['portId'].notnull().astype(int)


In [20]:
# creating a feature indicating if the current day is a weekend
ais_train['is_weekend'] = ais_train['day_of_week'].apply(lambda x: 1 if x >= 5 else 0)

In [22]:
# normalization of numerical features

scaler = MinMaxScaler()
numerical_features = ['latitude', 'longitude', 'sog', 'cog', 'distance_travelled', 'delta_sog', 'delta_cog']
ais_train[numerical_features] = scaler.fit_transform(ais_train[numerical_features])


In [23]:
# adding time window to determine morning, afternoon, evening, night
def time_window(hour):
    if 5 <= hour < 12:
        return 'morning'
    elif 12 <= hour < 17:
        return 'afternoon'
    elif 17 <= hour < 21:
        return 'evening'
    else:
        return 'night'

ais_train['time_window'] = ais_train['hour_of_day'].apply(time_window)
ais_train = pd.get_dummies(ais_train, columns=['time_window'], drop_first=True)


In [24]:
# calculate the average speed and course for each vessel
avg_features = ais_train.groupby('vesselId')[['sog', 'cog']].mean().reset_index()
avg_features.rename(columns={'sog': 'avg_sog', 'cog': 'avg_cog'}, inplace=True)

ais_train = ais_train.merge(avg_features, on='vesselId', how='left')


In [25]:

# save the feature-engineered dataset for modeling phase
ais_train.to_csv('ais_train_feature_engineered.csv', index=False)
