In [6]:
import pandas as pd
import numpy as np
from geopy.distance import geodesic
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from statsmodels.tsa.arima.model import ARIMA
from prophet import Prophet
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
import joblib
import warnings
warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm





In [14]:
# load the feature-engineered dataset
ais_train = pd.read_csv('ais_train_feature_engineered.csv')

# handling missing values by filling with 0
ais_train['distance_travelled'].fillna(0, inplace=True)
ais_train['delta_sog'].fillna(0, inplace=True)
ais_train['delta_cog'].fillna(0, inplace=True)
ais_train['time_since_last_port_call'].fillna(0, inplace=True)

In [17]:
# selecting Features and Target Variables
features = ['latitude', 'longitude', 'sog', 'cog', 'distance_travelled', 'delta_sog', 'delta_cog', 'is_weekend', 'at_port', 'time_since_last_port_call', 'avg_sog', 'avg_cog']
target = ['latitude', 'longitude']  # predicting future latitude and longitude
features


['latitude',
 'longitude',
 'sog',
 'cog',
 'distance_travelled',
 'delta_sog',
 'delta_cog',
 'is_weekend',
 'at_port',
 'time_since_last_port_call',
 'avg_sog',
 'avg_cog']

In [18]:
# splitting the data into training and validation sets
X = ais_train[features]
y = ais_train[target]
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [19]:

# model Selection and Training
models = {
    'RandomForest': RandomForestRegressor(n_estimators=100, random_state=42),
    'GradientBoosting': GradientBoostingRegressor(n_estimators=100, random_state=42)
}

model_results = {}

In [20]:

for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    mse = mean_squared_error(y_val, y_pred)
    model_results[model_name] = mse
    joblib.dump(model, f'{model_name}_model.pkl')
    print(f"{model_name} Model training complete. Model saved as '{model_name}_model.pkl'. Mean Squared Error: {mse}")

RandomForest Model training complete. Model saved as 'RandomForest_model.pkl'. Mean Squared Error: 1.4167622698423459e-08


ValueError: y should be a 1d array, got an array of shape (1217116, 2) instead.

In [None]:

# Feature Engineering Phase 3: Model Training



# ARIMA for Latitude and Longitude (Univariate Time Series Modeling)
try:
    arima_model_lat = ARIMA(y_train['latitude'], order=(5, 1, 0))
    arima_model_lat_fit = arima_model_lat.fit()
    y_pred_lat = arima_model_lat_fit.forecast(steps=len(y_val))
    mse_lat = mean_squared_error(y_val['latitude'], y_pred_lat)
    print(f"ARIMA Model Latitude - Mean Squared Error: {mse_lat}")
except Exception as e:
    print(f"ARIMA Model for Latitude failed: {e}")

# LSTM Model (Deep Learning)
try:
    X_train_lstm = X_train.values.reshape((X_train.shape[0], 1, X_train.shape[1]))
    X_val_lstm = X_val.values.reshape((X_val.shape[0], 1, X_val.shape[1]))

    lstm_model = Sequential()
    lstm_model.add(LSTM(50, activation='relu', input_shape=(X_train_lstm.shape[1], X_train_lstm.shape[2])))
    lstm_model.add(Dropout(0.2))
    lstm_model.add(Dense(2))
    lstm_model.compile(optimizer='adam', loss='mse')
    lstm_model.fit(X_train_lstm, y_train, epochs=10, batch_size=32, validation_data=(X_val_lstm, y_val), verbose=0)

    y_pred_lstm = lstm_model.predict(X_val_lstm)
    mse_lstm = mean_squared_error(y_val, y_pred_lstm)
    print(f"LSTM Model - Mean Squared Error: {mse_lstm}")
except Exception as e:
    print(f"LSTM Model failed: {e}")
