In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tensorflow.keras.layers import Dense, Input, LSTM, Concatenate, Dropout, Bidirectional
from tensorflow.keras.models import Model
from tensorflow.keras.regularizers import l2
from tensorflow.keras.metrics import RootMeanSquaredError, MeanAbsoluteError
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import os

In [2]:
path = r'C:\Users\firoj\OneDrive\Desktop\thesis_test\Thesis_test_01\Final_data'
os.chdir(path)

In [3]:
df = pd.read_csv('main_data.csv')
df.head()

Unnamed: 0,ais_timestamp,vessel_mmsi,vessel_length,lon,lat,distance,speed,destination_lat,destination_lon,remaining_distance,lock_status,water_level_cm,max_cm,sig_cm,turb_cm,tp,sine,cosine,target
0,01.05.2023 07:08,210090000.0,88.0,8.545379,53.56142,0.0,110.0,53.49468,8.504,7.909105,1.0,502.0,58.06,36.66,52.61,356.8,0.48481,-0.87462,0.45
1,01.05.2023 07:11,210090000.0,88.0,8.554007,53.55247,1.15,112.0,53.49468,8.504,7.226372,1.0,506.0,53.66,35.75,53.88,347.8,0.5,-0.866025,0.45
2,01.05.2023 07:14,210090000.0,88.0,8.562378,53.54377,1.11,111.0,53.49468,8.504,6.685145,1.0,510.0,55.09,37.16,57.33,359.2,0.48481,-0.87462,0.45
3,01.05.2023 07:17,210090000.0,88.0,8.567384,53.53481,1.05,110.0,53.49468,8.504,6.121669,1.0,514.0,55.09,38.33,62.32,362.5,-0.052336,-0.99863,0.45
4,01.05.2023 07:20,210090000.0,88.0,8.56215,53.52593,1.05,110.0,53.49468,8.504,5.182657,1.0,518.0,55.09,36.16,65.35,346.5,-0.681998,-0.731354,0.45


In [4]:
df.shape

(77365, 19)

In [5]:
final_df = df.drop(columns=['ais_timestamp',
                            'vessel_mmsi',
                            'sig_cm'])

In [6]:
final_df.head()

Unnamed: 0,vessel_length,lon,lat,distance,speed,destination_lat,destination_lon,remaining_distance,lock_status,water_level_cm,max_cm,turb_cm,tp,sine,cosine,target
0,88.0,8.545379,53.56142,0.0,110.0,53.49468,8.504,7.909105,1.0,502.0,58.06,52.61,356.8,0.48481,-0.87462,0.45
1,88.0,8.554007,53.55247,1.15,112.0,53.49468,8.504,7.226372,1.0,506.0,53.66,53.88,347.8,0.5,-0.866025,0.45
2,88.0,8.562378,53.54377,1.11,111.0,53.49468,8.504,6.685145,1.0,510.0,55.09,57.33,359.2,0.48481,-0.87462,0.45
3,88.0,8.567384,53.53481,1.05,110.0,53.49468,8.504,6.121669,1.0,514.0,55.09,62.32,362.5,-0.052336,-0.99863,0.45
4,88.0,8.56215,53.52593,1.05,110.0,53.49468,8.504,5.182657,1.0,518.0,55.09,65.35,346.5,-0.681998,-0.731354,0.45


In [7]:
final_df.columns

Index(['vessel_length', 'lon', 'lat', 'distance', 'speed', 'destination_lat',
       'destination_lon', 'remaining_distance', 'lock_status',
       'water_level_cm', 'max_cm', 'turb_cm', 'tp', 'sine', 'cosine',
       'target'],
      dtype='object')

In [8]:
# Exclude the target column and any other columns we don't want to standardize
columns_to_exclude = ['lock_status', 'sine', 'cosine', 'target']  
features_to_standardize = final_df.columns.difference(columns_to_exclude)

# Initialize the StandardScaler
scaler = StandardScaler()

# Standardize the features
final_df[features_to_standardize] = scaler.fit_transform(final_df[features_to_standardize])

In [9]:
final_df.head()

Unnamed: 0,vessel_length,lon,lat,distance,speed,destination_lat,destination_lon,remaining_distance,lock_status,water_level_cm,max_cm,turb_cm,tp,sine,cosine,target
0,-0.144523,-0.509426,1.00343,-0.426346,1.513499,0.817323,-0.662276,0.19378,1.0,0.397603,-0.753862,-1.127682,0.052111,0.48481,-0.87462,0.45
1,-0.144523,-0.466317,0.976728,0.839483,1.560916,0.817323,-0.662276,0.092566,1.0,0.418867,-0.834383,-1.111667,-0.076907,0.5,-0.866025,0.45
2,-0.144523,-0.424493,0.950772,0.795454,1.537207,0.817323,-0.662276,0.012329,1.0,0.440132,-0.808213,-1.068162,0.086516,0.48481,-0.87462,0.45
3,-0.144523,-0.399481,0.924041,0.729411,1.513499,0.817323,-0.662276,-0.071205,1.0,0.461396,-0.808213,-1.005238,0.133823,-0.052336,-0.99863,0.45
4,-0.144523,-0.425632,0.897548,0.729411,1.513499,0.817323,-0.662276,-0.210412,1.0,0.48266,-0.808213,-0.967029,-0.095543,-0.681998,-0.731354,0.45


In [10]:
final_df.columns

Index(['vessel_length', 'lon', 'lat', 'distance', 'speed', 'destination_lat',
       'destination_lon', 'remaining_distance', 'lock_status',
       'water_level_cm', 'max_cm', 'turb_cm', 'tp', 'sine', 'cosine',
       'target'],
      dtype='object')

In [11]:
# features and target variable
X = final_df.drop('target', axis=1)
y = final_df['target']

# Split the dataset into training and temp (validation + test) sets
# Let's say we want 70% of data in the training set and 30% in the temp set
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)

# Split the temp set equally into validation and test sets
# This will result in 15% of the original data in the validation set and 15% in the test set
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [12]:
# Define the base directory path
base_dir = r'C:\\Users\\firoj\\OneDrive\\Desktop\\thesis_test\\Thesis_test_01\\Feature_matrix\\'
os.chdir(base_dir)

In [13]:
# Save the feature sets
np.save(base_dir + 'X_train.npy', X_train)
np.save(base_dir + 'X_val.npy', X_val)
np.save(base_dir + 'X_test.npy', X_test)

# Save the target sets
np.save(base_dir + 'y_train.npy', y_train)
np.save(base_dir + 'y_val.npy', y_val)
np.save(base_dir + 'y_test.npy', y_test)