In [1]:
import main_2_pre_processing
import pandas as pd
from src.pre_proc import ais_query
import config
import gc

##### Load data from filtered parquet database

In [2]:
# Filtered AIS data is located at "ais-data/parquet/" and contains your filtered downloaded data (filtered based on bbox, ship types, etc.)
# Still contains NaN values and unprocessed columns
raw_df = ais_query.query_ais_duckdb("ais-data/parquet", verbose=True)
print(f"Raw DataFrame shape: {raw_df.shape}")
print(f"Raw DataFrame columns:\n{raw_df.columns.tolist()}")

[query_ais_duckdb] SQL:
 SELECT * FROM read_parquet('ais-data/parquet/**/*.parquet') WHERE 1=1
Raw DataFrame shape: (7276150, 28)
Raw DataFrame columns:
['Timestamp', 'Type of mobile', 'Latitude', 'Longitude', 'Navigational status', 'ROT', 'SOG', 'COG', 'Heading', 'IMO', 'Callsign', 'Name', 'Ship type', 'Cargo type', 'Width', 'Length', 'Type of position fixing device', 'Draught', 'Destination', 'ETA', 'Data source type', 'A', 'B', 'C', 'D', 'Date', 'MMSI', 'Segment']


##### Look at how many records and unique days are in the data

In [3]:
unique_days = pd.Series(raw_df['Timestamp'].dt.normalize().unique())
unique_days = pd.to_datetime(unique_days).sort_values()
print(f"Data loaded: {len(raw_df)} records.")
#print("Unique days:", [d.strftime('%Y-%m-%d') for d in unique_days])

# Print continuous periods (split when a day is missing)
if unique_days.empty:
    print("No days found in data.")
else:
    days = pd.to_datetime(unique_days).sort_values().reset_index(drop=True)
    diffs = days.diff().dt.days
    groups = (diffs.fillna(1) != 1).cumsum()
    periods = [(grp.iloc[0], grp.iloc[-1]) for _, grp in days.groupby(groups)]

    print("Continuous periods (split when a day is missing):")
    for start, end in periods:
        if start == end:
            print(start.strftime("%Y-%m-%d"))
        else:
            length = (end - start).days + 1
            print(f"{start.strftime('%Y-%m-%d')} -> {end.strftime('%Y-%m-%d')} ({length} days)")
    print(f"Total periods: {len(periods)}")


Data loaded: 7276150 records.
Continuous periods (split when a day is missing):
2025-08-01 -> 2025-09-27 (58 days)
Total periods: 1


In [4]:
# Free up memory
del raw_df
gc.collect()

36

##### Pre-processing

In [5]:
# Setup this parameters at your convenience into config.py

SEGMENT_MAX_LENGTH = config.SEGMENT_MAX_LENGTH # Maximum length of segments (minimum lenght is set during filtering, and should be equal to this, if we want fixed-length segments)
NUMERIC_COLS = config.NUMERIC_COLS             # Numeric columns to be normalized

TRAIN_START_DATE = config.TRAIN_START_DATE     # Start date for training data you want to PRE-PROCESS
TRAIN_END_DATE = config.TRAIN_END_DATE         # End date for training data you want to PRE-PROCESS

TEST_START_DATE = config.TEST_START_DATE       # Start date for test data you want to PRE-PROCESS
TEST_END_DATE = config.TEST_END_DATE           # End date for test data you want to PRE-PROCESS

print(f"SEGMENT_MAX_LENGTH: {SEGMENT_MAX_LENGTH}")
print(f"NUMERIC_COLS: {NUMERIC_COLS}")
print(f"TRAIN_START_DATE: {TRAIN_START_DATE}")
print(f"TRAIN_END_DATE: {TRAIN_END_DATE}")
print(f"TEST_START_DATE: {TEST_START_DATE}")
print(f"TEST_END_DATE: {TEST_END_DATE}")

SEGMENT_MAX_LENGTH: 300
NUMERIC_COLS: ['Latitude', 'Longitude', 'SOG', 'COG']
TRAIN_START_DATE: 2025-08-01
TRAIN_END_DATE: 2025-08-31
TEST_START_DATE: 2025-09-01
TEST_END_DATE: 2025-09-01


In [6]:
main_2_pre_processing.main_pre_processing(dataframe_type="train")
# The pre-processed DataFrame is now saved to the path specified in config.PRE_PROCESSING_DF_TRAIN_PATH

# YOU CAN ALSO PRE_PROCESS JUST THE TRAIN DATAFRAME AND THEN SPLIT IN TRAIN/TEST LATER IF YOU PREFER

[pre_processing] Querying AIS data for training period: 2025-08-01 to 2025-08-31
[query_ais_duckdb] SQL:
 SELECT * FROM read_parquet('ais-data/parquet/**/*.parquet') WHERE 1=1 AND Date IN ('2025-08-01', '2025-08-02', '2025-08-03', '2025-08-04', '2025-08-05', '2025-08-06', '2025-08-07', '2025-08-08', '2025-08-09', '2025-08-10', '2025-08-11', '2025-08-12', '2025-08-13', '2025-08-14', '2025-08-15', '2025-08-16', '2025-08-17', '2025-08-18', '2025-08-19', '2025-08-20', '2025-08-21', '2025-08-22', '2025-08-23', '2025-08-24', '2025-08-25', '2025-08-26', '2025-08-27', '2025-08-28', '2025-08-29', '2025-08-30', '2025-08-31')
[pre_processing] Initial data size: 3996082 records.
[pre_processing] Dropping unnecessary columns and rows with missing values...
[pre_processing] Data size after dropping: 3925808 records.
[pre_processing] Splitting segments to max length 300...
[pre_processing] Saving pre-processed DataFrame to ais-data/pre_processed_df_train.parquet...
[pre_processing] Columns of pre-pro

In [7]:
main_2_pre_processing.main_pre_processing(dataframe_type="test")
# The pre-processed DataFrame is now saved to the path specified in config.PRE_PROCESSING_DF_TEST_PATH

[pre_processing] Querying AIS data for testing period: 2025-09-01 to 2025-09-01
[query_ais_duckdb] SQL:
 SELECT * FROM read_parquet('ais-data/parquet/**/*.parquet') WHERE 1=1 AND Date IN ('2025-09-01')
[pre_processing] Initial data size: 111347 records.
[pre_processing] Dropping unnecessary columns and rows with missing values...
[pre_processing] Data size after dropping: 111338 records.
[pre_processing] Splitting segments to max length 300...
[pre_processing] Saving pre-processed DataFrame to ais-data/pre_processed_df_test.parquet...
[pre_processing] Columns of pre-processed DataFrame:
['Timestamp', 'Latitude', 'Longitude', 'SOG', 'COG', 'MMSI', 'DeltaT', 'Segment_nr', 'NavStatus_0', 'NavStatus_1', 'NavStatus_2', 'NavStatus_3', 'ShipTypeID']
[pre_processing] Saving preprocessing metadata to ais-data/pre_processing_metadata_test.json...
