# Port Performance Project - Data Processing Workbook for Status Changes.

This workbook processes data from the AIS system and combines it with port and dock data to support the [Port Performance Project](https://github.com/epistemetrica/Port-Performance-Project). See the README.md file in the main directory for full details. Analysing AIS status changes, rather than all AIS messages, significantly reduces compute needs and is all we need for most of the analysis in the project. Some metrics, however, require analyzing the full set of AIS messages, which is handled in a separate notebook. 

In [1]:
#prelims
import polars as pl
import pandas as pd
import geopandas as gpd
import time
import plotly.express as px
import matplotlib.pyplot as plt
import contextily as cx
import numpy as np
import glob

#enable string cache for polars categoricals
pl.enable_string_cache()
#display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pl.Config(tbl_rows=50);


## Pre-process AIS Data for status changes only

The vessel locations and status (e.g., "under way", "anchored", "moored") data include all AIS messages. For much of the analysis of port performance, we only need to know when a vessel *changes* status. We drop all other observations to create our first dataframe. 

Early exploratory analysis identified that vessel AIS statuses often change for very short periods of time, likely due to single AIS pings of errant or missing status. For example, a ship at berth for 4 hours may occaisionally ping "under power" even though the vessel remained at berth. To correct for this, we drop very short duration statuses where the previous and next statuses match.

We also filter to large vessels (>100m in length).

In [2]:
#set minimum meaningful status duration (minutes)
min_duration = 10

In [3]:
#init list of lazyframes
lfs = []
#process each parquet file individually into lazyframes
for file in glob.glob('ais data/data/ais_clean/*.parquet'):
    try:
        #check file integrity 
        pl.scan_parquet(file).collect_schema()
        #read file
        lf = (
            pl.scan_parquet(file)
            #drop smaller vessels
            .filter(pl.col('length')>100)
            #sort by vessel and time
            .sort(['mmsi', 'time'])
            #drop messages from the same vessel with same timestamp
            .unique(subset=['mmsi', 'time'])
            #indicate whether status is the same as previous row (Fill value needed to avoid status 0 evaluating as equal to false)
            .with_columns(
                status_change = (
                    pl.col('status').ne(pl.col('status').shift(fill_value=20))
                    .over('mmsi')
                ),
                status_previous = pl.col('status').shift().over('mmsi')
            )
            #keep only new status pings
            .filter(pl.col('status_change')==True)
            #drop change col
            .drop('status_change')
        )
        #append to list of lazyframes
        lfs.append(lf)
    except:
        print(f'{file} failed')

#collect all lazyframes
dfs = pl.collect_all(lfs)

In [4]:
#create single pandas dataframe
ais_gdf = (
    #concat dfs
    pl.concat(dfs, how='diagonal_relaxed')
    #sort by vessel and time
    .sort(['mmsi', 'time'])
    #create duration column
    .with_columns(
        status_duration = (pl.col('time').shift(-1) - pl.col('time'))
        .over('mmsi').dt.total_minutes()
    )
    #drop short changes in status between equal statuses
    .with_columns(
        short = ((pl.col('status').shift()==pl.col('status').shift(-1)) & 
                (pl.col('status_duration')<min_duration)).over('mmsi')
    )
    .filter(pl.col('short')!=True)
    #drop repeated same-status messsages
    .with_columns(
        repeat = (pl.col('status')==pl.col('status').shift()).over('mmsi')
    )
    .filter(pl.col('repeat')!=True)
    .drop('repeat')
    #recalculate duration column
    .with_columns(
        status_duration = (pl.col('time').shift(-1) - pl.col('time'))
        .over('mmsi').dt.total_minutes()
    )
    #ensure sorting
    .sort(['mmsi', 'time'])
    #create row index (for identifying docking events)
    .with_row_index('docking_id')
    .with_columns(
        #create docking event id - NOTE may need to ensure this captures all relevant messages
        docking_id = (
            #keep only docking ids associated with docking messages
            pl.when(pl.col('status')==5)
            .then(pl.col('docking_id'))
            .otherwise(pl.lit(None))
            #backfill over vessel
            .backward_fill().over('mmsi')
        )
    )
    #convert to pandas
    .to_pandas()
)

#convert to geopandas dataframe
ais_gdf = (
    #convert to geodataframe
    gpd.GeoDataFrame(
        ais_gdf,
        geometry=gpd.points_from_xy(ais_gdf.lon, ais_gdf.lat, crs='EPSG:4326')
    )
    #convert to WGS84 pseudo-mercator
    .to_crs(3857)
    #drop old lat lon cols
    .drop(['lat', 'lon'], axis=1)
)

### Port and Dock Data

Locations and descriptions for each dock and port come from the BTS and USACE online databases. 

In [5]:
#load port data
ports_gdf = (
    #read in shape file downloaded from BTS
    gpd.read_file('port data/Principal_Ports/Principal_Ports.shp')
    #drop unneeded columns
    .drop([
        'FID', #randomly assigned table id
        'PORT', #unknown numeric ID - not CBP or UN code
        'FOREIGN_','EXPORTS', 'IMPORTS', 'DOMESTIC' #breadown of total vol (tons)
    ], axis=1)
)
#set col names to pythonic lowercase
ports_gdf.columns = ports_gdf.columns.str.lower()

#load dock data
docks_gdf = (
    #read in shape file downloaded from USACE
    gpd.read_file('port data/Dock/Dock.shp')
    #drop unneeded columns
    .drop([
        'FID', #randomly assigned table id
        'LONGITUDE', 'LATITUDE', #already coded in 'geometry' 
        'LOCATION_D', #text description of dock location
        'STREET_ADD','ZIPCODE', #street address details
        'PSA_NAME', #statistical area name, rarely used
        'COUNTY_NAM', 'COUNTY_FIP', 'CONGRESS', 'CONGRESS_F', #county and congress info
        'MILE', 'BANK', 'LATITUDE1', 'LONGITUDE1', #redundant locaation data
        'OPERATORS', 'OWNERS', #owner info
        'PURPOSE', #long-form text description of dock uses
        'DOCK', #unknown number (not unique to each row/dock)
        'HIGHWAY_NO', 'RAILWAY_NO', 'LOCATION', #redundant location info
        'COMMODITIE', 'CONSTRUCTI','MECHANICAL', 'REMARKS', 'VERTICAL_D', 
        'DEPTH_MIN', 'DEPTH_MAX','BERTHING_L', 'BERTHING_T', 'DECK_HEIGH', 
        'DECK_HEI_1', #these are rarely used stats on construction
        'SERVICE_IN','SERVICE_TE', #rarely used indicators of data entry date 
    ], axis=1)
    #drop duplicates with matching geometries, keeping most common data
    .groupby('geometry').agg(lambda x: x.mode().iloc[0] if not x.mode().empty else None).reset_index()
    #rename cols for clarity
    .rename(columns={
        'NAV_UNIT_I':'nav_unit_id',
        'NAV_UNIT_N':'nav_unit_name',
        'FACILITY_T':'facility_type',
        'CITY_OR_TO':'city',
        'STATE_POST':'state'
    })
)
#set col names to pythonic lowercase
docks_gdf.columns = docks_gdf.columns.str.lower()

#coerse back to gdf - groupby appears to have kicked it back to pandas core
docks_gdf = gpd.GeoDataFrame(docks_gdf, geometry='geometry', crs=3857)

In [6]:
gpd.read_file('port data/Dock/Dock.shp').head(10)

Unnamed: 0,FID,LONGITUDE,LATITUDE,NAV_UNIT_I,UNLOCODE,NAV_UNIT_N,LOCATION_D,FACILITY_T,STREET_ADD,CITY_OR_TO,STATE_POST,ZIPCODE,COUNTY_NAM,COUNTY_FIP,CONGRESS,CONGRESS_F,WTWY_NAME,PORT_NAME,PSA_NAME,MILE,BANK,LATITUDE1,LONGITUDE1,OPERATORS,OWNERS,PURPOSE,HIGHWAY_NO,RAILWAY_NO,LOCATION,DOCK,COMMODITIE,CONSTRUCTI,MECHANICAL,REMARKS,VERTICAL_D,DEPTH_MIN,DEPTH_MAX,BERTHING_L,BERTHING_T,DECK_HEIGH,DECK_HEI_1,SERVICE_IN,SERVICE_TE,geometry
0,1,-162.02778,63.47667,01Y8,,"YUTANA BARGE LINES, ST. MICHAELS DK",,Dock,,SAINT MICHAEL,AK,99659,Nome,180,112,0,"Norton Sound, AK",,,100.0,,63.47667,-162.02778,,,,,,97690,775,"Gasoline, Jet Fuel, Kerosene | Distillate,Resi...",,,,,,,,,,,01-JAN-1990,,POINT (-18036849.964 9218097.559)
1,2,-162.289535,63.519975,01Y9,WMO,STEBBINS VILLAGE,,Dock,,WHITE MOUNTAIN,AK,99784,Nome,180,113,0,"Norton Sound, AK",,,85.0,,63.519975,-162.289535,,,,,,97690,800,"Gasoline, Jet Fuel, Kerosene | Distillate,Resi...",,,,,,,,,,,01-JAN-1990,,POINT (-18065988.397 9228900.858)
2,3,-122.193233,45.563392,01YB,,CANDIANA LIGHT WASH,,,,CORBETT,WA,97019,Skamania,59,112,3,"Columbia River between Vancouver, WA and The D...","Port of Portland, OR",,132.0,,45.563392,-122.193233,,,,,,90132,401,,,,,,,,,,,,01-JAN-1990,,POINT (-13602488.476 5710656.422)
3,4,-122.194899,45.571659,01YC,,CAPE HORN WASH,,,,CORBETT,WA,97019,Skamania,59,116,3,"Columbia River between Vancouver, WA and The D...",,,132.0,,45.571659,-122.194899,,,,,,90132,501,,,,,,,,,,,,01-JAN-1990,,POINT (-13602673.934 5711970.977)
4,5,-122.181479,45.572182,01YD,,PHOCA ROCK,,,,CORBETT,OR,97019,Multnomah,51,116,3,"Columbia River between Vancouver, WA and The D...","Port of Portland, OR",,132.0,,45.572182,-122.181479,,,,,,90132,705,,,,,,,,,,,,01-JAN-1990,,POINT (-13601180.027 5712054.147)
5,6,-122.15719,45.576059,01YF,,PRINDLE DIKE WASH,,,,CORBETT,WA,97019,Skamania,59,112,3,"Columbia River between Vancouver, WA and The D...","Port of Portland, OR",,134.0,,45.576059,-122.15719,,,,,,90134,601,,,,,,,,,,,,01-JAN-1990,,POINT (-13598476.188 5712670.710)
6,7,-122.138294,45.580975,01YG,,CROWN ZELLERBACH LOG STG,,,,CORBETT,OR,97019,Multnomah,51,112,3,"Columbia River between Vancouver, WA and The D...","Port of Portland, OR",,135.0,,45.580975,-122.138294,,,,,,90135,2,,,,,,,,,,,,01-JAN-1990,,POINT (-13596372.694 5713452.568)
7,8,-122.116946,45.578925,01YH,,MULTNOMAH BEACH,,,,CORBETT,OR,97019,Multnomah,51,116,3,"Columbia River between Vancouver, WA and The D...","Port of Portland, OR",,136.0,,45.578925,-122.116946,,,,,,90136,4,,,,,,,,,,,,01-JAN-1990,,POINT (-13593996.246 5713126.520)
8,9,-122.081569,45.594357,01YJ,CZK,COL RIV MI 138 HORSETAIL CR. ORE.,,Dock,,CASCADE LOCKS,OR,97014,Multnomah,51,112,3,"Columbia River between Vancouver, WA and The D...",,,138.0,,45.594357,-122.081569,,,,,,90138,1,Fish,,,,,,,,,,,01-JAN-1990,,POINT (-13590058.096 5715581.235)
9,10,-122.090377,45.58613,01YK,CZK,ONEONTA BAR OREG,,,,CASCADE LOCKS,OR,97014,Multnomah,51,116,3,"Columbia River between Vancouver, WA and The D...","Port of Portland, OR",,138.0,,45.58613,-122.090377,,,,,,90138,3,,,,,,,,,,,,01-JAN-1990,,POINT (-13591038.598 5714272.510)


### Matching Port and Dock data with AIS Messages

First, we match each of the moored (aka "docked" aka "at berth") AIS messages with the nearest port, then backfill the preceeding AIS status changes with the relevant port info. This allows us to observe which port the vessel was headed towards at any give time. 

A similar operation is performed with each dock. 

In [7]:
start = time.time()
stops_gdf = (
    #filter to only moorings
    ais_gdf[ais_gdf.status == 5]
    #join in nearest port to each ais message
    .sjoin_nearest(ports_gdf, how='left', exclusive=True,
                   #set max distance to 10km
                   max_distance = 10000
                   )
    #drop unneeded cols
    .drop(['index_right', 'total'], axis=1)
    #rename cols for clarity
    .rename({'rank':'port_rank', 'type':'port_type'}, axis=1)
)

#create main df
main_gdf = (
    #merge stops back into AIS data
    ais_gdf.merge(stops_gdf, how='left')
    #sort by vessel then time of message
    .sort_values(by=['mmsi', 'time'])
)
#mark port name to unknown for docking messages not matched within max distance
main_gdf.loc[(main_gdf.status==5) & main_gdf.port_name.isnull(), 'port_name'] = 'unknown'
#backfill port info across docking events, except geometry (normal pandas syntax not supported for gpd geometry)
main_gdf[['port_type','port_name','port_rank']] = (
    main_gdf[['docking_id', 'port_type','port_name','port_rank']].groupby('docking_id').bfill()
)
#drop messages with missing or unknown port info (these vessels did not dock near a US port)
main_gdf = main_gdf[main_gdf['port_name'].notnull() & (main_gdf['port_name']!='unknown')]
#merge port geometries into main (NOTE backfill not supported for gpd geometry, hence the separate merge step)
main_gdf = main_gdf.merge(ports_gdf[['port_name', 'geometry']], 
                          on='port_name', how='left', suffixes=[None, '_port'])
#compute distance from message loc to port loc
main_gdf['port_dist'] = main_gdf['geometry'].distance(main_gdf['geometry_port'])
#create year and month cols for convenience
main_gdf['year'] = main_gdf['time'].dt.year
main_gdf['month'] = main_gdf['time'].dt.strftime('%Y%m')

In [8]:
#add dock info to main df
dockstops_gdf = (
    #filter to only moorings
    main_gdf[main_gdf.status == 5]
    #join in nearest dock to each ais message
    .sjoin_nearest(
        #keep only dock id
        docks_gdf[['nav_unit_id', 'geometry']],
        #max distanance 1km
        max_distance = 1000, 
        how='left')
    #drop unneeded cols
    .drop(['index_right'], axis=1)
)

#merge docks match back into main gdf
main_gdf = (
    #merge stops back into AIS data
    main_gdf.merge(dockstops_gdf, how='left')
    #sort by vessel then time of message
    .sort_values(by=['mmsi', 'time'])
)
#backfill dock info across docking events, except geometry (normal pandas syntax not supported for gpd geometry)
main_gdf[['nav_unit_id']] = (
    main_gdf[['docking_id', 'nav_unit_id']].groupby('docking_id').bfill()
)
#coerse nav_unit_ids to string for merge 
main_gdf['nav_unit_id'] = main_gdf['nav_unit_id'].astype(str)
docks_gdf['nav_unit_id'] = docks_gdf['nav_unit_id'].astype(str)
#merge dock geometries into main (NOTE backfill not supported for gpd geometry, hence the separate merge step)
main_gdf = main_gdf.merge(docks_gdf[['nav_unit_id', 'geometry']], 
                          on='nav_unit_id', how='left', suffixes=[None, '_dock'])
#compute distance from message loc to dock loc
main_gdf['dock_dist'] = main_gdf['geometry'].distance(main_gdf['geometry_dock'])
#rename nav_unit_id to dock_id
main_gdf.rename({'nav_unit_id':'dock_id'}, axis=1, inplace=True)

#add port and dock lat and long to preserve geometries in polars and pandas dfs
main_gdf['port_lat'] = main_gdf.set_geometry('geometry_port').to_crs("EPSG:4326").geometry_port.y
main_gdf['port_lon'] = main_gdf.set_geometry('geometry_port').to_crs("EPSG:4326").geometry_port.x
main_gdf['dock_lat'] = main_gdf.set_geometry('geometry_dock').to_crs("EPSG:4326").geometry_dock.y
main_gdf['dock_lon'] = main_gdf.set_geometry('geometry_dock').to_crs("EPSG:4326").geometry_dock.x

In [9]:
#inspect
display(main_gdf.shape)
main_gdf.head()

(905499, 32)

Unnamed: 0,docking_id,mmsi,time,speed,course,heading,status,vessel_name,vessel_type,imo,length,width,draft,cargo,status_previous,status_duration,short,geometry,port_type,port_name,port_rank,geometry_port,port_dist,year,month,dock_id,geometry_dock,dock_dist,port_lat,port_lon,dock_lat,dock_lon
0,48.0,205042000,2021-11-27 18:00:41,1.2,127.4,88.0,0.0,DELOS,80.0,9877767.0,336.0,60.0,13.1,80.0,5.0,180.0,False,POINT (-10789425.685 3220135.203),C,"Port of Long Beach, CA",5.0,POINT (-13159021.347 3993886.017),2492724.0,2021,202111,0V0U,POINT (-13160065.524 3996249.272),2494451.0,33.73957,-118.2095,33.757222,-118.21888
1,48.0,205042000,2021-11-27 21:01:30,2.0,149.2,119.0,3.0,DELOS,80.0,9877767.0,336.0,60.0,13.1,80.0,5.0,1881.0,False,POINT (-10745702.729 3209918.145),C,"Port of Long Beach, CA",5.0,POINT (-13159021.347 3993886.017),2537462.0,2021,202111,0V0U,POINT (-13160065.524 3996249.272),2539186.0,33.73957,-118.2095,33.757222,-118.21888
2,48.0,205042000,2021-11-29 04:22:45,3.6,121.9,109.0,0.0,DELOS,80.0,9877767.0,336.0,60.0,20.1,80.0,3.0,534745.0,False,POINT (-10768169.229 3064348.133),C,"Port of Long Beach, CA",5.0,POINT (-13159021.347 3993886.017),2565193.0,2021,202111,0V0U,POINT (-13160065.524 3996249.272),2567023.0,33.73957,-118.2095,33.757222,-118.21888
3,48.0,205042000,2022-12-05 12:48:11,0.7,161.7,0.0,1.0,DELOS,80.0,9877767.0,336.0,60.0,20.3,80.0,0.0,741.0,False,POINT (-13144130.139 3978148.735),C,"Port of Long Beach, CA",5.0,POINT (-13159021.347 3993886.017),21665.87,2022,202212,0V0U,POINT (-13160065.524 3996249.272),24115.68,33.73957,-118.2095,33.757222,-118.21888
4,48.0,205042000,2022-12-06 01:09:17,2.9,207.3,211.0,0.0,DELOS,80.0,9877767.0,336.0,60.0,20.3,80.0,3.0,753.0,False,POINT (-13144338.306 3978299.798),C,"Port of Long Beach, CA",5.0,POINT (-13159021.347 3993886.017),21413.12,2022,202212,0V0U,POINT (-13160065.524 3996249.272),23864.81,33.73957,-118.2095,33.757222,-118.21888


## Save data

In [10]:
#convert to polars
main_df = pl.DataFrame(
    main_gdf.drop(['geometry', 'geometry_port', 'geometry_dock'], axis=1)
    )

#save to parquet
main_df.write_parquet('port data/ais_status_changes.parquet')