# Port Performance Project - Data Processing Workbook for Status Changes.

This workbook processes data from the AIS system and combines it with port and dock data to support the [Port Performance Project](https://github.com/epistemetrica/Port-Performance-Project). See the README.md file in the main directory for full details. Analysing AIS status changes, rather than all AIS messages, significantly reduces compute needs and is all we need for most of the analysis in the project. Some metrics, however, require analyzing the full set of AIS messages, which is handled in a separate notebook. 

In [3]:
#prelims
import polars as pl
import pandas as pd
import geopandas as gpd
import time
import plotly.express as px
import matplotlib.pyplot as plt
import contextily as cx
import numpy as np
import glob

#enable string cache for polars categoricals
pl.enable_string_cache()
#display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pl.Config(tbl_rows=50);


## Pre-process AIS Data for status changes only

The vessel locations and status (e.g., "under way", "anchored", "moored") data include all AIS messages. For much of the analysis of port performance, we only need to know when a vessel *changes* status. We drop all other observations to create our first dataframe. 

Early exploratory analysis identified that vessel AIS statuses often change for very short periods of time, likely due to single AIS pings of errant or missing status. For example, a ship at berth for 4 hours may occaisionally ping "under power" even though the vessel remained at berth. To correct for this, we drop very short duration statuses where the previous and next statuses match.

We also filter to large vessels (>100m in length).

In [4]:
#set minimum meaningful status duration (minutes)
min_duration = 10

In [None]:
#init list of lazyframes
lfs = []
#process each parquet file individually into lazyframes
for file in glob.glob('ais data/data/ais_clean/*.parquet'):
    try:
        #check file integrity 
        pl.scan_parquet(file).collect_schema()
        #read file
        lf = (
            pl.scan_parquet(file)
            #drop smaller vessels
            .filter(pl.col('length')>100)
            #sort by vessel and time
            .sort(['mmsi', 'time'])
            #drop messages from the same vessel with same timestamp
            .unique(subset=['mmsi', 'time'])
            #indicate whether status is the same as previous row (Fill value needed to avoid status 0 evaluating as equal to false)
            .with_columns(
                status_change = (
                    pl.col('status').ne(pl.col('status').shift())
                    .over('mmsi')
                )
            )
            #keep only new status pings
            .filter(pl.col('status_change')==True)
            #drop change col
            .drop('status_change')
        )
        #append to list of lazyframes
        lfs.append(lf)
    except:
        print(f'{file} failed')

#collect all lazyframes
dfs = pl.collect_all(lfs)

In [9]:
ports=gpd.read_file('port data/Dock/Dock.shp').drop_duplicates(subset='PORT_NAME')
a=.5
b=.5
def inPortWaters():
    for portName,portlat,portlng in list(zip(ports.PORT_NAME,ports.LATITUDE,ports.LONGITUDE)):
        yield ((pl.col('lat')<portlat+a)&(pl.col('lat')>portlat-a)&(pl.col('lon')>portlng-b)&(pl.col('lon')<portlng+b)).alias(f'{portName}_in_port_waters')


In [13]:

#init list of lazyframes
lfs = []
for file in glob.glob('ais data/data/ais_clean/*.parquet'):
    #try:
        #check file integrity 
        #pl.scan_parquet(file).collect_schema()
        #read file
        lf = (
            pl.scan_parquet(file)
            #drop smaller vessels
            .filter(pl.col('length')>100)
            #sort by vessel and time
            .sort(['mmsi', 'time'])
            #drop messages from the same vessel with same timestamp
            .unique(subset=['mmsi', 'time'])
            # .with_columns(
            #     inPortWaters()    
            # )
            #identify if the vessel is in any port waters (any of the port_waters booleans are true)
            .with_columns(
                in_port_waters=pl.any_horizontal(inPortWaters()) #if any true, then in_port_waters==TRUE, in_port_waters is now a column in lf, TRUE if in any ports waters, false otherwise
            )
            #indicate whether status is the same as previous row (Fill value needed to avoid status 0 evaluating as equal to false)
            .with_columns(
                status_change = (
                    pl.col('status').ne(pl.col('status').shift(fill_value=20))
                    .over('mmsi')
                ),
                status_previous = pl.col('status').shift().over('mmsi')
            )
            .with_columns(
                in_port_waters_change = (
                    pl.col('in_port_waters').ne(pl.col('in_port_waters').shift(fill_value=None))
                    .over('mmsi')
                ),
            )
            #keep only new status pings
            .filter(pl.col('status_change')|pl.col('in_port_waters_change'))
            #drop change col
            .drop('status_change','in_port_waters_change')
        )
        #append to list of lazyframes
        lfs.append(lf)
    # except:
    #     print(f'{file} failed')
#collect all lazyframes
#dfs = pl.collect_all(lfs)
#df=pl.concat(dfs,how='diagonal')
lfs[0].collect().head() 

mmsi,time,lat,lon,speed,course,heading,status,vessel_name,vessel_type,imo,length,width,draft,cargo,in_port_waters,status_previous
i64,datetime[μs],f64,f64,f64,f64,f64,f64,cat,f64,i64,f64,f64,f64,f64,bool,f64
366901880,2018-05-14 03:43:35,45.81636,-84.75229,13.2,93.2,93.0,0.0,"""BURNS HARBOR""",70.0,7514713,330.0,34.0,7.2,70.0,True,
255806069,2018-05-14 03:09:52,33.81874,-75.36683,17.6,192.3,192.0,0.0,"""AS FABRIZIA""",70.0,9395135,166.0,25.0,9.5,,False,
636016242,2018-05-14 09:00:01,35.75012,-74.95154,11.7,355.3,356.0,0.0,"""YM MODESTY""",70.0,9664885,293.0,40.0,14.0,,False,
366032000,2018-05-14 18:47:26,39.80478,-75.40969,0.0,124.5,60.0,1.0,"""CHEMICAL PIONEER""",80.0,6806444,209.0,24.0,10.8,,True,
311000324,2018-05-14 10:23:57,29.74548,-95.10478,0.0,192.0,219.0,0.0,"""AEC ABILITY II""",70.0,9317690,177.0,28.0,10.6,70.0,True,


In [19]:
unpivoted_df.shape

(15, 7)

In [None]:
#create single pandas dataframe
ais_gdf = (
    #concat dfs
    pl.concat(dfs, how='diagonal_relaxed')
    #sort by vessel and time
    .sort(['mmsi', 'time'])
    #create duration column
    .with_columns(
        status_duration = (pl.col('time').shift(-1) - pl.col('time'))
        .over('mmsi').dt.total_minutes()
    )
    #drop short changes in status between equal statuses
    .with_columns(
        short = ((pl.col('status').shift()==pl.col('status').shift(-1)) & 
                (pl.col('status_duration')<min_duration)).over('mmsi')
    )
    .filter(pl.col('short')!=True)
    .drop('short')
    #drop repeated same-status messsages
    .with_columns(
        repeat = (pl.col('status')==pl.col('status').shift()).over('mmsi')
    )
    .filter(pl.col('repeat')!=True)
    .drop('repeat')
    #recalculate duration column
    .with_columns(
        status_duration = (pl.col('time').shift(-1) - pl.col('time'))
        .over('mmsi').dt.total_minutes()
    )
    #ensure sorting
    .sort(['mmsi', 'time'])
    #create row index (for identifying docking events)
    .with_row_index('docking_id')
    .with_columns(
        #create docking event id - NOTE may need to ensure this captures all relevant messages
        docking_id = (
            #keep only docking ids associated with docking messages
            pl.when(pl.col('status')==5)
            .then(pl.col('docking_id'))
            .otherwise(pl.lit(None))
            #backfill over vessel
            .backward_fill().over('mmsi')
        )
    )
    #convert to pandas
    .to_pandas()
)

#convert to geopandas dataframe
ais_gdf = (
    #convert to geodataframe
    gpd.GeoDataFrame(
        ais_gdf,
        geometry=gpd.points_from_xy(ais_gdf.lon, ais_gdf.lat, crs='EPSG:4326')
    )
    #convert to WGS84 pseudo-mercator; giving distances in meters
    .to_crs(3857)
    #drop old lat lon cols
    .drop(['lat', 'lon'], axis=1)
)

#inspect
display(ais_gdf.head())
ais_gdf.info()

Unnamed: 0,docking_id,mmsi,time,speed,course,heading,status,vessel_name,vessel_type,imo,length,width,draft,cargo,status_duration,geometry
0,9.0,205041000,2023-05-18 13:38:06,0.4,138.6,105.0,2.0,LOWLANDS PROSPERITY,79.0,9586813.0,292.0,46.0,10.8,79.0,1.0,POINT (-13733098.187 6162516.103)
1,9.0,205041000,2023-05-18 13:39:15,0.3,148.0,103.0,1.0,LOWLANDS PROSPERITY,79.0,9586813.0,292.0,46.0,10.8,79.0,1767.0,POINT (-13733085.942 6162502.708)
2,9.0,205041000,2023-05-19 19:06:36,0.8,8.0,84.0,0.0,LOWLANDS PROSPERITY,79.0,9586813.0,292.0,46.0,10.8,79.0,8945.0,POINT (-13733242.903 6162469.218)
3,9.0,205041000,2023-05-26 00:11:49,0.0,224.0,224.0,1.0,LOWLANDS PROSPERITY,79.0,9586813.0,292.0,46.0,0.0,0.0,1202.0,POINT (-13732216.537 6163278.033)
4,9.0,205041000,2023-05-26 20:14:03,0.6,220.9,100.0,0.0,LOWLANDS PROSPERITY,79.0,9586813.0,292.0,46.0,10.8,79.0,329.0,POINT (-13733112.659 6162722.069)


<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 1823850 entries, 0 to 1823849
Data columns (total 16 columns):
 #   Column           Dtype         
---  ------           -----         
 0   docking_id       float64       
 1   mmsi             object        
 2   time             datetime64[us]
 3   speed            float64       
 4   course           float64       
 5   heading          float64       
 6   status           float64       
 7   vessel_name      category      
 8   vessel_type      float64       
 9   imo              float64       
 10  length           float64       
 11  width            float64       
 12  draft            float64       
 13  cargo            float64       
 14  status_duration  float64       
 15  geometry         geometry      
dtypes: category(1), datetime64[us](1), float64(12), geometry(1), object(1)
memory usage: 212.9+ MB


### Port and Dock Data

Locations and descriptions for each dock and port come from the BTS and USACE online databases. 

In [None]:
#load port data
ports_gdf = (
    #read in shape file downloaded from BTS
    gpd.read_file('port data/Principal_Ports/Principal_Ports.shp')
    #drop unneeded columns
    .drop([
        'FID', #randomly assigned table id
        'PORT', #unknown numeric ID - not CBP or UN code
        'FOREIGN_','EXPORTS', 'IMPORTS', 'DOMESTIC' #breadown of total vol (tons)
    ], axis=1)
)
#set col names to pythonic lowercase
ports_gdf.columns = ports_gdf.columns.str.lower()

#load dock data
docks_gdf = (
    #read in shape file downloaded from USACE
    gpd.read_file('port data/Dock/Dock.shp')
    #drop unneeded columns
    .drop([
        'FID', #randomly assigned table id
        'LONGITUDE', 'LATITUDE', #already coded in 'geometry' 
        'LOCATION_D', #text description of dock location
        'STREET_ADD','ZIPCODE', #street address details
        'PSA_NAME', #statistical area name, rarely used
        'COUNTY_NAM', 'COUNTY_FIP', 'CONGRESS', 'CONGRESS_F', #county and congress info
        'MILE', 'BANK', 'LATITUDE1', 'LONGITUDE1', #redundant locaation data
        'OPERATORS', 'OWNERS', #owner info
        'PURPOSE', #long-form text description of dock uses
        'DOCK', #unknown number (not unique to each row/dock)
        'HIGHWAY_NO', 'RAILWAY_NO', 'LOCATION', #redundant location info
        'COMMODITIE', 'CONSTRUCTI','MECHANICAL', 'REMARKS', 'VERTICAL_D', 
        'DEPTH_MIN', 'DEPTH_MAX','BERTHING_L', 'BERTHING_T', 'DECK_HEIGH', 
        'DECK_HEI_1', #these are rarely used stats on construction
        'SERVICE_IN','SERVICE_TE', #rarely used indicators of data entry date 
    ], axis=1)
    #drop duplicates with matching geometries, keeping most common data
    .groupby('geometry').agg(lambda x: x.mode().iloc[0] if not x.mode().empty else None).reset_index()
    #rename cols for clarity
    .rename(columns={
        'NAV_UNIT_I':'nav_unit_id',
        'NAV_UNIT_N':'nav_unit_name',
        'FACILITY_T':'facility_type',
        'CITY_OR_TO':'city',
        'STATE_POST':'state'
    })
)
#set col names to pythonic lowercase
docks_gdf.columns = docks_gdf.columns.str.lower()

#coerse back to gdf - groupby appears to have kicked it back to pandas core
docks_gdf = gpd.GeoDataFrame(docks_gdf, geometry='geometry', crs=3857)

### Matching Port and Dock data with AIS Messages

First, we match each of the moored (aka "docked" aka "at berth") AIS messages with the nearest port, then backfill the preceeding AIS status changes with the relevant port info. This allows us to observe which port the vessel was headed towards at any give time. 

A similar operation is performed with each dock. 

In [None]:
start = time.time()
stops_gdf = (
    #filter to only moorings
    ais_gdf[ais_gdf.status == 5]
    #join in nearest port to each mooring
    .sjoin_nearest(ports_gdf, how='left', exclusive=True,
                   #set max distance to 10km
                   max_distance = 10000
                   )
    #drop unneeded cols
    .drop(['index_right', 'total'], axis=1)
    #rename cols for clarity
    .rename({'rank':'port_rank', 'type':'port_type'}, axis=1)
)

#create main df
main_gdf = (
    #merge stops back into AIS data
    ais_gdf.merge(stops_gdf, how='left')
    #sort by vessel then time of message
    .sort_values(by=['mmsi', 'time'])
)
#mark port name to unknown for docking messages not matched within max distance
main_gdf.loc[(main_gdf.status==5) & main_gdf.port_name.isnull(), 'port_name'] = 'unknown'
#backfill port info across docking events, except geometry (normal pandas syntax not supported for gpd geometry)
main_gdf[['port_type','port_name','port_rank']] = (
    main_gdf[['docking_id', 'port_type','port_name','port_rank']].groupby('docking_id').bfill()
)
#drop messages with missing or unknown port info (these vessels did not dock near a US port)
main_gdf = main_gdf[main_gdf['port_name'].notnull() & (main_gdf['port_name']!='unknown')]
#merge port geometries into main (NOTE backfill not supported for gpd geometry, hence the separate merge step)
main_gdf = main_gdf.merge(ports_gdf[['port_name', 'geometry']], 
                          on='port_name', how='left', suffixes=[None, '_port'])
#compute distance from message loc to port loc
main_gdf['port_dist'] = main_gdf['geometry'].distance(main_gdf['geometry_port'])
#create year and month cols for convenience
main_gdf['year'] = main_gdf['time'].dt.year
main_gdf['month'] = main_gdf['time'].dt.strftime('%Y%m')

#inspect 
main_gdf.head()

Unnamed: 0,docking_id,mmsi,time,speed,course,heading,status,vessel_name,vessel_type,imo,length,width,draft,cargo,status_duration,geometry,port_type,port_name,port_rank,geometry_port,port_dist,year,month
0,44.0,205042000,2021-11-27 18:00:41,1.2,127.4,88.0,0.0,DELOS,80.0,9877767.0,336.0,60.0,13.1,80.0,180.0,POINT (-10789425.685 3220135.203),C,"Port of Long Beach, CA",5.0,POINT (-13159021.347 3993886.017),2492724.0,2021,202111
1,44.0,205042000,2021-11-27 21:01:30,2.0,149.2,119.0,3.0,DELOS,80.0,9877767.0,336.0,60.0,13.1,80.0,1887.0,POINT (-10745702.729 3209918.145),C,"Port of Long Beach, CA",5.0,POINT (-13159021.347 3993886.017),2537462.0,2021,202111
2,44.0,205042000,2021-11-29 04:29:27,4.0,178.3,177.0,0.0,DELOS,80.0,9877767.0,336.0,60.0,20.1,80.0,534738.0,POINT (-10767436.746 3063793.247),C,"Port of Long Beach, CA",5.0,POINT (-13159021.347 3993886.017),2566077.0,2021,202111
3,44.0,205042000,2022-12-05 12:48:11,0.7,161.7,0.0,1.0,DELOS,80.0,9877767.0,336.0,60.0,20.3,80.0,741.0,POINT (-13144130.139 3978148.735),C,"Port of Long Beach, CA",5.0,POINT (-13159021.347 3993886.017),21665.87,2022,202212
4,44.0,205042000,2022-12-06 01:09:17,2.9,207.3,211.0,0.0,DELOS,80.0,9877767.0,336.0,60.0,20.3,80.0,752.0,POINT (-13144338.306 3978299.798),C,"Port of Long Beach, CA",5.0,POINT (-13159021.347 3993886.017),21413.12,2022,202212


In [None]:
#inspect map of anchorage locations

#keep only anchor messages
gdf = main_gdf[main_gdf['status']==1].to_crs(4326)

import folium
from folium.plugins import HeatMap


# Extract latitude and longitude
gdf["lon"] = gdf.geometry.x
gdf["lat"] = gdf.geometry.y

# Initialize a Folium map centered on the data
m = folium.Map(location=[gdf["lat"].mean(), gdf["lon"].mean()], zoom_start=6)

# Add a heatmap layer
heat_data = list(zip(gdf["lat"], gdf["lon"]))
HeatMap(heat_data).add_to(m)

#show map
m

In [None]:
#inspect map of dock locations

#keep only anchor messages
gdf = main_gdf[main_gdf['status']==5].to_crs(4326)

import folium
from folium.plugins import HeatMap


# Extract latitude and longitude
gdf["lon"] = gdf.geometry.x
gdf["lat"] = gdf.geometry.y

# Initialize a Folium map centered on the data
m = folium.Map(location=[gdf["lat"].mean(), gdf["lon"].mean()], zoom_start=6)

# Add a heatmap layer
heat_data = list(zip(gdf["lat"], gdf["lon"]))
HeatMap(heat_data).add_to(m)

#show map
m

NameError: name 'main_gdf' is not defined

In [None]:
#add dock info to main df
dockstops_gdf = (
    #filter to only moorings
    main_gdf[main_gdf.status == 5]
    #join in nearest dock to each ais message
    .sjoin_nearest(
        #keep only dock id
        docks_gdf[['nav_unit_id', 'geometry']],
        #max distanance 500m
        max_distance = 500, 
        how='left')
    #drop unneeded cols
    .drop(['index_right'], axis=1)
)

#merge docks match back into main gdf
main_gdf = (
    #merge stops back into AIS data
    main_gdf.merge(dockstops_gdf, how='left')
    #sort by vessel then time of message
    .sort_values(by=['mmsi', 'time'])
)
#backfill dock info across docking events, except geometry (normal pandas syntax not supported for gpd geometry)
main_gdf[['nav_unit_id']] = (
    main_gdf[['docking_id', 'nav_unit_id']].groupby('docking_id').bfill()
)
#coerse nav_unit_ids to string for merge 
main_gdf['nav_unit_id'] = main_gdf['nav_unit_id'].astype(str)
docks_gdf['nav_unit_id'] = docks_gdf['nav_unit_id'].astype(str)
#merge dock geometries into main (NOTE backfill not supported for gpd geometry, hence the separate merge step)
main_gdf = main_gdf.merge(docks_gdf[['nav_unit_id', 'geometry']], 
                          on='nav_unit_id', how='left', suffixes=[None, '_dock'])
#compute distance from message loc to dock loc
main_gdf['dock_dist'] = main_gdf['geometry'].distance(main_gdf['geometry_dock'])
#rename nav_unit_id to dock_id
main_gdf.rename({'nav_unit_id':'dock_id'}, axis=1, inplace=True)

#add lat and long to preserve geometries in polars and pandas dfs
#port
main_gdf['port_lat'] = main_gdf.set_geometry('geometry_port').to_crs("EPSG:4326").geometry_port.y
main_gdf['port_lon'] = main_gdf.set_geometry('geometry_port').to_crs("EPSG:4326").geometry_port.x
#dock
main_gdf['dock_lat'] = main_gdf.set_geometry('geometry_dock').to_crs("EPSG:4326").geometry_dock.y
main_gdf['dock_lon'] = main_gdf.set_geometry('geometry_dock').to_crs("EPSG:4326").geometry_dock.x
#vessel
main_gdf['vessel_lat'] = main_gdf.set_geometry('geometry').to_crs("EPSG:4326").geometry.y
main_gdf['vessel_lon'] = main_gdf.set_geometry('geometry').to_crs("EPSG:4326").geometry.x

In [None]:
#inspect
display(main_gdf.shape)
main_gdf.head()

(898559, 32)

Unnamed: 0,docking_id,mmsi,time,speed,course,heading,status,vessel_name,vessel_type,imo,length,width,draft,cargo,status_duration,geometry,port_type,port_name,port_rank,geometry_port,port_dist,year,month,dock_id,geometry_dock,dock_dist,port_lat,port_lon,dock_lat,dock_lon,vessel_lat,vessel_lon
0,44.0,205042000,2021-11-27 18:00:41,1.2,127.4,88.0,0.0,DELOS,80.0,9877767.0,336.0,60.0,13.1,80.0,180.0,POINT (-10789425.685 3220135.203),C,"Port of Long Beach, CA",5.0,POINT (-13159021.347 3993886.017),2492724.0,2021,202111,0V0U,POINT (-13160065.524 3996249.272),2494451.0,33.73957,-118.2095,33.757222,-118.21888,27.77102,-96.92306
1,44.0,205042000,2021-11-27 21:01:30,2.0,149.2,119.0,3.0,DELOS,80.0,9877767.0,336.0,60.0,13.1,80.0,1887.0,POINT (-10745702.729 3209918.145),C,"Port of Long Beach, CA",5.0,POINT (-13159021.347 3993886.017),2537462.0,2021,202111,0V0U,POINT (-13160065.524 3996249.272),2539186.0,33.73957,-118.2095,33.757222,-118.21888,27.68978,-96.53029
2,44.0,205042000,2021-11-29 04:29:27,4.0,178.3,177.0,0.0,DELOS,80.0,9877767.0,336.0,60.0,20.1,80.0,534738.0,POINT (-10767436.746 3063793.247),C,"Port of Long Beach, CA",5.0,POINT (-13159021.347 3993886.017),2566077.0,2021,202111,0V0U,POINT (-13160065.524 3996249.272),2567907.0,33.73957,-118.2095,33.757222,-118.21888,26.52132,-96.72553
3,44.0,205042000,2022-12-05 12:48:11,0.7,161.7,0.0,1.0,DELOS,80.0,9877767.0,336.0,60.0,20.3,80.0,741.0,POINT (-13144130.139 3978148.735),C,"Port of Long Beach, CA",5.0,POINT (-13159021.347 3993886.017),21665.87,2022,202212,0V0U,POINT (-13160065.524 3996249.272),24115.68,33.73957,-118.2095,33.757222,-118.21888,33.62193,-118.07573
4,44.0,205042000,2022-12-06 01:09:17,2.9,207.3,211.0,0.0,DELOS,80.0,9877767.0,336.0,60.0,20.3,80.0,752.0,POINT (-13144338.306 3978299.798),C,"Port of Long Beach, CA",5.0,POINT (-13159021.347 3993886.017),21413.12,2022,202212,0V0U,POINT (-13160065.524 3996249.272),23864.81,33.73957,-118.2095,33.757222,-118.21888,33.62306,-118.0776


## Add Port Area entry and exit status changes

Now that we have the tidy'd status changes associated with each docking event id, we can determine the effective port area by the radius within which anchorings occur before vessels visit that port. From this, we can reprocess the full AIS dataset to identify when each vessel entered/exited the port area before/after docking at the port. 

### Identify Port Area

In [None]:
#set quantile
quantile = 0.9

#get anchorings
port_area_gdf = main_gdf[main_gdf.status == 1]
#get percentiles of anchor distances
port_area_gdf = (
    #select port name and distance to anchor location
    port_area_gdf[['port_name', 'geometry_port', 'port_dist']]
    #get quantile of distance to anchor
    .groupby(['port_name', 'geometry_port']).quantile(quantile).reset_index()
    #reset geometry
    .set_geometry('geometry_port')
)
#add 1km to quantile to get final port radius
port_area_gdf['radius_port'] = port_area_gdf['port_dist']+1000
#drop port_dist
port_area_gdf = port_area_gdf.drop('port_dist', axis=1)
#coerse radius minimum of 5km
port_area_gdf['radius_port'] = port_area_gdf['radius_port'].clip(lower=5000)
#create circular geometry using radius
port_area_gdf['port_area'] = (
    #NOTE temporarily going with 10km buffer distance while analyzed distance glitches are resolved
    port_area_gdf.geometry_port.buffer(distance = 10000) #distance = port_area_gdf['radius_port']
)
#set geometry to port area
port_area_gdf = port_area_gdf.set_geometry('port_area')
#convert to lat/long
port_area_gdf = port_area_gdf.to_crs(4326)
#get bounding box for each port area
port_area_gdf = pd.concat([port_area_gdf, port_area_gdf.bounds], axis=1)
#drop everything but port name and bounding box
port_area_gdf = (
    port_area_gdf.drop(['radius_port', 'port_area', 'geometry_port'], axis=1)
)

#inspect
port_area_gdf.head()

Unnamed: 0,port_name,minx,miny,maxx,maxy
0,"Albany Port District, NY",-73.837992,42.576596,-73.658328,42.708754
1,"Alpena, MI",-83.511565,44.99848,-83.331901,45.125384
2,"Anacortes, WA",-122.689442,48.436379,-122.509778,48.555437
3,"Ashtabula Port Authority, OH",-80.883808,41.836791,-80.704144,41.970509
4,"Baltimore, MD",-76.651472,39.181228,-76.471808,39.320357


### Get entry and exit status changes

In [35]:
#create polars lazyframes for lat and lon bounds
bounds_lf = pl.LazyFrame(port_area_gdf[['miny', 'maxy', 'minx', 'maxx']])

#display(bounds_lf.collect().head())

#init list of lazyframes
lfs = []
#process each parquet file individually into lazyframes
for file in glob.glob('ais data/data/ais_clean/*.parquet'):
    #try:
        #read file
        lf = (
            pl.scan_parquet(file)
            #drop smaller vessels
            .filter(pl.col('length')>100)
            #drop messages from the same vessel with same timestamp
            .unique(subset=['mmsi', 'time'])
        )
        
        #get indicator for messages sent from within (any) port area
        filtered_lf = (
            #cross-join with bounds to get all port areas
            lf.join(bounds_lf, how='cross')
            #filter to only messages sent within each bounding box
            .filter(
                (pl.col('lat').is_between(pl.col('miny'), pl.col('maxy'))) &
                (pl.col('lon').is_between(pl.col('minx'), pl.col('maxx')))
            )
            #drop bounding box cols
            .drop('miny', 'maxy', 'minx', 'maxx')
            #deduplicate
            .unique()
            #create boolean col
            .with_columns(
                near_port = pl.lit(True)
            )
        )

        
        lf = (
            #inner-join on everything except the new col - NOTE this drops all messages from outside port areas
            lf.join(filtered_lf,
                    on=list(filtered_lf.select(pl.exclude('near_port'))
                            .collect_schema().names()), 
                    how='inner')
            #sort by vessel and time
            .sort(['mmsi', 'time'])
            #indicate whether status is the same as previous row
            .with_columns(
                status_change = (
                    pl.col('status').ne(pl.col('status').shift())
                    .over('mmsi')
                ),
                status_previous = pl.col('status').shift().over('mmsi')
            )
            #set status_change to true for first message in port area

            #keep only new status pings
            .filter(pl.col('status_change')==True)

        )

        #identify first time vessel apears in port area
        #keep only status changes

        #append to list of lazyframes
        lfs.append(lf)
    #except:
     #   print(f'{file} failed')

#collect all lazyframes
dfs = pl.collect_all(lfs[:5])

In [36]:
dfs[1].describe()

statistic,mmsi,time,lat,lon,speed,course,heading,status,vessel_name,vessel_type,imo,length,width,draft,cargo,near_port,status_change,status_previous
str,f64,str,f64,f64,f64,f64,f64,f64,str,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""count""",270.0,"""270""",270.0,270.0,270.0,270.0,270.0,270.0,"""270""",270.0,270.0,270.0,270.0,270.0,270.0,270.0,270.0,270.0
"""null_count""",0.0,"""0""",0.0,0.0,0.0,0.0,0.0,0.0,"""0""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""mean""",417490000.0,"""2023-01-21 12:53:13.562963""",33.375731,-94.021465,0.652963,182.461481,193.27037,1.92963,,74.625926,16404000.0,213.044444,32.174074,10.743704,75.777778,1.0,1.0,2.344444
"""std""",140560000.0,,5.842438,18.237378,1.904146,99.156012,112.711812,2.482401,,4.962768,81390000.0,58.285781,7.89683,2.867563,5.468622,,,2.658344
"""min""",205087000.0,"""2023-01-21 00:12:43""",17.69499,-157.87912,0.0,0.0,0.0,0.0,,70.0,5148417.0,101.0,15.0,3.8,70.0,1.0,1.0,0.0
"""25%""",309822000.0,"""2023-01-21 06:54:54""",29.719,-118.15185,0.0,109.0,93.0,0.0,,70.0,9301433.0,180.0,27.0,8.3,70.0,,,0.0
"""50%""",369390000.0,"""2023-01-21 13:05:49""",32.79482,-89.76423,0.0,172.2,190.0,0.0,,71.0,9436965.0,199.0,32.0,10.4,79.0,,,1.0
"""75%""",538008626.0,"""2023-01-21 18:42:24""",37.57225,-80.05031,0.3,266.8,302.0,5.0,,80.0,9674555.0,243.0,32.0,12.6,80.0,,,5.0
"""max""",636093143.0,"""2023-01-21 23:59:35""",48.50893,-64.75408,11.9,357.7,359.0,15.0,,89.0,956795800.0,366.0,60.0,22.5,89.0,1.0,1.0,15.0


## Save data

In [37]:
#convert to polars
main_df = pl.DataFrame(
    main_gdf
    #preserve lat/lon
    
    .drop(['geometry', 'geometry_port', 'geometry_dock'], axis=1)
    )

#save to parquet
main_df.write_parquet('port data/ais_status_changes.parquet')