In [145]:
#prelims
import polars as pl
import pandas as pd
import geopandas as gpd
import time
import plotly_express as px
import matplotlib.pyplot as plt
import contextily as cx
import numpy as np

#enable string cache for polars categoricals
pl.enable_string_cache()
#display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pl.Config(tbl_rows=200);

# Port Performance Index Project - Main Notebook

This notebook presents the primary data analysis related to the PPI Project; see the README and other files in the [repo](https://github.com/epistemetrica/Port-Performance-Project) for full details. 

## Data Processing

AIS data is ingested from the Marine Cadastre database via the scripts found in the vessel data folder. Here, we combine the AIS data with port-level data and prepare for analysis. 

### AIS Data

The vessel locations and status (e.g., "under way", "anchored", "moored") data include all AIS messages; for the purposes of the PPI, we only need to know when a vessel *changes* status, so all other observations are dropped. 

We also limit the data for the PPI to vessels over 100m in length. 

In [146]:
%%script echo skipping: data to be read from previously processed parquet file in next cell

ais_gdf = (
    #read into lazyframe
    pl.scan_parquet('ais data/data/ais_clean/*.parquet')
    #drop smaller vessels
    .filter(pl.col('length')>100)
    #sort by vessel and time
    .sort(['mmsi', 'time'])
    #indicate whether status is the same as previous row (Fill value needed to avoid status 0 evaluating as equal to false)
    .with_columns(
        status_change = (
            pl.col('status').ne(pl.col('status').shift(fill_value=20))
            .over('mmsi')
        )
    )
    #keep only new status pings
    .filter(pl.col('status_change')==True)
    #drop change col
    .drop('status_change')
    #create duration column
    .with_columns(
        status_duration = (pl.col('time').shift(-1) - pl.col('time'))
        .over('mmsi').dt.total_minutes()
    )
    #collect to pandas df
    .collect()
    .to_pandas()
)

#convert to geopandas dataframe
ais_gdf = (
    #convert to geodataframe
    gpd.GeoDataFrame(
        ais_gdf,
        geometry=gpd.points_from_xy(ais_gdf.lon, ais_gdf.lat, crs='EPSG:4326')
    )
    #convert to WGS84 pseudo-mercator
    .to_crs(3857)
    #drop old lat lon cols
    .drop(['lat', 'lon'], axis=1)
)

#write to parquet
ais_gdf.to_parquet('ais data/data/ais_status_changes.parquet')

skipping: data to be read from previously processed parquet file in next cell


In [147]:
#%%script echo skipping: dataframe created in previous cell
#read previously processed AIS parquet file
ais_gdf = (gpd.read_parquet('ais data/data/ais_status_changes.parquet'))

#limit to time
ais_gdf = ais_gdf[ais_gdf['time'].dt.year>=2018]

### Port and Dock Data

Locations and descriptions for each dock and port come from the BTS and USACE online databases. 

In [148]:
#load port data
ports_gdf = (
    #read in shape file downloaded from BTS
    gpd.read_file('port data/Principal_Ports/Principal_Ports.shp')
    #drop unneeded columns
    .drop([
        'FID', #randomly assigned table id
        'PORT', #unknown numeric ID - not CBP or UN code
        'FOREIGN_','EXPORTS', 'IMPORTS', 'DOMESTIC' #breadown of total vol (tons)
    ], axis=1)
)
#set col names to pythonic lowercase
ports_gdf.columns = ports_gdf.columns.str.lower()

#load dock data
docks_gdf = (
    #read in shape file downloaded from USACE
    gpd.read_file('port data/Dock/Dock.shp')
    #drop unneeded columns
    .drop([
        'FID', #randomly assigned table id
        'LONGITUDE', 'LATITUDE', #already coded in 'geometry' 
        'LOCATION_D', #text description of dock location
        'STREET_ADD','ZIPCODE', #street address details
        'PSA_NAME', #statistical area name, rarely used
        'COUNTY_NAM', 'COUNTY_FIP', 'CONGRESS', 'CONGRESS_F', #county and congress info
        'MILE', 'BANK', 'LATITUDE1', 'LONGITUDE1', #redundant locaation data
        'OPERATORS', 'OWNERS', #owner info
        'PURPOSE', #long-form text description of dock uses
        'DOCK', #unknown number (not unique to each row/dock)
        'HIGHWAY_NO', 'RAILWAY_NO', 'LOCATION', #redundant location info
        'COMMODITIE', 'CONSTRUCTI','MECHANICAL', 'REMARKS', 'VERTICAL_D', 
        'DEPTH_MIN', 'DEPTH_MAX','BERTHING_L', 'BERTHING_T', 'DECK_HEIGH', 
        'DECK_HEI_1', #these are rarely used stats on construction
        'SERVICE_IN','SERVICE_TE', #rarely used indicators of data entry date 
    ], axis=1)
    #rename cols for clarity
    .rename(columns={
        'NAV_UNIT_I':'nav_unit_id',
        'NAV_UNIT_N':'nav_unit_name',
        'FACILITY_T':'facility_type',
        'CITY_OR_TO':'city',
        'STATE_POST':'state'
    })
)
#set col names to pythonic lowercase
docks_gdf.columns = docks_gdf.columns.str.lower()

### Matching

We first match each of the anchored and moored points from the AIS data with the nearest Port. 

In [149]:
start = time.time()
stops_gdf = (
    #filter to only moorings
    ais_gdf[ais_gdf.status == 5]
    #join in nearest port to each ais message
    .sjoin_nearest(ports_gdf, how='left', exclusive=True)
    #drop unneeded cols
    .drop(['index_right', 'total'], axis=1)
    #rename cols for clarity
    .rename({'rank':'port_rank', 'type':'port_type'}, axis=1)
)
print(f'Spatial Join time on {len(stops_gdf)} AIS messages took {time.time()-start} seconds')

#create main df
main_gdf = (
    #merge stops back into AIS data
    ais_gdf.merge(stops_gdf, how='left')
    #sort by vessel then time of message
    .sort_values(by=['mmsi', 'time'])
)
#backfill port info except geometry (normal pandas syntax not supported for gpd geometry)
main_gdf[['port_type','port_name','port_rank']] = (
    main_gdf[['port_type','port_name','port_rank']].bfill()
)
#merge port geometries into main (NOTE backfill not supported for gpd geometry, hence the separate merge step)
main_gdf = main_gdf.merge(ports_gdf[['port_name', 'geometry']], 
                          on='port_name', how='left', suffixes=[None, '_port'])
#compute distance from message loc to port loc
main_gdf['port_dist'] = main_gdf['geometry'].distance(main_gdf['geometry_port'])

Spatial Join time on 177268 AIS messages took 0.5833101272583008 seconds


In [150]:
#create year and month cols for convenience
main_gdf['year'] = main_gdf['time'].dt.year
main_gdf['month'] = main_gdf['time'].dt.strftime('%Y%m')

In [151]:
#inspect
display(main_gdf.head())
main_gdf.info()

Unnamed: 0,mmsi,time,speed,course,heading,status,vessel_name,vessel_type,imo,length,width,draft,cargo,status_duration,geometry,port_type,port_name,port_rank,geometry_port,port_dist,year,month
0,36336,2019-10-28 10:45:42,12.5,46.0,49.0,0.0,NAVIOS LYRA,79.0,9498626.0,231.0,39.0,5.8,79.0,,POINT (-8231327.164 4246784.482),C,"Calhoun Port Authority, TX",77.0,POINT (-10752608.322 3328559.594),2683281.0,2019,201910
1,205174000,2018-04-17 08:16:50,12.3,293.7,294.0,0.0,SIMONE,80.0,9537769.0,331.0,60.0,22.6,80.0,7107.0,POINT (-9293240.522 2360809.194),C,"Calhoun Port Authority, TX",77.0,POINT (-10752608.322 3328559.594),1751084.0,2018,201804
2,205174000,2018-04-22 06:43:57,0.1,146.5,78.0,1.0,SIMONE,80.0,9537769.0,331.0,60.0,22.6,80.0,,POINT (-10376653.013 2124386.530),C,"Calhoun Port Authority, TX",77.0,POINT (-10752608.322 3328559.594),1261497.0,2018,201804
3,205250000,2018-07-05 03:27:55,12.4,80.2,79.0,0.0,MINERAL DRAGON,70.0,9508392.0,292.0,45.0,18.1,70.0,,POINT (-18663677.771 6529613.956),C,"Calhoun Port Authority, TX",77.0,POINT (-10752608.322 3328559.594),8534153.0,2018,201807
4,205283000,2018-04-10 21:35:14,15.5,304.9,304.0,0.0,KAPRIJKE,80.0,9687485.0,181.0,,,,2345.0,POINT (-9852667.718 2719066.832),C,"Calhoun Port Authority, TX",77.0,POINT (-10752608.322 3328559.594),1086911.0,2018,201804


<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 603482 entries, 0 to 603481
Data columns (total 22 columns):
 #   Column           Non-Null Count   Dtype         
---  ------           --------------   -----         
 0   mmsi             603482 non-null  int64         
 1   time             603482 non-null  datetime64[us]
 2   speed            603482 non-null  float64       
 3   course           602003 non-null  float64       
 4   heading          598253 non-null  float64       
 5   status           603482 non-null  float64       
 6   vessel_name      603482 non-null  category      
 7   vessel_type      603482 non-null  float64       
 8   imo              603482 non-null  float64       
 9   length           603482 non-null  float64       
 10  width            562241 non-null  float64       
 11  draft            559793 non-null  float64       
 12  cargo            322267 non-null  float64       
 13  status_duration  590870 non-null  float64       
 14  geometry    

## Exploratory Analysis

### Vessel calls in previous 12 months

In [152]:
#get most recent twelve month window that appears in the data
latest_date = main_gdf.time.max()
year_before = latest_date - pd.DateOffset(months=12)

#get port lat and long to preserve geometries in polars df
main_gdf['lat'] = main_gdf.set_geometry('geometry_port').to_crs("EPSG:4326").geometry_port.y
main_gdf['lon'] = main_gdf.set_geometry('geometry_port').to_crs("EPSG:4326").geometry_port.x

annual_gdf = (
    #convert main gdf to polars
    pl.DataFrame(
        main_gdf.drop(['geometry', 'geometry_port'], axis=1))
    #filter to most recent 12 months
    .filter(pl.col('time')>= year_before)
    .with_columns(
        #get monthly avg vessels
        vessels_avg = pl.col('mmsi').n_unique().over('port_name', 'month'),
        status_duration = pl.col('status_duration')
    )
    #aggregate
    .group_by('port_name')
    .agg(
        #keep lat and long
        lat = pl.col('lat').first(),
        lon = pl.col('lon').first(),
        #get avg vessels per month
        vessels_avg = pl.col('vessels_avg').mean(),
        #get median time at berth in hours
        time_at_berth_avg = (
            pl.when(pl.col('status')==5)
            .then(pl.col('status_duration'))
            .otherwise(pl.lit(None))
        ).median()/60,
        #get median time at anchor in hours
        time_at_anchor_avg = (
            pl.when(pl.col('status')==1)
            .then(pl.col('status_duration'))
            .otherwise(pl.lit(None))
        ).median()/60
    )
    #convert to pandas to that geopandas is happy
    .to_pandas()
)

#convert to geodataframe
annual_gdf = (
    gpd.GeoDataFrame(
        annual_gdf, geometry=gpd.points_from_xy(annual_gdf.lon, annual_gdf.lat),
        crs=3857
    )
)

### Monthly Port Statistics

In [153]:
monthly_gdf = (
    #convert to polars
    pl.DataFrame(main_gdf.drop(['geometry', 'geometry_port'], axis=1))
    #agg over ports and months
    .group_by('port_name', 'month')
    .agg(
        #keep lat and long
        lat = pl.col('lat').first(),
        lon = pl.col('lon').first(),
        #get monthly avg vessels
        vessels_avg = pl.col('mmsi').n_unique(),
        #get average time at berth
        time_at_berth_avg = (
            pl.when(pl.col('status')==5)
            .then(pl.col('status_duration'))
            .otherwise(pl.lit(None))
        ).median()/60,
        #get average time at anchor
        time_at_anchor_avg = (
            pl.when(pl.col('status')==1)
            .then(pl.col('status_duration'))
            .otherwise(pl.lit(None))
        ).median()/60
    )
)

### Visualizations

In [165]:
#scatterplot
fig = px.scatter_geo(
    annual_gdf,
    lon='lon',
    lat='lat',
    size='vessels_avg',
    color='time_at_berth_avg',
    range_color=[0,50],
    hover_name='port_name',
    size_max=30,
    title='Avg Vessels per month & Median Hours at Berth (previous 12 months)',
    color_continuous_scale=px.colors.sequential.Viridis,
    width=1000,
    height=600,
    labels={
        'time_at_berth_avg':'Hours at Berth'
    }
)

# Fit the view to ports
fig.update_geos(fitbounds="locations")

# Add footnote using add_annotation
fig.add_annotation(
    text="Note: Circle size corresponds to averages vessels per month",  # Footnote text
    xref="paper", yref="paper",  # Position relative to the plot area
    x=0, y=0-0.05,  # Adjust to footnote position
    showarrow=False,  # No arrow, just text
    font=dict(size=14, color="black"),  # Customize the font style
    align="left"
)

# Show the figure
fig.show()

In [166]:
#scatterplot
fig = px.scatter_geo(
    annual_gdf,
    lon='lon',
    lat='lat',
    size='vessels_avg',
    color='time_at_anchor_avg',
    range_color=[0,50],
    hover_name='port_name',
    size_max=30,
    title='Avg Vessels per month & Median Hours at Anchor (previous 12 months)',
    color_continuous_scale=px.colors.sequential.Viridis,
    width=1000,
    height=600,
    labels={
        'time_at_anchor_avg':'Hours at Anchor'
    }
)

# Fit the view to ports
fig.update_geos(fitbounds="locations")

# Add footnote using add_annotation
fig.add_annotation(
    text="Note: Circle size corresponds to averages vessels per month",  # Footnote text
    xref="paper", yref="paper",  # Position relative to the plot area
    x=0, y=0-0.05,  # Adjust to footnote position
    showarrow=False,  # No arrow, just text
    font=dict(size=14, color="black"),  # Customize the font style
    align="left"
)

# Show the figure
fig.show()

In [156]:
monthly_gdf.head()

port_name,month,lat,lon,vessels_avg,time_at_berth_avg,time_at_anchor_avg
str,str,f64,f64,u32,f64,f64
"""Monroe, MI""","""201911""",41.915134,-83.390609,4,8.75,20.266667
"""Cleveland-Cuyahoga Port, OH""","""201904""",41.47852,-81.67191,12,27.766667,6.741667
"""PortMiami, FL""","""201811""",25.782862,-80.181642,89,19.166667,23.433333
"""Unalaska Island, AK""","""201804""",53.894185,-166.549916,25,30.8,263.316667
"""Port Arthur, TX""","""201905""",29.83142,-93.96069,125,1.983333,9.25


In [157]:
px.line(
    #data with month in dt format
    monthly_gdf
    .with_columns(pl.col('month').str.strptime(pl.Date, format='%Y%m'))
    .sort(by='month'), 
    #plot specs
    x='month', y='vessels_avg', color='port_name',
    title='Average Vessels per month at Principal Ports',
    width=1000,
    height=500 
)

In [163]:
px.line(
    #data with month in dt format
    monthly_gdf
    .with_columns(pl.col('month').str.strptime(pl.Date, format='%Y%m'))
    .sort(by='month'), 
    #plot specs
    x='month', y='time_at_anchor_avg', color='port_name',
    title='Median Time at Anchor at Principal Ports',
    width=1000,
    height=500 
)

In [164]:
px.line(
    #data with month in dt format
    monthly_gdf
    .with_columns(pl.col('month').str.strptime(pl.Date, format='%Y%m'))
    .sort(by='month'), 
    #plot specs
    x='month', y='time_at_berth_avg', color='port_name',
    title='Median Time at Berth at Principal Ports',
    width=1000,
    height=500 
)

In [160]:
main_gdf[(main_gdf['port_name']=='Port of Los Angeles, CA') & (main_gdf['month']=='201712')]

Unnamed: 0,mmsi,time,speed,course,heading,status,vessel_name,vessel_type,imo,length,width,draft,cargo,status_duration,geometry,port_type,port_name,port_rank,geometry_port,port_dist,year,month,lat,lon
