In [70]:
#prelims
import polars as pl
import pandas as pd
import geopandas as gpd
import time
import plotly_express as px
import matplotlib.pyplot as plt
import contextily as cx
import numpy as np

#enable string cache for polars categoricals
pl.enable_string_cache()
#display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pl.Config(tbl_rows=200);

# Port Performance Index Project - Main Notebook

This notebook presents the primary data analysis related to the PPI Project; see the README and other files in the [repo](https://github.com/epistemetrica/Port-Performance-Project) for full details. 

## Data Processing

AIS data is ingested from the Marine Cadastre database via the scripts found in the vessel data folder. Here, we combine the AIS data with port-level data and prepare for analysis. 

### AIS Data

The vessel locations and status (e.g., "under way", "anchored", "moored") data include all AIS messages; for the purposes of the PPI, we only need to know when a vessel *changes* status, so all other observations are dropped. 

We also limit the data for the PPI to vessels over 100m in length. 

In [71]:
%%script echo skipping: data to be read from previously processed parquet file in next cell

ais_gdf = (
    #read into lazyframe
    pl.scan_parquet('ais data/data/ais_clean/*.parquet')
    #drop smaller vessels
    .filter(pl.col('length')>100)
    #sort by vessel and time
    .sort(['mmsi', 'time'])
    #indicate whether status is the same as previous row (Fill value needed to avoid status 0 evaluating as equal to false)
    .with_columns(
        status_change = (
            pl.col('status').ne(pl.col('status').shift(fill_value=20))
            .over('mmsi')
        )
    )
    #keep only new status pings
    .filter(pl.col('status_change')==True)
    #drop change col
    .drop('status_change')
    #create duration column
    .with_columns(
        status_duration = (pl.col('time').shift(-1) - pl.col('time'))
        .over('mmsi').dt.total_minutes()
    )
    #collect to pandas df
    .collect()
    .to_pandas()
)

#convert to geopandas dataframe
ais_gdf = (
    #convert to geodataframe
    gpd.GeoDataFrame(
        ais_gdf,
        geometry=gpd.points_from_xy(ais_gdf.lon, ais_gdf.lat, crs='EPSG:4326')
    )
    #convert to WGS84 pseudo-mercator
    .to_crs(3857)
    #drop old lat lon cols
    .drop(['lat', 'lon'], axis=1)
)

#write to parquet
ais_gdf.to_parquet('ais data/data/ais_status_changes.parquet')

skipping: data to be read from previously processed parquet file in next cell


In [72]:
#%%script echo skipping: dataframe created in previous cell
#read previously processed AIS parquet file
ais_gdf = (gpd.read_parquet('ais data/data/ais_status_changes.parquet'))

### Port and Dock Data

Locations and descriptions for each dock and port come from the BTS and USACE online databases. 

In [73]:
#load port data
ports_gdf = (
    #read in shape file downloaded from BTS
    gpd.read_file('port data/Principal_Ports/Principal_Ports.shp')
    #drop unneeded columns
    .drop([
        'FID', #randomly assigned table id
        'PORT', #unknown numeric ID - not CBP or UN code
        'FOREIGN_','EXPORTS', 'IMPORTS', 'DOMESTIC' #breadown of total vol (tons)
    ], axis=1)
)
#set col names to pythonic lowercase
ports_gdf.columns = ports_gdf.columns.str.lower()

#load dock data
docks_gdf = (
    #read in shape file downloaded from USACE
    gpd.read_file('port data/Dock/Dock.shp')
    #drop unneeded columns
    .drop([
        'FID', #randomly assigned table id
        'LONGITUDE', 'LATITUDE', #already coded in 'geometry' 
        'LOCATION_D', #text description of dock location
        'STREET_ADD','ZIPCODE', #street address details
        'PSA_NAME', #statistical area name, rarely used
        'COUNTY_NAM', 'COUNTY_FIP', 'CONGRESS', 'CONGRESS_F', #county and congress info
        'MILE', 'BANK', 'LATITUDE1', 'LONGITUDE1', #redundant locaation data
        'OPERATORS', 'OWNERS', #owner info
        'PURPOSE', #long-form text description of dock uses
        'DOCK', #unknown number (not unique to each row/dock)
        'HIGHWAY_NO', 'RAILWAY_NO', 'LOCATION', #redundant location info
        'COMMODITIE', 'CONSTRUCTI','MECHANICAL', 'REMARKS', 'VERTICAL_D', 
        'DEPTH_MIN', 'DEPTH_MAX','BERTHING_L', 'BERTHING_T', 'DECK_HEIGH', 
        'DECK_HEI_1', #these are rarely used stats on construction
        'SERVICE_IN','SERVICE_TE', #rarely used indicators of data entry date 
    ], axis=1)
    #rename cols for clarity
    .rename(columns={
        'NAV_UNIT_I':'nav_unit_id',
        'NAV_UNIT_N':'nav_unit_name',
        'FACILITY_T':'facility_type',
        'CITY_OR_TO':'city',
        'STATE_POST':'state'
    })
)
#set col names to pythonic lowercase
docks_gdf.columns = docks_gdf.columns.str.lower()

### Matching

We first match each of the anchored and moored points from the AIS data with the nearest Port. 

In [74]:
start = time.time()
stops_gdf = (
    #filter to only moorings
    ais_gdf[ais_gdf.status == 5]
    #join in nearest port to each ais message
    .sjoin_nearest(ports_gdf, how='left', exclusive=True)
    #drop unneeded cols
    .drop(['index_right', 'total'], axis=1)
    #rename cols for clarity
    .rename({'rank':'port_rank', 'type':'port_type'}, axis=1)
)
print(f'Spatial Join time on {len(stops_gdf)} AIS messages took {time.time()-start} seconds')

#create main df
main_gdf = (
    #merge stops back into AIS data
    ais_gdf.merge(stops_gdf, how='left')
    #sort by vessel then time of message
    .sort_values(by=['mmsi', 'time'])
)
#backfill port info except geometry (normal pandas syntax not supported for gpd geometry)
main_gdf[['port_type','port_name','port_rank']] = (
    main_gdf[['port_type','port_name','port_rank']].bfill()
)
#merge port geometries into main (NOTE backfill not supported for gpd geometry, hence the separate merge step)
main_gdf = main_gdf.merge(ports_gdf[['port_name', 'geometry']], 
                          on='port_name', how='left', suffixes=[None, '_port'])
#compute distance from message loc to port loc
main_gdf['port_dist'] = main_gdf['geometry'].distance(main_gdf['geometry_port'])

Spatial Join time on 187873 AIS messages took 0.5822947025299072 seconds


In [75]:
#create year and month cols for convenience
main_gdf['year'] = main_gdf['time'].dt.year
main_gdf['month'] = main_gdf['time'].dt.strftime('%Y%m')

In [76]:
#inspect
display(main_gdf.head())
main_gdf.info()

Unnamed: 0,mmsi,time,speed,course,heading,status,vessel_name,vessel_type,imo,length,width,draft,cargo,status_duration,geometry,port_type,port_name,port_rank,geometry_port,port_dist,year,month
0,3,2017-12-17 02:27:29,0.0,60.0,35.0,1.0,MAERSK PRIVILEGE,80.0,3.0,240.0,42.0,-12.6,80.0,66.0,POINT (-10021529.366 3474701.894),C,"PortMiami, FL",52.0,POINT (-8925779.558 2972212.233),1205472.0,2017,201712
1,3,2017-12-17 03:34:09,1.0,46.0,41.0,0.0,MAERSK PRIVILEGE,80.0,3.0,240.0,42.0,-12.6,80.0,,POINT (-10021509.329 3474712.155),C,"PortMiami, FL",52.0,POINT (-8925779.558 2972212.233),1205458.0,2017,201712
2,36336,2019-10-28 10:45:42,12.5,46.0,49.0,0.0,NAVIOS LYRA,79.0,9498626.0,231.0,39.0,5.8,79.0,,POINT (-8231327.164 4246784.482),C,"PortMiami, FL",52.0,POINT (-8925779.558 2972212.233),1451482.0,2019,201910
3,102810,2015-07-26 02:13:42,0.0,13.6,14.0,0.0,SINGLEVESSEL,79.0,4711.0,130.0,16.0,0.0,79.0,,POINT (-13135627.556 3988362.673),C,"PortMiami, FL",52.0,POINT (-8925779.558 2972212.233),4330748.0,2015,201507
4,21059000,2017-03-14 02:33:03,0.0,0.0,228.0,5.0,SANTIAGO,70.0,9255050.0,172.0,27.0,10.5,70.0,9853.0,POINT (-8338607.984 2350996.438),C,"PortMiami, FL",52.0,POINT (-8925779.558 2972212.233),854797.9,2017,201703


<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 642350 entries, 0 to 642349
Data columns (total 22 columns):
 #   Column           Non-Null Count   Dtype         
---  ------           --------------   -----         
 0   mmsi             642350 non-null  int64         
 1   time             642350 non-null  datetime64[us]
 2   speed            642350 non-null  float64       
 3   course           640866 non-null  float64       
 4   heading          636872 non-null  float64       
 5   status           642350 non-null  float64       
 6   vessel_name      642350 non-null  category      
 7   vessel_type      642350 non-null  float64       
 8   imo              642137 non-null  float64       
 9   length           642350 non-null  float64       
 10  width            601101 non-null  float64       
 11  draft            597079 non-null  float64       
 12  cargo            359561 non-null  float64       
 13  status_duration  628715 non-null  float64       
 14  geometry    

## Exploratory Analysis

### Vessel calls in previous 12 months

In [77]:
#get most recent twelve month window that appears in the data
latest_date = main_gdf.time.max()
year_before = latest_date - pd.DateOffset(months=12)

#get port lat and long to preserve geometries in polars df
main_gdf['lat'] = main_gdf.set_geometry('geometry_port').to_crs("EPSG:4326").geometry_port.y
main_gdf['lon'] = main_gdf.set_geometry('geometry_port').to_crs("EPSG:4326").geometry_port.x

annual_gdf = (
    #convert main gdf to polars
    pl.DataFrame(
        main_gdf.drop(['geometry', 'geometry_port'], axis=1))
    #filter to most recent 12 months
    .filter(pl.col('time')>= year_before)
    .with_columns(
        #get monthly avg vessels
        vessels_avg = pl.col('mmsi').n_unique().over('port_name', 'month')
    )
    #aggregate
    .group_by('port_name')
    .agg(
        #keep lat and long
        lat = pl.col('lat').first(),
        lon = pl.col('lon').first(),
        #get avg vessels per month
        vessels_avg = pl.col('vessels_avg').mean(),
        #get average time at berth
        time_at_berth_avg = (
            pl.when(pl.col('status')==5)
            .then(pl.col('status_duration'))
            .otherwise(pl.lit(None))
        ).mean(),
        #get average time at anchor
        time_at_anchor_avg = (
            pl.when(pl.col('status')==1)
            .then(pl.col('status_duration'))
            .otherwise(pl.lit(None))
        ).mean()
    )
    #convert to pandas to that geopandas is happy
    .to_pandas()
)

#convert to geodataframe
annual_gdf = (
    gpd.GeoDataFrame(
        annual_gdf, geometry=gpd.points_from_xy(annual_gdf.lon, annual_gdf.lat),
        crs=3857
    )
)

Unnamed: 0,port_name,lat,lon,vessels_avg,time_at_berth_avg,time_at_anchor_avg,geometry
0,"Mueller Township, MI",45.969424,-85.872972,6.655039,600.462366,743.0,POINT (-85.873 45.969)
1,"Tacoma, WA",47.28966,-122.4515,82.115549,1948.626594,5376.539007,POINT (-122.451 47.290)
2,"Rogers City, MI",45.422298,-83.811972,5.494118,992.580645,733.916667,POINT (-83.812 45.422)
3,"Port of Pascagoula, MS",30.34802,-88.55879,47.039336,1013.167665,1206.52949,POINT (-88.559 30.348)
4,"Panama City Port Authority, FL",30.133998,-85.633762,12.988406,3354.528926,3179.042553,POINT (-85.634 30.134)


### Monthly Port Statistics

In [90]:
monthly_gdf = (
    #convert to polars
    pl.DataFrame(main_gdf.drop(['geometry', 'geometry_port'], axis=1))
    #agg over ports and months
    .group_by('port_name', 'month')
    .agg(
        #keep lat and long
        lat = pl.col('lat').first(),
        lon = pl.col('lon').first(),
        #get monthly avg vessels
        vessels_avg = pl.col('mmsi').n_unique(),
        #get average time at berth
        time_at_berth_avg = (
            pl.when(pl.col('status')==5)
            .then(pl.col('status_duration'))
            .otherwise(pl.lit(None))
        ).mean(),
        #get average time at anchor
        time_at_anchor_avg = (
            pl.when(pl.col('status')==1)
            .then(pl.col('status_duration'))
            .otherwise(pl.lit(None))
        ).mean()
    )
)

### Visualizations

In [103]:
#scatterplot
fig = px.scatter_geo(
    annual_gdf,
    lon='lon',
    lat='lat',
    size='vessels_avg',
    color='time_at_berth_avg',
    range_color=[0,5000],
    hover_name='port_name',
    size_max=30,
    title='Avg Vessels per month & Avg Time at Berth (previous 12 months)',
    color_continuous_scale=px.colors.sequential.Viridis,
    width=1000,
    height=600
)

# Fit the view to ports
fig.update_geos(fitbounds="locations")

# Show the figure
fig.show()

In [85]:
#scatterplot
fig = px.scatter_geo(
    annual_gdf,
    lon='lon',
    lat='lat',
    size='vessels_avg',
    color='time_at_anchor_avg',
    range_color=[0,7000],
    hover_name='port_name',
    size_max=30,
    title='Avg Vessels per month & Avg Time at Anchor (previous 12 months)',
    color_continuous_scale=px.colors.sequential.Viridis,
    width=1000,
    height=600 
)

# Fit the view to ports
fig.update_geos(fitbounds="locations")

# Show the figure
fig.show()

In [93]:
monthly_gdf.head()

port_name,month,lat,lon,vessels_avg,time_at_berth_avg,time_at_anchor_avg
str,str,f64,f64,u32,f64,f64
"""Port of Longview, WA""","""201806""",46.14222,-122.914,65,4859.290323,10847.611111
"""Seattle, WA""","""201809""",47.587711,-122.359218,57,2362.186667,2776.6875
"""Port of Alaska, AK""","""201812""",61.23778,-149.895,9,1121.368421,2104.0
"""Port of Savannah, GA""","""201707""",32.084711,-81.095382,3,1322.666667,
"""PortMiami, FL""","""201602""",25.782862,-80.181642,9,2379.333333,346.0


In [101]:
px.line(
    #data with month in dt format
    monthly_gdf
    .with_columns(pl.col('month').str.strptime(pl.Date, format='%Y%m'))
    .sort(by='month'), 
    #plot specs
    x='month', y='vessels_avg', color='port_name',
    title='Average Vessels per month at Principal Ports',
    width=1000,
    height=500 
)

In [102]:
px.line(
    #data with month in dt format
    monthly_gdf
    .with_columns(pl.col('month').str.strptime(pl.Date, format='%Y%m'))
    .sort(by='month'), 
    #plot specs
    x='month', y='time_at_anchor_avg', color='port_name',
    title='Average Time at Anchor at Principal Ports',
    width=1000,
    height=500 
)

In [99]:
px.line(
    #data with month in dt format
    monthly_gdf
    .with_columns(pl.col('month').str.strptime(pl.Date, format='%Y%m'))
    .sort(by='month'), 
    #plot specs
    x='month', y='time_at_berth_avg', color='port_name',
    title='Average Time at Berth at Principal Ports',
    width=1000,
    height=500 
)

In [107]:
main_gdf[(main_gdf['port_name']=='Port of Los Angeles, CA') & (main_gdf['month']=='201712')]

Unnamed: 0,mmsi,time,speed,course,heading,status,vessel_name,vessel_type,imo,length,width,draft,cargo,status_duration,geometry,port_type,port_name,port_rank,geometry_port,port_dist,year,month,lat,lon
498298,538007586,2017-12-15 00:58:39,0.9,235.1,200.0,0.0,CLIPPER TENACIOUS,70.0,9320348.0,179.0,28.0,9.6,70.0,622450.0,POINT (-13685615.972 5796184.913),C,"Port of Los Angeles, CA",11.0,POINT (-13165544.558 3992154.584),1877498.0,2017,201712,33.726635,-118.268099
499402,538007675,2017-12-30 17:37:29,0.0,0.0,0.0,0.0,SEAMAX NEW HAVEN,79.0,9293777.0,335.0,42.0,12.1,79.0,219160.0,POINT (-13159536.756 3996891.714),C,"Port of Los Angeles, CA",11.0,POINT (-13165544.558 3992154.584),7650.757,2017,201712,33.726635,-118.268099
