# Port Performance Index Project - Metrics Development Workbook

This workbook develops port performance metrics using the AIS data processed in 'ais_ingest.ipynb' in the 'ais data' folderm, along with port and dock data from the BTS and USACE. 

In [1]:
#prelims
import polars as pl
import pandas as pd
import geopandas as gpd
import time
import plotly.express as px
import matplotlib.pyplot as plt
import contextily as cx
import numpy as np
import glob

#enable string cache for polars categoricals
pl.enable_string_cache()
#display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pl.Config(tbl_rows=50);


## Load and Pre-process Data 

### AIS Data

The vessel locations and status (e.g., "under way", "anchored", "moored") data include all AIS messages; for the purposes of the PPI, we only need to know when a vessel *changes* status, so all other observations are dropped. 

Early exploratory analysis identified that vessel AIS statuses often change for very short periods of time, likely due to single AIS pings of errant or missing status. For example, a ship at berth for 4 hours may occaisionally ping "under power" even though the vessel remained at berth. To correct for this, we drop very short duration statuses where the previous and next statuses match.  

In [2]:
#set minimum meaningful status duration (minutes)
min_duration = 10

In [3]:
#init list of lazyframes
lfs = []
#process each parquet file individually into lazyframes
for file in glob.glob('ais data/data/ais_clean/*.parquet'):
    try:
        #check file integrity 
        pl.scan_parquet(file).collect_schema()
        #read file
        lf = (
            pl.scan_parquet(file)
            #drop smaller vessels
            .filter(pl.col('length')>100)
            #sort by vessel and time
            .sort(['mmsi', 'time'])
            #drop messages from the same vessel with same timestamp
            .unique(subset=['mmsi', 'time'])
            #indicate whether status is the same as previous row (Fill value needed to avoid status 0 evaluating as equal to false)
            .with_columns(
                status_change = (
                    pl.col('status').ne(pl.col('status').shift(fill_value=20))
                    .over('mmsi')
                ),
                status_previous = pl.col('status').shift().over('mmsi')
            )
            #keep only new status pings
            .filter(pl.col('status_change')==True)
            #drop change col
            .drop('status_change')
        )
        #append to list of lazyframes
        lfs.append(lf)
    except:
        print(f'{file} failed')

#collect all lazyframes
dfs = pl.collect_all(lfs)

In [4]:
#create single pandas dataframe
ais_gdf = (
    #concat dfs
    pl.concat(dfs, how='diagonal_relaxed')
    #sort by vessel and time
    .sort(['mmsi', 'time'])
    #create duration column
    .with_columns(
        status_duration = (pl.col('time').shift(-1) - pl.col('time'))
        .over('mmsi').dt.total_minutes()
    )
    #drop short changes in status between equal statuses
    .with_columns(
        short = ((pl.col('status').shift()==pl.col('status').shift(-1)) & 
                (pl.col('status_duration')<min_duration)).over('mmsi')
    )
    .filter(pl.col('short')!=True)
    #drop repeated same-status messsages
    .with_columns(
        repeat = (pl.col('status')==pl.col('status').shift()).over('mmsi')
    )
    .filter(pl.col('repeat')!=True)
    .drop('repeat')
    #recalculate duration column
    .with_columns(
        status_duration = (pl.col('time').shift(-1) - pl.col('time'))
        .over('mmsi').dt.total_minutes()
    )
    #convert to pandas
    .to_pandas()
)

#convert to geopandas dataframe
ais_gdf = (
    #convert to geodataframe
    gpd.GeoDataFrame(
        ais_gdf,
        geometry=gpd.points_from_xy(ais_gdf.lon, ais_gdf.lat, crs='EPSG:4326')
    )
    #convert to WGS84 pseudo-mercator
    .to_crs(3857)
    #drop old lat lon cols
    .drop(['lat', 'lon'], axis=1)
)

### Port and Dock Data

Locations and descriptions for each dock and port come from the BTS and USACE online databases. 

In [5]:
#load port data
ports_gdf = (
    #read in shape file downloaded from BTS
    gpd.read_file('port data/Principal_Ports/Principal_Ports.shp')
    #drop unneeded columns
    .drop([
        'FID', #randomly assigned table id
        'PORT', #unknown numeric ID - not CBP or UN code
        'FOREIGN_','EXPORTS', 'IMPORTS', 'DOMESTIC' #breadown of total vol (tons)
    ], axis=1)
)
#set col names to pythonic lowercase
ports_gdf.columns = ports_gdf.columns.str.lower()

#load dock data
docks_gdf = (
    #read in shape file downloaded from USACE
    gpd.read_file('port data/Dock/Dock.shp')
    #drop unneeded columns
    .drop([
        'FID', #randomly assigned table id
        'LONGITUDE', 'LATITUDE', #already coded in 'geometry' 
        'LOCATION_D', #text description of dock location
        'STREET_ADD','ZIPCODE', #street address details
        'PSA_NAME', #statistical area name, rarely used
        'COUNTY_NAM', 'COUNTY_FIP', 'CONGRESS', 'CONGRESS_F', #county and congress info
        'MILE', 'BANK', 'LATITUDE1', 'LONGITUDE1', #redundant locaation data
        'OPERATORS', 'OWNERS', #owner info
        'PURPOSE', #long-form text description of dock uses
        'DOCK', #unknown number (not unique to each row/dock)
        'HIGHWAY_NO', 'RAILWAY_NO', 'LOCATION', #redundant location info
        'COMMODITIE', 'CONSTRUCTI','MECHANICAL', 'REMARKS', 'VERTICAL_D', 
        'DEPTH_MIN', 'DEPTH_MAX','BERTHING_L', 'BERTHING_T', 'DECK_HEIGH', 
        'DECK_HEI_1', #these are rarely used stats on construction
        'SERVICE_IN','SERVICE_TE', #rarely used indicators of data entry date 
    ], axis=1)
    #drop duplicates with matching geometries, keeping most common data
    .groupby('geometry').agg(pd.Series.mode).reset_index()
    #rename cols for clarity
    .rename(columns={
        'NAV_UNIT_I':'nav_unit_id',
        'NAV_UNIT_N':'nav_unit_name',
        'FACILITY_T':'facility_type',
        'CITY_OR_TO':'city',
        'STATE_POST':'state'
    })
)
#set col names to pythonic lowercase
docks_gdf.columns = docks_gdf.columns.str.lower()

#coerse back to gdf - groupby appears to have kicked it back to pandas core
docks_gdf = gpd.GeoDataFrame(docks_gdf, geometry='geometry', crs=3857)

### Matching Port and Dock data with AIS Messages

First, we match each of the moored (aka "docked" aka "at berth") AIS messages with the nearest port, then backfill the preceeding AIS status changes with the relevant port info. This allows us to observe which port the vessel was headed towards at any give time. 

A similar operation is performed with each dock. 

In [6]:
start = time.time()
stops_gdf = (
    #filter to only moorings
    ais_gdf[ais_gdf.status == 5]
    #join in nearest port to each ais message
    .sjoin_nearest(ports_gdf, how='left', exclusive=True)
    #drop unneeded cols
    .drop(['index_right', 'total'], axis=1)
    #rename cols for clarity
    .rename({'rank':'port_rank', 'type':'port_type'}, axis=1)
)
print(f'Spatial Join time on {len(stops_gdf)} AIS messages took {time.time()-start} seconds')

#create main df
main_gdf = (
    #merge stops back into AIS data
    ais_gdf.merge(stops_gdf, how='left')
    #sort by vessel then time of message
    .sort_values(by=['mmsi', 'time'])
)
#backfill port info across vessels, except geometry (normal pandas syntax not supported for gpd geometry)
main_gdf[['port_type','port_name','port_rank']] = (
    main_gdf[['mmsi', 'port_type','port_name','port_rank']].groupby('mmsi').bfill()
)
#merge port geometries into main (NOTE backfill not supported for gpd geometry, hence the separate merge step)
main_gdf = main_gdf.merge(ports_gdf[['port_name', 'geometry']], 
                          on='port_name', how='left', suffixes=[None, '_port'])
#compute distance from message loc to port loc
main_gdf['port_dist'] = main_gdf['geometry'].distance(main_gdf['geometry_port'])

#create year and month cols for convenience
main_gdf['year'] = main_gdf['time'].dt.year
main_gdf['month'] = main_gdf['time'].dt.strftime('%Y%m')

Spatial Join time on 526071 AIS messages took 1.7807488441467285 seconds


In [7]:
start = time.time()
dockstops_gdf = (
    #filter to only moorings
    main_gdf[main_gdf.status == 5]
    #join in nearest dock to each ais message
    .sjoin_nearest(
        #keep only dock id
        docks_gdf[['nav_unit_id', 'geometry']], 
        how='left')
    #drop unneeded cols
    .drop(['index_right'], axis=1)
)
print(f'Spatial Join time on {len(dockstops_gdf)} AIS messages took {time.time()-start} seconds')

#merge docks match back into main gdf
main_gdf = (
    #merge stops back into AIS data
    main_gdf.merge(dockstops_gdf, how='left')
    #sort by vessel then time of message
    .sort_values(by=['mmsi', 'time'])
)
#backfill dock info across vessels, except geometry (normal pandas syntax not supported for gpd geometry)
main_gdf[['nav_unit_id']] = (
    main_gdf[['mmsi', 'nav_unit_id']].groupby('mmsi').bfill()
)
#coerse nav_unit_ids to string for merge 
main_gdf['nav_unit_id'] = main_gdf['nav_unit_id'].astype(str)
docks_gdf['nav_unit_id'] = docks_gdf['nav_unit_id'].astype(str)
#merge dock geometries into main (NOTE backfill not supported for gpd geometry, hence the separate merge step)
main_gdf = main_gdf.merge(docks_gdf[['nav_unit_id', 'geometry']], 
                          on='nav_unit_id', how='left', suffixes=[None, '_dock'])
#compute distance from message loc to dock loc
main_gdf['dock_dist'] = main_gdf['geometry'].distance(main_gdf['geometry_dock'])

#rename nav_unit_id to dock_id
main_gdf.rename({'nav_unit_id':'dock_id'}, axis=1, inplace=True)

Spatial Join time on 526071 AIS messages took 2.429561138153076 seconds


## Performance Metrics and Exploratory Analysis

### Stats for most recent 12 months

The eventual dashboard will present some statistics and visualizations based on data from the most recently available 12 months. 

In [8]:
#get most recent twelve month window that appears in the data
latest_date = main_gdf.time.max()
year_before = latest_date - pd.DateOffset(months=12)

#get port and dock lat and long to preserve geometries in polars df
main_gdf['port_lat'] = main_gdf.set_geometry('geometry_port').to_crs("EPSG:4326").geometry_port.y
main_gdf['port_lon'] = main_gdf.set_geometry('geometry_port').to_crs("EPSG:4326").geometry_port.x
main_gdf['dock_lat'] = main_gdf.set_geometry('geometry_dock').to_crs("EPSG:4326").geometry_port.y
main_gdf['dock_lon'] = main_gdf.set_geometry('geometry_dock').to_crs("EPSG:4326").geometry_port.x

#create ports stats for most recent 12 months
portstats_last12months_gdf = (
    #convert main gdf to polars
    pl.DataFrame(
        main_gdf.drop(['geometry', 'geometry_port', 'geometry_dock'], axis=1))
    #filter to most recent 12 months
    .filter(pl.col('time')>= year_before)
    .with_columns(
        #get monthly count of vessels
        vessels = pl.col('mmsi').n_unique().over('port_name', 'month'),
        #get monthly count of vessel docking events - NOTE this may need revision 
        visits = (
            pl.when(pl.col('status')==5)
            .then(pl.col('time'))
            .otherwise(pl.lit(None))
            .n_unique().over('port_name', 'month')
        )
    )
    #aggregate
    .group_by('port_name')
    .agg(
        #keep lat and long
        port_lat = pl.col('port_lat').first(),
        port_lon = pl.col('port_lon').first(),
        #get avg vessels and visits per month
        vessels_avg = pl.col('vessels').mean(),
        visits_avg = pl.col('visits').mean(),
        #get median time at berth in hours
        time_at_berth_avg = (
            pl.when(pl.col('status')==5)
            .then(pl.col('status_duration'))
            .otherwise(pl.lit(None))
        ).median()/60,
        #get median time at anchor in hours
        time_at_anchor_avg = (
            pl.when(pl.col('status')==1)
            .then(pl.col('status_duration'))
            .otherwise(pl.lit(None))
        ).median()/60
    )
    #convert to pandas to that geopandas is happy
    .to_pandas()
)

#convert back to geodataframe
portstats_last12months_gdf = (
    gpd.GeoDataFrame(
        portstats_last12months_gdf, 
        geometry=gpd.points_from_xy(portstats_last12months_gdf.port_lon, 
                                    portstats_last12months_gdf.port_lat),
        crs=3857
    )
)

In [None]:
df = (
    #convert main gdf to polars
    pl.DataFrame(
        main_gdf.drop(['geometry', 'geometry_port', 'geometry_dock'], axis=1)
    )
    #filter to most recent 12 months
    .filter(pl.col('time')>= year_before)
    #ensure sorting
    .sort(['mmsi', 'time'])
    #create row index (for identifying docking events)
    .with_row_index('docking_id')
    .with_columns(
        #get monthly count of vessels
        vessels = pl.col('mmsi').n_unique().over('dock_id', 'month'),
        #get monthly count of vessel docking events - NOTE this may need revision 
        visits = (
            pl.when(pl.col('status')==5)
            .then(pl.col('time'))
            .otherwise(pl.lit(None))
            .n_unique().over('dock_id', 'month')
        ),
        #create docking event id - NOTE may need to ensure this captures all relevant messages
        docking_id = (
            pl.when(pl.col('status')==5)
            .then(pl.col('docking_id'))
            .otherwise(pl.lit(None))
            .backward_fill().over('mmsi')
        )
    )
    #drop messages not associated with a docking event
    .drop_nulls(subset='docking_id')
    #aggregate to month, dock, vessel and docking event
    .group_by('mmsi', 'docking_id')
    .agg(
        #sum anchorage time for each docking event
        time_at_anchor = (
            pl.when(pl.col('status')==1)
            .then(pl.col('status_duration'))
            .otherwise(pl.lit(None))
            .sum()
        )
    )
)

display(df.head())
df.describe()

mmsi,docking_id,time_at_anchor
str,u32,f64
"""352871000""",120379,0.0
"""636019828""",287047,10.0
"""636016417""",272446,272.0
"""338789000""",112129,0.0
"""352002747""",117970,4779.0


statistic,mmsi,docking_id,time_at_anchor
str,str,f64,f64
"""count""","""92973""",92973.0,92973.0
"""null_count""","""0""",0.0,0.0
"""mean""",,151769.718843,2038.752025
"""std""",,89382.47326,6797.006736
"""min""","""205097000""",40.0,0.0
"""25%""",,76957.0,0.0
"""50%""",,146262.0,0.0
"""75%""",,230017.0,1451.0
"""max""","""775345000""",311061.0,385850.0


In [42]:
df.with_row_index('new_index').head(10)

new_index,port_name,dock_id,month,mmsi,docking_id,time_at_anchor
u32,str,str,str,str,u32,f64
0,"""Philadelphia Regional Port, PA""","""0N54""","""202309""","""257593000""",53734.0,0.0
1,"""Houston Port Authority, TX""","""0SHS""","""202402""","""258640000""",55562.0,0.0
2,"""Virgin Islands - St. Croix, VI""","""0H8C""","""202307""","""377913000""",166045.0,0.0
3,"""Richmond, CA""","""0V6C""","""202401""","""431667000""",170581.0,0.0
4,,"""nan""","""202309""","""636019079""",,2045.0
5,"""Conneaut, OH""","""06BR""","""202405""","""316031772""",83919.0,0.0
6,,"""nan""","""202402""","""477050400""",,1673.0
7,"""Anacortes, WA""","""02JV""","""202402""","""316047565""",91016.0,0.0
8,,"""nan""","""202404""","""538007362""",,738.0
9,"""Manatee County Port, FL""","""0ZSZ""","""202401""","""245964000""",31780.0,3043.0


In [None]:
#check for disjoint docking ids across vessels (code from chatgpt)
# Step 1: Group by 'group_column' and collect 'value_column' as sets
grouped = df.group_by("mmsi").agg(
    pl.col("docking_id").unique().alias("value_set")
)
# Step 2: Convert the value sets to Python lists
all_value_sets = [set(values) for values in grouped["value_set"].to_list()]
# Step 3: Check for intersections between all pairs of sets
are_disjoint = not any(set1 & set2 for i, set1 in enumerate(all_value_sets) for set2 in all_value_sets[i + 1:])
print(f"Are the values disjoint across groups? {are_disjoint}")


In [37]:
df = (
    #convert main gdf to polars
    pl.DataFrame(
        main_gdf.drop(['geometry', 'geometry_port', 'geometry_dock'], axis=1)
    )
    #filter to most recent 12 months
    .filter(pl.col('time')>= year_before)
    #ensure sorting
    .sort(['mmsi', 'time'])
    #create row index (for identifying docking events)
    .with_row_index('docking_id')
    .with_columns(
        #create docking event id
        docking_id = (
            pl.when(pl.col('status')==5)
            .then(pl.col('docking_id'))
            .otherwise(pl.lit(None))
            #.backward_fill().over('mmsi')
        )
    )
)

#check for disjoint docking ids across vessels (code from chatgpt)
# Step 1: Group by 'group_column' and collect 'value_column' as sets
grouped = df.group_by("mmsi").agg(
    pl.col("docking_id").unique().alias("value_set")
)
# Step 2: Convert the value sets to Python lists
all_value_sets = [set(values) for values in grouped["value_set"].to_list()]
# Step 3: Check for intersections between all pairs of sets
are_disjoint = not any(set1 & set2 for i, set1 in enumerate(all_value_sets) for set2 in all_value_sets[i + 1:])
print(f"Are the values disjoint across groups? {are_disjoint}")

Are the values disjoint across groups? False


In [27]:
#create docks stats for most recent 12 months
dockstats_last12months_gdf = (
    #convert main gdf to polars
    pl.DataFrame(
        main_gdf.drop(['geometry', 'geometry_port', 'geometry_dock'], axis=1))
    #filter to most recent 12 months
    .filter(pl.col('time')>= year_before)
    #ensure sorting
    .sort(['mmsi', 'time'])
    #create row index (for identifying docking events)
    .with_row_index('docking_id')
    .with_columns(
        #get monthly count of vessels
        vessels = pl.col('mmsi').n_unique().over('dock_id', 'month'),
        #get monthly count of vessel docking events - NOTE this may need revision 
        visits = (
            pl.when(pl.col('status')==5)
            .then(pl.col('time'))
            .otherwise(pl.lit(None))
            .n_unique().over('dock_id', 'month')
        ),
        #create docking event id
        docking_id = (
            pl.when(pl.col('status')==5)
            .then(pl.col('docking_id'))
            .otherwise(pl.lit(None))
            .backward_fill().over('mmsi', 'dock_id')
        )
    )
    .with_columns(
        #sum anchorage time for each docking event
        time_at_anchor = (
            pl.when(pl.col('status')==1)
            .then(pl.col('status_duration'))
            .otherwise(pl.lit(None))
            .sum().over('mmsi', 'docking_id')
        )
    )
    #aggregate
    .group_by('dock_id')
    .agg(
        #keep port name, dock lat and long
        dock_lat = pl.col('dock_lat').first(),
        dock_lon = pl.col('dock_lon').first(),
        port_name = pl.col('port_name').mode().first(),
        #get avg vessels and visits per month
        vessels_avg = pl.col('vessels').mean(),
        visits_avg = pl.col('visits').mean(),
        #get median time at berth in hours
        time_at_berth_avg = (
            pl.when(pl.col('status')==5)
            .then(pl.col('status_duration'))
            .otherwise(pl.lit(None))
        ).median()/60,
        #get median time at anchor in hours
        time_at_anchor_avg = (
            pl.when(pl.col('status')==1)
            .then(pl.col('status_duration'))
            .otherwise(pl.lit(None))
        ).median()/60
    )
    #convert to pandas to that geopandas is happy
    .to_pandas()
)

#convert back to geodataframe
dockstats_last12months_gdf = (
    gpd.GeoDataFrame(
        dockstats_last12months_gdf, 
        geometry=gpd.points_from_xy(dockstats_last12months_gdf.dock_lon, 
                                    dockstats_last12months_gdf.dock_lat),
        crs=3857
    )
)

dockstats_last12months_gdf.head()

Unnamed: 0,dock_id,dock_lat,dock_lon,port_name,vessels_avg,visits_avg,time_at_berth_avg,time_at_anchor_avg,geometry
0,0RZK,3221937.0,-9186346.0,"Tampa Port Authority, FL",2.481481,3.111111,133.325,20.2,POINT (-9186345.981 3221936.932)
1,['0SN5' '0SN6'],3517235.0,-10384640.0,"Lake Charles Harbor District, LA",2.071625,43.239669,0.733333,0.85,POINT (-10384643.415 3517234.688)
2,0SBB,3592714.0,-9800104.0,"Mobile, AL",3.793651,3.079365,104.283333,73.633333,POINT (-9800104.212 3592714.071)
3,0RGE,3992155.0,-13165540.0,"Port of Los Angeles, CA",4.662162,4.675676,110.425,20.5,POINT (-13165544.558 3992154.584)
4,0YGY,3225389.0,-10842280.0,"Corpus Christi, TX",4.735294,4.970588,68.583333,46.791667,POINT (-10842283.519 3225388.812)


### Monthly Stats

In [30]:
main_gdf.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 1782198 entries, 0 to 1782197
Data columns (total 31 columns):
 #   Column           Dtype         
---  ------           -----         
 0   mmsi             object        
 1   time             datetime64[us]
 2   speed            float64       
 3   course           float64       
 4   heading          float64       
 5   status           float64       
 6   vessel_name      category      
 7   vessel_type      float64       
 8   imo              float64       
 9   length           float64       
 10  width            float64       
 11  draft            float64       
 12  cargo            float64       
 13  status_previous  float64       
 14  status_duration  float64       
 15  short            bool          
 16  geometry         geometry      
 17  port_type        object        
 18  port_name        object        
 19  port_rank        float64       
 20  geometry_port    geometry      
 21  port_dist        float6

In [35]:
monthly_df = (
    #convert to polars
    pl.DataFrame(main_gdf.drop(['geometry', 'geometry_port', 'geometry_dock'], axis=1))
    #agg over ports and months
    .group_by('port_name', 'month')
    .agg(
        #keep lat and long
        lat = pl.col('port_lat').first(),
        lon = pl.col('port_lon').first(),
        #get monthly avg vessels
        vessels_avg = pl.col('mmsi').n_unique(),
        #get average time at berth
        time_at_berth_avg = (
            pl.when(pl.col('status')==5)
            .then(pl.col('status_duration'))
            .otherwise(pl.lit(None))
        ).median()/60,
        #get average time at anchor
        time_at_anchor_avg = (
            pl.when(pl.col('status')==1)
            .then(pl.col('status_duration'))
            .otherwise(pl.lit(None))
        ).median()/60
    )
)

## Visualizations

In [38]:
#scatterplot
fig = px.scatter_geo(
    portstats_last12months_gdf,
    lon='port_lon',
    lat='port_lat',
    size='vessels_avg',
    color='time_at_berth_avg',
    range_color=[0,50],
    hover_name='port_name',
    size_max=30,
    title='Avg Vessels per month & Median Hours at Berth (previous 12 months)',
    color_continuous_scale=px.colors.sequential.Viridis,
    width=1000,
    height=600,
    labels={
        'time_at_berth_avg':'Hours at Berth'
    }
)

# Fit the view to ports
fig.update_geos(fitbounds="locations")

# Add footnote using add_annotation
fig.add_annotation(
    text="Note: Circle size corresponds to averages vessels per month",  # Footnote text
    xref="paper", yref="paper",  # Position relative to the plot area
    x=0, y=0-0.05,  # Adjust to footnote position
    showarrow=False,  # No arrow, just text
    font=dict(size=14, color="black"),  # Customize the font style
    align="left"
)

# Show the figure
fig.show()

In [37]:
#scatterplot
fig = px.scatter_geo(
    portstats_last12months_gdf,
    lon='port_lon',
    lat='port_lat',
    size='vessels_avg',
    color='time_at_anchor_avg',
    range_color=[0,50],
    hover_name='port_name',
    size_max=30,
    title='Avg Vessels per month & Median Hours at Anchor (previous 12 months)',
    color_continuous_scale=px.colors.sequential.Viridis,
    width=1000,
    height=600,
    labels={
        'time_at_anchor_avg':'Hours at Anchor'
    }
)

# Fit the view to ports
fig.update_geos(fitbounds="locations")

# Add footnote using add_annotation
fig.add_annotation(
    text="Note: Circle size corresponds to averages vessels per month",  # Footnote text
    xref="paper", yref="paper",  # Position relative to the plot area
    x=0, y=0-0.05,  # Adjust to footnote position
    showarrow=False,  # No arrow, just text
    font=dict(size=14, color="black"),  # Customize the font style
    align="left"
)

# Show the figure
fig.show()

In [None]:
#get top 5 ports
top5ports = pl.Series(ports_gdf.sort_values('rank').head().port_name)

#get top 10 ports
top10ports = pl.Series(ports_gdf.sort_values('rank').head(10).port_name)

In [None]:
px.line(
    monthly_df
    #restrict to top 5 ports
    .filter(pl.col('port_name').is_in(top5ports))
    #month in dt format
    .with_columns(pl.col('month').str.strptime(pl.Date, format='%Y%m'))
    .sort(by='month'), 
    #plot specs
    x='month', y='vessels_avg', color='port_name',
    title='Vessels per month at Principal Ports',
    labels={'vessels_avg':'Unique Vessels'},
    width=1000,
    height=500 
)

In [None]:
px.line(
    monthly_df
    #restrict to top 10 ports
    .filter(pl.col('port_name').is_in(top10ports))
    #month in dt format
    .with_columns(pl.col('month').str.strptime(pl.Date, format='%Y%m'))
    .sort(by='month'), 
    #plot specs
    x='month', y='time_at_anchor_avg', color='port_name',
    title='Median Time at Anchor at Principal Ports',
    width=1000,
    height=500 
)

In [None]:
px.line(
    monthly_df
    #restrict to top 5 ports
    .filter(pl.col('port_name').is_in(top5ports))
    #month in dt format
    .with_columns(pl.col('month').str.strptime(pl.Date, format='%Y%m'))
    .sort(by='month'), 
    #plot specs
    x='month', y='vessels_avg', color='port_name',
    title='Unique vessels at Anchor at Principal Ports',
    width=1000,
    height=500 
)

In [None]:
px.line(
    #data with month in dt format
    monthly_df
    #restrict to top 5 ports
    .filter(pl.col('port_name').is_in(top5ports))
    #month in dt format
    .with_columns(pl.col('month').str.strptime(pl.Date, format='%Y%m'))
    .sort(by='month'), 
    #plot specs
    x='month', y='time_at_berth_avg', color='port_name',
    title='Median Time at Berth at Principal Ports',
    width=1000,
    height=500 
)