# Port Statistics

This notebook develops and explores the various port statistics used in the [Port Performance Project](https://github.com/epistemetrica/Port-Performance-Project). See the README.md file in the main directory for more info.

The primary data set comes from a combination of AIS vessel data and port data, processed in the Port Geodata notebook.

Statistics and final dataframes developed here are used in the Port Performance Dashboard.



In [13]:
#prelims
import polars as pl
import polars.selectors as cs
import pandas as pd
import geopandas as gpd
import time
import plotly.express as px
import matplotlib.pyplot as plt
import contextily as cx
import numpy as np
import glob
import folium
from folium.plugins import HeatMap

#enable string cache for polars categoricals
pl.enable_string_cache()
#display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pl.Config(tbl_rows=100);

## Load Data from geodata_prep notebook

In [14]:
#load data from parquet
main_lf = pl.scan_parquet('port data/dashboard/main.parquet')

## Generate Stats

In [15]:
#get stats for each call
calls_df = (
    main_lf
    #ensure sorting by vessel and time
    .sort(['imo', 'time'])
    #grouby by call id
    .group_by('call_id')
    .agg(
        #port name
        port_name = pl.first('port_name'),
        #port lat and lon
        port_lat = pl.first('port_lat'),
        port_lon = pl.first('port_lon'),
        #dock name
        dock_name = pl.first('dock_name'),
        #dock_id
        dock_id = pl.first('dock_id'),
        #facility type
        facility_type = pl.first('facility_type'),
        #dock lat and lon
        dock_lat = pl.first('dock_lat'),
        dock_lon = pl.first('dock_lon'),
        #vessel imo
        imo = pl.first('imo'),
        #vessel size
        vessel_size = pl.first('length'),
        #time entering port waters
        time_port_entry = pl.col('time').min(),
        #time of arrival at dock
        time_arrival = (
            pl.when(pl.col('status')==5)
            .then(pl.col('time'))
            .otherwise(pl.lit(None))
        ).min(),
        #time of departure from dock
        time_departure = (
            pl.when(pl.col('status')==5)
            .then(pl.col('time')+pl.col('status_duration'))
            .otherwise(pl.lit(None))
        ).max(),
        #time port exit
        time_port_exit = (pl.col('time') + pl.col('status_duration')).max(),
        #hrs a berth
        hrs_at_berth = (
            ((pl.col('status')==5)*
            (pl.col('status_duration').dt.total_minutes()/60)
            )
        ).sum(),
        #hrs at anchor
        hrs_at_anchor = (
            (pl.col('status')==1)*
            (pl.col('status_duration').dt.total_minutes()/60)
        ).sum()
    )
    #drop calls with missing arrival or departure time
    .filter(pl.col('time_arrival').is_not_null() & 
            pl.col('time_departure').is_not_null())
    #drop calls with missing port entry or exit time
    .filter(pl.col('time_port_entry').is_not_null() & 
            pl.col('time_port_exit').is_not_null())
    #compute additional stats
    .with_columns(
        #time from port entry to docking in hrs
        hrs_to_dock = (
            (pl.col('time_arrival') - pl.col('time_port_entry'))
            .dt.total_minutes()/60
        ),
        #time in port waters after leaving dock
        hrs_in_port_after_dock = (
            (pl.col('time_port_exit') - pl.col('time_departure'))
            .dt.total_minutes()/60
        ),
        #total time in port waters in hrs
        hrs_in_port_waters =(
            (pl.col('time_port_exit') - pl.col('time_port_entry'))
            .dt.total_minutes()/60
        )
    )
    #collect
    .collect()
)

#inspect
display(calls_df.describe())
calls_df.head(5)


statistic,call_id,port_name,port_lat,port_lon,dock_name,dock_id,facility_type,dock_lat,dock_lon,imo,vessel_size,time_port_entry,time_arrival,time_departure,time_port_exit,hrs_at_berth,hrs_at_anchor,hrs_to_dock,hrs_in_port_after_dock,hrs_in_port_waters
str,str,str,f64,f64,str,str,str,f64,f64,f64,f64,str,str,str,str,f64,f64,f64,f64,f64
"""count""","""149060""","""149060""",149060.0,149060.0,"""149060""","""149060""","""148696""",149060.0,149060.0,149060.0,149060.0,"""149060""","""149060""","""149060""","""149060""",149060.0,149060.0,149060.0,149060.0,149060.0
"""null_count""","""0""","""0""",0.0,0.0,"""0""","""0""","""364""",0.0,0.0,0.0,0.0,"""0""","""0""","""0""","""0""",0.0,0.0,0.0,0.0,0.0
"""mean""",,,32.355629,-94.734587,,,,32.355861,-94.733648,10145000.0,207.563203,"""2021-06-05 16:11:18.714215""","""2021-06-06 23:43:26.756433""","""2021-06-12 17:34:14.974124""","""2021-06-14 11:58:45.439413""",62.765083,14.088087,31.527654,42.400755,211.782574
"""std""",,,6.911825,20.752654,,,,6.912113,20.75299,26593000.0,58.898016,,,,,434.691401,90.94194,811.801086,1197.611755,2327.014701
"""min""","""0_Corpus Christi, TX_2020-04-0…","""Albany Port District, NY""",17.938939,-166.549916,"""ADM Corpus Christi Grain Eleva…","""00XE""","""Anchorage""",17.936081,-166.53444,0.0,101.0,"""2018-01-01 00:35:19""","""2018-01-01 00:35:19""","""2018-01-01 04:23:54""","""2018-01-01 09:15:57""",0.083333,0.0,0.0,0.0,0.133333
"""25%""",,,28.629389,-118.2095,,,,28.645767,-118.21111,9295945.0,176.0,"""2019-08-26 07:35:43""","""2019-08-27 07:09:16""","""2019-09-02 00:53:02""","""2019-09-04 14:23:09""",16.716667,0.0,2.666667,2.15,26.033333
"""50%""",,,30.69123,-90.085256,,,,30.712718,-90.112537,9402043.0,190.0,"""2021-07-18 17:41:27""","""2021-07-20 08:08:48""","""2021-07-27 01:17:07""","""2021-07-29 05:41:35""",31.416667,0.0,3.55,2.8,44.566667
"""75%""",,,36.86642,-80.117801,,,,36.875896,-80.114322,9613848.0,230.0,"""2023-02-24 19:12:22""","""2023-02-26 10:08:18""","""2023-03-03 23:04:34""","""2023-03-05 17:59:37""",58.25,0.0,5.516667,3.55,84.15
"""max""","""9992268_Morehead City, NC_2024…","""Wilmington, NC""",61.23778,-66.096678,"""YUSEN TERMINALS BERTHS 212-221""","""1JHK""","""Open Water""",61.24306,-66.086926,980002500.0,667.0,"""2024-09-30 12:54:53""","""2024-09-30 16:32:15""","""2024-09-30 23:22:34""","""2024-09-30 23:22:34""",45137.65,14177.583333,59022.816667,58916.25,59078.816667


call_id,port_name,port_lat,port_lon,dock_name,dock_id,facility_type,dock_lat,dock_lon,imo,vessel_size,time_port_entry,time_arrival,time_departure,time_port_exit,hrs_at_berth,hrs_at_anchor,hrs_to_dock,hrs_in_port_after_dock,hrs_in_port_waters
str,str,f64,f64,str,str,str,f64,f64,i64,f64,datetime[μs],datetime[μs],datetime[μs],datetime[μs],f64,f64,f64,f64,f64
"""9332925_Galveston, TX_2024-02-…","""Galveston, TX""",29.31049,-94.8127,"""PORT OF GALVESTON, PIERS 37-38""","""0VXL""","""Dock""",29.308333,-94.811389,9332925,228.0,2024-02-10 07:50:18,2024-02-10 11:39:51,2024-02-11 00:06:26,2024-02-11 03:52:27,12.433333,0.0,3.816667,3.766667,20.033333
"""9021332_Port of Long Beach, CA…","""Port of Long Beach, CA""",33.73957,-118.2095,"""TOYOTA VEHICLE PROCESSORS BERT…","""0VEQ""","""Dock""",33.773889,-118.21888,9021332,180.0,2019-01-29 08:36:22,2019-01-29 17:59:08,2019-01-29 18:17:08,2019-01-30 04:38:18,0.3,0.0,9.366667,10.35,20.016667
"""9407251_Port Arthur, TX_2023-1…","""Port Arthur, TX""",29.83142,-93.96069,"""VALERO REFINING GROUP, PORT AR…","""0QVD""","""Dock""",29.846377,-93.968517,9407251,229.0,2023-11-01 22:21:55,2023-11-02 00:32:44,2023-11-04 20:33:22,2023-11-04 22:40:53,68.0,0.0,2.166667,2.116667,72.3
"""9357298_Port of Brunswick, GA_…","""Port of Brunswick, GA""",31.132426,-81.53666,"""GEORGIA PORTS AUTHORITY, COLON…","""0SB2""","""Dock""",31.132222,-81.537222,9357298,199.0,2019-05-08 06:15:27,2019-05-08 09:36:16,2019-05-08 22:18:16,2019-05-09 01:01:23,12.7,0.0,3.333333,2.716667,18.75
"""9705732_Galveston, TX_2021-09-…","""Galveston, TX""",29.31049,-94.8127,"""AGRILIANCE, GALVESTON PIERS 35…","""0VY2""","""Dock""",29.308333,-94.809167,9705732,145.0,2021-09-12 00:52:24,2021-09-12 06:00:00,2021-09-13 07:42:00,2021-09-14 03:47:48,21.4,8.666667,5.116667,20.083333,50.916667


### Notes on Calls Frame and additional cleaning

- hrs_in_port_after_dock can be 0 when a vessel docks in overlapping port waters and visits both ports. 4692 (3.2%) of port calls have hrs_in_port_after_dock == 0. 
- hrs_to_dock == 0 implies that the first time the vessel sent an AIS message while in port waters was while at dock. This would be expected with new vessels that send their first messages from a dock, and accounts for ~1.6% (2234) of port calls. 
    - Of these, ~17% (336) were docked in overlapping port waters prior to visiting the next dock. This would result in hrs_in_port_waters == hrs_at_berth. 
- The mean and quartile statistics for the port calls seems reasonable; however, some calls have very long (6+ years in some cases) hrs_at_berth and related stats. This would result from vessel AIS transponders going offline at some stage during their visit to port waters. Rectifying this issue will be done either in the AIS ingestion or geodata_prep stages at a later date. For now we simply drop these as outliers. 
- Null status_duration values exist in ~3k of 1.3M status changes in the main_df; this is expected whenever an AIS transciever goes offline while in port waters. 
    - There are no observations of entirely-null duration values for mooring statuses, which limits the potential impacts of this issue on the stats.
    - Null status_duration would cause undervalued time data (e.g., time_port_exit == the timestamp of the last status change + the status duration); however, dropping calls with null status_durations has no measurable impact on the statistics, so they are left in the data for now. 

In [16]:
#define outlier drop
def drop_outliers(df, cols, threshold=3):
    '''
    Drops outliers from the dataframe for the specified columns.
    Args:
        df: Polars DataFrame
        cols: List of columns to drop outliers from
        threshold: Z-score threshold for outlier detection
    Returns:
        Polars DataFrame with outliers dropped
    '''
    print(f'Outlier threshold: {threshold} Std Devs')
    for col in cols:
        #compute z scores
        df = df.with_columns(
            z_score = (pl.col(col) - pl.col(col).mean()) / pl.col(col).std()
        )
        #drop outliers
        df = df.filter(pl.col('z_score').abs() < threshold)
    return df.drop('z_score')

In [17]:
#get count of rows from calls_df before drop
rows_prior = calls_df.shape[0]

#list cols for outlier drop
outlier_cols = ['hrs_at_berth', 'hrs_to_dock', 'hrs_at_anchor', 
                'hrs_in_port_after_dock', 'hrs_in_port_waters']

#drop outliers
#print z_score thresholds
for col in outlier_cols:
    print(f'{col} outlier threshold: {calls_df[col].std()*3/24:.1f} days')
#drop outliers
calls_df = drop_outliers(calls_df, outlier_cols, threshold=3)
#print rows dropped
print(f'Total outlier rows dropped: {rows_prior - calls_df.shape[0]} of {rows_prior}')

#inspect
display(calls_df.describe())
calls_df.head()

hrs_at_berth outlier threshold: 54.3 days
hrs_to_dock outlier threshold: 101.5 days
hrs_at_anchor outlier threshold: 11.4 days
hrs_in_port_after_dock outlier threshold: 149.7 days
hrs_in_port_waters outlier threshold: 290.9 days
Outlier threshold: 3 Std Devs
Total outlier rows dropped: 2572 of 149060


statistic,call_id,port_name,port_lat,port_lon,dock_name,dock_id,facility_type,dock_lat,dock_lon,imo,vessel_size,time_port_entry,time_arrival,time_departure,time_port_exit,hrs_at_berth,hrs_at_anchor,hrs_to_dock,hrs_in_port_after_dock,hrs_in_port_waters
str,str,str,f64,f64,str,str,str,f64,f64,f64,f64,str,str,str,str,f64,f64,f64,f64,f64
"""count""","""146488""","""146488""",146488.0,146488.0,"""146488""","""146488""","""146133""",146488.0,146488.0,146488.0,146488.0,"""146488""","""146488""","""146488""","""146488""",146488.0,146488.0,146488.0,146488.0,146488.0
"""null_count""","""0""","""0""",0.0,0.0,"""0""","""0""","""355""",0.0,0.0,0.0,0.0,"""0""","""0""","""0""","""0""",0.0,0.0,0.0,0.0,0.0
"""mean""",,,32.32609,-94.609283,,,,32.326194,-94.608303,10145000.0,207.268527,"""2021-06-07 05:30:12.034630""","""2021-06-07 17:03:22.014062""","""2021-06-09 21:58:21.931659""","""2021-06-10 03:12:20.055758""",47.831807,8.354447,11.544853,5.225082,69.694045
"""std""",,,6.918495,20.728085,,,,6.918757,20.728437,26607000.0,58.810358,,,,,62.675238,28.97474,36.133365,41.271337,95.772879
"""min""","""0_Corpus Christi, TX_2020-04-0…","""Albany Port District, NY""",17.938939,-166.549916,"""ADM Corpus Christi Grain Eleva…","""00XE""","""Anchorage""",17.936081,-166.53444,0.0,101.0,"""2018-01-01 00:35:19""","""2018-01-01 00:35:19""","""2018-01-01 04:23:54""","""2018-01-01 09:15:57""",0.083333,0.0,0.0,0.0,0.133333
"""25%""",,,28.629389,-118.2095,,,,28.645767,-118.21083,9295397.0,176.0,"""2019-08-27 09:40:39""","""2019-08-27 17:30:51""","""2019-08-29 21:19:05""","""2019-08-30 01:38:50""",16.55,0.0,2.666667,2.15,25.8
"""50%""",,,30.69123,-90.085256,,,,30.707675,-90.105844,9401128.0,190.0,"""2021-07-19 00:59:38""","""2021-07-19 14:22:59""","""2021-07-22 01:57:56""","""2021-07-22 07:16:28""",30.966667,0.0,3.516667,2.783333,43.766667
"""75%""",,,36.86642,-80.117801,,,,36.875896,-80.114322,9612882.0,230.0,"""2023-02-27 09:23:38""","""2023-02-27 17:39:00""","""2023-03-02 01:31:11""","""2023-03-02 06:10:50""",57.016667,0.0,5.366667,3.533333,80.7
"""max""","""9992268_Morehead City, NC_2024…","""Wilmington, NC""",61.23778,-66.096678,"""YUSEN TERMINALS BERTHS 212-221""","""1JHK""","""Open Water""",61.24306,-66.086926,980002500.0,667.0,"""2024-09-30 12:54:53""","""2024-09-30 16:32:15""","""2024-09-30 21:54:12""","""2024-09-30 22:57:53""",1365.033333,246.2,2152.533333,3560.95,4453.8


call_id,port_name,port_lat,port_lon,dock_name,dock_id,facility_type,dock_lat,dock_lon,imo,vessel_size,time_port_entry,time_arrival,time_departure,time_port_exit,hrs_at_berth,hrs_at_anchor,hrs_to_dock,hrs_in_port_after_dock,hrs_in_port_waters
str,str,f64,f64,str,str,str,f64,f64,i64,f64,datetime[μs],datetime[μs],datetime[μs],datetime[μs],f64,f64,f64,f64,f64
"""9332925_Galveston, TX_2024-02-…","""Galveston, TX""",29.31049,-94.8127,"""PORT OF GALVESTON, PIERS 37-38""","""0VXL""","""Dock""",29.308333,-94.811389,9332925,228.0,2024-02-10 07:50:18,2024-02-10 11:39:51,2024-02-11 00:06:26,2024-02-11 03:52:27,12.433333,0.0,3.816667,3.766667,20.033333
"""9021332_Port of Long Beach, CA…","""Port of Long Beach, CA""",33.73957,-118.2095,"""TOYOTA VEHICLE PROCESSORS BERT…","""0VEQ""","""Dock""",33.773889,-118.21888,9021332,180.0,2019-01-29 08:36:22,2019-01-29 17:59:08,2019-01-29 18:17:08,2019-01-30 04:38:18,0.3,0.0,9.366667,10.35,20.016667
"""9407251_Port Arthur, TX_2023-1…","""Port Arthur, TX""",29.83142,-93.96069,"""VALERO REFINING GROUP, PORT AR…","""0QVD""","""Dock""",29.846377,-93.968517,9407251,229.0,2023-11-01 22:21:55,2023-11-02 00:32:44,2023-11-04 20:33:22,2023-11-04 22:40:53,68.0,0.0,2.166667,2.116667,72.3
"""9357298_Port of Brunswick, GA_…","""Port of Brunswick, GA""",31.132426,-81.53666,"""GEORGIA PORTS AUTHORITY, COLON…","""0SB2""","""Dock""",31.132222,-81.537222,9357298,199.0,2019-05-08 06:15:27,2019-05-08 09:36:16,2019-05-08 22:18:16,2019-05-09 01:01:23,12.7,0.0,3.333333,2.716667,18.75
"""9705732_Galveston, TX_2021-09-…","""Galveston, TX""",29.31049,-94.8127,"""AGRILIANCE, GALVESTON PIERS 35…","""0VY2""","""Dock""",29.308333,-94.809167,9705732,145.0,2021-09-12 00:52:24,2021-09-12 06:00:00,2021-09-13 07:42:00,2021-09-14 03:47:48,21.4,8.666667,5.116667,20.083333,50.916667


In [18]:
#save calls dataframe to parquet
calls_df.write_parquet('dashboard/calls.parquet')

## Simple delay calculations

Differentiating between delay time and the "efficient" time it takes for a ship to get to a dock is somewhat difficult. 

At its most basic, we can calculate the difference between the hrs_to_dock time for each port call and the minimum hrs_to_dock for that vessel and dock. 

In [19]:
## these stats tabled for now

#add min hrs to dock for each vessel-dock pair
delay_df = (
    calls_df
    #min hrs to dock for each vessel-dock pair
    .with_columns(
        min_hrs_to_dock = pl.col('hrs_to_dock').min().over('imo', 'dock_id')
    )
    #"delay" in hrs
    .with_columns(
        hrs_delay = pl.col('hrs_to_dock') - pl.col('min_hrs_to_dock')
    )
    #drop unnecessary columns
    .drop('min_hrs_to_dock')
)

64k of 145k port calls align show zero delay indicating those calls represent the only time that that vessel visited that dock. 

### Time Awaiting Berth

We define time awaiting berth as the total time it takes a vessel to get to the dock minus the amount of time the dock was occupied while that vessel was en route. 

Generating this statistic is tabled for now. 

In [20]:
%%script echo skipping
#calculate time awaiting berth

#for each call_id and dock, get the total time dock was occupied between time_port_entry and time_arrival

#get time_port_entry, time_arrival for each call id
lf = (calls_df.select('call_id', 'time_port_entry', 'time_arrival')
      .unique().lazy())
#join to main lf
main_lf = main_lf.join(lf, on='call_id', how='left')

for call in calls_df.select('call_id').unique().to_series():
      #get start time and end time
      start = (calls_df.filter(pl.col('call_id')==call)
               .select('time_port_entry').item())
      end = (calls_df.filter(pl.col('call_id')==call)
               .select('time_arrival').item())
      #get dock occupancy
      df = (
            main_lf
            .with_columns(
                  dock_occupied = (
                        (pl.col('status')==5)
                        .then(pl.col('status_duration'))
                        .otherwise(pl.lit(None))
                  )
            )
      )


skipping


In [21]:
#create monthly stats dataframe
monthly_df = (
    calls_df
    #get month from docking time
    .with_columns(
        #extract month from docking time
        month = pl.col('time_arrival').dt.strftime('%Y%m')
    )
    #group by port dock and month
    .group_by(['port_name', 'port_lat', 'port_lon', 
               'dock_id', 'dock_name', 'dock_lat', 'dock_lon', 
               'month'])
    .agg(
        #count number of vessels
        vessels = pl.n_unique('imo'),
        #mean vessel size
        vessel_size_mean = pl.mean('vessel_size'),
        #count number of vessel calls
        calls = pl.n_unique('call_id'),
        #time at dock stats for each vessel in hours
        hrs_occupied = pl.sum('hrs_at_berth'),
        hrs_at_berth_median = pl.median('hrs_at_berth'),
        hrs_at_berth_mean = pl.mean('hrs_at_berth'),
        #time at anchor stats for each vessel visit in hours
        hrs_at_anchor_median = pl.median('hrs_at_anchor'),
        hrs_at_anchor_mean = pl.mean('hrs_at_anchor'),
        #time in port waters 
        hrs_in_port_waters_total = pl.sum('hrs_in_port_waters'),
        hrs_in_port_waters_mean = pl.mean('hrs_in_port_waters'),
        hrs_in_port_waters_median = pl.median('hrs_in_port_waters')
    )
    #get hours from each month
    .with_columns(
        hrs_in_month = (
            pl.when(pl.col('month').str.tail(2).is_in(['01', '03', '05', '07',
                                                       '08', '10', '12']))
            .then(31*24)
            .when(pl.col('month').str.tail(2).is_in(['04', '06', '09', '11']))
            .then(30*24)
            .otherwise(28*24)
        )
    )
    .with_columns(
        #dock utilization - percentage of time a dock is occupied
        utilization = (
            pl.col('hrs_occupied')/pl.col('hrs_in_month')
        )
    )
    #drop hours in month
    .drop('hrs_in_month')
    #sort by port dock then month
    .sort(['port_name', 'dock_id', 'month'])
)

In [22]:
monthly_df.describe()

statistic,port_name,port_lat,port_lon,dock_id,dock_name,dock_lat,dock_lon,month,vessels,vessel_size_mean,calls,hrs_occupied,hrs_at_berth_median,hrs_at_berth_mean,hrs_at_anchor_median,hrs_at_anchor_mean,hrs_in_port_waters_total,hrs_in_port_waters_mean,hrs_in_port_waters_median,utilization
str,str,f64,f64,str,str,f64,f64,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""count""","""34459""",34459.0,34459.0,"""34459""","""34459""",34459.0,34459.0,"""34459""",34459.0,34459.0,34459.0,34459.0,34459.0,34459.0,34459.0,34459.0,34459.0,34459.0,34459.0,34459.0
"""null_count""","""0""",0.0,0.0,"""0""","""0""",0.0,0.0,"""0""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""mean""",,32.729156,-95.518577,,,32.730879,-95.516727,,3.732058,195.76391,4.251081,203.336887,59.584558,62.746007,7.248826,9.392862,296.275032,88.042209,82.783433,0.278678
"""std""",,6.893308,20.907512,,,6.892972,20.908134,,3.711798,49.182437,4.39989,196.054813,69.330032,69.756424,23.665062,24.486733,310.444455,100.255194,98.645074,0.268427
"""min""","""Albany Port District, NY""",17.938939,-166.549916,"""00XE""","""ADM Corpus Christi Grain Eleva…",17.936081,-166.53444,"""201801""",1.0,101.0,1.0,0.083333,0.083333,0.083333,0.0,0.0,0.65,0.65,0.65,0.000112
"""25%""",,29.31049,-118.2095,,,29.309167,-118.21083,,1.0,168.0,1.0,67.833333,22.883333,25.533333,0.0,0.0,97.533333,37.676667,34.15,0.092966
"""50%""",,30.69123,-91.19934,,,30.710833,-91.199983,,3.0,184.75,3.0,148.183333,40.733333,44.022222,0.0,0.0,205.85,63.472222,57.916667,0.202867
"""75%""",,37.82152,-80.117801,,,37.811944,-80.114722,,5.0,211.5,5.0,278.266667,72.516667,76.516667,0.0,4.583333,387.633333,107.133333,100.716667,0.381138
"""max""","""Wilmington, NC""",61.23778,-66.096678,"""1JHK""","""YUSEN TERMINALS BERTHS 212-221""",61.24306,-66.086926,"""202409""",43.0,385.0,53.0,3851.016667,1312.966667,1312.966667,244.916667,244.916667,6076.233333,3290.8,3290.8,5.176098


#### Hours calc discussion

The current code first associates the call_id with the month in which the vessel arrived at dock, then counts total times for that call_id to that month. This results in some edge cases where hours stats far exceed the total hours in the month, as in the case that a vessel arrives at the dock and stays there for a very long period of time. 

This can be partially resolved by dropping statuses that are very long, which needs to be done anyway.
- what's the right strategy? set status duration to (the median for that dock? zero? 12hr?) and give an unknown status afterwards? 

It would be fully resolved by totaling monthly hrs (at dock or hrs utilized, for example) independently of call_id.  

In [23]:
ports_alltime_df = (
    calls_df
    #group by port 
    .group_by('port_name')
    .agg(
        #port lat and lon
        port_lat = pl.first('port_lat'),
        port_lon = pl.first('port_lon'),
        #count number of vessels
        vessels = pl.n_unique('imo'),
        #mean vessel size
        vessel_size_mean = pl.mean('vessel_size'),
        #count number of vessel calls
        calls = pl.n_unique('call_id'),
        #time at dock stats for each vessel in hours
        hrs_at_berth_median = pl.median('hrs_at_berth'),
        hrs_at_berth_mean = pl.mean('hrs_at_berth'),
        #time at anchor stats for each vessel visit in hours
        hrs_at_anchor_median = pl.median('hrs_at_anchor'),
        hrs_at_anchor_mean = pl.mean('hrs_at_anchor')
    )
    #sort by port
    .sort('port_name')
)
#inspect
display(ports_alltime_df.describe())
ports_alltime_df.head()

statistic,port_name,port_lat,port_lon,vessels,vessel_size_mean,calls,hrs_at_berth_median,hrs_at_berth_mean,hrs_at_anchor_median,hrs_at_anchor_mean
str,str,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""count""","""70""",70.0,70.0,70.0,70.0,70.0,70.0,70.0,70.0,70.0
"""null_count""","""0""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""mean""",,34.780064,-98.610501,691.3,196.481713,2092.685714,52.168214,66.418701,0.772024,8.113449
"""std""",,9.238289,26.613336,701.067599,36.613303,2230.887427,102.710006,94.074057,2.76927,9.49449
"""min""","""Albany Port District, NY""",17.938939,-166.549916,3.0,117.586207,3.0,8.233333,10.619192,0.0,0.0
"""25%""",,29.31049,-121.541541,129.0,172.489815,409.0,22.658333,34.661621,0.0,1.126932
"""50%""",,32.788781,-90.61794,538.0,189.879402,1227.0,34.116667,49.61037,0.0,4.31346
"""75%""",,41.172,-76.72421,965.0,214.153277,3300.0,50.05,61.810556,0.0,13.490027
"""max""","""Wilmington, NC""",61.23778,-66.096678,2684.0,295.202899,9798.0,853.55,729.5625,16.7,43.169949


port_name,port_lat,port_lon,vessels,vessel_size_mean,calls,hrs_at_berth_median,hrs_at_berth_mean,hrs_at_anchor_median,hrs_at_anchor_mean
str,f64,f64,u32,f64,u32,f64,f64,f64,f64
"""Albany Port District, NY""",42.64271,-73.74816,264,157.841398,372,65.841667,79.631452,0.0,0.248118
"""Anacortes, WA""",48.495943,-122.59961,272,219.191163,1109,36.966667,49.61037,0.0,25.777773
"""Baltimore, MD""",39.250827,-76.56164,2494,213.61283,7467,34.216667,47.927601,0.0,12.543871
"""Beaumont, TX""",30.084872,-94.094985,982,183.208705,1907,54.016667,66.59043,0.0,3.696067
"""Boston, MA""",42.342468,-71.032029,315,295.202899,966,17.7,27.743099,0.0,2.409472


## Visualizations

In [24]:
#scatterplot
fig = px.scatter_geo(
    ports_alltime_df,
    lon='port_lon',
    lat='port_lat',
    size='vessels',
    color='hrs_at_berth_median',
    range_color=[0,50],
    hover_name='port_name',
    size_max=30,
    title='Total Vessels (all time) and Median Hours at Berth',
    color_continuous_scale=px.colors.sequential.Viridis,
    width=1000,
    height=600,
    labels={
        'time_at_berth_avg':'Hours at Berth'
    }
)

# Fit the view to ports
fig.update_geos(fitbounds="locations")

# Add footnote using add_annotation
fig.add_annotation(
    text="Note: Circle size corresponds to averages vessels per month",  # Footnote text
    xref="paper", yref="paper",  # Position relative to the plot area
    x=0, y=0-0.05,  # Adjust to footnote position
    showarrow=False,  # No arrow, just text
    font=dict(size=14, color="black"),  # Customize the font style
    align="left"
)

# Show the figure
fig.show()

### Port-level Scatter Plots

Developing core visualization and data-agg functions for geographic scatter plots. 

Initial goal:
- User selects port and time bounds; dashboard shows standard visualizations
    - calls over time (monthly)
    - avg hrs at berth over time (monthly)
    - vessel size and hrs at berth scatter
    - vessel size and hrs at anchors scatter


In [25]:
#set date range
start_month = '201801'
end_month = '202312'

#set port name
port_name = 'Seattle, WA'

#convert start and end month to datetime
start_month = pd.to_datetime(start_month, format='%Y%m')
end_month = pd.to_datetime(end_month, format='%Y%m')

#get dataframe
df = (
    calls_df
    #filter for Seattle
    .filter(pl.col('port_name') == port_name)
    #filter by date
    .filter(pl.col('time_arrival').is_between(start_month, end_month))
    #get month from docking time
    .with_columns(
        #extract month from docking time
        month = pl.col('time_arrival').dt.strftime('%Y%m')
    )
)

#get dock stats
docks_df = (
    df.group_by(['dock_name', 'dock_lat', 'dock_lon', 'facility_type'])
    .agg(
        #mean vessel size
        vessel_size_mean = pl.mean('vessel_size'),
        #median vessel size
        vessel_size_median = pl.median('vessel_size'),
        #mean hours at berth
        hrs_at_berth_mean = pl.mean('hrs_at_berth'),
        #median hours at berth
        hrs_at_berth_median = pl.median('hrs_at_berth'),
        #mean hours at anchor
        hrs_at_anchor_mean = pl.mean('hrs_at_anchor'),
        #median hours at anchor
        hrs_at_anchor_median = pl.median('hrs_at_anchor'),
        #mean hours in port waters
        hrs_in_port_waters_mean = pl.mean('hrs_in_port_waters'),
        #median hours in port waters
        hrs_in_port_waters_median = pl.median('hrs_in_port_waters'),
    )
    #convert to pandas
    .to_pandas()
)

#inspect
docks_df.head()

Unnamed: 0,dock_name,dock_lat,dock_lon,facility_type,vessel_size_mean,vessel_size_median,hrs_at_berth_mean,hrs_at_berth_median,hrs_at_anchor_mean,hrs_at_anchor_median,hrs_in_port_waters_mean,hrs_in_port_waters_median
0,OPEN WATER-BUOYS & DOLPHINS,47.583046,-122.36105,Open Water,292.625,300.0,65.00625,37.366667,35.502083,13.683333,106.79375,65.9
1,PIER 34,47.587536,-122.342553,Dock,298.31383,294.0,51.524424,36.766667,6.267642,0.0,65.916356,49.975
2,"B P OIL CO., SEATTLE TERMINAL, PIER NO. 11",47.5825,-122.35833,Dock,183.0,183.0,18.133333,17.091667,2.829167,1.966667,25.3125,24.775
3,CRANE DOCK PIER 20,47.574757,-122.346077,Dock,160.444444,149.0,61.175926,60.4,0.0,0.0,68.744444,73.516667
4,"TRANS PACIFIC CONTAINER SERVICE CORP., TERMINA...",47.582943,-122.34293,Dock,263.711712,262.0,30.012988,23.366667,5.294845,0.0,42.13521,34.15


In [26]:
def mapbox_zoom_finder(lons, lats, lon_pad=0, lat_pad=0):
    """
    Calculates the optimal zoom level for a Plotly Mapbox plot.
    Args:
        lons (list): List of longitudes.
        lats (list): List of latitudes.
        lon_pad (float, optional): Padding to add to the longitude range. Defaults to 0.
        lat_pad (float, optional): Padding to add to the latitude range. Defaults to 0.
    Returns:
        zoom (int): the calculated zoom level
    """
    # Check if the lengths of lons and lats are equal and not empty
    if len(lons) != len(lats) or len(lons) == 0:
        return 10
    # Calculate the maximum and minimum longitude and latitude
    max_lon, min_lon = max(lons), min(lons)
    max_lat, min_lat = max(lats), min(lats)
    # Calculate the longitude and latitude ranges
    lon_range = max_lon - min_lon
    lat_range = max_lat - min_lat
    # Calculate the zoom level based on the ranges
    zoom = 7 - np.log2(max(lon_range + lon_pad, lat_range + lat_pad))
    return zoom

In [27]:
def plot_mapbox(df, lat_col, lon_col, size_col, color_col, title, zoom=None,
                width=800, height=600, size_max=30, hover_name=None, range_color=None,
                hover_data=None, mapbox_style='carto-positron', labels=None, 
                color_continuous_scale=None, color_outlier_z=None):
    """
    Plots a Mapbox scatter plot using Plotly.
    Args:
        df (pd.DataFrame): DataFrame containing the data to plot.
        lat_col (str): Column name for latitude.
        lon_col (str): Column name for longitude.
        size_col (str): Column name for size.
        color_col (str): Column name for color.
        title (str): Title of the plot.
        zoom (float, optional): Zoom level for the map. Defaults to None.
        width (int, optional): Width of the plot. Defaults to 800.
        height (int, optional): Height of the plot. Defaults to 600.
        size_max (int, optional): Maximum size of the markers. Defaults to 30.
        hover_name (str, optional): Column name for hover text. Defaults to None.
        range_color (list, optional): Range for color scale. Defaults to None.
        hover_data (list, optional): Additional data to show on hover. Defaults to None.
        mapbox_style (str, optional): Mapbox style. Defaults to 'carto-positron'.
        labels (dict, optional): Labels for the axes. Defaults to None.
        color_continuous_scale (list, optional): Color scale for the plot. Defaults to None.
        color_outlier_z (float, optional): Z-score threshold for outlier detection. Defaults to None.
    Returns:
        None
    """
    #Set default color scale if not provided
    if not color_continuous_scale:
        color_continuous_scale = px.colors.sequential.Viridis

    # Set the zoom level automatically if not provided
    if not zoom:
        zoom = mapbox_zoom_finder(df[lon_col], df[lat_col])

    #drop outliers if specified
    if color_outlier_z:
        #get color_col upper and lower limits based on z score
        color_col_mean, color_col_std = df[color_col].mean(), df[color_col].std()
        color_col_upper = color_col_mean + (color_col_std * color_outlier_z)
        color_col_lower = color_col_mean - (color_col_std * color_outlier_z)
        #set range color
        range_color = [color_col_lower, color_col_upper]

    # Create a scatter mapbox figure
    fig = px.scatter_mapbox(
        #data
        df, lat=lat_col, lon=lon_col,
        #categories
        size=size_col, color=color_col,
        #hover info
        hover_name=hover_name, hover_data=hover_data,
        #display settings
        range_color=range_color, size_max=size_max,
        color_continuous_scale=color_continuous_scale, mapbox_style=mapbox_style,
        width=width, height=height,
        #title and labals
        title=title, labels=labels
    )
    # Set the zoom level
    fig.update_layout(mapbox_zoom=zoom)
    # Show the figure
    fig.show()

In [28]:
plot_mapbox(
    df=docks_df,
    lat_col='dock_lat',
    lon_col='dock_lon',
    size_col='vessel_size_mean',
    color_col='hrs_at_berth_mean',
    #symbol='facility_type', #NOTE not working; needs to be dock type (container terminal, bulk terminal, etc)
    size_max=20,
    title=f'Average Vessel Size and Average Hours at Berth for {port_name}',
    hover_name='dock_name',
    hover_data={'dock_name': True, 'vessel_size_mean': True},
    mapbox_style='carto-positron',
    color_outlier_z=1,
    labels={
        'vessel_size_median': 'Median Vessel Size (ft)',
        'hrs_at_berth_median': 'Median Hours at Berth'
    },
)

In [22]:

# seattle mapbox
fig_seattle = px.scatter_mapbox(
    docks_df,
    lon='dock_lon',
    lat='dock_lat',
    size='vessel_size_mean',
    color='hrs_at_berth_mean',
    hover_name='dock_name',
    #size_max=20,
    title='Vessel Size & Hours at Berth',
    color_continuous_scale=px.colors.sequential.Viridis,
    labels={'hrs_at_berth_mean': 'Mean Hours at Berth'},
    height=600, width=800
)

# Set Mapbox style
fig_seattle.update_layout(
    mapbox_style="carto-positron", 
    mapbox_zoom=mapbox_zoom_finder(docks_df['dock_lon'], docks_df['dock_lat']),
    mapbox_center={"lat": docks_df['dock_lat'].mean(), 
                   "lon": docks_df['dock_lon'].mean()},
)

# Add footnote using add_annotation
fig_seattle.add_annotation(
    text="Circle size corresponds to mean vessel length",
    xref="paper", yref="paper",
    x=0, y=-0.05,
    showarrow=False,
    font=dict(size=13, color="black"),
    align="left"
)

fig_seattle.show()

## Point in Time Stats

Still under development

In [73]:
#get point in time stats

#create point in time (pit) df to join stats to
pit_df = (
    main_lf
    .with_columns(
        date = pl.col('time').dt.date(),
        month = pl.col('time').dt.strftime('%Y%m')
    )
    .select('port_name', 'dock_id', 'month', 'date')
    .unique().collect()
)

for hour in range(0, 24):
    #create a time object for each hour
    hour_dt = pl.time(hour)
    #create a dataframe for each hour
    hour_df = (
        main_lf
        .with_columns(
            #get end of status time
            end_time = pl.col('time') + pl.col('status_duration'),
            #get date from time
            date = pl.col('time').dt.date(),
            #get month from time
            month = pl.col('time').dt.strftime('%Y%m')
        )
        #group by port dock and hour
        .group_by(['port_name', 'dock_id', 'month', 'date'])
        .agg(
            #number of vessels at dock at each hour
            vessels_at_dock = (
                #when moored at hour
                pl.when((pl.col('status')==5) & 
                        (hour_dt.is_between(pl.col('time').dt.time(), 
                                         pl.col('end_time').dt.time())))
                #then count the individual vessels
                .then(pl.col('imo'))
                .otherwise(pl.lit(None))
                .drop_nulls() #n_unique counts nulls as unique values
                .n_unique()
            ),
            #number of vessels at anchor at each hour
            vessels_at_anchor = (
                #when anchored at hour
                pl.when((pl.col('status')==1) & 
                        (hour_dt.is_between(pl.col('time').dt.time(), 
                                        pl.col('end_time').dt.time())))
                #then count the individual vessels
                .then(pl.col('imo'))
                .otherwise(pl.lit(None))
                .drop_nulls()
                .n_unique()
            )
        )
        .collect()
    )
    #join the hour dataframe to the main pit dataframe
    pit_df = (
        pit_df
        .join(hour_df, 
              on=['port_name', 'dock_id', 'month', 'date'], 
              how='left')
        #rename the columns to include the hour
        .rename({
            'vessels_at_dock': f'vessels_at_dock_{hour}',
            'vessels_at_anchor': f'vessels_at_anchor_{hour}'
        })
    )

#get port stats by month
pit_df = (
    pit_df
    #group by port and date
    .group_by(['port_name', 'month', 'date'])
    .agg(
        #sum the number of vessels at all docks at each hour
        cs.starts_with('vessels_at_dock_').sum(),
        #sum the number of vessels at anchor at each hour
        cs.starts_with('vessels_at_anchor_').sum()
    )
    #get the max at any hour
    .with_columns(
        #get max at dock at any hour
        vessels_at_dock_max = (
            pl.max_horizontal(cs.starts_with('vessels_at_dock_'))
        ),
        #get mean at dock any hour
        vessels_at_dock_mean = (
            pl.mean_horizontal(cs.starts_with('vessels_at_dock_'))
        ),
        #get max at anchor at any hour
        vessels_at_anchor_max = (
            pl.max_horizontal(cs.starts_with('vessels_at_anchor_'))
        ),
        #get mean at anchor any hour
        vessels_at_anchor_mean = (
            pl.mean_horizontal(cs.starts_with('vessels_at_anchor_'))
        )
    )
    #select the columns to keep
    .select(['port_name', 'month', 'date', 'vessels_at_dock_max', 
             'vessels_at_dock_mean', 'vessels_at_anchor_max',
             'vessels_at_anchor_mean'])
    #aggregate by month
    .group_by(['port_name', 'month'])
    .agg(
        #get max at anchor on any date during that month
        vessels_at_anchor_max = pl.max('vessels_at_anchor_max'),
        #get mean at anchor on any date during that month
        vessels_at_anchor_mean = pl.mean('vessels_at_anchor_mean'),
        #get max at dock on any date during that month
        vessels_at_dock_max = pl.max('vessels_at_dock_max'),
        #get mean at dock on any date during that month
        vessels_at_dock_mean = pl.mean('vessels_at_dock_mean'
        )
    )
)

In [74]:
#inspect
display(pit_df.describe())
pit_df.head()

statistic,port_name,month,vessels_at_anchor_max,vessels_at_anchor_mean,vessels_at_dock_max,vessels_at_dock_mean
str,str,str,f64,f64,f64,f64
"""count""","""5200""","""5200""",5200.0,5200.0,5200.0,5200.0
"""null_count""","""0""","""0""",0.0,0.0,0.0,0.0
"""mean""",,,0.783462,0.046248,2.598462,0.493748
"""std""",,,0.796243,0.084986,1.76203,0.473728
"""min""","""Albany Port District, NY""","""201801""",0.0,0.0,0.0,0.0
"""25%""",,,0.0,0.0,1.0,0.194444
"""50%""",,,1.0,0.013889,2.0,0.349702
"""75%""",,,1.0,0.056452,3.0,0.610215
"""max""","""Wilmington, NC""","""202409""",6.0,0.951389,12.0,3.938889


port_name,month,vessels_at_anchor_max,vessels_at_anchor_mean,vessels_at_dock_max,vessels_at_dock_mean
str,str,u32,f64,u32,f64
"""Sacramento-Yolo Port, CA""","""201904""",0,0.0,2,0.6
"""New Haven, CT""","""202401""",1,0.048246,1,0.149123
"""Port of Brunswick, GA""","""201902""",1,0.070513,2,0.520833
"""Port of Vancouver USA, WA""","""202210""",1,0.074074,3,0.876543
"""Anacortes, WA""","""202308""",1,0.038462,2,0.344551


### Max/Mean stats for vessels_at_dock 

- Current output seems far too low - e.g. Port of LA shows a max of 11 vessels at dock at any time since 2018; since there are 37 docks at LA we expect a max in the 20s or higher. 

In [75]:
pit_df.filter(pl.col('port_name')=='Port of Los Angeles, CA').sort('month')

port_name,month,vessels_at_anchor_max,vessels_at_anchor_mean,vessels_at_dock_max,vessels_at_dock_mean
str,str,u32,f64,u32,f64
"""Port of Los Angeles, CA""","""201801""",2,0.147849,6,1.313172
"""Port of Los Angeles, CA""","""201802""",3,0.241071,4,0.837798
"""Port of Los Angeles, CA""","""201803""",2,0.283602,8,1.688172
"""Port of Los Angeles, CA""","""201804""",3,0.277778,7,1.15
"""Port of Los Angeles, CA""","""201805""",2,0.291667,5,1.221774
"""Port of Los Angeles, CA""","""201806""",2,0.222222,5,1.441667
"""Port of Los Angeles, CA""","""201807""",4,0.245968,5,1.271505
"""Port of Los Angeles, CA""","""201808""",2,0.165323,7,1.447581
"""Port of Los Angeles, CA""","""201809""",2,0.243056,7,1.172222
"""Port of Los Angeles, CA""","""201810""",3,0.241935,7,1.552419
