# Port Statistics

This notebook develops and explores the various port statistics used in the [Port Performance Project](https://github.com/epistemetrica/Port-Performance-Project). See the README.md file in the main directory for more info.

The primary data set comes from a combination of AIS vessel data and port data, processed in the Port Geodata notebook.

Statistics and final dataframes developed here are used in the Port Performance Dashboard.



In [17]:
#prelims
import polars as pl
import polars.selectors as cs
import pandas as pd
import geopandas as gpd
import time
import plotly.express as px
import matplotlib.pyplot as plt
import contextily as cx
import numpy as np
import glob
import folium
from folium.plugins import HeatMap

#enable string cache for polars categoricals
pl.enable_string_cache()
#display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pl.Config(tbl_rows=100);

In [18]:
#load and inspect
lf = pl.scan_parquet('port data/dashboard/main.parquet')
display(lf.describe())
lf.limit(5).collect()

statistic,docking_id,mmsi,time,speed,course,heading,status,vessel_name,vessel_type,imo,length,width,draft,cargo,"Albany Port District, NY_in_port_waters","Anacortes, WA_in_port_waters","Baltimore, MD_in_port_waters","Beaumont, TX_in_port_waters","Boston, MA_in_port_waters","Bridgeport, CT_in_port_waters","Brownsville, TX_in_port_waters","Calhoun Port Authority, TX_in_port_waters","Canaveral Port District, FL_in_port_waters","Coos Bay OR, Port of_in_port_waters","Corpus Christi, TX_in_port_waters","Galveston, TX_in_port_waters","Grays Harbor Port District, WA_in_port_waters","Greater Lafourche Port, LA_in_port_waters","Guayama, PR_in_port_waters","Guaynabo, PR_in_port_waters","Hilo, Hawai'i, HI_in_port_waters","Honolulu, O'ahu, HI_in_port_waters","Houston Port Authority, TX_in_port_waters","Jacksonville, FL_in_port_waters","Kahului, Maui, HI_in_port_waters","Kalaeloa Barbers Point, HI_in_port_waters",…,"San Juan, PR_in_port_waters","Searsport, ME_in_port_waters","Seattle, WA_in_port_waters","South Jersey Port Corp, NJ_in_port_waters","South Louisiana, LA, Port of_in_port_waters","Stockton, CA_in_port_waters","Tacoma, WA_in_port_waters","Tampa Port Authority, FL_in_port_waters","Terrebonne Parish Port, LA_in_port_waters","Texas City, TX_in_port_waters","Unalaska Island, AK_in_port_waters","Valdez, AK_in_port_waters","Victoria, TX_in_port_waters","Virgin Islands - St. Croix, VI_in_port_waters","Virginia, VA, Port of_in_port_waters","West St. Mary Parish Port, LA_in_port_waters","Wilmington, DE_in_port_waters","Wilmington, NC_in_port_waters","Yabucoa, PR_in_port_waters",in_port_waters,status_duration,dock_id,dock_name,facility_type,port_name,port_area_desc,port_area_name,port_area_id,dist_to_dock,year,month,vessel_lat,vessel_lon,dock_lat,dock_lon,port_lat,port_lon
str,f64,str,str,f64,f64,f64,f64,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,…,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str,str,str,str,str,str,str,f64,f64,str,f64,f64,f64,f64,f64,f64
"""count""",6535308.0,"""6535308""","""6535308""",6535308.0,6514673.0,6425901.0,6535308.0,"""6535308""",6535308.0,6535277.0,6535308.0,6037914.0,5983069.0,5059254.0,6535308.0,6535308.0,6535308.0,6535308.0,6535308.0,6535308.0,6535308.0,6535308.0,6535308.0,6535308.0,6535308.0,6535308.0,6535308.0,6535308.0,6535308.0,6535308.0,6535308.0,6535308.0,6535308.0,6535308.0,6535308.0,6535308.0,…,6535308.0,6535308.0,6535308.0,6535308.0,6535308.0,6535308.0,6535308.0,6535308.0,6535308.0,6535308.0,6535308.0,6535308.0,6535308.0,6535308.0,6535308.0,6535308.0,6535308.0,6535308.0,6535308.0,6535308.0,6533771.0,"""6535308""","""6535308""","""6521979""","""6535308""","""6535308""","""3555593""","""6535308""",6535308.0,6535308.0,"""6535308""",6535308.0,6535308.0,6535308.0,6535308.0,6535308.0,6535308.0
"""null_count""",0.0,"""0""","""0""",0.0,20635.0,109407.0,0.0,"""0""",0.0,31.0,0.0,497394.0,552239.0,1476054.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1537.0,"""0""","""0""","""13329""","""0""","""0""","""2979715""","""0""",0.0,0.0,"""0""",0.0,0.0,0.0,0.0,0.0,0.0
"""mean""",5231000.0,,"""2021-04-03 02:26:51.983197""",0.284146,184.620127,183.159084,2.759917,,73.378548,10021000.0,209.241067,32.336211,11.016364,74.03041,0.001502,0.002242,0.026284,0.011172,0.001483,4.2e-05,0.003134,0.001355,0.004624,0.000584,0.011675,0.010286,1.7e-05,0.00041,0.000241,0.01071,0.0,0.006787,0.021514,0.002832,0.000378,0.003096,…,0.01071,0.000743,0.008152,0.003462,0.012324,0.008965,0.005348,8e-06,0.0,0.009847,0.000948,2e-06,0.0,0.0,0.011966,0.0,0.004705,0.005962,0.001866,0.465782,2023.69363,,,,,,,,47185.277298,2020.773354,,32.955287,-94.786258,32.96704,-94.819965,32.964848,-94.84618
"""std""",2993200.0,,,3.514977,90.180405,104.831186,2.484854,,4.927653,23600000.0,56.994161,7.654705,2.955969,8.294142,,,,,,,,,,,,,,,,,,,,,,,…,,,,,,,,,,,,,,,,,,,,,30775.124903,,,,,,,,403390.172291,1.926181,,6.019867,17.555548,5.989056,17.428123,5.984866,17.428692
"""min""",166.0,"""205042000""","""2018-01-01 00:09:04""",0.0,0.0,0.0,0.0,,70.0,0.0,101.0,0.0,-12.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,…,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"""00XE""","""76 LUBRICANTS CO SAVANNAH TERM…","""Anchorage""","""Albany Port District, NY""","""""Portland Harbor"" means the en…","""Albany Port District, NY""","""1506""",1.0245e-08,2018.0,"""201801""",4.80316,-179.1786,17.672974,-166.582293,17.75271,-166.549916
"""25%""",2522315.0,,"""2019-07-18 04:21:36""",0.0,126.3,90.0,0.0,,70.0,9324643.0,179.0,28.0,9.0,70.0,,,,,,,,,,,,,,,,,,,,,,,…,,,,,,,,,,,,,,,,,,,,,21.0,,,,,,,,88.582581,2019.0,,29.73732,-97.26172,29.738928,-97.276042,29.747023,-97.39789
"""50%""",5299509.0,,"""2021-04-22 00:02:33""",0.0,184.9,181.0,5.0,,70.0,9499424.0,189.0,32.0,11.0,70.0,,,,,,,,,,,,,,,,,,,,,,,…,,,,,,,,,,,,,,,,,,,,,51.0,,,,,,,,147.419724,2021.0,,30.13318,-91.19956,30.145527,-91.199983,30.133998,-91.19934
"""75%""",7877415.0,,"""2022-10-31 21:39:24""",0.1,250.5,274.0,5.0,,80.0,9684976.0,229.0,32.0,13.1,80.0,,,,,,,,,,,,,,,,,,,,,,,…,,,,,,,,,,,,,,,,,,,,,123.0,,,,,,,,265.440858,2022.0,,36.95153,-81.13598,36.926079,-81.137103,36.86642,-81.095382
"""max""",10255767.0,"""725019920""","""2024-09-30 23:53:26""",102.3,359.9,369.0,15.0,,89.0,984903300.0,901.0,86.0,25.5,196.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,…,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,3501909.0,"""1JHK""","""ZEN-NOH GRAIN CORP. WHARF.""","""Tie Off""","""Yabucoa, PR""","""U.S. Census Bureau municipal l…","""Yabucoa, PR""","""99""",33858000.0,2024.0,"""202409""",82.36032,175.1242,61.24306,-64.740137,61.23778,-64.73242


docking_id,mmsi,time,speed,course,heading,status,vessel_name,vessel_type,imo,length,width,draft,cargo,"Albany Port District, NY_in_port_waters","Anacortes, WA_in_port_waters","Baltimore, MD_in_port_waters","Beaumont, TX_in_port_waters","Boston, MA_in_port_waters","Bridgeport, CT_in_port_waters","Brownsville, TX_in_port_waters","Calhoun Port Authority, TX_in_port_waters","Canaveral Port District, FL_in_port_waters","Coos Bay OR, Port of_in_port_waters","Corpus Christi, TX_in_port_waters","Galveston, TX_in_port_waters","Grays Harbor Port District, WA_in_port_waters","Greater Lafourche Port, LA_in_port_waters","Guayama, PR_in_port_waters","Guaynabo, PR_in_port_waters","Hilo, Hawai'i, HI_in_port_waters","Honolulu, O'ahu, HI_in_port_waters","Houston Port Authority, TX_in_port_waters","Jacksonville, FL_in_port_waters","Kahului, Maui, HI_in_port_waters","Kalaeloa Barbers Point, HI_in_port_waters","Kawaihae, Hawai'i, HI_in_port_waters",…,"San Juan, PR_in_port_waters","Searsport, ME_in_port_waters","Seattle, WA_in_port_waters","South Jersey Port Corp, NJ_in_port_waters","South Louisiana, LA, Port of_in_port_waters","Stockton, CA_in_port_waters","Tacoma, WA_in_port_waters","Tampa Port Authority, FL_in_port_waters","Terrebonne Parish Port, LA_in_port_waters","Texas City, TX_in_port_waters","Unalaska Island, AK_in_port_waters","Valdez, AK_in_port_waters","Victoria, TX_in_port_waters","Virgin Islands - St. Croix, VI_in_port_waters","Virginia, VA, Port of_in_port_waters","West St. Mary Parish Port, LA_in_port_waters","Wilmington, DE_in_port_waters","Wilmington, NC_in_port_waters","Yabucoa, PR_in_port_waters",in_port_waters,status_duration,dock_id,dock_name,facility_type,port_name,port_area_desc,port_area_name,port_area_id,dist_to_dock,year,month,vessel_lat,vessel_lon,dock_lat,dock_lon,port_lat,port_lon
u32,str,datetime[μs],f64,f64,f64,f64,cat,f64,f64,f64,f64,f64,f64,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,…,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,bool,f64,str,str,str,str,str,str,str,f64,i32,str,f64,f64,f64,f64,f64,f64
166,"""205042000""",2021-11-25 15:05:50,0.0,202.0,202.0,5.0,"""DELOS""",80.0,9877767.0,336.0,60.0,11.0,80.0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,…,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,3056.0,"""110V""","""ENBRIDGE INGLESIDE ENERGY CENT…","""Dock""","""Corpus Christi, TX""","""Per Port of Corpus Chisti legi…","""Corpus Christi, TX""","""2436""",204.903865,2021,"""202111""",27.82006,-97.20766,27.821683,-97.207517,27.81277,-97.39789
208,"""205042000""",2022-12-12 15:24:43,0.8,29.7,332.0,1.0,"""DELOS""",80.0,9877767.0,336.0,60.0,16.5,80.0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,…,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,20.0,"""0V0U""","""ARCO WESTERN PIPELINE CO BERTH…","""Dock""","""Port of Long Beach, CA""","""As defined per legislation by …",,"""4110""",25918.551288,2022,"""202212""",33.62503,-118.04868,33.757222,-118.21888,33.73957,-118.2095
208,"""205042000""",2022-12-12 06:32:11,0.0,346.0,256.0,1.0,"""DELOS""",80.0,9877767.0,336.0,60.0,16.5,80.0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,…,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,210.0,"""0V0U""","""ARCO WESTERN PIPELINE CO BERTH…","""Dock""","""Port of Long Beach, CA""","""As defined per legislation by …",,"""4110""",25849.267159,2022,"""202212""",33.62736,-118.04695,33.757222,-118.21888,33.73957,-118.2095
208,"""205042000""",2022-12-12 10:02:11,1.1,103.4,304.0,0.0,"""DELOS""",80.0,9877767.0,336.0,60.0,16.5,80.0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,…,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,27.0,"""0V0U""","""ARCO WESTERN PIPELINE CO BERTH…","""Dock""","""Port of Long Beach, CA""","""As defined per legislation by …",,"""4110""",25976.311304,2022,"""202212""",33.62539,-118.04757,33.757222,-118.21888,33.73957,-118.2095
208,"""205042000""",2022-12-12 10:29:13,0.0,353.4,326.0,1.0,"""DELOS""",80.0,9877767.0,336.0,60.0,16.5,80.0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,…,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,18.0,"""0V0U""","""ARCO WESTERN PIPELINE CO BERTH…","""Dock""","""Port of Long Beach, CA""","""As defined per legislation by …",,"""4110""",25964.956439,2022,"""202212""",33.62478,-118.04839,33.757222,-118.21888,33.73957,-118.2095


In [23]:
#define calls lazyframe for building stats
calls_lf = (
    lf
    #grouby by port dock and docking id
    .group_by(['port_name', 'dock_id', 'docking_id'])
    .agg(
        #vessel mmsi
        mmsi = pl.first('mmsi'),
        #vessel size
        vessel_size = pl.first('length'),
        #time of mooring
        docking_time = (
            pl.when(pl.col('status')==5)
            .then(pl.col('time'))
            .otherwise(pl.lit(None))
            .drop_nulls() #polars .first() evals to null if first val is null
        ).first(),
        #time a berth
        hrs_at_berth = (
            pl.when(pl.col('status')==5)
            .then(pl.col('status_duration'))
            .otherwise(pl.lit(None))
            .sum()/60
        ),
        #time at anchor
        hrs_at_anchor = (
            pl.when(pl.col('status')==1)
            .then(pl.col('status_duration'))
            .otherwise(pl.lit(None))
            .sum()/60
        ),
        #time entering port waters - not working as expected
        time_port_entry =(
            pl.when(pl.col('in_port_waters')==True) #need related port, not any ports
            .then(pl.col('time'))
            .otherwise(pl.lit(None))
            .min()),
        #time in port waters to berth
        hrs_in_port_waters = (
            (pl.when(pl.col('status')==5)
            .then(pl.col('time'))
            .otherwise(pl.lit(None))
            .min()) - 
            (pl.when(pl.col('in_port_waters')==True)
            .then(pl.col('time'))
            .otherwise(pl.lit(None))
            .min())
        ).dt.total_hours(),
        #inspect in port waters
        sum_in_port_waters = pl.col('in_port_waters').sum()
    )
)

In [24]:
calls_lf.limit(5).collect()

port_name,dock_id,docking_id,mmsi,vessel_size,docking_time,hrs_at_berth,hrs_at_anchor,time_port_entry,hrs_in_port_waters,sum_in_port_waters
str,str,u32,str,f64,datetime[μs],f64,f64,datetime[μs],i64,u32
"""Port of Brunswick, GA""","""0XFY""",1132066,"""246293000""",185.0,2022-06-17 23:04:55,2.35,0.0,2022-06-17 23:04:55,0.0,1
"""Baltimore, MD""","""0RP6""",1342318,"""249262000""",229.0,2022-11-22 22:21:34,23.9,0.0,2022-11-22 22:21:34,0.0,1
"""Port of Greater Baton Rouge, L…","""0X2P""",3490908,"""354634000""",137.0,2022-12-28 12:44:41,1.15,0.0,,,0
"""Houston Port Authority, TX""","""0W0D""",6598967,"""538002355""",189.0,2018-10-04 23:02:24,0.6,0.0,2018-10-04 22:32:24,0.0,2
"""Corpus Christi, TX""","""0VM8""",270707,"""215497000""",248.0,2019-11-08 04:07:51,0.1,0.0,2019-11-08 04:07:51,0.0,1


In [25]:
calls_lf.select('sum_in_port_waters').describe()

statistic,sum_in_port_waters
str,f64
"""count""",3555593.0
"""null_count""",0.0
"""mean""",0.856124
"""std""",0.924127
"""min""",0.0
"""25%""",0.0
"""50%""",1.0
"""75%""",2.0
"""max""",104.0


In [26]:
#create monthly stats dataframe
monththly_df = (
    calls_lf
    #get month from docking time
    .with_columns(
        #extract month from docking time
        month = pl.col('docking_time').dt.strftime('%Y%m')
    )
    #group by port dock and month
    .group_by(['port_name', 'dock_id', 'month'])
    .agg(
        #count number of vessels
        vessels = pl.n_unique('mmsi'),
        #mean vessel size
        vessel_size_mean = pl.mean('vessel_size'),
        #count number of vessel calls
        calls = pl.n_unique('docking_id'),
        #time at dock stats for each vessel in hours
        hrs_occupied = pl.sum('hrs_at_berth'),
        hrs_at_berth_median = pl.median('hrs_at_berth'),
        hrs_at_berth_mean = pl.mean('hrs_at_berth'),
        #time at anchor stats for each vessel visit in hours
        hrs_at_anchor_median = pl.median('hrs_at_anchor'),
        hrs_at_anchor_mean = pl.mean('hrs_at_anchor'),
        #time in port waters 
        hrs_in_port_waters_total = pl.sum('hrs_in_port_waters'),
        hrs_in_port_waters_mean = pl.mean('hrs_in_port_waters'),
        hrs_in_port_waters_median = pl.median('hrs_in_port_waters')
    )
    #get hours from each month
    .with_columns(
        hrs_in_month = (
            pl.when(pl.col('month').str.tail(2).is_in(['01', '03', '05', '07', '08', '10', '12']))
            .then(31*24)
            .when(pl.col('month').str.tail(2).is_in(['04', '06', '09', '11']))
            .then(30*24)
            .otherwise(28*24)
        )
    )
    .with_columns(
        #dock utilization - percentage of time a dock is occupied
        utilization = (
            pl.col('hrs_occupied')/pl.col('hrs_in_month')
        )
    )
    #drop hours in month
    .drop('hrs_in_month')
    #sort by port dock then month
    .sort(['port_name', 'dock_id', 'month'])
    #collect
    .collect()
)

#inspect
monththly_df.head()

port_name,dock_id,month,vessels,vessel_size_mean,calls,hrs_occupied,hrs_at_berth_median,hrs_at_berth_mean,hrs_at_anchor_median,hrs_at_anchor_mean,hrs_in_port_waters_total,hrs_in_port_waters_mean,hrs_in_port_waters_median,utilization
str,str,str,u32,f64,u32,f64,f64,f64,f64,f64,i64,f64,f64,f64
"""Albany Port District, NY""","""0PST""","""201803""",1,171.0,4,44.316667,8.875,11.079167,0.0,9.8875,8,2.0,0.0,0.059565
"""Albany Port District, NY""","""0PST""","""201804""",1,189.0,1,165.45,165.45,165.45,5.933333,5.933333,0,0.0,0.0,0.229792
"""Albany Port District, NY""","""0PST""","""201806""",1,179.0,2,87.083333,43.541667,43.541667,0.0,0.0,31,15.5,15.5,0.120949
"""Albany Port District, NY""","""0PST""","""201807""",2,199.0,73,101.933333,0.8,1.396347,0.0,0.0,26,0.356164,0.0,0.137007
"""Albany Port District, NY""","""0PST""","""201808""",3,189.430556,72,146.266667,0.5,2.031481,0.0,0.365741,23,0.319444,0.0,0.196595


In [91]:
#create annual port stats dataframe
annual_df = (
    calls_lf
    #get year from docking time
    .with_columns(
        #extract year from docking time
        year = pl.col('docking_time').dt.year()
    )
    #group by port dock and month
    .group_by(['port_name', 'dock_id', 'year'])
    .agg(
        #count number of vessels
        vessels = pl.n_unique('mmsi'),
        #mean vessel size
        vessel_size_mean = pl.mean('vessel_size'),
        #count number of vessel calls
        calls = pl.n_unique('docking_id'),
        #time at dock stats for each vessel in hours
        hrs_at_berth_median = pl.median('hrs_at_berth')/60,
        hrs_at_berth_mean = pl.mean('hrs_at_berth')/60,
        #time at anchor stats for each vessel visit in hours
        hrs_at_anchor_median = pl.median('hrs_at_anchor')/60,
        hrs_at_anchor_mean = pl.mean('hrs_at_anchor')/60
    )
    #sort by port then month
    .sort(['port_name', 'year'])
    #collect
    .collect()
)

#inspect
annual_df.head()

port_name,dock_id,year,vessels,vessel_size_mean,calls,hrs_at_berth_median,hrs_at_berth_mean,hrs_at_anchor_median,hrs_at_anchor_mean
str,str,i32,u32,f64,u32,f64,f64,f64,f64
"""Albany Port District, NY""","""0Y0T""",2018,4,129.457627,59,0.883333,3.882486,0.0,1.201695
"""Albany Port District, NY""","""0PST""",2018,9,193.064516,186,0.6,3.367652,0.0,0.483333
"""Albany Port District, NY""","""0Y0V""",2018,24,148.264706,170,2.316667,7.601863,0.0,2.160686
"""Albany Port District, NY""","""0Q86""",2018,2,147.857143,14,1.95,4.632143,0.0,0.0
"""Albany Port District, NY""","""0RQB""",2018,4,148.76,25,1.5,3.44,0.0,0.149333


In [92]:
#get point in time stats

#create point in time (pit) df to join stats to
pit_df = (
    lf
    .with_columns(date = pl.col('time').dt.date())
    .select('port_name', 'dock_id', 'month', 'date')
    .unique().collect()
)

for hour in range(0, 24):
    #create a time object for each hour
    hour_dt = pl.time(hour)
    #create a dataframe for each hour
    hour_df = (
        lf
        .with_columns(
            #get end of status time
            end_time = (pl.col('time') + 
                        pl.duration(minutes=pl.col('status_duration'))),
            #get date from time
            date = pl.col('time').dt.date()
        )
        #group by port dock and hour
        .group_by(['port_name', 'dock_id', 'month', 'date'])
        .agg(
            #number of vessels at dock at each hour
            vessels_at_dock = (
                #when moored at hour
                pl.when((pl.col('status')==5) & 
                        (hour_dt.is_between(pl.col('time').dt.time(), 
                                         pl.col('end_time').dt.time())))
                #then count the individual vessels
                .then(pl.col('mmsi'))
                .otherwise(pl.lit(None))
                .drop_nulls() #n_unique counts nulls as unique values
                .n_unique()
            ),
            #number of vessels at anchor at each hour
            vessels_at_anchor = (
                #when anchored at hour
                pl.when((pl.col('status')==1) & 
                        (hour_dt.is_between(pl.col('time').dt.time(), 
                                        pl.col('end_time').dt.time())))
                #then count the individual vessels
                .then(pl.col('mmsi'))
                .otherwise(pl.lit(None))
                .drop_nulls()
                .n_unique()
            )
        )
        .collect()
    )
    #join the hour dataframe to the main dataframe
    pit_df = (
        pit_df
        .join(hour_df, 
              on=['port_name', 'dock_id', 'month', 'date'], 
              how='left')
        #rename the columns to include the hour
        .rename({
            'vessels_at_dock': f'vessels_at_dock_{hour}',
            'vessels_at_anchor': f'vessels_at_anchor_{hour}'
        })
    )

#get port stats by month
pit_df = (
    pit_df
    #group by port and date
    .group_by(['port_name', 'month', 'date'])
    .agg(
        #sum the number of vessels at dock at each hour
        cs.starts_with('vessels_at_dock_').sum(),
        #sum the number of vessels at anchor at each hour
        cs.starts_with('vessels_at_anchor_').sum()
    )
    #get the max at any hour
    .with_columns(
        #get max at dock at any hour
        vessels_at_dock_max = (
            pl.max_horizontal(cs.starts_with('vessels_at_dock_'))
        ),
        #get mean at dock any hour
        vessels_at_dock_mean = (
            pl.mean_horizontal(cs.starts_with('vessels_at_dock_'))
        ),
        #get max at anchor at any hour
        vessels_at_anchor_max = (
            pl.max_horizontal(cs.starts_with('vessels_at_anchor_'))
        ),
        #get mean at anchor any hour
        vessels_at_anchor_mean = (
            pl.mean_horizontal(cs.starts_with('vessels_at_anchor_'))
        )
    )
    #select the columns to keep
    .select(['port_name', 'month', 'date', 'vessels_at_dock_max', 
             'vessels_at_dock_mean', 'vessels_at_anchor_max',
             'vessels_at_anchor_mean'])
    #aggregate by month
    .group_by(['port_name', 'month'])
    .agg(
        #get max at anchor on any date
        vessels_at_anchor_max = pl.max('vessels_at_anchor_max'),
        #get mean at anchor on any date
        vessels_at_anchor_mean = pl.mean('vessels_at_anchor_mean'),
        #get max at dock on any date
        vessels_at_dock_max = pl.max('vessels_at_dock_max'),
        #get mean at dock on any date
        vessels_at_dock_mean = pl.mean('vessels_at_dock_mean'
        )
    )
)

In [93]:
#inspect
pit_df.head()

port_name,month,vessels_at_anchor_max,vessels_at_anchor_mean,vessels_at_dock_max,vessels_at_dock_mean
str,str,u32,f64,u32,f64
"""Panama City Port Authority, FL""","""201803""",1,0.033654,2,0.336538
"""Searsport, ME""","""202202""",1,0.016667,1,0.169444
"""Port Freeport, TX""","""202101""",3,0.168011,5,1.099462
"""Texas City, TX""","""202409""",2,0.174107,4,1.126488
"""Oxnard Harbor District, CA""","""202111""",1,0.047414,3,0.515805


In [None]:



    
    #group by month to get averages
    .group_by(['port_name', 'month'])
    .agg(
        #average number of vessels at dock
        vessels_at_dock_at_noon_mean = pl.mean('vessels_at_dock'),
        #average number of vessels at anchor
        vessels_at_anchor_at_noon_mean = pl.mean('vessels_at_anchor')
    )
    #sort by port then date
    .sort(['port_name', 'month'])
    #collect
    .collect()
)

#inspect
df.head()

#join to stats dfs



In [6]:
#get means for each port over all time
port_alltime_df = (
    port_monththly_df
    .group_by(['port_name'])
    .agg(
        #average number of vessels at dock
        vessels_at_dock_at_noon_mean = pl.mean('vessels_at_dock_at_noon_mean'),
        #average number of vessels at anchor
        vessels_at_anchor_at_noon_mean = pl.mean('vessels_at_anchor_at_noon_mean'),
        #average time at dock in hours
        hrs_at_berth_median = pl.mean('hrs_at_berth_median'),
        hrs_at_berth_mean = pl.mean('hrs_at_berth_mean'),
        #average time at anchor in hours
        hrs_at_anchor_median = pl.mean('hrs_at_anchor_median'),
        hrs_at_anchor_mean = pl.mean('hrs_at_anchor_mean')
    )
    #sort by port
    .sort(['port_name'])
)

#inspect
port_alltime_df.head()

port_name,vessels_at_dock_at_noon_mean,vessels_at_anchor_at_noon_mean,hrs_at_berth_median,hrs_at_berth_mean,hrs_at_anchor_median,hrs_at_anchor_mean
str,f64,f64,f64,f64,f64,f64
"""Albany Port District, NY""",0.360985,0.047641,9.389506,15.038997,13.703333,18.1363
"""Anacortes, WA""",0.510336,0.077529,3.805864,9.666307,28.969264,40.798921
"""Baltimore, MD""",5.141655,0.257404,1.034877,6.098101,20.794033,124.246689
"""Beaumont, TX""",3.915332,0.671338,1.14465,7.312186,19.619856,38.896071
"""Boston, MA""",0.642309,0.060388,1.930041,11.229007,17.306962,81.831545
