# PNREC Presentation Prep



In [89]:
#prelims
import numpy as np
import pandas as pd
import geopandas as gpd
import polars as pl
import plotly.express as px
import datetime as dt
from dateutil.relativedelta import relativedelta
import dash
from dash import dcc, html
from dash.dependencies import Input, Output

#display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pl.Config(tbl_rows=100);

## Load Data and initial prep

Data used here is processed in the geodata_prep and port_stats notebooks in the main directory. 

The data used in the app is a dataframe with reach row corresponding to a port call, including data such as port, dock, and vessel info, time of arrival, hours at berth and at anchor, time in port waters, etc. 

In [90]:
#create main dataframe
calls_df = (
    #read in data
    pl.read_parquet('../dashboard/calls.parquet')
    #get year month and date from arrival time
    .with_columns(
        pl.col('time_arrival').dt.year().alias('year'),
        pl.col('time_arrival').dt.date().dt.month_start().alias('month'),
        pl.col('time_arrival').dt.date().alias('date')
    )
    #add port group for NWSPA and San Pedro Ports
    .with_columns(
        pl.when(pl.col('port_name').is_in(['Seattle, WA', 'Tacoma, WA']))
        .then(pl.lit('NW Seaport Alliance'))
        .when(pl.col('port_name').is_in(['Port of Los Angeles, CA',
                                         'Port of Long Beach, CA']))
        .then(pl.lit('San Pedro Ports'))
        .otherwise(pl.col('port_name'))
        .alias('port_group')
    )
)
#get date bounds
earliest_date = calls_df['time_arrival'].min().date()
latest_date = calls_df['time_arrival'].max().date()

#inspect data
calls_df.describe()

statistic,call_id,port_name,port_lat,port_lon,dock_name,dock_id,facility_type,dock_lat,dock_lon,imo,vessel_size,time_port_entry,time_arrival,time_departure,time_port_exit,hrs_at_berth,hrs_at_anchor,hrs_to_dock,hrs_in_port_after_dock,hrs_in_port_waters,year,month,date,port_group
str,str,str,f64,f64,str,str,str,f64,f64,f64,f64,str,str,str,str,f64,f64,f64,f64,f64,f64,str,str,str
"""count""","""152339""","""152339""",152339.0,152339.0,"""152339""","""152339""","""151966""",152339.0,152339.0,152339.0,152339.0,"""152339""","""152339""","""152339""","""152339""",152339.0,152339.0,152339.0,152339.0,152339.0,152339.0,"""152339""","""152339""","""152339"""
"""null_count""","""0""","""0""",0.0,0.0,"""0""","""0""","""373""",0.0,0.0,0.0,0.0,"""0""","""0""","""0""","""0""",0.0,0.0,0.0,0.0,0.0,0.0,"""0""","""0""","""0"""
"""mean""",,,32.314243,-94.595966,,,,32.314355,-94.594993,10138000.0,207.282114,"""2021-07-25 12:09:45.520457""","""2021-07-25 23:42:49.418494""","""2021-07-28 04:42:50.440307""","""2021-07-28 09:58:48.028312""",47.902378,8.398775,11.543167,5.258262,69.809185,2021.068459,"""2021-07-10 17:16:44.551000""","""2021-07-25 11:11:14.325000""",
"""std""",,,6.905596,20.709946,,,,6.905846,20.710309,26423000.0,58.862874,,,,,62.483046,28.999441,35.440416,41.451654,96.098973,2.030807,,,
"""min""","""0_Corpus Christi, TX_2020-04-0…","""Albany Port District, NY""",17.938939,-166.549916,"""ADM Corpus Christi Grain Eleva…","""00XE""","""Anchorage""",17.936081,-166.53444,0.0,101.0,"""2018-01-01 00:35:19""","""2018-01-01 00:35:19""","""2018-01-01 04:23:54""","""2018-01-01 09:15:57""",0.083333,0.0,0.0,0.0,0.133333,2018.0,"""2018-01-01""","""2018-01-01""","""Albany Port District, NY"""
"""25%""",,,28.629389,-118.2095,,,,28.645767,-118.21083,9298636.0,176.0,"""2019-09-23 05:36:19""","""2019-09-23 14:53:58""","""2019-09-25 11:07:36""","""2019-09-25 15:04:34""",16.616667,0.0,2.666667,2.15,25.866667,2019.0,"""2019-09-01""","""2019-09-23""",
"""50%""",,,30.69123,-90.085256,,,,30.706768,-90.112537,9403451.0,190.0,"""2021-09-06 01:03:18""","""2021-09-06 20:50:23""","""2021-09-09 01:06:40""","""2021-09-09 05:20:12""",31.116667,0.0,3.516667,2.783333,43.933333,2021.0,"""2021-09-01""","""2021-09-06""",
"""75%""",,,36.86642,-80.05267,,,,36.875896,-80.053495,9619426.0,230.0,"""2023-05-07 10:13:53""","""2023-05-07 22:10:37""","""2023-05-10 09:57:31""","""2023-05-10 13:19:05""",57.183333,0.0,5.383333,3.533333,80.966667,2023.0,"""2023-05-01""","""2023-05-07""",
"""max""","""9993808_Honolulu, O'ahu, HI_20…","""Wilmington, NC""",61.23778,-66.096678,"""YUSEN TERMINALS BERTHS 212-221""","""1JHK""","""Tie Off""",61.24306,-66.086926,980002500.0,667.0,"""2024-12-31 18:55:48""","""2024-12-31 22:30:59""","""2024-12-31 23:37:00""","""2024-12-31 23:37:00""",1398.416667,244.45,2152.533333,3653.166667,4453.8,2024.0,"""2024-12-01""","""2024-12-31""","""Wilmington, NC"""


#### Volume Data from Hanouf

In [108]:
importvol_df = pl.read_csv('../port data/volumes/portimports.csv', infer_schema_length=0)
importvol_df.head()

PORT,PORT_NAME,CTY_CODE,CTY_NAME,I_COMMODITY,GEN_VAL_MO,CNT_VAL_MO,CNT_WGT_MO,VES_VAL_MO,VES_WGT_MO,YEAR,MONTH,COMM_LVL,date,COMMODITY_NAME
str,str,str,str,str,str,str,str,str,str,str,str,str,str,str
"""-""","""TOTAL FOR ALL PORTS""","""-""",,"""1""","""215797758""","""392018""","""120408""","""392018""","""120408""","""2018""","""1""","""HS2""","""1/1/2018""","""Live Animals"""
"""-""","""TOTAL FOR ALL PORTS""","""0003""",,"""1""","""56261689""","""134532""","""21000""","""134532""","""21000""","""2018""","""1""","""HS2""","""1/1/2018""","""Live Animals"""
"""-""","""TOTAL FOR ALL PORTS""","""0014""",,"""1""","""3595903""","""257486""","""99408""","""257486""","""99408""","""2018""","""1""","""HS2""","""1/1/2018""","""Live Animals"""
"""-""","""TOTAL FOR ALL PORTS""","""0022""",,"""1""","""206506476""","""134532""","""21000""","""134532""","""21000""","""2018""","""1""","""HS2""","""1/1/2018""","""Live Animals"""
"""-""","""TOTAL FOR ALL PORTS""","""0023""",,"""1""","""155670763""","""134532""","""21000""","""134532""","""21000""","""2018""","""1""","""HS2""","""1/1/2018""","""Live Animals"""


In [107]:
exportvol_df = pl.read_csv('../port data/volumes/portexports.csv', infer_schema_length=0)
exportvol_df.head()

PORT,PORT_NAME,CTY_CODE,CTY_NAME,E_COMMODITY,ALL_VAL_MO,CNT_VAL_MO,CNT_WGT_MO,VES_VAL_MO,VES_WGT_MO,YEAR,MONTH,COMM_LVL,date,COMMODITY_NAME
str,str,str,str,str,str,str,str,str,str,str,str,str,str,str
"""1003""","""NEWARK, NJ""","""5180""","""Qatar""","""9""","""36780""","""36780""","""2796""","""36780""","""2796""","""2018""","""8""","""HS2""","""2018-08-01""","""Coffee, Tea, Maté, and Spices"""
"""1003""","""NEWARK, NJ""","""5180""","""Qatar""","""4""","""40800""","""40800""","""15000""","""40800""","""15000""","""2023""","""8""","""HS2""","""2023-08-01""","""Dairy Produce; Birds' Eggs; Na…"
"""1003""","""NEWARK, NJ""","""5180""","""Qatar""","""8""","""26965""","""26965""","""5300""","""26965""","""5300""","""2022""","""1""","""HS2""","""2022-01-01""","""Edible Fruit and Nuts; Peel of…"
"""1003""","""NEWARK, NJ""","""5180""","""Qatar""","""9""","""168541""","""168541""","""19904""","""168541""","""19904""","""2018""","""11""","""HS2""","""2018-11-01""","""Coffee, Tea, Maté, and Spices"""
"""1003""","""NEWARK, NJ""","""5180""","""Qatar""","""9""","""54349""","""54349""","""5856""","""54349""","""5856""","""2023""","""6""","""HS2""","""2023-06-01""","""Coffee, Tea, Maté, and Spices"""


## Visualization Functions

For the dashboard and PNREC presentation, we define a handful of visualization types (e.g. scatter map, line plot, bar chart, etc) and code to allow both default visualizations as well as custom user-generated visualizations. 

In [91]:
#define zoom level function for plotly express scatter_mapbox
def mapbox_zoom_finder(lons, lats, lon_pad=0, lat_pad=0):
    """
    Calculates the optimal zoom level for a Plotly Mapbox plot.
    Args:
        lons (list): List of longitudes.
        lats (list): List of latitudes.
        lon_pad (float, optional): Padding to add to the longitude range. Defaults to 0.
        lat_pad (float, optional): Padding to add to the latitude range. Defaults to 0.
    Returns:
        zoom (int): the calculated zoom level
    """
    # Check if the lengths of lons and lats are equal and not empty
    if len(lons) != len(lats) or len(lons) == 0:
        return 10
    # Calculate the maximum and minimum longitude and latitude
    max_lon, min_lon = max(lons), min(lons)
    max_lat, min_lat = max(lats), min(lats)
    # Calculate the longitude and latitude ranges
    lon_range = max_lon - min_lon
    lat_range = max_lat - min_lat
    # Calculate the zoom level based on the ranges
    zoom = 7 - np.log2(max(lon_range + lon_pad, lat_range + lat_pad))
    return zoom

def plot_mapbox(df, cat_group,  lat_col, lon_col, 
                size_col, size_col_alias, color_col, color_col_alias, 
                title, filter_col=None, filter=None, time_col='date', 
                time_range=[earliest_date, latest_date], 
                zoom=None, center=None, width=800, height=600, 
                size_max=30, range_color=None, hover_name=None, hover_data=None, 
                mapbox_style='carto-positron', labels=None, 
                color_continuous_scale=None, color_outlier_z=None):
    """
    Plots a Mapbox scatter plot using Plotly.
    """
    #init df
    df = df
    #filter if specified
    if filter_col:
        df = df.filter(pl.col(filter_col).is_in(filter))
    #generate df
    df = (
        df
        #filter by time range
        .filter(pl.col(time_col).is_between(time_range[0], time_range[1]))
        .group_by(cat_group)
        .agg(
            #get the lat and lon columns
            pl.col(lat_col).first().alias(lat_col),
            pl.col(lon_col).first().alias(lon_col),
            #get hover name
            #may be different or same as cat_group
            #get stats
            pl.col(size_col).mean().alias(size_col_alias),
            pl.col(color_col).mean().alias(color_col_alias),
        )
    )

    #Set default color scale if not provided
    if not color_continuous_scale:
        color_continuous_scale = px.colors.sequential.Viridis

    # Set the zoom level automatically if not provided
    if not zoom:
        zoom = mapbox_zoom_finder(df[lon_col], df[lat_col])

    #Set the center of the map if not provided
    if not center:
        center = {
            'lat': ((df[lat_col].max() - df[lat_col].min()) / 2),
            'lon': ((df[lon_col].max() - df[lon_col].min()) / 2)
        }

    #drop outliers if specified
    if color_outlier_z:
        #get color_col upper and lower limits based on z score
        color_col_mean, color_col_std = df[color_col_alias].mean(), df[color_col_alias].std()
        color_col_upper = color_col_mean + (color_col_std * color_outlier_z)
        color_col_lower = color_col_mean - (color_col_std * color_outlier_z)
        #set range color
        range_color = [color_col_lower, color_col_upper]

    # Create a scatter mapbox figure
    fig = px.scatter_mapbox(
        #data
        df, lat=lat_col, lon=lon_col,
        #categories
        size=size_col_alias, color=color_col_alias,
        #hover info
        hover_name=hover_name, hover_data=hover_data,
        #display settings
        range_color=range_color, size_max=size_max,
        color_continuous_scale=color_continuous_scale, mapbox_style=mapbox_style,
        width=width, height=height,
        #title and labals
        title=title, labels=labels
    )
    # Set the zoom level
    fig.update_layout(mapbox_zoom=zoom)

    #NOTE Add annotation if specified

    return fig

def plot_line(df, cat_group, time_col, y_col, y_col_alias, title,
              filter_col=None, filter=None, cat_limit=None, cat_limit_col=None, 
              time_range=[earliest_date, latest_date], 
              width=800, height=600, hover_name=None, 
              hover_data=None, labels=None, color_continuous_scale=None, 
              highlight=None, highlight_color=None):
    """
    Plots a line chart using Plotly Express.
    """
    #initialize df
    df = (
        df
        #filter by time range
        .filter(pl.col(time_col).is_between(time_range[0], time_range[1]))
    )
    #filter if specified
    if filter_col:
        df = df.filter(pl.col(filter_col).is_in(filter))
    #limit categories if specified
    if cat_limit:
        #get the top n categories
        top_cats = (
            df.group_by(cat_group)
            .agg(pl.col(cat_limit_col).sum())
            .sort(pl.col(cat_limit_col), descending=True)
            .limit(cat_limit)
            .to_series()
        )
        #filter to only top n categories or highlight category
        df = df.filter(pl.col(cat_group)
                       .is_in(top_cats.append(pl.Series([highlight]))))
    #generate df
    df = (
        df
        .group_by(cat_group, time_col)
        .agg(
            #compute y col mean
            pl.col(y_col).mean().alias(y_col_alias)
        )
        .sort(time_col)
    )

    # Set default color scale if not provided
    if not color_continuous_scale:
        color_continuous_scale = px.colors.sequential.Viridis

    # Create a line figure
    fig = px.line(
        df,
        x=time_col,
        y=y_col_alias,
        color=cat_group,
        title=title,
        labels=labels,
        hover_name=hover_name,
        hover_data=hover_data,
    )
    # Set the width and height of the figure
    fig.update_layout(width=width, height=height)

    #highlight given lines if specified
    if highlight:
        fig.update_traces(line_color='lightgray')
        fig.update_traces(patch={'line': {'color': highlight_color}}, 
                          selector=dict(name=highlight))

    return fig

#the above takes the mean of y_col; this takes the count (e.g., number of calls)
def plot_line_count(df, cat_group, time_col, y_col, y_col_alias, title,
              filter_col=None, filter=None, cat_limit=None, cat_limit_col=None, 
              time_range=[earliest_date, latest_date], 
              width=800, height=600, hover_name=None, 
              hover_data=None, labels=None, color_continuous_scale=None, 
              highlight=None, highlight_color=None):
    """
    Plots a line chart using Plotly Express.
    """
    #initialize df
    df = (
        df
        #filter by time range
        .filter(pl.col(time_col).is_between(time_range[0], time_range[1]))
    )
    #filter if specified
    if filter_col:
        df = df.filter(pl.col(filter_col).is_in(filter))
    #limit categories if specified
    if cat_limit:
        #get the top n categories
        top_cats = (
            df.group_by(cat_group)
            .agg(pl.col(cat_limit_col).count())
            .sort(pl.col(cat_limit_col), descending=True)
            .limit(cat_limit)
            .to_series()
        )
        #filter to only top n categories or highlight category
        df = df.filter(pl.col(cat_group)
                       .is_in(top_cats.append(pl.Series([highlight]))))
    #generate df
    df = (
        df
        .group_by(cat_group, time_col)
        .agg(
            #compute y col mean
            pl.col(y_col).count().alias(y_col_alias)
        )
        .sort(time_col)
    )

    # Set default color scale if not provided
    if not color_continuous_scale:
        color_continuous_scale = px.colors.sequential.Viridis

    # Create a line figure
    fig = px.line(
        df,
        x=time_col,
        y=y_col_alias,
        color=cat_group,
        title=title,
        labels=labels,
        hover_name=hover_name,
        hover_data=hover_data,
    )
    # Set the width and height of the figure
    fig.update_layout(width=width, height=height)

    #highlight given lines if specified
    if highlight:
        fig.update_traces(line_color='lightgray')
        fig.update_traces(patch={'line': {'color': highlight_color}}, 
                          selector=dict(name=highlight))

    return fig


def bar_ranking(df, cat_group, stat_col, stat_alias, title, limit=20, 
                filter_col=None, filter=None, 
                time_col='month', time_range=[earliest_date, latest_date],
                labels=None, width=800, height=600, 
                highlight=None):
    '''
    '''
    #initialize df
    df = (
        df
        #filter by time range
        .filter(pl.col(time_col).is_between(time_range[0], time_range[1]))
    )
    #filter if specified
    if filter_col:
        df = df.filter(pl.col(filter_col).is_in(filter))
    #get top n categories
    top_cats = (
        df
        .group_by(cat_group)
        .agg(pl.col(stat_col).mean().alias(stat_alias))
        .sort(pl.col(stat_alias), descending=True)
        .limit(limit)
        .to_series()
    )
    #create df
    df = (
        df
        .filter(pl.col(cat_group).is_in(top_cats.append(pl.Series([highlight]))))
        .group_by(cat_group)
        .agg(pl.col(stat_col).mean().alias(stat_alias))
        .sort(pl.col(stat_alias))
    )
    # Create a bar figure
    fig = px.bar(
        df,
        x=stat_alias, y=cat_group,
        title=title, labels=labels,
        width=width, height=height,
    )

    #set highlight if specified
    if highlight:
        fig["data"][0]["marker"]["color"] = (
            [fig["data"][0]["marker"]['color'] if c == highlight 
             else "lightgrey" for c in fig["data"][0]["y"]]
        )
    
    return fig


## PNREC Visualizations

### National Context

To put the Northwest Seaport Alliance in context, we present various KPIs and associated trends

#### Efficiency Rankings 

In [92]:
#group and define efficiency metrict
df = (
    calls_df
    .with_columns(
        (pl.col('vessel_size')/pl.col('hrs_at_berth')).alias('Efficiency Score')
    )
)

bar_ranking(
    df,
    cat_group='port_group',
    stat_col='Efficiency Score',
    stat_alias='Vessel Size (m) per Hour at Berth',
    title='Port Efficiency: Vessel Size per Hour at Berth',
    limit=30,
    labels={'port_group':'Port'},
    highlight='NW Seaport Alliance',
    width=800,
    height=600,
)

better: total volume per hour at berth

In [93]:
#get top 20 ports by calls
top_groups_calls = (
    calls_df
    .group_by('port_group')
    .agg(pl.col('call_id').count().alias('total_calls'))
    .sort(pl.col('total_calls'), descending=True)
    .limit(20)
)

#plot
bar_ranking(df=calls_df, cat_group='port_group', stat_col='hrs_in_port_waters',
            filter_col='port_group', filter=top_groups_calls['port_group'],
            stat_alias='Time in Port Waters (hrs)', 
            title='Average Time in Port Waters (Top 20 ports by total calls)',
            labels={'port_group':''}, highlight='NW Seaport Alliance',
            limit=50, width=800, height=500)

Better to filter by volume

In [94]:
plot_line(
        df=calls_df, cat_group='port_group', time_col='month',
        y_col='hrs_at_berth', y_col_alias='Average Hours at Berth',
        title='Hours at Berth per call at Principal Ports', width=None, height=None,
        labels={'port_group':'Port', 'month':''},
        cat_limit=10, cat_limit_col='hrs_at_berth',
        highlight='NW Seaport Alliance', highlight_color='blue'
    )

In [103]:
plot_line(
        df=calls_df, cat_group='port_group', time_col='month',
        filter_col='port_group', filter=['NW Seaport Alliance', 'San Pedro Ports'],
        y_col='hrs_at_berth', y_col_alias='Average Hours at Berth',
        title='Hours at Berth per call - San Pedro vs NWSPA', width=None, height=None,
        labels={'port_group':'Port', 'month':''},
        cat_limit=10, cat_limit_col='hrs_at_berth',
        #highlight='NW Seaport Alliance', highlight_color='blue'
    )

Much more interesting: total hours at berth per unit volume in each month

In [96]:
plot_line(
        df=calls_df, cat_group='port_group', time_col='month',
        y_col='vessel_size', y_col_alias='Mean Vessel Length (m)',
        title='Vessel Size at Top Ports', width=None, height=None,
        labels={'port_group':'Port', 'month':''},
        cat_limit=10, cat_limit_col='vessel_size',
        highlight='NW Seaport Alliance', highlight_color='blue'
    )

In [97]:
plot_line(
        df=calls_df.filter(pl.col('port_group').is_in(['NW Seaport Alliance', 'Virginia, VA, Port of'])),
        cat_group='port_group', time_col='month',
        y_col='vessel_size', y_col_alias='Mean Vessel Length (m)',
        title='Vessel Size - Diverging Trends', width=None, height=None,
        labels={'port_group':'Port', 'month':''},
        cat_limit=10, cat_limit_col='vessel_size',
    )

Note filtering on vessel size - consider swap to total volume

In [98]:
plot_line_count(
        df=calls_df, cat_group='port_group', time_col='month',
        y_col='hrs_at_berth', y_col_alias='Number of Calls per Month',
        title='Total Calls at Principal Ports', width=None, height=None,
        labels={'port_group':'Port', 'month':''},
        cat_limit=10, cat_limit_col='hrs_at_berth',
        highlight='NW Seaport Alliance', highlight_color='blue'
    )

Reality check here?

### Dock-level Visualizations

In [99]:
plot_mapbox(df=calls_df, cat_group='dock_name', filter_col='port_name', 
            filter=['Seattle, WA'],
            lat_col='dock_lat', lon_col='dock_lon', size_col='hrs_at_berth', size_col_alias='Mean Hours at Berth',
            color_col='hrs_at_anchor', color_col_alias='Mean Hours at Anchor',
            size_max=20, color_outlier_z=2,
            title='Hours at Berth vs. Hours at Anchor - Seattle', hover_name='dock_name')


*scatter_mapbox* is deprecated! Use *scatter_map* instead. Learn more at: https://plotly.com/python/mapbox-to-maplibre/



Note current issue with Tacoma Docks not showing up - will fix and re-run. 

In [100]:
plot_mapbox(df=calls_df, cat_group='dock_name', filter_col='port_group', 
            filter=['San Pedro Ports'],
            lat_col='dock_lat', lon_col='dock_lon', size_col='hrs_at_berth', size_col_alias='Mean Hours at Berth',
            color_col='hrs_at_anchor', color_col_alias='Mean Hours at Anchor',
            size_max=20, color_outlier_z=2, zoom=11.5,
            title='Hours at Berth vs. Hours at Anchor - San Pedro Ports', hover_name='dock_name')


*scatter_mapbox* is deprecated! Use *scatter_map* instead. Learn more at: https://plotly.com/python/mapbox-to-maplibre/



In [101]:
#group and define efficiency metrict
df = (
    calls_df.filter(pl.col('port_group')=='NW Seaport Alliance')
    .with_columns(
        (pl.col('vessel_size')/pl.col('hrs_at_berth')).alias('Efficiency Score')
    )
)

bar_ranking(df,
            cat_group='dock_name', stat_col='Efficiency Score',
            stat_alias='Vessel Size (m) per Hour at Berth', 
            title='Top Performing Docks - NW Seaport Alliance',
            labels={'dock_name':''},
            limit=20, width=800, height=500)