# PNREC Presentation Prep



In [80]:
#prelims
import numpy as np
import polars as pl
import pandas as pd
import geopandas as gpd

import plotly.express as px
import datetime as dt
from dateutil.relativedelta import relativedelta
import geopy
from geopy.geocoders import Bing
from geopy.extra.rate_limiter import RateLimiter

#display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pl.Config(tbl_rows=100);

## Load Data and initial prep

Data used here is processed in the geodata_prep and port_stats notebooks in the main directory. 

The data used in the app is a dataframe with reach row corresponding to a port call, including data such as port, dock, and vessel info, time of arrival, hours at berth and at anchor, time in port waters, etc. 

In [81]:
#create main dataframe
calls_df = (
    #read in data
    pl.read_parquet('../dashboard/calls.parquet')
    #get year month and date from arrival time
    .with_columns(
        pl.col('time_arrival').dt.year().alias('year'),
        pl.col('time_arrival').dt.date().dt.month_start().alias('month'),
        pl.col('time_arrival').dt.date().alias('date')
    )
    #add port group for NWSPA and San Pedro Ports
    .with_columns(
        pl.when(pl.col('port_name').is_in(['Seattle, WA', 'Tacoma, WA']))
        .then(pl.lit('NW Seaport Alliance'))
        .when(pl.col('port_name').is_in(['Port of Los Angeles, CA',
                                         'Port of Long Beach, CA']))
        .then(pl.lit('San Pedro Ports'))
        .otherwise(pl.col('port_name'))
        .alias('port_group')
    )
)
#get date bounds
earliest_date = calls_df['time_arrival'].min().date()
latest_date = calls_df['time_arrival'].max().date()

#inspect data
calls_df.describe()

statistic,call_id,port_name,port_lat,port_lon,dock_name,dock_id,facility_type,dock_lat,dock_lon,imo,vessel_size,time_port_entry,time_arrival,time_departure,time_port_exit,hrs_at_berth,hrs_at_anchor,hrs_to_dock,hrs_in_port_after_dock,hrs_in_port_waters,year,month,date,port_group
str,str,str,f64,f64,str,str,str,f64,f64,f64,f64,str,str,str,str,f64,f64,f64,f64,f64,f64,str,str,str
"""count""","""156205""","""156205""",156205.0,156205.0,"""156205""","""156205""","""155832""",156205.0,156205.0,156205.0,156205.0,"""156205""","""156205""","""156205""","""156205""",156205.0,156205.0,156205.0,156205.0,156205.0,156205.0,"""156205""","""156205""","""156205"""
"""null_count""","""0""","""0""",0.0,0.0,"""0""","""0""","""373""",0.0,0.0,0.0,0.0,"""0""","""0""","""0""","""0""",0.0,0.0,0.0,0.0,0.0,0.0,"""0""","""0""","""0"""
"""mean""",,,32.686926,-95.288618,,,,32.686999,-95.287408,10116000.0,208.089351,"""2021-07-26 08:52:21.130284""","""2021-07-26 20:08:02.973278""","""2021-07-29 00:46:41.506270""","""2021-07-29 05:56:08.875797""",47.656672,8.141756,11.253709,5.149874,69.055084,2021.070702,"""2021-07-11 13:41:40.076000""","""2021-07-26 07:35:18.562000""",
"""std""",,,7.207185,20.905249,,,,7.207342,20.905297,26094000.0,58.853774,,,,,62.115883,28.449073,34.912037,40.933445,95.135948,2.030186,,,
"""min""","""0_Corpus Christi, TX_2020-04-0…","""Albany Port District, NY""",17.938939,-166.549916,"""ADM Corpus Christi Grain Eleva…","""00XE""","""Anchorage""",17.936081,-166.53444,0.0,101.0,"""2018-01-01 00:35:19""","""2018-01-01 00:35:19""","""2018-01-01 04:23:54""","""2018-01-01 09:15:57""",0.083333,0.0,0.0,0.0,0.083333,2018.0,"""2018-01-01""","""2018-01-01""","""Albany Port District, NY"""
"""25%""",,,28.96133,-118.2095,,,,28.936819,-118.217936,9294977.0,177.0,"""2019-09-24 13:52:02""","""2019-09-24 21:12:08""","""2019-09-26 17:14:26""","""2019-09-26 20:53:34""",16.45,0.0,2.583333,2.083333,25.516667,2019.0,"""2019-09-01""","""2019-09-24""",
"""50%""",,,30.69123,-90.085256,,,,30.723889,-90.12417,9401491.0,190.0,"""2021-09-06 09:40:43""","""2021-09-07 02:34:37""","""2021-09-09 04:35:42""","""2021-09-09 11:20:56""",30.85,0.0,3.483333,2.75,43.466667,2021.0,"""2021-09-01""","""2021-09-07""",
"""75%""",,,37.82152,-80.117801,,,,37.797222,-80.115556,9615042.0,231.0,"""2023-05-08 10:41:45""","""2023-05-08 22:48:08""","""2023-05-11 09:53:50""","""2023-05-11 13:02:31""",56.983333,0.0,5.3,3.5,80.233333,2023.0,"""2023-05-01""","""2023-05-08""",
"""max""","""9993808_Honolulu, O'ahu, HI_20…","""Wilmington, NC""",61.23778,-66.096678,"""YUSEN TERMINALS BERTHS 212-221""","""1JHK""","""Tie Off""",61.24306,-66.086926,980002500.0,667.0,"""2024-12-31 18:55:48""","""2024-12-31 22:30:59""","""2024-12-31 23:37:00""","""2024-12-31 23:37:00""",1398.416667,241.483333,2152.533333,3653.166667,4453.8,2024.0,"""2024-12-01""","""2024-12-31""","""Wilmington, NC"""


#### Volume Data from Hanouf

In [82]:
#load import data
importvol_df = (
    #read file
    pl.scan_csv('../port data/volumes/portimports.csv', infer_schema_length=0)
    #keep only relevant columns
    .select('CNT_VAL_MO','CNT_WGT_MO','VES_VAL_MO','VES_WGT_MO', 'date', 'matched_PORT_NAME') #only keeping total volumes for now
    #set data types
    .cast({
        'CNT_VAL_MO': pl.Float64, 'CNT_WGT_MO': pl.Float64,
        'VES_VAL_MO': pl.Float64, 'VES_WGT_MO': pl.Float64
    })
    #convert date to month start
    .with_columns(pl.col('date').str.to_date())
    #sum over port and month
    .group_by('date', 'matched_PORT_NAME').sum()
    #housekeeping
    .drop_nulls('matched_PORT_NAME')
    .rename({
        'matched_PORT_NAME': 'port_name', 
        'CNT_VAL_MO': 'val_container_im', 'CNT_WGT_MO': 'wt_container_im',
        'VES_VAL_MO': 'val_total_im', 'VES_WGT_MO': 'wt_total_im'
    })
    .collect()
)

#load export data
exportvol_df = (
    pl.scan_csv('../port data/volumes/portexports.csv', infer_schema_length=0)
    #keep only relevant columns
    .select('CNT_VAL_MO','CNT_WGT_MO','VES_VAL_MO','VES_WGT_MO', 'date', 'matched_PORT_NAME') #only keeping total volumes for now
    #set data types
    .cast({
        'CNT_VAL_MO': pl.Float64, 'CNT_WGT_MO': pl.Float64,
        'VES_VAL_MO': pl.Float64, 'VES_WGT_MO': pl.Float64
    })
    #convert date to date
    .with_columns(pl.col('date').str.to_date())
    #sum over port and month
    .group_by('date', 'matched_PORT_NAME').sum()
    #housekeeping
    .drop_nulls('matched_PORT_NAME')
    .rename({
        'matched_PORT_NAME': 'port_name', 
        'CNT_VAL_MO': 'val_container_ex', 'CNT_WGT_MO': 'wt_container_ex',
        'VES_VAL_MO': 'val_total_ex', 'VES_WGT_MO': 'wt_total_ex'
    })
    .collect()
)

In [83]:
#combine import and export data
volumes_df = (
    importvol_df
    .join(exportvol_df, on=['date', 'port_name'], how='inner', suffix='_export')
    #sum weights and values
    .with_columns(
        (pl.col('val_container_im') + pl.col('val_container_ex'))
        .alias('val_container_total'),
        (pl.col('wt_container_im') + pl.col('wt_container_ex'))
        .alias('wt_container_total'),
        (pl.col('val_total_im') + pl.col('val_total_ex'))
        .alias('val_total'),
        (pl.col('wt_total_im') + pl.col('wt_total_ex'))
        .alias('wt_total')
    )
    #computer bulk (non-container) values and weights for import and export
    .with_columns(
        #totals
        (pl.col('val_total') - pl.col('val_container_total'))
        .alias('val_bulk'),
        (pl.col('wt_total') - pl.col('wt_container_total'))
        .alias('wt_bulk_total'),
        #bulk import and export values and weights
        (pl.col('val_total_ex') - pl.col('val_container_ex'))
        .alias('val_bulk_ex'),
        (pl.col('wt_total_ex') - pl.col('wt_container_ex'))
        .alias('wt_bulk_ex'),
        (pl.col('val_total_im') - pl.col('val_container_im'))
        .alias('val_bulk_im'),
        (pl.col('wt_total_im') - pl.col('wt_container_im'))
        .alias('wt_bulk_im')
    )
    #add port group for NWSPA and San Pedro Ports
    .with_columns(
        pl.when(pl.col('port_name').is_in(['Seattle, WA', 'Tacoma, WA']))
        .then(pl.lit('NW Seaport Alliance'))
        .when(pl.col('port_name').is_in(['Port of Los Angeles, CA',
                                         'Port of Long Beach, CA']))
        .then(pl.lit('San Pedro Ports'))
        .otherwise(pl.col('port_name'))
        .alias('port_group')
    )
    #rename date to month
    .rename({'date': 'month'})
)
#inspect
display(volumes_df.head())
volumes_df.describe()

month,port_name,val_container_im,wt_container_im,val_total_im,wt_total_im,val_container_ex,wt_container_ex,val_total_ex,wt_total_ex,val_container_total,wt_container_total,val_total,wt_total,val_bulk,wt_bulk_total,val_bulk_ex,wt_bulk_ex,val_bulk_im,wt_bulk_im,port_group
date,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str
2022-03-01,"""Jacksonville, FL""",3048300000.0,1502700000.0,9662100000.0,4120900000.0,1690400000.0,372298700.0,2792100000.0,693370158.0,4738700000.0,1875000000.0,12454000000.0,4814200000.0,7715500000.0,2939200000.0,1101700000.0,321071458.0,6613800000.0,2618100000.0,"""Jacksonville, FL"""
2020-01-01,"""Corpus Christi, TX""",50000.0,6755.0,3120800000.0,8201500000.0,80103156.0,379686113.0,20740000000.0,46465000000.0,80153156.0,379692868.0,23861000000.0,54666000000.0,23781000000.0,54287000000.0,20660000000.0,46085000000.0,3120800000.0,8201500000.0,"""Corpus Christi, TX"""
2018-10-01,"""Milwaukee, WI""",440820.0,177230.0,174429622.0,472935569.0,140735.0,10456.0,60977654.0,151031047.0,581555.0,187686.0,235407276.0,623966616.0,234825721.0,623778930.0,60836919.0,151020591.0,173988802.0,472758339.0,"""Milwaukee, WI"""
2021-09-01,"""New Orleans, LA""",2391900000.0,1046300000.0,8094200000.0,8462100000.0,2484500000.0,1377500000.0,8032400000.0,17518000000.0,4876400000.0,2423800000.0,16127000000.0,25980000000.0,11250000000.0,23556000000.0,5547900000.0,16140000000.0,5702300000.0,7415800000.0,"""New Orleans, LA"""
2021-10-01,"""Port of Oakland, CA""",10663000000.0,2610400000.0,11158000000.0,2661600000.0,7995100000.0,3298100000.0,8636700000.0,3893500000.0,18658000000.0,5908400000.0,19795000000.0,6555000000.0,1137000000.0,646607664.0,641569957.0,595385521.0,495422826.0,51222143.0,"""Port of Oakland, CA"""


statistic,month,port_name,val_container_im,wt_container_im,val_total_im,wt_total_im,val_container_ex,wt_container_ex,val_total_ex,wt_total_ex,val_container_total,wt_container_total,val_total,wt_total,val_bulk,wt_bulk_total,val_bulk_ex,wt_bulk_ex,val_bulk_im,wt_bulk_im,port_group
str,str,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str
"""count""","""5461""","""5461""",5461.0,5461.0,5461.0,5461.0,5461.0,5461.0,5461.0,5461.0,5461.0,5461.0,5461.0,5461.0,5461.0,5461.0,5461.0,5461.0,5461.0,5461.0,"""5461"""
"""null_count""","""0""","""0""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"""0"""
"""mean""","""2021-05-16 02:51:39.652000""",,6143300000.0,1220300000.0,8518200000.0,3851700000.0,1720200000.0,712890000.0,4267600000.0,5427000000.0,7863500000.0,1933200000.0,12786000000.0,9278800000.0,4922300000.0,7345600000.0,2547400000.0,4714200000.0,2374900000.0,2631400000.0,
"""std""",,,17949000000.0,3255700000.0,20115000000.0,5930000000.0,3677200000.0,1533800000.0,8149900000.0,11666000000.0,21086000000.0,4630400000.0,25209000000.0,15675000000.0,8304300000.0,13843000000.0,6243800000.0,11154000000.0,3467300000.0,3664500000.0,
"""min""","""2018-01-01""","""Albany Port District, NY""",0.0,0.0,4140.0,5.0,0.0,0.0,11700.0,624.0,0.0,0.0,23436.0,1709.0,0.0,0.0,0.0,0.0,0.0,0.0,"""Albany Port District, NY"""
"""25%""","""2019-09-01""",,6413388.0,1483942.0,330962328.0,515599522.0,1267824.0,329875.0,131661624.0,184547693.0,23116137.0,11204480.0,725399489.0,1312400000.0,441740070.0,1062800000.0,68818281.0,100923005.0,187801251.0,429459946.0,
"""50%""","""2021-05-01""",,165980679.0,54387745.0,1521200000.0,1828100000.0,69431127.0,46981721.0,1176200000.0,1117600000.0,309513723.0,150841770.0,3637400000.0,3227800000.0,1977800000.0,2509700000.0,648487957.0,727643492.0,968760829.0,1378600000.0,
"""75%""","""2023-02-01""",,2431000000.0,762321801.0,6418200000.0,4073400000.0,1424000000.0,406821671.0,4583800000.0,5290800000.0,4425400000.0,1212100000.0,11722000000.0,9559200000.0,5824300000.0,7021200000.0,2015800000.0,3513900000.0,3040900000.0,3024800000.0,
"""max""","""2024-10-01""","""Wilmington, NC""",151660000000.0,25113000000.0,161910000000.0,34814000000.0,19322000000.0,8606700000.0,73539000000.0,89714000000.0,165830000000.0,31019000000.0,179280000000.0,111600000000.0,74756000000.0,97339000000.0,56391000000.0,82669000000.0,22105000000.0,28697000000.0,"""Wilmington, NC"""


In [85]:
#construct columns dictionary for volumes_df
volumes_col_descriptions = {
    'val_container_im': 'Value of Containerized Imports',
    'val_container_ex': 'Value of Containerized Exports',
    'val_container_total': 'Value of Total Containerized Cargo',
    'val_bulk': 'Value of Bulk Cargo',
    'val_bulk_ex': 'Value of Bulk Exports',
    'val_bulk_im': 'Value of Bulk Imports',
    'val_total': 'Value of Total Cargo',
    'val_total_ex': 'Value of Total Exports',
    'val_total_im': 'Value of Total Imports',
    'wt_container_im': 'Weight of Containerized Imports',
    'wt_container_ex': 'Weight of Containerized Exports',
    'wt_container_total': 'Weight of Total Containerized Cargo',
    'wt_bulk': 'Weight of Bulk Cargo',
    'wt_bulk_ex': 'Weight of Bulk Exports',
    'wt_bulk_im': 'Weight of Bulk Imports',
    'wt_total': 'Weight of Total Cargo',
    'wt_total_ex': 'Weight of Total Exports',
    'wt_total_im': 'Weight of Total Imports',
    'date': 'Date',
    'port_name': 'Port Name'
}

In [86]:
fig = px.line(exportvol_df.sort('port_name','date'), x='date', y='val_total_ex', 
        title='Seattle Export Value In Context', color='port_name')
fig.update_traces(line_color='lightgray')
fig.update_traces(patch={'line': {'color': 'blue'}}, 
                          selector=dict(name='Seattle, WA'))
fig.show()

In [89]:
fig = px.line(volumes_df.sort('port_name','month'), x='month', y='val_total_im', 
        title='Seattle Import Value In Context', color='port_name')
fig.update_traces(line_color='lightgray')
fig.update_traces(patch={'line': {'color': 'blue'}}, 
                          selector=dict(name='Seattle, WA'))
fig.show()

In [90]:
fig = px.line(volumes_df.sort('port_name','month'), x='month', y='val_container_im', 
        title='Containerize Value (imports)', color='port_name')
fig.update_traces(line_color='lightgray')
fig.update_traces(patch={'line': {'color': 'blue'}}, 
                          selector=dict(name='Port of Long Beach, CA'))
fig.show()

In [None]:
fig = px.line(volumes_df.sort('port_name','date'), x='date', y='wt_total_im', 
        title='Total Weight (imports)', color='port_name')
fig.update_traces(line_color='lightgray')
fig.update_traces(patch={'line': {'color': 'blue'}}, 
                          selector=dict(name='Port of Los Angeles, CA'))
fig.show()

## Visualization Functions

For the dashboard and PNREC presentation, we define a handful of visualization types (e.g. scatter map, line plot, bar chart, etc) and code to allow both default visualizations as well as custom user-generated visualizations. 

In [134]:
#define zoom level function for plotly express scatter_mapbox
def mapbox_zoom_finder(lons, lats, lon_pad=0, lat_pad=0):
    """
    Calculates the optimal zoom level for a Plotly Mapbox plot.
    Args:
        lons (list): List of longitudes.
        lats (list): List of latitudes.
        lon_pad (float, optional): Padding to add to the longitude range. Defaults to 0.
        lat_pad (float, optional): Padding to add to the latitude range. Defaults to 0.
    Returns:
        zoom (int): the calculated zoom level
    """
    # Check if the lengths of lons and lats are equal and not empty
    if len(lons) != len(lats) or len(lons) == 0:
        return 10
    # Calculate the maximum and minimum longitude and latitude
    max_lon, min_lon = max(lons), min(lons)
    max_lat, min_lat = max(lats), min(lats)
    # Calculate the longitude and latitude ranges
    lon_range = max_lon - min_lon
    lat_range = max_lat - min_lat
    # Calculate the zoom level based on the ranges
    zoom = 7 - np.log2(max(lon_range + lon_pad, lat_range + lat_pad))
    return zoom

def plot_mapbox(df, cat_group,  lat_col, lon_col, 
                size_col, size_col_alias, color_col, color_col_alias, 
                title, filter_col=None, filter=None, time_col='date', 
                time_range=[earliest_date, latest_date], 
                zoom=None, center=None, width=800, height=600, 
                size_max=30, range_color=None, hover_name=None, hover_data=None, 
                mapbox_style='carto-positron', labels=None, 
                color_continuous_scale=None, color_outlier_z=None):
    """
    Plots a Mapbox scatter plot using Plotly.
    """
    #init df
    df = df
    #filter if specified
    if filter_col:
        df = df.filter(pl.col(filter_col).is_in(filter))
    #generate df
    df = (
        df
        #filter by time range
        .filter(pl.col(time_col).is_between(time_range[0], time_range[1]))
        .group_by(cat_group)
        .agg(
            #get the lat and lon columns
            pl.col(lat_col).first().alias(lat_col),
            pl.col(lon_col).first().alias(lon_col),
            #get hover name
            #may be different or same as cat_group
            #get stats
            pl.col(size_col).mean().alias(size_col_alias),
            pl.col(color_col).mean().alias(color_col_alias),
        )
    )
    #Set default color scale if not provided
    if not color_continuous_scale:
        color_continuous_scale = px.colors.sequential.Viridis

    # Set the zoom level automatically if not provided
    if not zoom:
        zoom = mapbox_zoom_finder(df[lon_col], df[lat_col])

    #Set the center of the map if not provided
    if not center:
        center = {
            'lat': ((df[lat_col].max() - df[lat_col].min()) / 2),
            'lon': ((df[lon_col].max() - df[lon_col].min()) / 2)
        }

    #drop outliers if specified
    if color_outlier_z:
        #get color_col upper and lower limits based on z score
        color_col_mean, color_col_std = df[color_col_alias].mean(), df[color_col_alias].std()
        color_col_upper = color_col_mean + (color_col_std * color_outlier_z)
        color_col_lower = color_col_mean - (color_col_std * color_outlier_z)
        #set range color
        range_color = [color_col_lower, color_col_upper]

    # Create a scatter mapbox figure
    fig = px.scatter_mapbox(
        #data
        df, lat=lat_col, lon=lon_col,
        #categories
        size=size_col_alias, color=color_col_alias,
        #hover info
        hover_name=hover_name, hover_data=hover_data,
        #display settings
        range_color=range_color, size_max=size_max,
        color_continuous_scale=color_continuous_scale, mapbox_style=mapbox_style,
        width=width, height=height,
        #title and labals
        title=title, labels=labels
    )
    # Set the zoom level
    fig.update_layout(mapbox_zoom=zoom)

    #NOTE Add annotation if specified

    return fig

def plot_line(df, cat_group, time_col, y_col, y_col_alias, title,
              filter_col=None, filter=None, cat_limit=None, cat_limit_col=None,
              stat='mean', time_conversion=None,
              time_range=[earliest_date, latest_date], 
              volume_df=None, volume_col=None,
              width=800, height=600, hover_name=None, 
              hover_data=None, labels=None, color_continuous_scale=None, 
              highlight=None, highlight_color=None):
    """
    Plots a line chart using Plotly Express.
    """
    #initialize df
    df = (
        df
        #filter by time range
        .filter(pl.col(time_col).is_between(time_range[0], time_range[1]))
    )
    #filter if specified
    if filter_col:
        df = df.filter(pl.col(filter_col).is_in(filter))
    #generate df
    df = (
        df
        .group_by(cat_group, time_col)
        .agg(
            #compute y col mean
            pl.col(y_col).mean().alias(y_col+'_mean'),
            #compute y col sum
            pl.col(y_col).sum().alias(y_col+'_sum'),
        )
        .sort(time_col)
    )
    #rename y col to match stat choice
    if stat == 'sum':
        df = df.rename({y_col+'_sum': y_col_alias})
    elif stat == 'mean':
        df = df.rename({y_col+'_mean': y_col_alias})
    else:
        raise ValueError(f"Invalid stat: {stat}. Must be 'sum' or 'mean'.")
    #merge in volume data if specified
    if volume_df is not None:
        #merge in volume data
        df = (
            df
            .join(volume_df.group_by(cat_group, time_col)
                  .agg(pl.col(volume_col).sum().alias(volume_col)), 
                  on=[time_col, cat_group], how='inner')
            #compute y column 
            .with_columns(
                (pl.col(volume_col) / pl.col(y_col_alias))
                .alias(y_col_alias)
            )
            .sort(time_col)
        )
    #limit categories if specified
    if cat_limit:
        #get the top n categories
        top_cats = (
            df.group_by(cat_group)
            .agg(pl.col(cat_limit_col).sum())
            .sort(pl.col(cat_limit_col), descending=True)
            .limit(cat_limit)
            .to_series()
        )
        #filter to only top n categories or highlight category
        df = (
            df.filter(pl.col(cat_group)
                       .is_in(top_cats.append(pl.Series([highlight]))))
            .sort('month')
        )
    #convert time if specified
    if time_conversion:
        #convert time column to datetime
        df = df.with_columns(
            pl.col(y_col_alias)*time_conversion
        )

    # Set default color scale if not provided
    if not color_continuous_scale:
        color_continuous_scale = px.colors.sequential.Viridis

    # Create a line figure
    fig = px.line(
        df,
        x=time_col,
        y=y_col_alias,
        color=cat_group,
        title=title,
        labels=labels,
        hover_name=hover_name,
        hover_data=hover_data,
    )
    # Set the width and height of the figure
    fig.update_layout(width=width, height=height)

    #highlight given lines if specified
    if highlight:
        fig.update_traces(line_color='lightgray')
        fig.update_traces(patch={'line': {'color': highlight_color}}, 
                          selector=dict(name=highlight))

    return fig

#the above takes the mean of y_col; this takes the count (e.g., number of calls)
def plot_line_count(df, cat_group, time_col, y_col, y_col_alias, title,
              filter_col=None, filter=None, cat_limit=None, cat_limit_col=None, 
              time_range=[earliest_date, latest_date], 
              width=800, height=600, hover_name=None, 
              hover_data=None, labels=None, color_continuous_scale=None, 
              highlight=None, highlight_color=None):
    """
    Plots a line chart using Plotly Express.
    """
    #initialize df
    df = (
        df
        #filter by time range
        .filter(pl.col(time_col).is_between(time_range[0], time_range[1]))
    )
    #filter if specified
    if filter_col:
        df = df.filter(pl.col(filter_col).is_in(filter))
    #limit categories if specified
    if cat_limit:
        #get the top n categories
        top_cats = (
            df.group_by(cat_group)
            .agg(pl.col(cat_limit_col).count())
            .sort(pl.col(cat_limit_col), descending=True)
            .limit(cat_limit)
            .to_series()
        )
        #filter to only top n categories or highlight category
        df = df.filter(pl.col(cat_group)
                       .is_in(top_cats.append(pl.Series([highlight]))))
    #generate df
    df = (
        df
        .group_by(cat_group, time_col)
        .agg(
            #compute y col mean
            pl.col(y_col).count().alias(y_col_alias)
        )
        .sort(time_col)
    )

    # Set default color scale if not provided
    if not color_continuous_scale:
        color_continuous_scale = px.colors.sequential.Viridis

    # Create a line figure
    fig = px.line(
        df,
        x=time_col,
        y=y_col_alias,
        color=cat_group,
        title=title,
        labels=labels,
        hover_name=hover_name,
        hover_data=hover_data,
    )
    # Set the width and height of the figure
    fig.update_layout(width=width, height=height)

    #highlight given lines if specified
    if highlight:
        fig.update_traces(line_color='lightgray')
        fig.update_traces(patch={'line': {'color': highlight_color}}, 
                          selector=dict(name=highlight))

    return fig


def bar_ranking(df, cat_group, stat_col, stat_alias, title, limit=20, 
                filter_col=None, filter=None, 
                time_col='month', time_range=[earliest_date, latest_date],
                labels=None, width=800, height=600, 
                highlight=None):
    '''
    '''
    #initialize df
    df = (
        df
        #filter by time range
        .filter(pl.col(time_col).is_between(time_range[0], time_range[1]))
    )
    #filter if specified
    if filter_col:
        df = df.filter(pl.col(filter_col).is_in(filter))
    #get top n categories
    top_cats = (
        df
        .group_by(cat_group)
        .agg(pl.col(stat_col).mean().alias(stat_alias))
        .sort(pl.col(stat_alias), descending=True)
        .limit(limit)
        .to_series()
    )
    #create df
    df = (
        df
        .filter(pl.col(cat_group).is_in(top_cats.append(pl.Series([highlight]))))
        .group_by(cat_group)
        .agg(pl.col(stat_col).mean().alias(stat_alias))
        .sort(pl.col(stat_alias))
    )
    # Create a bar figure
    fig = px.bar(
        df,
        x=stat_alias, y=cat_group,
        title=title, labels=labels,
        width=width, height=height,
    )

    #set highlight if specified
    if highlight:
        fig["data"][0]["marker"]["color"] = (
            [fig["data"][0]["marker"]['color'] if c == highlight 
             else "lightgrey" for c in fig["data"][0]["y"]]
        )
    
    return fig


## PNREC Visualizations

### National Context

To put the Northwest Seaport Alliance in context, we present various KPIs and associated trends


In [93]:
plot_line_count(
        df=calls_df, cat_group='port_group', time_col='month',
        y_col='hrs_at_berth', y_col_alias='Number of Calls per Month',
        title='Total Calls at Principal Ports', width=None, height=None,
        labels={'port_group':'Port', 'month':''},
        cat_limit=10, cat_limit_col='hrs_at_berth',
        highlight='NW Seaport Alliance', highlight_color='blue'
    )

In [97]:
plot_line(
        df=calls_df, cat_group='port_group', time_col='month',
        y_col='vessel_size', y_col_alias='Mean Vessel Length (m)',
        title='Vessel Size at Top Ports', width=None, height=None,
        labels={'port_group':'Port', 'month':''},
        cat_limit=10, cat_limit_col='Mean Vessel Length (m)',
        highlight='NW Seaport Alliance', highlight_color='blue'
    )

In [96]:
plot_line(
        df=calls_df.filter(pl.col('port_group').is_in(['NW Seaport Alliance', 'Virginia, VA, Port of'])),
        cat_group='port_group', time_col='month',
        y_col='vessel_size', y_col_alias='Mean Vessel Length (m)',
        title='Vessel Size - Diverging Trends', width=None, height=None,
        labels={'port_group':'Port', 'month':''},
        cat_limit=10, cat_limit_col='Mean Vessel Length (m)',
    )

#### Efficiency and Performance

In [141]:
plot_line(
        df=calls_df, cat_group='port_group', time_col='month',
        y_col='hrs_at_berth', y_col_alias='Average Hours at Berth',
        title='Hours at Berth per call at Principal Ports', width=None, height=None,
        labels={'port_group':'Port', 'month':''},
        cat_limit=10, cat_limit_col='Average Hours at Berth',
        highlight='NW Seaport Alliance', highlight_color='blue'
    )

In [142]:
plot_line(
        df=calls_df, cat_group='port_group', time_col='month',
        y_col='hrs_at_berth', y_col_alias='Efficiency ($/hr at berth)',
        stat='sum',
        volume_df=volumes_df, volume_col='val_total_im',
        title='Dollars of Trade Value per Vessel-Hour at Berth at Principal Ports', width=None, height=None,
        labels={'port_group':'Port', 'month':''},
        cat_limit=5, cat_limit_col='val_total_im',
        highlight='NW Seaport Alliance', highlight_color='blue'
    )

In [140]:
fig = px.line(volumes_df
              .group_by('port_group','month').agg(pl.col('val_total_im').sum())
              .sort('port_group','month'), 
              x='month', y='val_total_im', 
              title='NWSPA Import Value In Context', color='port_group')
fig.update_traces(line_color='lightgray')
fig.update_traces(patch={'line': {'color': 'blue'}}, 
                          selector=dict(name='NW Seaport Alliance'))
fig.show()

In [None]:
plot_line(
        df=calls_df, cat_group='port_group', time_col='month',
        y_col='hrs_at_berth', y_col_alias='Efficiency (ms/kg)',
        stat='sum', time_conversion=3600000,
        volume_df=volumes_df, volume_col='wt_total',
        title='Time at Berth per Kg Traded at Principal Ports', width=None, height=None,
        labels={'port_group':'Port', 'month':''},
        cat_limit=10, cat_limit_col='wt_total',
        highlight='NW Seaport Alliance', highlight_color='blue'
    )

In [None]:
plot_line(
        df=calls_df, cat_group='port_group', time_col='month',
        filter_col='port_group', filter=['NW Seaport Alliance', 'San Pedro Ports'],
        y_col='hrs_at_berth', y_col_alias='Average Hours at Berth',
        title='Hours at Berth per call - San Pedro vs NWSPA', width=None, height=None,
        labels={'port_group':'Port', 'month':''},
        cat_limit=10, cat_limit_col='hrs_at_berth',
        #highlight='NW Seaport Alliance', highlight_color='blue'
    )

In [None]:
#group and define efficiency metrict
df = (
    calls_df
    .with_columns(
        (pl.col('vessel_size')/pl.col('hrs_at_berth')).alias('Efficiency Score')
    )
)

bar_ranking(
    df,
    cat_group='port_group',
    stat_col='Efficiency Score',
    stat_alias='Vessel Size (m) per Hour at Berth',
    title='Port Efficiency: Vessel Size per Hour at Berth',
    limit=30,
    labels={'port_group':'Port'},
    highlight='NW Seaport Alliance',
    width=800,
    height=600,
)

better: total weight per hour at berth

In [None]:
#get top 20 ports by calls
top_groups_calls = (
    calls_df
    .group_by('port_group')
    .agg(pl.col('call_id').count().alias('total_calls'))
    .sort(pl.col('total_calls'), descending=True)
    .limit(20)
)

#plot
bar_ranking(df=calls_df, cat_group='port_group', stat_col='hrs_in_port_waters',
            filter_col='port_group', filter=top_groups_calls['port_group'],
            stat_alias='Time in Port Waters (hrs)', 
            title='Average Time in Port Waters (Top 20 ports by total calls)',
            labels={'port_group':''}, highlight='NW Seaport Alliance',
            limit=50, width=800, height=500)

### Dock-level Visualizations

In [None]:
plot_mapbox(df=calls_df, cat_group='dock_name', filter_col='port_name', 
            filter=['Tacoma, WA'],
            lat_col='dock_lat', lon_col='dock_lon', size_col='hrs_at_berth', 
            size_col_alias='Mean Hours at Berth',
            color_col='hrs_at_anchor', color_col_alias='Mean Hours at Anchor',
            size_max=20, color_outlier_z=2,
            title='Hours at Berth vs. Hours at Anchor - Tacoma', 
            hover_name='dock_name')

Schema({'dock_name': String, 'dock_lat': Float64, 'dock_lon': Float64, 'Mean Hours at Berth': Float64, 'Mean Hours at Anchor': Float64})



*scatter_mapbox* is deprecated! Use *scatter_map* instead. Learn more at: https://plotly.com/python/mapbox-to-maplibre/



In [None]:
plot_mapbox(df=calls_df, cat_group='dock_name', filter_col='port_name', 
            filter=['Seattle, WA'],
            lat_col='dock_lat', lon_col='dock_lon', size_col='hrs_at_berth', size_col_alias='Mean Hours at Berth',
            color_col='hrs_at_anchor', color_col_alias='Mean Hours at Anchor',
            size_max=20, color_outlier_z=2,
            title='Hours at Berth vs. Hours at Anchor - Seattle', hover_name='dock_name')

Note current issue with Tacoma Docks not showing up - will fix and re-run. 

In [None]:
plot_mapbox(df=calls_df, cat_group='dock_name', filter_col='port_group', 
            filter=['San Pedro Ports'],
            lat_col='dock_lat', lon_col='dock_lon', size_col='hrs_at_berth', size_col_alias='Mean Hours at Berth',
            color_col='hrs_at_anchor', color_col_alias='Mean Hours at Anchor',
            size_max=20, color_outlier_z=2, zoom=11.5,
            title='Hours at Berth vs. Hours at Anchor - San Pedro Ports', hover_name='dock_name')

In [None]:
#group and define efficiency metrict
df = (
    calls_df.filter(pl.col('port_group')=='NW Seaport Alliance')
    .with_columns(
        (pl.col('vessel_size')/pl.col('hrs_at_berth')).alias('Efficiency Score')
    )
)

bar_ranking(df,
            cat_group='dock_name', stat_col='Efficiency Score',
            stat_alias='Vessel Size (m) per Hour at Berth', 
            title='Top Performing Docks - NW Seaport Alliance',
            labels={'dock_name':''},
            limit=20, width=800, height=500)