# Port Performance Dashboard App

This notebook developes the visulizations and Dash App for the [Port Performance Project](https://github.com/epistemetrica/Port-Performance-Project). 

Basic draft of app:
- National-level visualizations are displayed by default
    - users can select the date range via sliders
- Users may select a given port
    - drop down selection to start
    - hopefully clicking on a port from the national map could serve the same UI function
- For a selected port:
    - default visualizations are presented
    - users have option to generate custom visualizations based on their selection of date range, metric (hrs at berth, vessel size, etc), stat (mean, median, etc), and other options to be developed. 

In [100]:
#prelims
import numpy as np
import pandas as pd
import geopandas as gpd
import polars as pl
import plotly.express as px
import datetime as dt
from dateutil.relativedelta import relativedelta
import dash
from dash import dcc, html
from dash.dependencies import Input, Output

#display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pl.Config(tbl_rows=100);

## Load Data

The data used in the app is a dataframe with reach row corresponding to a port call, including data such as port, dock, and vessel info, time of arrival, hours at berth and at anchor, time in port waters, etc. 

NOTE this data is currently processed in the geodata_prep notebook followed by the port_stats notebook; final versions of this project may distill that down to a single data preparation step. 

In [101]:
#load calls data
calls_df = pl.read_parquet('calls.parquet')
#inspect data
calls_df.describe()

statistic,call_id,port_name,port_lat,port_lon,dock_name,dock_id,facility_type,dock_lat,dock_lon,imo,vessel_size,time_port_entry,time_arrival,time_departure,time_port_exit,hrs_at_berth,hrs_at_anchor,hrs_to_dock,hrs_in_port_after_dock,hrs_in_port_waters
str,str,str,f64,f64,str,str,str,f64,f64,f64,f64,str,str,str,str,f64,f64,f64,f64,f64
"""count""","""146488""","""146488""",146488.0,146488.0,"""146488""","""146488""","""146133""",146488.0,146488.0,146488.0,146488.0,"""146488""","""146488""","""146488""","""146488""",146488.0,146488.0,146488.0,146488.0,146488.0
"""null_count""","""0""","""0""",0.0,0.0,"""0""","""0""","""355""",0.0,0.0,0.0,0.0,"""0""","""0""","""0""","""0""",0.0,0.0,0.0,0.0,0.0
"""mean""",,,32.32609,-94.609283,,,,32.326194,-94.608303,10145000.0,207.268527,"""2021-06-07 05:30:12.034630""","""2021-06-07 17:03:22.014062""","""2021-06-09 21:58:21.931659""","""2021-06-10 03:12:20.055758""",47.831807,8.354447,11.544853,5.225082,69.694045
"""std""",,,6.918495,20.728085,,,,6.918757,20.728437,26607000.0,58.810358,,,,,62.675238,28.97474,36.133365,41.271337,95.772879
"""min""","""0_Corpus Christi, TX_2020-04-0…","""Albany Port District, NY""",17.938939,-166.549916,"""ADM Corpus Christi Grain Eleva…","""00XE""","""Anchorage""",17.936081,-166.53444,0.0,101.0,"""2018-01-01 00:35:19""","""2018-01-01 00:35:19""","""2018-01-01 04:23:54""","""2018-01-01 09:15:57""",0.083333,0.0,0.0,0.0,0.133333
"""25%""",,,28.629389,-118.2095,,,,28.645767,-118.21083,9295397.0,176.0,"""2019-08-27 09:40:39""","""2019-08-27 17:30:51""","""2019-08-29 21:19:05""","""2019-08-30 01:38:50""",16.55,0.0,2.666667,2.15,25.8
"""50%""",,,30.69123,-90.085256,,,,30.707675,-90.105844,9401128.0,190.0,"""2021-07-19 00:59:38""","""2021-07-19 14:22:59""","""2021-07-22 01:57:56""","""2021-07-22 07:16:28""",30.966667,0.0,3.516667,2.783333,43.766667
"""75%""",,,36.86642,-80.117801,,,,36.875896,-80.114322,9612882.0,230.0,"""2023-02-27 09:23:38""","""2023-02-27 17:39:00""","""2023-03-02 01:31:11""","""2023-03-02 06:10:50""",57.016667,0.0,5.366667,3.533333,80.7
"""max""","""9992268_Morehead City, NC_2024…","""Wilmington, NC""",61.23778,-66.096678,"""YUSEN TERMINALS BERTHS 212-221""","""1JHK""","""Open Water""",61.24306,-66.086926,980002500.0,667.0,"""2024-09-30 12:54:53""","""2024-09-30 16:32:15""","""2024-09-30 21:54:12""","""2024-09-30 22:57:53""",1365.033333,246.2,2152.533333,3560.95,4453.8


#### Init variables

In [102]:
#get date bounds
earliest_date = calls_df['time_arrival'].min().date()
latest_date = calls_df['time_arrival'].max().date()

## Define DataFrame-generating functions

From the main calls dataframe, we generate various statistics such as average visits per month, maximum vessel size per dock, etc. Based on user inputs or defaults, we use these resulting dataframes to generate visualizations. 

In [103]:
def port_stats(ports=None, time_group='month',
                        start_date=earliest_date, end_date=latest_date):
    #NOTE need to combine all time with these time_group stats
    #set df
    df = calls_df
    #filter by ports if specified
    if ports:
        df = df.filter(pl.col('port_name').is_in(ports))
    #create monthly stats
    df = (
        calls_df
        #filter by date
        .filter((pl.col('time_arrival') >= start_date) & 
                (pl.col('time_arrival') <= end_date))
        #get month from docking time
        .with_columns(
            #extract month from docking time
            month = pl.col('time_arrival').dt.strftime('%Y%m')
        )
        #group by port and month
        .group_by(['port_name', 'port_lat', 'port_lon',  
                time_group])
        .agg(
            vessel_ft_sum = pl.sum('vessel_size')*3.28,
            hrs_at_berth_sum = pl.sum('hrs_at_berth'),
        )
        #group by port
        .group_by(['port_name', 'port_lat', 'port_lon'])
        .agg(
            #average vessel-ft per month
            pl.mean('vessel_ft_sum').alias(f'vessel_ft_mean_{time_group}'),
            #average hrs at berth per month
            pl.mean('hrs_at_berth_sum').alias(f'hrs_at_berth_mean_{time_group}'),
        )
    )
    return df

In [143]:
def dock_stats(ports=None, time_group='month',
                        start_date=earliest_date, end_date=latest_date):
    #NOTE need to combine all time with these time_group stats
    #set df
    df = calls_df
    #filter by ports if specified
    if ports is not None:
        df = df.filter(pl.col('port_name').is_in(ports))
    #create monthly stats
    df = (
        df
        #filter by date
        .filter((pl.col('time_arrival') >= start_date) & 
                (pl.col('time_arrival') <= end_date))
        #get month from docking time
        .with_columns(
            #extract month from docking time
            month = pl.col('time_arrival').dt.strftime('%Y%m')
        )
        #group by port and month
        .group_by(['dock_name', 'dock_lat', 'dock_lon',  
                time_group])
        .agg(
            vessel_ft_sum = pl.sum('vessel_size')*3.28,
            hrs_at_berth_sum = pl.sum('hrs_at_berth'),
        )
        #group by port
        .group_by(['dock_name', 'dock_lat', 'dock_lon'])
        .agg(
            #average vessel-ft per month
            pl.mean('vessel_ft_sum').alias(f'vessel_ft_mean_{time_group}'),
            #average hrs at berth per month
            pl.mean('hrs_at_berth_sum').alias(f'hrs_at_berth_mean_{time_group}'),
        )
    )
    return df

In [132]:
def port_time_stats(time_group='month', earliest_date=earliest_date,
                    latest_date=latest_date):
    '''
    Function to calculate port stats for each time group
    Args:
        time_group (str): time group to use for stats. Options are 'month' or 'year'
    Returns:
        df (polars.DataFrame): DataFrame with port time stats
    '''
    df = (
        calls_df
        #filter by date
        .filter((pl.col('time_arrival') >= earliest_date) & 
                (pl.col('time_arrival') <= latest_date))
        #get month from docking time
        .with_columns(
            #extract month and year from docking time
            month = pl.col('time_arrival').dt.date().dt.month_start(),
            year = pl.col('time_arrival').dt.year(),
        )
        #group by port and month
        .group_by(['port_name', 'port_lat', 'port_lon', time_group])
        .agg(
            #count number of vessels
            vessels = pl.n_unique('imo'),
            #mean vessel size
            vessel_size_mean = pl.mean('vessel_size'),
            #count number of vessel calls
            calls = pl.n_unique('call_id'),
            #time at dock stats for each vessel in hours
            hrs_occupied = pl.sum('hrs_at_berth'),
            hrs_at_berth_median = pl.median('hrs_at_berth'),
            hrs_at_berth_mean = pl.mean('hrs_at_berth'),
            #time at anchor stats for each vessel visit in hours
            hrs_at_anchor_median = pl.median('hrs_at_anchor'),
            hrs_at_anchor_mean = pl.mean('hrs_at_anchor'),
            #time in port waters 
            hrs_in_port_waters_total = pl.sum('hrs_in_port_waters'),
            hrs_in_port_waters_mean = pl.mean('hrs_in_port_waters'),
            hrs_in_port_waters_median = pl.median('hrs_in_port_waters')
        )
        #sort by port name and time group
        .sort(['port_name', time_group])
    )
    return df

In [106]:
def port_dock_time_stats(ports=None):
    #NOTE refactor to allow user defined groupings and filter by port
    df = (
        calls_df
        #get month from docking time
        .with_columns(
            #extract month from docking time
            month = pl.col('time_arrival').dt.strftime('%Y%m')
        )
        #group by port dock and month
        .group_by(['port_name', 'port_lat', 'port_lon', 
                'dock_id', 'dock_name', 'dock_lat', 'dock_lon', 
                'month'])
        .agg(
            #count number of vessels
            vessels = pl.n_unique('imo'),
            #mean vessel size
            vessel_size_mean = pl.mean('vessel_size'),
            #count number of vessel calls
            calls = pl.n_unique('call_id'),
            #time at dock stats for each vessel in hours
            hrs_occupied = pl.sum('hrs_at_berth'),
            hrs_at_berth_median = pl.median('hrs_at_berth'),
            hrs_at_berth_mean = pl.mean('hrs_at_berth'),
            #time at anchor stats for each vessel visit in hours
            hrs_at_anchor_median = pl.median('hrs_at_anchor'),
            hrs_at_anchor_mean = pl.mean('hrs_at_anchor'),
            #time in port waters 
            hrs_in_port_waters_total = pl.sum('hrs_in_port_waters'),
            hrs_in_port_waters_mean = pl.mean('hrs_in_port_waters'),
            hrs_in_port_waters_median = pl.median('hrs_in_port_waters')
        )
    )
    return df

## Visualization Functions

In [107]:
def mapbox_zoom_finder(lons, lats, lon_pad=0, lat_pad=0):
    """
    Calculates the optimal zoom level for a Plotly Mapbox plot.
    Args:
        lons (list): List of longitudes.
        lats (list): List of latitudes.
        lon_pad (float, optional): Padding to add to the longitude range. Defaults to 0.
        lat_pad (float, optional): Padding to add to the latitude range. Defaults to 0.
    Returns:
        zoom (int): the calculated zoom level
    """
    # Check if the lengths of lons and lats are equal and not empty
    if len(lons) != len(lats) or len(lons) == 0:
        return 10
    # Calculate the maximum and minimum longitude and latitude
    max_lon, min_lon = max(lons), min(lons)
    max_lat, min_lat = max(lats), min(lats)
    # Calculate the longitude and latitude ranges
    lon_range = max_lon - min_lon
    lat_range = max_lat - min_lat
    # Calculate the zoom level based on the ranges
    zoom = 7 - np.log2(max(lon_range + lon_pad, lat_range + lat_pad))
    return zoom

In [108]:
def plot_mapbox(df, lat_col, lon_col, size_col, color_col, title, zoom=None,
                width=800, height=600, size_max=30, hover_name=None, range_color=None,
                hover_data=None, mapbox_style='carto-positron', labels=None, 
                color_continuous_scale=None, color_outlier_z=None, center=None,
                annotation=None):
    """
    Plots a Mapbox scatter plot using Plotly.
    Args:
        df (pd.DataFrame): DataFrame containing the data to plot.
        lat_col (str): Column name for latitude.
        lon_col (str): Column name for longitude.
        size_col (str): Column name for size.
        color_col (str): Column name for color.
        title (str): Title of the plot.
        zoom (float, optional): Zoom level for the map. Defaults to None.
        width (int, optional): Width of the plot. Defaults to 800.
        height (int, optional): Height of the plot. Defaults to 600.
        size_max (int, optional): Maximum size of the markers. Defaults to 30.
        hover_name (str, optional): Column name for hover text. Defaults to None.
        range_color (list, optional): Range for color scale. Defaults to None.
        hover_data (list, optional): Additional data to show on hover. Defaults to None.
        mapbox_style (str, optional): Mapbox style. Defaults to 'carto-positron'.
        labels (dict, optional): Labels for the axes. Defaults to None.
        color_continuous_scale (list, optional): Color scale for the plot. Defaults to None.
        color_outlier_z (float, optional): Z-score threshold for outlier detection. Defaults to None.
        center (dict, optional): Center of the map. Defaults to center of lat/lon columns when None.
        annotation (dict, optional): Annotation for the plot. Defaults to None.
    Returns:
        None
    """
    #Set default color scale if not provided
    if not color_continuous_scale:
        color_continuous_scale = px.colors.sequential.Viridis

    # Set the zoom level automatically if not provided
    if not zoom:
        zoom = mapbox_zoom_finder(df[lon_col], df[lat_col])

    #Set the center of the map if not provided
    if not center:
        center = {
            'lat': ((df[lat_col].max() - df[lat_col].min()) / 2),
            'lon': ((df[lon_col].max() - df[lon_col].min()) / 2)
        }

    #drop outliers if specified
    if color_outlier_z:
        #get color_col upper and lower limits based on z score
        color_col_mean, color_col_std = df[color_col].mean(), df[color_col].std()
        color_col_upper = color_col_mean + (color_col_std * color_outlier_z)
        color_col_lower = color_col_mean - (color_col_std * color_outlier_z)
        #set range color
        range_color = [color_col_lower, color_col_upper]

    # Create a scatter mapbox figure
    fig = px.scatter_mapbox(
        #data
        df, lat=lat_col, lon=lon_col,
        #categories
        size=size_col, color=color_col,
        #hover info
        hover_name=hover_name, hover_data=hover_data,
        #display settings
        range_color=range_color, size_max=size_max,
        color_continuous_scale=color_continuous_scale, mapbox_style=mapbox_style,
        width=width, height=height,
        #title and labals
        title=title, labels=labels
    )
    # Set the zoom level
    fig.update_layout(mapbox_zoom=zoom)

    #NOTE Add annotation if specified

    # Show the figure
    fig.show()

In [75]:
def plot_line(df, x_col, y_col, title, width=800, height=600,
             color_col=None, hover_name=None, hover_data=None,
             labels=None, color_continuous_scale=None):
    """
    Plots a line chart using Plotly Express.
    Args:
        df (pd.DataFrame): DataFrame containing the data to plot.
        x_col (str): Column name for x-axis.
        y_col (str): Column name for y-axis.
        title (str): Title of the plot.
        width (int, optional): Width of the plot. Defaults to 800.
        height (int, optional): Height of the plot. Defaults to 600.
        color_col (str, optional): Column name for color. Defaults to None.
        hover_name (str, optional): Column name for hover text. Defaults to None.
        hover_data (list, optional): Additional data to show on hover. Defaults to None.
        labels (dict, optional): Labels for the axes. Defaults to None.
        color_continuous_scale (list, optional): Color scale for the plot. Defaults to None.
    Returns:
        None
    """
    # Set default color scale if not provided
    if not color_continuous_scale:
        color_continuous_scale = px.colors.sequential.Viridis

    # Create a line figure
    fig = px.line(
        df,
        x=x_col,
        y=y_col,
        color=color_col,
        title=title,
        labels=labels,
        hover_name=hover_name,
        hover_data=hover_data,
    )
    # Set the width and height of the figure
    fig.update_layout(width=width, height=height)

    # Show the figure
    fig.show()

## Static Visualizations

In [125]:
plot_mapbox(
    df = port_stats(),
    lat_col = 'port_lat', lon_col = 'port_lon',
    size_col = 'vessel_ft_mean_month', color_col = 'hrs_at_berth_mean_month',
    title = 'Average Vessel Throughput at Principal Ports',
    hover_name = 'port_name',
    hover_data = {'vessel_ft_mean_month':True, 'hrs_at_berth_mean_month':True,
                  'port_lat':False, 'port_lon':False},
    labels={
        'vessel_ft_mean_month': 'Vessel-Feet per Month',
        'hrs_at_berth_mean_month': 'Dock Hours per Month',
        'port_name': 'Port Name'
    },
    width=800, height=500,
    zoom=2.2
)

In [None]:
for size, color in 
    plot_mapbox(
        lat_col = 'dock_lat', lon_col = 'dock_lon',
        size_col = 'vessel_ft_mean_month', color_col = 'hrs_at_berth_mean_month',
        filters= 
        group_vars=
        title = 'Average Vessel Throughput at Seattle Docks',
        hover_name = 'dock_name',
        hover_data = {'vessel_ft_mean_month':True, 'hrs_at_berth_mean_month':True,
                    'dock_lat':False, 'dock_lon':False},
        labels={
            'vessel_ft_mean_month': 'Vessel-Feet per Month',
            'hrs_at_berth_mean_month': 'Dock Hours per Month',
            'dock_name': 'Port Name'
        }
    )

In [134]:
#limit to top 10 ports - NOTE incorporate into function
top_ports = (
    port_time_stats()
    .group_by('port_name')
    .agg(pl.sum('calls'))
    .sort('calls', descending=True)
    .limit(10)
    .select('port_name')
    .to_series()
)

plot_line(
    df=port_time_stats().filter(pl.col('port_name').is_in(top_ports)), x_col='month', y_col='calls',
    color_col='port_name', hover_name='port_name',
    title='Vessel Calls per Month',
)

## Run App

In [6]:
%%script echo skip
#run
if __name__ == '__main__':
    app.run()

skip
