In [1]:
#prelims
import polars as pl
import pandas as pd
import geopandas as gpd
import time
import plotly.express as px
import matplotlib.pyplot as plt
import contextily as cx
import numpy as np
import glob

#enable string cache for polars categoricals
pl.enable_string_cache()
#display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pl.Config(tbl_rows=50);

## Load data from data processing notebook

In [3]:
#save to parquet
main_df = pl.read_parquet('port data/ais_status_changes.parquet')

## Performance Metrics and Exploratory Analysis

### Stats for most recent 12 months

The eventual dashboard will present some statistics and visualizations based on data from the most recently available 12 months. 

In [11]:
#get most recent twelve month window that appears in the data
latest_date = main_gdf.time.max()
year_before = latest_date - pd.DateOffset(months=12)

#create ports stats for most recent 12 months
portstats_last12months_gdf = (
    #convert main gdf to polars
    pl.DataFrame(
        main_gdf.drop(['geometry', 'geometry_port', 'geometry_dock'], axis=1)
    )
    #filter to most recent 12 months
    .filter(pl.col('time')>= year_before)
    #ensure sorting
    .sort(['mmsi', 'time'])
    #create row index (for identifying docking events)
    .with_row_index('docking_id')
    .with_columns(
        #create docking event id - NOTE may need to ensure this captures all relevant messages
        docking_id = (
            #keep only docking ids associated with docking messages
            pl.when(pl.col('status')==5)
            .then(pl.col('docking_id'))
            .otherwise(pl.lit(None))
            #backfill over vessel
            .backward_fill().over('mmsi')
        )
    )
    #drop messages not associated with a docking event
    .drop_nulls(subset='docking_id')
    .with_columns(
        #sum anchorage time for each docking event
        time_at_anchor = (
            pl.when(pl.col('status')==1)
            .then(pl.col('status_duration'))
            .otherwise(pl.lit(None))
            .sum().over('docking_id')
        ),
        #get monthly vessels and visits
        vessels = pl.col('mmsi').n_unique().over('port_name', 'month'),
        visits = pl.col('docking_id').n_unique().over('port_name', 'month')
    )
    #aggregate to ports
    .group_by('port_name')
    .agg(
        #keep lat and long
        port_lat = pl.col('port_lat').first(),
        port_lon = pl.col('port_lon').first(),
        #get monthly average of unique vessels seen at each port
        vessels_avg = pl.col('vessels').mean(),
        #get monthly average of vessel visits at each port
        visits_avg = pl.col('visits').mean(),
        #get median time at berth in hours
        time_at_berth_median = (
            pl.when(pl.col('status')==5)
            .then(pl.col('status_duration'))
            .otherwise(pl.lit(None))
        ).median()/60,
        #get median time at anchor in hours
        time_at_anchor_median = pl.col('time_at_anchor').median()/60,
        #get mean time at anchor in hours
        time_at_anchor_mean = pl.col('time_at_anchor').mean()/60
    )
    #convert to pandas to that geopandas is happy
    .to_pandas()
)
#convert back to geodataframe
portstats_last12months_gdf = (
    gpd.GeoDataFrame(
        portstats_last12months_gdf, 
        geometry=gpd.points_from_xy(portstats_last12months_gdf.port_lon, 
                                    portstats_last12months_gdf.port_lat),
        crs=3857
    )
)

DuplicateError: column with name 'docking_id' has more than one occurrences

In [107]:
#create docks stats for most recent 12 months
dockstats_last12months_gdf = (
    #convert main gdf to polars
    pl.DataFrame(
        main_gdf.drop(['geometry', 'geometry_port', 'geometry_dock'], axis=1))
    #filter to most recent 12 months
    .filter(pl.col('time')>= year_before)
    #ensure sorting
    .sort(['mmsi', 'time'])
    #create row index (for identifying docking events)
    .with_row_index('docking_id')
    .with_columns(
        #get monthly count of vessels
        vessels = pl.col('mmsi').n_unique().over('dock_id', 'month'),
        #get monthly count of vessel docking events - NOTE this may need revision 
        visits = (
            pl.when(pl.col('status')==5)
            .then(pl.col('time'))
            .otherwise(pl.lit(None))
            .n_unique().over('dock_id', 'month')
        ),
        #create docking event id
        docking_id = (
            pl.when(pl.col('status')==5)
            .then(pl.col('docking_id'))
            .otherwise(pl.lit(None))
            .backward_fill().over('mmsi', 'dock_id')
        )
    )
    .with_columns(
        #sum anchorage time for each docking event
        time_at_anchor = (
            pl.when(pl.col('status')==1)
            .then(pl.col('status_duration'))
            .otherwise(pl.lit(None))
            .sum().over('mmsi', 'docking_id')
        )
    )
    #aggregate
    .group_by('dock_id')
    .agg(
        #keep port name, dock lat and long
        dock_lat = pl.col('dock_lat').first(),
        dock_lon = pl.col('dock_lon').first(),
        port_name = pl.col('port_name').mode().first(),
        #get avg vessels and visits per month
        vessels_avg = pl.col('vessels').mean(),
        visits_avg = pl.col('visits').mean(),
        #get median time at berth in hours
        time_at_berth_avg = (
            pl.when(pl.col('status')==5)
            .then(pl.col('status_duration'))
            .otherwise(pl.lit(None))
        ).median()/60,
        #get median time at anchor in hours
        time_at_anchor_avg = (
            pl.when(pl.col('status')==1)
            .then(pl.col('status_duration'))
            .otherwise(pl.lit(None))
        ).median()/60
    )
    #convert to pandas to that geopandas is happy
    .to_pandas()
)

#convert back to geodataframe
dockstats_last12months_gdf = (
    gpd.GeoDataFrame(
        dockstats_last12months_gdf, 
        geometry=gpd.points_from_xy(dockstats_last12months_gdf.dock_lon, 
                                    dockstats_last12months_gdf.dock_lat),
        crs=3857
    )
)

### Monthly Stats

In [12]:
monthly_df = (
    #convert to polars
    pl.DataFrame(main_gdf.drop(['geometry', 'geometry_port', 'geometry_dock'], axis=1))
    #agg over ports and months
    .group_by('port_name', 'month')
    .agg(
        #keep lat and long
        lat = pl.col('port_lat').first(),
        lon = pl.col('port_lon').first(),
        #get monthly avg vessels
        vessels_avg = pl.col('mmsi').n_unique(),
        #get average time at berth
        time_at_berth_avg = (
            pl.when(pl.col('status')==5)
            .then(pl.col('status_duration'))
            .otherwise(pl.lit(None))
        ).median()/60,
        #get average time at anchor
        time_at_anchor_avg = (
            pl.when(pl.col('status')==1)
            .then(pl.col('status_duration'))
            .otherwise(pl.lit(None))
        ).median()/60
    )
)

## Visualizations

In [13]:
#scatterplot
fig = px.scatter_geo(
    portstats_last12months_gdf,
    lon='port_lon',
    lat='port_lat',
    size='visits_avg',
    color='time_at_berth_median',
    range_color=[0,50],
    hover_name='port_name',
    size_max=20,
    title='Average visits per month & Median Hours at Berth (previous 12 months)',
    color_continuous_scale=px.colors.sequential.Viridis,
    width=1000,
    height=600,
    labels={
        'time_at_berth_median':'Median Hours at Berth'
    }
)

# Fit the view to ports
fig.update_geos(fitbounds="locations")

# Add footnote using add_annotation
fig.add_annotation(
    text="Note: Circle size corresponds to average vessel visits per month",  # Footnote text
    xref="paper", yref="paper",  # Position relative to the plot area
    x=0, y=0-0.05,  # Adjust to footnote position
    showarrow=False,  # No arrow, just text
    font=dict(size=14, color="black"),  # Customize the font style
    align="left"
)

# Show the figure
fig.show()

NameError: name 'portstats_last12months_gdf' is not defined

In [131]:
#scatterplot
#scatterplot
fig = px.scatter_geo(
    portstats_last12months_gdf,
    lon='port_lon',
    lat='port_lat',
    size='visits_avg',
    color='time_at_berth_median',
    range_color=[0,50],
    hover_name='port_name',
    size_max=20,
    title='Average visits per month & Median Hours at Berth (previous 12 months)',
    color_continuous_scale=px.colors.sequential.Viridis,
    width=1000,
    height=600,
    labels={
        'time_at_berth_median':'Median Hours at Berth'
    }
)

# Fit the view to ports
fig.update_geos(fitbounds="locations")

# Add footnote using add_annotation
fig.add_annotation(
    text="Note: Circle size corresponds to average vessel visits per month",  # Footnote text
    xref="paper", yref="paper",  # Position relative to the plot area
    x=0, y=0-0.05,  # Adjust to footnote position
    showarrow=False,  # No arrow, just text
    font=dict(size=14, color="black"),  # Customize the font style
    align="left"
)

# Show the figure
fig.show()

In [128]:
#get top 5 ports
top5ports = pl.Series(ports_gdf.sort_values('rank').head().port_name)

#get top 10 ports
top10ports = pl.Series(ports_gdf.sort_values('rank').head(10).port_name)

In [127]:
px.line(
    monthly_df
    #restrict to top 5 ports
    .filter(pl.col('port_name').is_in(top5ports))
    #month in dt format
    .with_columns(pl.col('month').str.strptime(pl.Date, format='%Y%m'))
    .sort(by='month'), 
    #plot specs
    x='month', y='vessels_avg', color='port_name',
    title='Vessels per month at Principal Ports',
    labels={'vessels_avg':'Unique Vessels'},
    width=1000,
    height=500 
)

In [129]:
px.line(
    monthly_df
    #restrict to top 10 ports
    .filter(pl.col('port_name').is_in(top10ports))
    #month in dt format
    .with_columns(pl.col('month').str.strptime(pl.Date, format='%Y%m'))
    .sort(by='month'), 
    #plot specs
    x='month', y='time_at_anchor_avg', color='port_name',
    title='Median Time at Anchor at Principal Ports',
    width=1000,
    height=500 
)

In [130]:
px.line(
    monthly_df
    #restrict to top 5 ports
    .filter(pl.col('port_name').is_in(top5ports))
    #month in dt format
    .with_columns(pl.col('month').str.strptime(pl.Date, format='%Y%m'))
    .sort(by='month'), 
    #plot specs
    x='month', y='vessels_avg', color='port_name',
    title='Unique vessels at Anchor at Principal Ports',
    width=1000,
    height=500 
)

In [None]:
px.line(
    #data with month in dt format
    monthly_df
    #restrict to top 5 ports
    .filter(pl.col('port_name').is_in(top5ports))
    #month in dt format
    .with_columns(pl.col('month').str.strptime(pl.Date, format='%Y%m'))
    .sort(by='month'), 
    #plot specs
    x='month', y='time_at_berth_avg', color='port_name',
    title='Median Time at Berth at Principal Ports',
    width=1000,
    height=500 
)