In [1]:
#prelims
import polars as pl
import pandas as pd
import geopandas as gpd
import time
import plotly.express as px
import matplotlib.pyplot as plt
import contextily as cx
import numpy as np
import glob

#enable string cache for polars categoricals
pl.enable_string_cache()
#display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pl.Config(tbl_rows=50);

In [5]:
#load data
df = pl.read_parquet('port data/dashboard/main.parquet')
#init handy variables
earliest_date = df['time'].min()
latest_date = df['time'].max()

#define port stats function
def port_stats(df, start_date=earliest_date, end_date=latest_date):
    #create ports stats for most recent 12 months
    portstats_df = (
        #convert main gdf to polars
        df
        #filter to most recent 12 months
        .filter(pl.col('time').is_between(start_date, end_date))
        #ensure sorting
        .sort(['mmsi', 'time'])
        #create row index (for identifying docking events)
        .with_row_index('docking_id')
        .with_columns(
            #create docking event id - NOTE may need to ensure this captures all relevant messages
            docking_id = (
                #keep only docking ids associated with docking messages
                pl.when(pl.col('status')==5)
                .then(pl.col('docking_id'))
                .otherwise(pl.lit(None))
                #backfill over vessel
                .backward_fill().over('mmsi')
            )
        )
        #drop messages not associated with a docking event
        .drop_nulls(subset='docking_id')
        .with_columns(
            #sum anchorage time for each docking event
            time_at_anchor = (
                pl.when(pl.col('status')==1)
                .then(pl.col('status_duration'))
                .otherwise(pl.lit(None))
                .sum().over('docking_id')
            ),
            #get monthly vessels and visits
            vessels = pl.col('mmsi').n_unique().over('port_name', 'month'),
            visits = pl.col('docking_id').n_unique().over('port_name', 'month')
        )
        #aggregate to ports
        .group_by('port_name')
        .agg(
            #keep lat and long
            port_lat = pl.col('port_lat').first(),
            port_lon = pl.col('port_lon').first(),
            #get monthly average of unique vessels seen at each port
            vessels_avg = pl.col('vessels').mean(),
            #get monthly average of vessel visits at each port
            visits_avg = pl.col('visits').mean(),
            #get median time at berth in hours
            time_at_berth_median = (
                pl.when(pl.col('status')==5)
                .then(pl.col('status_duration'))
                .otherwise(pl.lit(None))
            ).median()/60,
            #get median time at anchor in hours
            time_at_anchor_median = pl.col('time_at_anchor').median()/60,
            #get mean time at anchor in hours
            time_at_anchor_mean = pl.col('time_at_anchor').mean()/60
        )
        #convert to pandas to that geopandas is happy
        .to_pandas()
    )
    #convert back to geodataframe
    portstats_gdf = (
        gpd.GeoDataFrame(
            portstats_df, 
            geometry=gpd.points_from_xy(portstats_df.port_lon, 
                                        portstats_df.port_lat),
            crs=3857
        )
    )
    return portstats_gdf


def update_map(start_date, end_date):
    #ensure function doesn't change main df
    data = df.clone()
    if start_date and end_date:
        data = port_stats(data, start_date, end_date)
    elif start_date:
        data = port_stats(data, start_date)
    elif end_date:
        data = port_stats(data, end_date=end_date)
    else:
        data = port_stats(data)
    #create map figure
    fig = px.scatter_geo(
        data,
        lon='port_lon',
        lat='port_lat',
        size='visits_avg',
        color='time_at_berth_median',
        range_color=[0,50],
        hover_name='port_name',
        size_max=20,
        title='Average visits per month & Median Hours at Berth (previous 12 months)',
        color_continuous_scale=px.colors.sequential.Viridis,
        width=1000,
        height=600,
        labels={
            'time_at_berth_median':'Median Hours at Berth'
        }
    )

    # Fit the view to ports
    fig.update_geos(fitbounds="locations")

    # Add footnote using add_annotation
    fig.add_annotation(
        text="Note: Circle size corresponds to average vessel visits per month",  # Footnote text
        xref="paper", yref="paper",  # Position relative to the plot area
        x=0, y=0-0.05,  # Adjust to footnote position
        showarrow=False,  # No arrow, just text
        font=dict(size=14, color="black"),  # Customize the font style
        align="left"
    )
    return fig

update_map(earliest_date, latest_date)

In [2]:
#init list of lazyframes
lfs = []
#process each parquet file individually into lazyframes
for file in glob.glob('ais data/data/ais_clean/*.parquet'):
    try:
        #check file integrity 
        pl.scan_parquet(file).collect_schema()
        #read file
        lf = (
            pl.scan_parquet(file)
            #drop smaller vessels
            .filter(pl.col('length')>100)
            #sort by vessel and time
            .sort(['mmsi', 'time'])
            #indicate whether status is the same as previous row (Fill value needed to avoid status 0 evaluating as equal to false)
            #.with_columns(
            #    status_change = (
            #        pl.col('status').ne(pl.col('status').shift(fill_value=20))
            #        .over('mmsi')
            #    ),
            #    status_previous = pl.col('status').shift().over('mmsi')
            #)
            #keep only new status pings
            #.filter(pl.col('status_change')==True)
            #drop change col
            #.drop('status_change')
        )
        #append to list of lazyframes
        lfs.append(lf)
    except:
        print(f'{file} failed')


In [3]:
lf = pl.concat(lfs, how='diagonal_relaxed')
lf.select(pl.len()).collect().item()

1769422121

In [None]:
lf.select(pl.len()).collect().item() - lf.unique(subset=['mmsi', 'time']).select(pl.len()).collect().item()

In [None]:
os.chdir('/Users/adamwilson/Downloads/')

def bad_lines(line):
    line = line[1:]
    return line

In [None]:
df = pd.read_csv('AIS_2023_05_09.csv', engine='python', on_bad_lines=bad_lines)

In [None]:
df.info()

### What's happening in April 2023??

In [None]:
px.bar(
    monthly_df
    .group_by('month').agg(pl.col('vessels_avg').sum())
    .sort('month'),
    y='vessels_avg', x='month',
    title='Vessels per month',
    labels={'vessels_avg':'Vessel Count'}
)

daily_df = (
    #convert to polars
    pl.DataFrame(main_gdf.drop(['geometry', 'geometry_port'], axis=1))
    #create day column
    .with_columns(day = pl.col('time').dt.strftime('%Y%m%d'))
    #agg over ports and days
    .group_by('port_name', 'day')
    .agg(
        #keep lat and long
        lat = pl.col('lat').first(),
        lon = pl.col('lon').first(),
        #get monthly avg vessels
        vessels_avg = pl.col('mmsi').n_unique(),
        #get average time at berth
        time_at_berth_avg = (
            pl.when(pl.col('status')==5)
            .then(pl.col('status_duration'))
            .otherwise(pl.lit(None))
        ).median()/60,
        #get average time at anchor
        time_at_anchor_avg = (
            pl.when(pl.col('status')==1)
            .then(pl.col('status_duration'))
            .otherwise(pl.lit(None))
        ).median()/60
    )
)

px.bar(
    daily_df
    .filter(
        pl.col('day').str.starts_with('202302') | 
        pl.col('day').str.starts_with('202303') | 
        pl.col('day').str.starts_with('202304') |
        pl.col('day').str.starts_with('202305') |
        pl.col('day').str.starts_with('202306')
    )
    .group_by('day').agg(pl.col('vessels_avg').sum())
    .sort('day'),
    y='vessels_avg', x='day',
    title='Vessels per day',
    labels={'vessels_avg':'Vessel Count'}
)

#### checking raw AIS data
#init list of lazyframes
lfs = []
#process each parquet file individually into lazyframes
for file in glob.glob('ais data/data/ais_clean/*.parquet'):
    try:
        #check file integrity 
        pl.scan_parquet(file).collect_schema()
        #read file
        lf = (
            pl.scan_parquet(file)
            #keep only observations near 2023 03
            .filter(pl.col('time').is_between(pl.datetime(2023,1,1), 
                                              pl.datetime(2023,12,31))
            )
            #drop smaller vessels
            .filter(pl.col('length')>100)
        )
        #append to list of lazyframes
        lfs.append(lf)
    except:
        print(f'{file} failed')

#collect all lazyframes
dfs = pl.collect_all(lfs)

#create single dataframe
ais_df = (
    #concat dfs
    pl.concat(dfs, how='diagonal_relaxed')
    .sort(['mmsi', 'time'])
    #add time since last ping
    .with_columns(
        time_since_last = (pl.col('time')-pl.col('time').shift()).over('mmsi')
        .dt.total_seconds()
    )
)

px.histogram(ais_df
             #limit to keep plotly from losing its mind
             .limit(10000000)
             #filter to pings less than 2hr apart
             .filter((pl.col('time_since_last')/60<=120) & (pl.col('status')==5)), 
             x='time_since_last', nbins=100)

px.bar(
    ais_df
    .filter(pl.col('time_since_last')<45)
    .with_columns(
        #split by coast
        coast = (
            pl.when(pl.col('lon')>103)
            .then(pl.lit('west'))
            .otherwise(pl.lit('east'))
        ),
        #cast day to dt
        day = pl.col('time').dt.strftime('%Y%m%d')
    )
    .group_by(['coast','day']).agg(
        messages = pl.col('mmsi').count(),
        avg_time_since_last = pl.col('time_since_last').mean()
    )
    .with_columns(pl.col('day').str.to_date('%Y%m%d'))
    .sort('day'), 
    y='messages', x='day',
    #color='coast'
    )

In [None]:
pd.read_csv('AIS_2023_05_25.csv')

In [None]:
pf = pl.read_csv('AIS_2023_05_09.csv', truncate_ragged_lines=True, infer_schema_length=0)

In [None]:
pf.describe()

In [None]:
polars_lf = (
    #load from parquet file - NOTE scan_ tells polars to be in lazy mode
    pl.scan_parquet('ais data/data/ais_clean/ais_2023_12.parquet')
    #sort by vessel and time
    .sort(['mmsi', 'time'])
    #drop smaller vessels
    .filter(pl.col('length')>100)
    #keep necessary columns
    .select(['mmsi','time', 'status'])
)

In [None]:
polars_lf.show_graph(optimized=False)

In [None]:
os.chdir('/Users/adamwilson/Library/CloudStorage/OneDrive-WashingtonStateUniversity(email.wsu.edu)/Port Performance/data/AIS/')

pl.read_parquet('2015_3.parquet').head()

In [None]:
%timeit polars_lf.collect()

In [None]:
pl.read_parquet('ais data/data/ais_clean/ais_2023_12.parquet').to_pandas().to_parquet('ais data/data/file.parquet')

In [None]:
%%timeit

#read parquet
df = pd.read_parquet('ais data/data/file.parquet', engine='pyarroe')
#sort by mmsi and time
df = df.sort_values(by=['mmsi', 'time'])
#filter by vessel length
df = df[df.length>100]
#drop unused columns
df = df[['mmsi', 'time', 'status']]

In [None]:
%%timeit

#read parquet
df = pd.read_parquet('ais data/data/file.parquet')[['mmsi','time', 'status', 'length']]
#sort by mmsi and time
df = df.sort_values(by=['mmsi', 'time'])
#drop unused column
df.drop('length', axis=1)

In [None]:
df.head()

In [None]:
polars_lf.collect().describe()

In [None]:
df = pl.DataFrame()
for file in [f for f in os.listdir('ais data/data/ais_clean/') if not f.startswith('.')]:
    file_df = pl.read_parquet()

In [None]:
df = (
    #read into lazyframe
    pl.scan_parquet('ais data/data/ais_clean/*.parquet'))

In [None]:
df.collect_schema()

In [None]:
import contextily as ctx
import matplotlib.pyplot as plt

#load dock data
docks_gdf = (
    #read in shape file downloaded from USACE
    gpd.read_file('port data/Dock/Dock.shp')
    #drop unneeded columns
    .drop([
        'FID', #randomly assigned table id
        'LONGITUDE', 'LATITUDE', #already coded in 'geometry' 
        'LOCATION_D', #text description of dock location
        'STREET_ADD','ZIPCODE', #street address details
        'PSA_NAME', #statistical area name, rarely used
        'COUNTY_NAM', 'COUNTY_FIP', 'CONGRESS', 'CONGRESS_F', #county and congress info
        'MILE', 'BANK', 'LATITUDE1', 'LONGITUDE1', #redundant locaation data
        'OPERATORS', 'OWNERS', #owner info
        'PURPOSE', #long-form text description of dock uses
        'DOCK', #unknown number (not unique to each row/dock)
        'HIGHWAY_NO', 'RAILWAY_NO', 'LOCATION', #redundant location info
        'COMMODITIE', 'CONSTRUCTI','MECHANICAL', 'REMARKS', 'VERTICAL_D', 
        'DEPTH_MIN', 'DEPTH_MAX','BERTHING_L', 'BERTHING_T', 'DECK_HEIGH', 
        'DECK_HEI_1', #these are rarely used stats on construction
        'SERVICE_IN','SERVICE_TE', #rarely used indicators of data entry date 
    ], axis=1)
    #set coordinate reference system to WGS84 lat/long
    .to_crs('EPSG:4326')
    #rename cols for clarity
    .rename(columns={
        'NAV_UNIT_I':'nav_unit_id',
        'NAV_UNIT_N':'nav_unit_name',
        'FACILITY_T':'facility_type',
        'CITY_OR_TO':'city',
        'STATE_POST':'state'
    })
)
#set col names to pythonic lowercase
docks_gdf.columns = docks_gdf.columns.str.lower()

In [None]:
gdf.geometry.y

In [None]:
import numpy as np
from sklearn.neighbors import KDTree
np.random.seed(0)
X = np.random.random((5, 2))  # 5 points in 2 dimensions
tree = KDTree(X)
nearest_dist, nearest_ind = tree.query(X, k=2)  # k=2 nearest neighbors where k1 = identity
print(X)
print(nearest_dist[:, 1])    # drop id; assumes sorted -> see args!
print(nearest_ind[:, 1])     # drop id 

In [None]:
pd.DataFrame(X)

In [None]:
df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})

# Set column 'B' to None where column 'A' is equal to 2
df.loc[df['A'] == 2, 'B'] = None

print(df)