# Port Performance Sandbox

In [1]:
#prelims
import polars as pl
import pandas as pd
import geopandas as gpd
import time
import plotly.express as px
import matplotlib.pyplot as plt
import contextily as cx
import numpy as np
import glob

#enable string cache for polars categoricals
pl.enable_string_cache()
#display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pl.Config(tbl_rows=100);

In [5]:
df = pl.read_csv('port data/volumes/imports.csv')

In [6]:
df.head(20)

PORT,PORT_NAME,CTY_CODE,CTY_NAME,I_COMMODITY,GEN_VAL_MO,CNT_VAL_MO,CNT_WGT_MO,VES_VAL_MO,VES_WGT_MO,YEAR,MONTH,COMM_LVL,I_COMMODITY_duplicated_0,date
str,str,str,str,i64,i64,i64,i64,i64,i64,i64,i64,str,i64,str
"""-""","""TOTAL FOR ALL PORTS""","""-""","""TOTAL FOR ALL COUNTRIES""",1,215797758,392018,120408,392018,120408,2018,1,"""HS2""",1,"""1/1/2018"""
"""-""","""TOTAL FOR ALL PORTS""","""0001""","""OPEC""",1,95000,0,0,0,0,2018,1,"""HS2""",1,"""1/1/2018"""
"""-""","""TOTAL FOR ALL PORTS""","""0003""","""EUROPEAN UNION""",1,56261689,134532,21000,134532,21000,2018,1,"""HS2""",1,"""1/1/2018"""
"""-""","""TOTAL FOR ALL PORTS""","""0014""","""PACIFIC RIM COUNTRIES""",1,3595903,257486,99408,257486,99408,2018,1,"""HS2""",1,"""1/1/2018"""
"""-""","""TOTAL FOR ALL PORTS""","""0017""","""CAFTA-DR""",1,334151,0,0,0,0,2018,1,"""HS2""",1,"""1/1/2018"""
"""-""","""TOTAL FOR ALL PORTS""","""0020""","""NAFTA""",1,145173799,0,0,0,0,2018,1,"""HS2""",1,"""1/1/2018"""
"""-""","""TOTAL FOR ALL PORTS""","""0021""","""TWENTY LATIN AMERICAN REPUBLIC…",1,51139330,0,0,0,0,2018,1,"""HS2""",1,"""1/1/2018"""
"""-""","""TOTAL FOR ALL PORTS""","""0022""","""OECD""",1,206506476,134532,21000,134532,21000,2018,1,"""HS2""",1,"""1/1/2018"""
"""-""","""TOTAL FOR ALL PORTS""","""0023""","""NATO""",1,155670763,134532,21000,134532,21000,2018,1,"""HS2""",1,"""1/1/2018"""
"""-""","""TOTAL FOR ALL PORTS""","""0024""","""LAFTA""",1,50796857,0,0,0,0,2018,1,"""HS2""",1,"""1/1/2018"""


In [6]:
df = pl.read_excel('Book1.xlsx')
print(df)

shape: (10, 4)
┌─────┬─────┬─────┬─────────┐
│ id1 ┆ id2 ┆ id3 ┆ call_id │
│ --- ┆ --- ┆ --- ┆ ---     │
│ i64 ┆ i64 ┆ i64 ┆ i64     │
╞═════╪═════╪═════╪═════════╡
│ 1   ┆ 4   ┆ 9   ┆ null    │
│ 1   ┆ 5   ┆ 9   ┆ null    │
│ 1   ┆ 5   ┆ 9   ┆ null    │
│ 2   ┆ 5   ┆ 9   ┆ null    │
│ 2   ┆ 6   ┆ 9   ┆ 2       │
│ 2   ┆ 7   ┆ 10  ┆ null    │
│ 3   ┆ 7   ┆ 11  ┆ null    │
│ 3   ┆ 7   ┆ 12  ┆ null    │
│ 3   ┆ 7   ┆ 13  ┆ 7       │
│ 3   ┆ 8   ┆ 13  ┆ null    │
└─────┴─────┴─────┴─────────┘


In [None]:
#manually create a dataframe identical to the one above


In [None]:
#set minimum meaningful status duration (minutes)
min_duration = 10


#init list of lazyframes
lfs = []
#process each parquet file individually into lazyframes
for file in glob.glob('ais data/data/ais_clean/*.parquet'):
    #check file integrity 
    pl.scan_parquet(file).collect_schema()
    #read and process file
    lf = (
        pl.scan_parquet(file)
        #drop smaller vessels
        .filter(pl.col('length')>100)
        #sort by vessel and time
        .sort(['mmsi', 'time'])
        #set status to undefined when moored but non-zero velocity
        .with_columns(
            pl.when((pl.col('status')==5)&(pl.col('speed')!=0))
            .then(pl.col('status')==15)
            .otherwise(pl.col('status'))
        )
        #set status to unknown when at anchor but high velocity (> 1 knot)
        .with_columns(
            pl.when((pl.col('status')==1)&(pl.col('speed')>1))
            .then(pl.col('status')==15)
            .otherwise(pl.col('status'))
        )
        #drop messages from the same vessel with same timestamp
        .unique(subset=['mmsi', 'time'])
        #identify if a message comes from within each port's waters
        .with_columns(in_port_waters(port_waters))
        #indentify if a message comes from within any port's waters
        .with_columns(
            pl.any_horizontal(in_port_waters(port_waters)).alias('in_port_waters')
        )
    )
    #get column names ending in '_in_port_waters'
    in_waters_cols = [col for col in lf.collect_schema().names() if col.endswith('_in_port_waters')]
    index_cols = [col for col in lf.collect_schema().names() if not col.endswith('_in_port_waters')]
    lf = (
        lf
        #unpivot in_port_waters cols
        .unpivot(
            on=in_waters_cols,
            index=index_cols,
            variable_name='port_waters_name',
            value_name='value'
        )
        #set port_waters_name to empty if value is false
        .with_columns(
            pl.when(pl.col('value')==False)
            .then(None)
            .otherwise(pl.col('port_waters_name'))
            .alias('port_waters_name')
        )
        #deduplicate from unpivot
        .unique()
        #keep status change and entering port waters messages
        .filter(
            (pl.col('status').ne(pl.col('status').shift()).over('mmsi')) |
            (pl.col('in_port_waters').ne(pl.col('in_port_waters')
                                         .shift()).over('mmsi'))
        )
    )
    #append to list of lazyframes
    lfs.append(lf)
print('files loaded; beginning collection')
# Collect all lazyframes
ais_df = pl.concat(pl.collect_all(lfs), how='diagonal_relaxed')
#inspect
ais_df.head()

## Load and Process Docks and Ports Data

### [Principal Ports from BTS](https://data-usdot.opendata.arcgis.com/datasets/usdot::principal-ports/about)

In [2]:
#load principal ports from BTS
ports_gdf = (
    #read in shape file downloaded from BTS
    gpd.read_file('port data/Principal_Ports/Principal_Ports.shp')
    #coerse web mercator
    .to_crs(3857)
    #drop unneeded columns
    .drop([
        'FID', #randomly assigned table id
        'PORT', #unknown numeric ID - not CBP or UN code,
        'RANK',
        'TOTAL',
        'FOREIGN_','EXPORTS', 'IMPORTS', 'DOMESTIC' #breadown of total vol (tons)
    ], axis=1)
    #rename for clarity
    .rename({'TYPE':'port_type'}, axis=1)
)
#set col names to pythonic lowercase
ports_gdf.columns = ports_gdf.columns.str.lower()

#inspect
ports_gdf.head()

Unnamed: 0,port_type,port_name,geometry
0,C,"Albany Port District, NY",POINT (-8209607.618 5257745.826)
1,L,"Alpena, MI",POINT (-9286464.839 5631282.203)
2,I,"America's Central Port, IL",POINT (-10030336.964 4699050.768)
3,C,"Anacortes, WA",POINT (-13647726.157 6189762.265)
4,L,"Ashtabula Port Authority, OH",POINT (-8993944.267 5146562.825)


### [Docks and Anchorages from Army Corp](https://geospatial-usace.opendata.arcgis.com/datasets/23d91bd988ac4fc9943128965bddfa37_0/about)

In [3]:
#load docks and anchorages from CoE
docks_gdf = (
    #read in shape file downloaded from USACE
    gpd.read_file('port data/Dock/Dock.shp')
    #drop unneeded columns
    .drop([
        'PORT_NAME', #unneeded and conflicts with port data
        'UNLOCODE', #UN Location Code, rarely used
        'CITY_OR_TO', 'STATE_POST', 'WTWY_NAME', #unneeded 
        'FID', #randomly assigned table id
        'LONGITUDE', 'LATITUDE', #already coded in 'geometry' 
        'LOCATION_D', #text description of dock location
        'STREET_ADD','ZIPCODE', #street address details
        'PSA_NAME', #statistical area name, rarely used
        'COUNTY_NAM', 'COUNTY_FIP', 'CONGRESS', 'CONGRESS_F', #county and congress info
        'MILE', 'BANK', 'LATITUDE1', 'LONGITUDE1', #redundant locaation data
        'OPERATORS', 'OWNERS', #owner info
        'PURPOSE', #long-form text description of dock uses
        'DOCK', #unknown number (not unique to each row/dock)
        'HIGHWAY_NO', 'RAILWAY_NO', 'LOCATION', #redundant location info
        'COMMODITIE', 'CONSTRUCTI','MECHANICAL', 'REMARKS', 'VERTICAL_D', 
        'DEPTH_MIN', 'DEPTH_MAX','BERTHING_L', 'BERTHING_T', 'DECK_HEIGH', 
        'DECK_HEI_1', #these are rarely used stats on construction
        'SERVICE_IN','SERVICE_TE', #rarely used indicators of data entry date 
    ], axis=1)
    #drop duplicates with matching geometries, keeping most common data
    .groupby('geometry').agg(lambda x: x.mode().iloc[0] if not x.mode().empty else None).reset_index()
    #rename cols for clarity
    .rename(columns={
        'NAV_UNIT_I':'dock_id',
        'NAV_UNIT_N':'dock_name',
        'FACILITY_T':'facility_type'
    })
)
#set col names to pythonic lowercase
docks_gdf.columns = docks_gdf.columns.str.lower()
#coerse back to gdf - groupby appears to have kicked it back to pandas core
docks_gdf = gpd.GeoDataFrame(docks_gdf, geometry='geometry', crs=3857)

#inspect
docks_gdf.head()

Unnamed: 0,geometry,dock_id,dock_name,facility_type
0,POINT (-19217933.954 -1519086.611),0552,ASAU SMALL BOAT HARBOR,
1,POINT (-18999867.643 -1605625.815),058N,PAGO PAGO AMERICAN SAMOA,Dock
2,POINT (-18986748.196 -1606739.459),058M,AUNU'U SMALL BOAT HARBOR,
3,POINT (-18864337.721 -1602880.791),0551,TA'U HARBOR,
4,POINT (-18884324.802 -1593797.256),0550,OFU HARBOR,


### [Port Statistical Areas from BTS](https://geospatial-usace.opendata.arcgis.com/datasets/b7fd6cec8d8c43e4a141d24170e6d82f_0/about)

In [4]:
#load port stat areas from BTS
port_areas_gdf = (
    gpd.read_file('port data/Port Statistical Areas/Ports_and_Port_Statistical_Areas.shp')
    #coerse web mercator
    .to_crs(3857)
    #drop unneeded cols
    .drop(['INSTALLATI', 'MEDIAID', 'METADATAID', 'SDSID', 'DATA_YEAR', 
           'OBJECTID', 'Shape__Are', 'Shape__Len'], axis=1)
    #rename cols
    .rename({
        'geometry':'geometry_area',
        'PORTIDPK':'port_area_id',
        'FEATUREDES':'port_area_desc',
        'FEATURENAM':'port_area_name'
        }, axis=1)
    .set_geometry('geometry_area')
)

#inspect
port_areas_gdf.head()

Unnamed: 0,port_area_desc,port_area_name,port_area_id,geometry_area
0,U.S. Census Bureau municipal limit,"Galveston, TX",2417,"POLYGON ((-10589014.480 3386402.959, -10588974..."
1,"Per legislation, all of Shelby County, TN exce...","Memphis and Shelby County, TN",2294,"MULTIPOLYGON (((-9978777.748 4174967.904, -997..."
2,"All those portions of the St. Louis Bay, St. L...","Duluth-Superior, MN and WI",3924,"POLYGON ((-10274934.649 5888396.250, -10274930..."
3,Area defined by Texas state legislation creati...,"Port Freeport, TX",2408,"POLYGON ((-10672631.006 3404927.407, -10672585..."
4,"Corporate limits of Henderson county, Kentucky.","Henderson County Riverport Authority, KY",2329,"POLYGON ((-9787954.639 4565536.864, -9787769.1..."


### Merge Ports, Port Areas, and Docks

Since we are only interested in principal ports, we spatially join the port areas to the principal ports dataframe, effectively dropping all areas not containing a principal port. We then join this data into the docks and anchorages file in order to assign the relevant principal port to each dock within the port area. 

In [5]:
stops_gdf = (
    #left join areas into ports_gdf
    ports_gdf.sjoin(port_areas_gdf, how='left', predicate='within')
    .drop('index_right', axis=1)
    #join port area geometry back in (there seems to be no native way to do this in geopandas sjoin)
    .merge(port_areas_gdf, how='left')
)

#join stops with port and area info onto docks
stops_gdf = (
    docks_gdf.sjoin(
        #inner join to drop docks and anchorages not within port areas
        stops_gdf.set_geometry('geometry_area'), how='inner', predicate='within'
    )
    .drop('index_right', axis=1)
    #rename geometry cols
    .rename({'geometry_left':'geometry_dock', 
             'geometry_right':'geometry_port'}, axis=1)
    #recover port area geometry as above
    .merge(port_areas_gdf, how='left')
)


#inspect
stops_gdf.head()

Unnamed: 0,geometry_dock,dock_id,dock_name,facility_type,port_type,port_name,geometry_port,port_area_desc,port_area_name,port_area_id,geometry_area
0,POINT (-9190384.318 3204025.920),0JU9,PINEY POINT,,C,"Manatee County Port, FL",POINT (-9190998.357 3202867.355),County limits of Manatee per Legislation,"Manatee County Port Authority, FL",2437,"POLYGON ((-9161269.898 3171826.661, -9161284.0..."
1,POINT (-9190910.637 3202310.097),0XDE,"MANATEE COUNTY PORT AUTHORITY, BERTH NOS. 12 A...",Dock,C,"Manatee County Port, FL",POINT (-9190998.357 3202867.355),County limits of Manatee per Legislation,"Manatee County Port Authority, FL",2437,"POLYGON ((-9161269.898 3171826.661, -9161284.0..."
2,POINT (-9190939.135 3202649.098),0ZSY,"MANATEE COUNTY PORT AUTHORITY, BERTH NO. 11",Dock,C,"Manatee County Port, FL",POINT (-9190998.357 3202867.355),County limits of Manatee per Legislation,"Manatee County Port Authority, FL",2437,"POLYGON ((-9161269.898 3171826.661, -9161284.0..."
3,POINT (-9190908.188 3203242.559),0ZU6,"MANATEE COUNTY PORT AUTHORITY, BERTH Nos. 5 and 4",Dock,C,"Manatee County Port, FL",POINT (-9190998.357 3202867.355),County limits of Manatee per Legislation,"Manatee County Port Authority, FL",2437,"POLYGON ((-9161269.898 3171826.661, -9161284.0..."
4,POINT (-9190537.160 3203033.092),0ZTL,"MANATEE COUNTY PORT AUTHORITY, BERTH NO. 7",Dock,C,"Manatee County Port, FL",POINT (-9190998.357 3202867.355),County limits of Manatee per Legislation,"Manatee County Port Authority, FL",2437,"POLYGON ((-9161269.898 3171826.661, -9161284.0..."


## Load and Process AIS Data

AIS messages are obtained from the Marine Cadastre database and processed in a separate notebook; see the README for full details.

In [6]:
#set minimum meaningful status duration (minutes)
min_duration = 10

#init list of lazyframes
lfs = []
#process each parquet file individually into lazyframes
for file in glob.glob('ais data/data/ais_clean/*.parquet'):
    try:
        #check file integrity 
        pl.scan_parquet(file).collect_schema()
        #read file
        lf = (
            pl.scan_parquet(file)
            #drop smaller vessels
            .filter(pl.col('length')>100)
            #sort by vessel and time
            .sort(['mmsi', 'time'])
            #drop messages from the same vessel with same timestamp
            .unique(subset=['mmsi', 'time'])
            #indicate whether status is the same as previous row (Fill value needed to avoid status 0 evaluating as equal to false)
            .with_columns(
                status_change = (
                    pl.col('status').ne(pl.col('status').shift())
                    .over('mmsi')
                )
            )
            #keep only new status pings
            .filter(pl.col('status_change')==True)
            #drop change col
            .drop('status_change')
        )
        #append to list of lazyframes
        lfs.append(lf)
    except:
        print(f'{file} failed')

#collect all lazyframes
dfs = pl.collect_all(lfs)

In [7]:
#process ais data
ais_gdf = (
    #concat dfs
    pl.concat(dfs, how='diagonal_relaxed')
    #sort by vessel and time
    .sort(['mmsi', 'time'])
    #create duration column
    .with_columns(
        status_duration = (pl.col('time').shift(-1) - pl.col('time'))
        .over('mmsi').dt.total_minutes()
    )
    #drop short changes in status between equal statuses
    .with_columns(
        short = ((pl.col('status').shift()==pl.col('status').shift(-1)) & 
                (pl.col('status_duration')<min_duration)).over('mmsi')
    )
    .filter(pl.col('short')!=True)
    .drop('short')
    #drop repeated same-status messsages
    .with_columns(
        repeat = (pl.col('status')==pl.col('status').shift()).over('mmsi')
    )
    .filter(pl.col('repeat')!=True)
    .drop('repeat')
    #recalculate duration column
    .with_columns(
        status_duration = (pl.col('time').shift(-1) - pl.col('time'))
        .over('mmsi').dt.total_minutes()
    )
    #ensure sorting
    .sort(['mmsi', 'time'])
    #create row index (for identifying docking events)
    .with_row_index('docking_id')
    .with_columns(
        #create docking event id - NOTE may need to ensure this captures all relevant messages
        docking_id = (
            #keep only docking ids associated with docking messages
            pl.when(pl.col('status')==5)
            .then(pl.col('docking_id'))
            .otherwise(pl.lit(None))
            #backfill over vessel
            .backward_fill().over('mmsi')
        )
    )
    #convert to pandas
    .to_pandas()
)

#convert to geopandas dataframe
ais_gdf = (
    #convert to geodataframe
    gpd.GeoDataFrame(
        ais_gdf,
        geometry=gpd.points_from_xy(ais_gdf.lon, ais_gdf.lat, crs='EPSG:4326')
    )
    #convert to WGS84 pseudo-mercator; giving distances in meters
    .to_crs(3857)
    #drop old lat lon cols
    .drop(['lat', 'lon'], axis=1)
    #rename geometry col for clarity
    .rename({'geometry':'geometry_vessel'}, axis=1)
    .set_geometry('geometry_vessel')
)

#inspect
display(ais_gdf.head())
ais_gdf.info()

Unnamed: 0,docking_id,mmsi,time,speed,course,heading,status,vessel_name,vessel_type,imo,length,width,draft,cargo,status_duration,geometry_vessel
0,9.0,205041000,2023-05-18 13:38:06,0.4,138.6,105.0,2.0,LOWLANDS PROSPERITY,79.0,9586813.0,292.0,46.0,10.8,79.0,1.0,POINT (-13733098.187 6162516.103)
1,9.0,205041000,2023-05-18 13:39:15,0.3,148.0,103.0,1.0,LOWLANDS PROSPERITY,79.0,9586813.0,292.0,46.0,10.8,79.0,11915.0,POINT (-13733085.942 6162502.708)
2,9.0,205041000,2023-05-26 20:15:12,0.6,201.4,102.0,0.0,LOWLANDS PROSPERITY,79.0,9586813.0,292.0,46.0,10.8,79.0,320.0,POINT (-13733123.791 6162706.998)
3,9.0,205041000,2023-05-27 01:35:31,0.5,146.0,326.0,1.0,LOWLANDS PROSPERITY,79.0,9586813.0,292.0,46.0,10.8,79.0,53.0,POINT (-13762110.273 6277590.274)
4,9.0,205041000,2023-05-27 02:29:30,3.8,320.0,317.0,0.0,LOWLANDS PROSPERITY,79.0,9586813.0,292.0,46.0,10.8,79.0,15.0,POINT (-13760275.728 6275582.561)


<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 1823668 entries, 0 to 1823667
Data columns (total 16 columns):
 #   Column           Dtype         
---  ------           -----         
 0   docking_id       float64       
 1   mmsi             object        
 2   time             datetime64[us]
 3   speed            float64       
 4   course           float64       
 5   heading          float64       
 6   status           float64       
 7   vessel_name      category      
 8   vessel_type      float64       
 9   imo              float64       
 10  length           float64       
 11  width            float64       
 12  draft            float64       
 13  cargo            float64       
 14  status_duration  float64       
 15  geometry_vessel  geometry      
dtypes: category(1), datetime64[us](1), float64(12), geometry(1), object(1)
memory usage: 212.9+ MB


## Match AIS messages with relevant port calls

In [8]:
#join stops to AIS based on mooring locations
calls_gdf = (
    #filter to only ais moorings
    ais_gdf[ais_gdf.status ==5]
    #join nearest dock (within 500m)
    .sjoin_nearest(
        stops_gdf.set_geometry('geometry_dock'), how='left', 
        exclusive=True, max_distance=500
    )
    #drop unneeded cols
    .drop(['index_right'], axis=1)
    #recover dock geometry
    .merge(stops_gdf[['dock_id', 'geometry_dock']], how='left')
)

In [9]:
#merge calls back into ais to create main df
main_gdf = (
    ais_gdf.merge(calls_gdf, how='left')
    #sort by vessel then by time
    .sort_values(by=['mmsi', 'time'])
)
#backfill port and dock info across docking event id
main_gdf = main_gdf.groupby('docking_id').bfill()
#drop messages with missing dock/port matches
main_gdf = main_gdf[main_gdf['port_name'].notnull()]
#get distance from ais message to next dock
main_gdf['dist_to_dock'] = main_gdf['geometry_vessel'].distance(main_gdf['geometry_dock'])
#create year and month cols for convenience
main_gdf['year'] = main_gdf['time'].dt.year
main_gdf['month'] = main_gdf['time'].dt.strftime('%Y%m')

#inspect
main_gdf.head()

Unnamed: 0,mmsi,time,speed,course,heading,status,vessel_name,vessel_type,imo,length,width,draft,cargo,status_duration,geometry_vessel,dock_id,dock_name,facility_type,port_type,port_name,geometry_port,port_area_desc,port_area_name,port_area_id,geometry_area,geometry_dock,dist_to_dock,year,month
31,205042000,2021-07-24 13:23:47,1.9,119.3,271.0,0.0,DELOS,80.0,9877767.0,336.0,60.0,11.0,80.0,176221.0,POINT (-10020194.646 3357741.339),110V,ENBRIDGE INGLESIDE ENERGY CENTER (EIEC),Dock,C,"Corpus Christi, TX",POINT (-10842283.519 3225388.812),"Per Port of Corpus Chisti legislature, TIGER f...","Corpus Christi, TX",2436,"POLYGON ((-10875869.425 3268476.673, -10875803...",POINT (-10821091.294 3226510.640),811576.8,2021,202107
32,205042000,2021-11-23 22:25:16,0.3,3.5,70.0,1.0,DELOS,80.0,9877767.0,336.0,60.0,11.0,80.0,2221.0,POINT (-10781429.606 3220750.432),110V,ENBRIDGE INGLESIDE ENERGY CENTER (EIEC),Dock,C,"Corpus Christi, TX",POINT (-10842283.519 3225388.812),"Per Port of Corpus Chisti legislature, TIGER f...","Corpus Christi, TX",2436,"POLYGON ((-10875869.425 3268476.673, -10875803...",POINT (-10821091.294 3226510.640),40077.79,2021,202111
33,205042000,2021-11-25 11:26:30,0.6,183.0,148.0,0.0,DELOS,80.0,9877767.0,336.0,60.0,11.0,80.0,219.0,POINT (-10781429.606 3221031.005),110V,ENBRIDGE INGLESIDE ENERGY CENTER (EIEC),Dock,C,"Corpus Christi, TX",POINT (-10842283.519 3225388.812),"Per Port of Corpus Chisti legislature, TIGER f...","Corpus Christi, TX",2436,"POLYGON ((-10875869.425 3268476.673, -10875803...",POINT (-10821091.294 3226510.640),40038.43,2021,202111
34,205042000,2021-11-25 15:05:50,0.0,202.0,202.0,5.0,DELOS,80.0,9877767.0,336.0,60.0,11.0,80.0,3054.0,POINT (-10821107.212 3226306.356),110V,ENBRIDGE INGLESIDE ENERGY CENTER (EIEC),Dock,C,"Corpus Christi, TX",POINT (-10842283.519 3225388.812),"Per Port of Corpus Chisti legislature, TIGER f...","Corpus Christi, TX",2436,"POLYGON ((-10875869.425 3268476.673, -10875803...",POINT (-10821091.294 3226510.640),204.9039,2021,202111
35,205042000,2021-11-27 18:00:41,1.2,127.4,88.0,0.0,DELOS,80.0,9877767.0,336.0,60.0,13.1,80.0,180.0,POINT (-10789425.685 3220135.203),0V0U,ARCO WESTERN PIPELINE CO BERTH T-121,Dock,C,"Port of Long Beach, CA",POINT (-13159021.347 3993886.017),As defined per legislation by the City of Long...,"Port of Long Beach, CA",4110,"POLYGON ((-13158445.210 3999690.643, -13158447...",POINT (-13160065.524 3996249.272),2494451.0,2021,202111


## Get Effective Port Zones based on anchorings

In [10]:
#set quantile
quantile = .8

#create port zones
zones_df = (
    #get anchoring locations from main
    pl.DataFrame(main_gdf[main_gdf.status==1]
                 .drop(['geometry_dock', 'geometry_port', 'geometry_vessel', 
                        'geometry_area',], axis=1))
    .group_by('port_name')
    .agg(pl.col('dist_to_dock').quantile(quantile))
    .sort('port_name')
)


#inspect
zones_df.head()

port_name,dist_to_dock
str,f64
"""Albany Port District, NY""",317841.64996
"""Alpena, MI""",195439.144551
"""Anacortes, WA""",1645600.0
"""Ashtabula Port Authority, OH""",625294.642756
"""Baltimore, MD""",345611.484117
