# 

# Lane Matching between PIERS and Drewery databases

Problem: PIERS BOL data on ports/lanes and Drewery lane categories do not match

Strategy: 
- geocode ports from both databases
- match on haversine distance to associate PIERS -> Drewery (many:1 matching)
- merge drewery lanes and associated price info into main_lf to allow aggregation and analysis

## Prelims and load data

In [3]:
#preliminaries 
import pandas as pd 
import polars as pl
import numpy as np
import geopy
import geopy.distance
from geopy.geocoders import Bing
from geopy.extra.rate_limiter import RateLimiter

#display settings
pd.set_option('display.max_columns', None)

#enable string cache for polars categoricals
pl.enable_string_cache()

#load exports from PIERS
exports_lf = (
    pl.scan_parquet('../data/main/*.parquet')
    #limit to exports
    .filter(pl.col('direction')=='export')
)

#load drewery data
drewery_df = (
    #load CSV
    pl.read_csv('../data/rates/tidy_rates.csv')
    #filter by US ports
    .filter(pl.col('route').str.contains('US '))
    #drop lanes containing "via" - these are not coast ports
    .filter(~pl.col('route').str.contains(' via '))
)

#get unified ports, regions and territories over lane_ids (NOTE this step may live in oca_data_prep)
exports_lf = (
    exports_lf
    .with_columns(
        pl.col('origin_territory').drop_nulls().mode().first().over('lane_id').alias('origin_territory'),
        pl.col('origin_region').drop_nulls().mode().first().over('lane_id').alias('origin_region'),
        pl.col('dest_territory').drop_nulls().mode().first().over('lane_id').alias('dest_territory'),
        pl.col('dest_region').drop_nulls().mode().first().over('lane_id').alias('dest_region'),
        pl.col('arrival_port_name').drop_nulls().mode().first().over('arrival_port_code').alias('arrival_port_name'),
        pl.col('departure_port_name').drop_nulls().mode().first().over('departure_port_code').alias('departure_port_name'),
        pl.col('us_port').drop_nulls().mode().first().over('departure_port_code').alias('us_port')
    )
)

In [4]:
#get piers_lanes_df (used to merge back into main_lf)
piers_lanes_df = (
    exports_lf
    #select columns
    .select('lane_id', 'lane_name', 'origin_territory', 'departure_port_name', 
            'coast_region', 'dest_territory', 'arrival_port_name', 'direction')
    #group by to get modes (NOTE: territory data is uncommonly messy/incorrect; this step avoids gets around that issue)
    .group_by('direction', 'lane_id')
    .agg(
        pl.all().mode().first()
    )
    #construct origin and destination port names for geocoder
    .with_columns(
        pl.when(pl.col('direction')=='import')
        .then(pl.col('origin_territory').cast(pl.Utf8)+' '+pl.col('departure_port_name').cast(pl.Utf8))
        .otherwise('US Port of '+pl.col('departure_port_name').cast(pl.Utf8))
        .alias('piers_origin'),
        pl.when(pl.col('direction')=='import')
        .then('US Port of '+pl.col('arrival_port_name').cast(pl.Utf8))
        .otherwise(pl.col('dest_territory').cast(pl.Utf8)+' '+pl.col('arrival_port_name').cast(pl.Utf8))
        .alias('piers_dest')
    )
    #drop unnessary cols
    .drop('origin_territory', 'departure_port_name', 'coast_region', 'dest_territory', 'arrival_port_name', 'direction')
    #recast to categorical data
    .cast(pl.Categorical)
    #drop duplicates
    .unique()
    #drop nulls
    .drop_nulls()
    #collect to memory
    .collect()
)

#get piers_ports_df
#convert origin ports to series
piers_ports = (
    piers_lanes_df
    .select('piers_origin')
    .rename({'piers_origin':'piers_ports'})
    .drop_nulls()
    .unique()
    .to_series()
)
#append dest ports
piers_ports_df = (
    pl.DataFrame(
        piers_ports.append(
            piers_lanes_df
            .select('piers_dest')
            .drop_nulls()
            .unique()
            .to_series()
        )
    )
    #cast to strings
    .cast(pl.Utf8)
    #convert to pandas
    .to_pandas()
)

In [5]:
piers_lanes_df.head()

lane_id,lane_name,piers_origin,piers_dest
cat,cat,cat,cat
"""5310_79100""","""Galveston — Pt Ngqura""","""US Port of GALVESTON""","""REP. OF SOUTH AFRICA JOHANNESB…"
"""5204_28319""","""W Palm Bch — Gustavia""","""US Port of W PALM BCH""","""GUADELOUPE GUSTAVIA"""
"""5301_45512""","""Houston — Swinoujscie""","""US Port of HOUSTON""","""POLAND SWINOUJSCIE"""
"""2205_58030""","""Pnt Comfort — Daesan""","""US Port of PNT COMFORT""","""REPUBLIC OF KOREA DAESAN"""
"""4601_33741""","""New York — Iquique""","""US Port of NEW YORK""","""CHILE IQUIQUE"""


In [6]:
#get drewery_lanes_df
drewery_lanes_df = (
    drewery_df
    .select('route')
    .unique()
    #split route col
    .with_columns(
        pl.col('route').str.split_exact(by=' to ', n=1)
        .alias('split')
    )
    #unnest into separate cols
    .unnest('split')
    #rename
    .rename({
        'field_0':'drewery_origin',
        'field_1':'drewery_dest'
    })
    #drop nulls
    .drop_nulls()
)

#get drewery_ports
#convert origin col to series
drewery_ports = (
    drewery_lanes_df
    .select('drewery_origin')
    .rename({'drewery_origin':'drewery_port'})
    .drop_nulls()
    .unique()
    .to_series()
)
#append dest col
drewery_ports_df = (
    pl.DataFrame(
        drewery_ports.append(
            drewery_lanes_df
            .select('drewery_dest')
            .drop_nulls()
            .unique()
            .to_series()
        )
    )
    #drop non-coast ports
    .filter(~pl.col('drewery_port').str.contains(' via '))
    #convert to pandas
    .to_pandas()
)

## Geocode

In [7]:
def geocoder_trg(locations, bing_rest_api_key='Am19ZYf8qoO0j2DJGJDu6oZJkhtyvG9v9-8zJ-RDowSZ8QIKLMbjDIq0w7qAzSv1', 
                 df_export=False):
    '''
    Converts location inputs to geographic coordinates (decimal degrees format, datum WGS-84) using the Bing REST Services geocoder API 
    INPUTS:
        locations - array-like - the address/es or place name/s to be geocoded.
        bing_rest_api_key - an API key issued by Bing Rest Services. Uses Adam Wilson's by default.
        df_export - boolean - default=False - when True, returns a pandas dataframe containing the 'locations' inputs in the first column, 
                    the latitude in the second column, and the longitude in the third column.  
    RETURNS:
        when df_export = False (default), returns a list of (lat, long) tuples corresponding to the 'locations' input list. Uninterpretable
                    inputs are listed as np.NaN.
        when df_export = True, returns a pandas dataframe containing the 'locations' inputs in the first column, 
                    the latitude in the second column, and the longitude in the third column.
    RELIES ON:
        pandas
        numpy
        geopy
        Bing from geopy.geocoders
        RateLimiter from geopy.extras
    '''
    #define geocoder function
    def geocoder_latlong(loc):
        '''returns latitute and longitude of given location if interpretable by Bing, else NaN'''
        #instantiate Bing client
        geocoder_bing = Bing(bing_rest_api_key)
        #rate limit
        geocoder_bing = RateLimiter(geocoder_bing.geocode, min_delay_seconds=0.5)
        #geocode location
        geoloc = geocoder_bing(loc)
        #return latitude and longitude results 
        if type(geoloc) == geopy.Location:
            return geoloc.latitude, geoloc.longitude
        else:
            return np.NaN, np.NaN
    #coerse locations input to pd.Series
    locations = pd.Series(locations)
    #init df
    df = pd.DataFrame({'locations': locations})
    #apply geocoder to each location 
    df[['lat', 'long']] = df.apply(lambda row: geocoder_latlong(row), axis=1, result_type='expand')
    #create coordinate list
    coord_list = [coord if ~np.isnan(coord[0]) else np.NaN for coord in list(zip(df.lat, df.long))]
    #return results 
    if df_export:
        return df
    elif len(df)==1:
        return coord_list[0]
    else:
        return coord_list

In [8]:
%%script echo skipping #api calls are limited; only execute when necessary

#geocode drewery ports
drewery_ports_df['drewery_port_loc'] = (
    drewery_ports_df.drewery_port
    .apply(lambda r: geocoder_trg(r))
    .dropna()
)

#geocode piers ports
piers_ports_df['piers_port_loc'] = (
    piers_ports_df.piers_ports
    .apply(lambda r: geocoder_trg(r))
    .dropna()
)

#save geolocations
drewery_ports_df.to_parquet('../data/misc/drewery_port_geolocations.parquet')
piers_ports_df.to_parquet('../data/misc/piers_port_geolocations.parquet')

skipping #api calls are limited; only execute when necessary


## Match on Haversine Distance

In [9]:
#%%script echo skipping
#load previously geocoded data
drewery_ports_df = pl.read_parquet('../data/misc/drewery_port_geolocations.parquet').to_pandas()
piers_ports_df = pl.read_parquet('../data/misc/piers_port_geolocations.parquet').to_pandas()

In [10]:
#merge distances back to lanes

#merge drewery origin locs
drewery_loc_lanes_df = (
    drewery_lanes_df
    .join(
        pl.DataFrame(drewery_ports_df),
        left_on='drewery_origin',
        right_on='drewery_port',
    )
    .rename({'drewery_port_loc':'drewery_origin_loc'})
)
#merge drewery dest locs
drewery_loc_lanes_df = (
    drewery_loc_lanes_df
    .join(
        pl.DataFrame(drewery_ports_df),
        left_on='drewery_dest',
        right_on='drewery_port',
    )
    .rename({'drewery_port_loc':'drewery_dest_loc'})
    .unique()
)

#merge drewery origin locs
piers_loc_lanes_df = (
    piers_lanes_df
    .join(
        pl.DataFrame(piers_ports_df).cast({'piers_ports':pl.Categorical}),
        left_on='piers_origin',
        right_on='piers_ports',
    )
    .rename({'piers_port_loc':'piers_origin_loc'})
)
#merge drewery dest locs
piers_loc_lanes_df = (
    piers_loc_lanes_df
    .join(
        pl.DataFrame(piers_ports_df).cast({'piers_ports':pl.Categorical}),
        left_on='piers_dest',
        right_on='piers_ports',
    )
    .rename({'piers_port_loc':'piers_dest_loc'})
    .unique()
)

#inspect
display(drewery_loc_lanes_df.head())
piers_loc_lanes_df.head()

route,drewery_origin,drewery_dest,drewery_origin_loc,drewery_dest_loc
str,str,str,list[f64],list[f64]
"""Malaysia (Tanjung Pelepas) to …","""Malaysia (Tanjung Pelepas)""","""US West Coast (Los Angeles)""","[3.6024549, 114.704468]","[34.052238, -118.243347]"
"""Malaysia (Tanjung Pelepas) to …","""Malaysia (Tanjung Pelepas)""","""US East Coast (New York)""","[3.6024549, 114.704468]","[40.713047, -74.007233]"
"""US East Coast (New York) to No…","""US East Coast (New York)""","""North Continent Europe (Rotter…","[40.713047, -74.007233]","[54.261223, 17.669846]"
"""US West Coast (Los Angeles) to…","""US West Coast (Los Angeles)""","""Saudi Arabia (Jeddah)""","[34.052238, -118.243347]","[21.487305, 39.181335]"
"""US East Coast (New York) to Vi…","""US East Coast (New York)""","""Vietnam (Ho Chi Minh)""","[40.713047, -74.007233]","[10.77653, 106.700974]"


lane_id,lane_name,piers_origin,piers_dest,piers_origin_loc,piers_dest_loc
cat,cat,cat,cat,list[f64],list[f64]
"""3010_57043""","""Anacortes — Tianjin""","""US Port of ANACORTES""","""CHINA (MAINLAND) TIANJIN""","[39.503571, -99.018341]","[36.559372, 103.753349]"
"""1803_40301""","""Jacksonville — Aalesund""","""US Port of JACKSONVILLE""","""NORWAY AALESUND""","[30.325968, -81.656761]","[59.91333, 10.73897]"
"""0401_47598""","""Boston — Arzignano""","""US Port of BOSTON""","""ITALY ARZIGNANO""","[39.503571, -99.018341]","[45.519333, 11.339563]"
"""5201_35120""","""Miami — Pecem""","""US Port of MIAMI""","""BRAZIL PECEM""","[25.772247, -80.165138]","[-3.548698, -38.827728]"
"""2010_48471""","""S Louisiana — Thessaloniki""","""US Port of S LOUISIANA""","""GREECE THESSALONIKI""","[30.981009, -91.891823]","[40.640316, 22.935272]"


In [37]:
matched_df = (
    #cross join piers and drewery tables
    piers_loc_lanes_df.join(drewery_loc_lanes_df, how='cross')
    #convert to pandas
    .to_pandas()
)


In [68]:
pd.Series(df[['piers_origin_loc', 'drewery_origin_loc']].sample(1).iloc[0])

piers_origin_loc      [39.50357056, -99.01834106]
drewery_origin_loc     [54.26122284, 17.66984558]
Name: 364236, dtype: object

In [76]:
def haversine(row, col1, col2):
    return geopy.distance.great_circle(row[col1], row[col2]).km

df = matched_df
df['origin_dist'] = (
    df.apply(lambda r: haversine(row=r, col1='piers_origin_loc', col2='drewery_origin_loc'), axis=1)
)
df['dest_dist'] = (
    df.apply(lambda r: haversine(row=r, col1='piers_dest_loc', col2='drewery_dest_loc'), axis=1)
)
df['dist'] = df.origin_dist + df.dest_dist
df.head()

Unnamed: 0,lane_id,lane_name,piers_origin,piers_dest,piers_origin_loc,piers_dest_loc,route,drewery_origin,drewery_dest,drewery_origin_loc,drewery_dest_loc,origin_dist,dest_dist,dist
0,3010_57043,Anacortes — Tianjin,US Port of ANACORTES,CHINA (MAINLAND) TIANJIN,"[39.50357056, -99.01834106]","[36.55937195, 103.7533493]",Malaysia (Tanjung Pelepas) to US West Coast (L...,Malaysia (Tanjung Pelepas),US West Coast (Los Angeles),"[3.6024549, 114.70446777]","[34.05223846, -118.24334717]",14111.409741,11038.141163,25149.550904
1,3010_57043,Anacortes — Tianjin,US Port of ANACORTES,CHINA (MAINLAND) TIANJIN,"[39.50357056, -99.01834106]","[36.55937195, 103.7533493]",Malaysia (Tanjung Pelepas) to US East Coast (N...,Malaysia (Tanjung Pelepas),US East Coast (New York),"[3.6024549, 114.70446777]","[40.71304703, -74.00723267]",14111.409741,11419.765037,25531.174777
2,3010_57043,Anacortes — Tianjin,US Port of ANACORTES,CHINA (MAINLAND) TIANJIN,"[39.50357056, -99.01834106]","[36.55937195, 103.7533493]",US East Coast (New York) to North Continent Eu...,US East Coast (New York),North Continent Europe (Rotterdam),"[40.71304703, -74.00723267]","[54.26122284, 17.66984558]",2124.12922,6556.839727,8680.968946
3,3010_57043,Anacortes — Tianjin,US Port of ANACORTES,CHINA (MAINLAND) TIANJIN,"[39.50357056, -99.01834106]","[36.55937195, 103.7533493]",US West Coast (Los Angeles) to Saudi Arabia (J...,US West Coast (Los Angeles),Saudi Arabia (Jeddah),"[34.05223846, -118.24334717]","[21.48730469, 39.18133545]",1812.00053,6380.051366,8192.051895
4,3010_57043,Anacortes — Tianjin,US Port of ANACORTES,CHINA (MAINLAND) TIANJIN,"[39.50357056, -99.01834106]","[36.55937195, 103.7533493]",US East Coast (New York) to Vietnam (Ho Chi Minh),US East Coast (New York),Vietnam (Ho Chi Minh),"[40.71304703, -74.00723267]","[10.77653027, 106.70097351]",2124.12922,2882.178891,5006.308111


In [80]:
matched_df = (
    pl.DataFrame(df)
    .sort(by='dist')
    .group_by('lane_id')
    .agg(
        pl.col('route').first(),
        pl.col('dist').min()
    )
)

In [81]:
matched_df.head()

lane_id,route,dist
cat,str,f64
"""1801_21527""","""US Gulf Coast (Houston) to Mex…",2435.338413
"""5306_57024""","""US Gulf Coast (Houston) to Nor…",2361.464441
"""5201_56051""","""US Gulf Coast (Houston) to Sin…",2740.670867
"""4601_71401""","""US East Coast (New York) to We…",1868.220539
"""1601_47076""","""US East Coast (New York) to We…",2438.912516


In [83]:
matched_df.write_csv('../data/misc/matched_lanes_dist.csv')

In [21]:
matched_df = (
    #cross join piers and drewery tables
    piers_loc_lanes_df.join(drewery_loc_lanes_df, how='cross')
    #split coordinates
    .with_columns(
        piers_origin_lat = pl.col('piers_origin_loc').list[0],
        piers_origin_long = pl.col('piers_origin_loc').list[1],
        piers_dest_lat = pl.col('piers_dest_loc').list[0],
        piers_dest_long = pl.col('piers_dest_loc').list[1],
        drewery_origin_lat = pl.col('drewery_origin_loc').list[0],
        drewery_origin_long = pl.col('drewery_origin_loc').list[1],
        drewery_dest_lat = pl.col('drewery_dest_loc').list[0],
        drewery_dest_long = pl.col('drewery_dest_loc').list[1]
    )
)
matched_df.head()

lane_id,lane_name,piers_origin,piers_dest,piers_origin_loc,piers_dest_loc,route,drewery_origin,drewery_dest,drewery_origin_loc,drewery_dest_loc,piers_origin_lat,piers_origin_long,piers_dest_lat,piers_dest_long,drewery_origin_lat,drewery_origin_long,drewery_dest_lat,drewery_dest_long
cat,cat,cat,cat,list[f64],list[f64],str,str,str,list[f64],list[f64],f64,f64,f64,f64,f64,f64,f64,f64
"""3010_57043""","""Anacortes — Tianjin""","""US Port of ANACORTES""","""CHINA (MAINLAND) TIANJIN""","[39.503571, -99.018341]","[36.559372, 103.753349]","""Malaysia (Tanjung Pelepas) to …","""Malaysia (Tanjung Pelepas)""","""US West Coast (Los Angeles)""","[3.6024549, 114.704468]","[34.052238, -118.243347]",39.503571,-99.018341,36.559372,103.753349,3.6024549,114.704468,34.052238,-118.243347
"""3010_57043""","""Anacortes — Tianjin""","""US Port of ANACORTES""","""CHINA (MAINLAND) TIANJIN""","[39.503571, -99.018341]","[36.559372, 103.753349]","""Malaysia (Tanjung Pelepas) to …","""Malaysia (Tanjung Pelepas)""","""US East Coast (New York)""","[3.6024549, 114.704468]","[40.713047, -74.007233]",39.503571,-99.018341,36.559372,103.753349,3.6024549,114.704468,40.713047,-74.007233
"""3010_57043""","""Anacortes — Tianjin""","""US Port of ANACORTES""","""CHINA (MAINLAND) TIANJIN""","[39.503571, -99.018341]","[36.559372, 103.753349]","""US East Coast (New York) to No…","""US East Coast (New York)""","""North Continent Europe (Rotter…","[40.713047, -74.007233]","[54.261223, 17.669846]",39.503571,-99.018341,36.559372,103.753349,40.713047,-74.007233,54.261223,17.669846
"""3010_57043""","""Anacortes — Tianjin""","""US Port of ANACORTES""","""CHINA (MAINLAND) TIANJIN""","[39.503571, -99.018341]","[36.559372, 103.753349]","""US West Coast (Los Angeles) to…","""US West Coast (Los Angeles)""","""Saudi Arabia (Jeddah)""","[34.052238, -118.243347]","[21.487305, 39.181335]",39.503571,-99.018341,36.559372,103.753349,34.052238,-118.243347,21.487305,39.181335
"""3010_57043""","""Anacortes — Tianjin""","""US Port of ANACORTES""","""CHINA (MAINLAND) TIANJIN""","[39.503571, -99.018341]","[36.559372, 103.753349]","""US East Coast (New York) to Vi…","""US East Coast (New York)""","""Vietnam (Ho Chi Minh)""","[40.713047, -74.007233]","[10.77653, 106.700974]",39.503571,-99.018341,36.559372,103.753349,40.713047,-74.007233,10.77653,106.700974


In [25]:
df = (
    matched_df
    .with_columns(
        origin_dist = 2*6378.1*(
            ((((((pl.col('piers_origin_lat')-pl.col('drewery_origin_lat'))/2).sin())**2)+
                                  pl.col('drewery_origin_lat').cos()*pl.col('piers_origin_lat')*
                                  ((((pl.col('piers_origin_long')-pl.col('drewery_origin_long'))/2).sin())**2))
                                  .sqrt())
                                  .arcsin()),
        dest_dist = 2*6378.1*(
            ((((((pl.col('piers_dest_lat')-pl.col('drewery_dest_lat'))/2).sin())**2)+
                                  pl.col('drewery_dest_lat').cos()*pl.col('piers_dest_lat')*
                                  ((((pl.col('piers_dest_long')-pl.col('drewery_dest_long'))/2).sin())**2))
                                  .sqrt())
                                  .arcsin())                    
    )
)

In [29]:
df.fill_nan(None).describe()

statistic,lane_id,lane_name,piers_origin,piers_dest,piers_origin_loc,piers_dest_loc,route,drewery_origin,drewery_dest,drewery_origin_loc,drewery_dest_loc,piers_origin_lat,piers_origin_long,piers_dest_lat,piers_dest_long,drewery_origin_lat,drewery_origin_long,drewery_dest_lat,drewery_dest_long,origin_dist,dest_dist
str,str,str,str,str,f64,f64,str,str,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""count""","""4151955""","""4151955""","""4151955""","""4151955""",4151955.0,4151955.0,"""4151955""","""4151955""","""4151955""",4151955.0,4151955.0,4151955.0,4151955.0,4151955.0,4151955.0,4151955.0,4151955.0,4151955.0,4151955.0,590144.0,674503.0
"""null_count""","""0""","""0""","""0""","""0""",0.0,0.0,"""0""","""0""","""0""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3561811.0,3477452.0
"""mean""",,,,,,,,,,,,35.506514,-95.567675,21.558734,17.817179,26.708728,-16.032856,24.790191,-16.667226,8176.986391,9660.889291
"""std""",,,,,,,,,,,,6.521182,18.219039,25.509602,76.968993,20.629243,94.335958,22.124095,92.169137,4664.585752,4731.577552
"""min""",,,,,,,"""Australia (Melbourne) to US Ea…","""Australia (Melbourne)""","""Argentina (Buenos Aires)""",,,18.466303,-157.858154,-54.806934,-175.201813,-37.81546,-118.243347,-37.81546,-118.243347,0.0,0.0
"""25%""",,,,,,,,,,,,30.185211,-99.018341,8.69812,-60.962078,22.421183,-95.369507,19.053801,-95.369507,5564.232734,5833.774526
"""50%""",,,,,,,,,,,,37.804829,-95.358505,24.871939,10.73897,31.797731,-74.007233,31.082682,-74.007233,5564.232734,9774.441768
"""75%""",,,,,,,,,,,,39.503571,-80.165138,40.43121,97.745308,40.713047,83.272919,40.713047,72.964783,10685.095862,13281.082654
"""max""",,,,,,,"""West Med (Genoa) to US West Co…","""West Med (Genoa)""","""West Med (Genoa)""",,,57.149849,-2.09376,71.970825,179.198257,59.938732,174.709915,59.938732,174.709915,19406.060867,20007.980122


In [None]:
def find_drewery_match(piers_lane_id, df=drewery_loc_lanes_df, df_piers=piers_loc_lanes_df):
    '''
    Finds nearest port to the given piers port from the Drewery locations using haversine distance
    This is a many:1 match of piers:drewery
    '''
    #inspect inputs
    print(piers_lane_id)
    display(df.head())
    display(df_piers.head())
    print(df_piers[df_piers.lane_id == piers_lane_id].piers_origin_loc.iloc[0])
    
    #get distance from piers origin to all drewery origins
    df['origin_dist'] = (
        df.drewery_origin_loc.apply(
            lambda r: geopy.distance.great_circle(r, df_piers.loc[df_piers.lane_id.str.contains(piers_lane_id)].piers_lane_id.iloc[0]).km
            )
    )
    #get distance from piers dest to all drewery dest
    df['dest_dist'] = (
        df.drewery_dest_loc.apply(
            lambda r: geopy.distance.great_circle(r, piers_loc_lanes_df.loc[piers_loc_lanes_df.lane_id.str == piers_lane_id, 'piers_dest_loc'].iloc[0]).km
            )
    )
    #get sum of distances
    df['dist'] = df.origin_dist + df.dest_dist
    #return name and distance of drewery lane 
    return df.sort_values(by='dist').route.iloc[0], df.sort_values(by='dist').dist.iloc[0]


In [None]:
#get nearest drewery port for each piers port
piers_loc_lanes_df['drewery_lane_match'], piers_loc_lanes_df['drewery_lane_match_dist'] = (
    piers_loc_lanes_df.lane_id.apply(lambda r: find_drewery_match(r))
)

## Merge back to main lf

In [None]:
#merge on piers origin
matched_df = (
    pd.merge(piers_lanes_df.to_pandas(), piers_ports_df, how='left', left_on='piers_origin', right_on='piers_ports')
    .rename(columns={'drewery_match':'drewery_origin'})
    .drop('piers_ports', axis=1)
)

#merge on piers dest
matched_df = (
    pd.merge(matched_df, piers_ports_df[['piers_ports', 'drewery_match']], how='left', left_on='piers_dest', right_on='piers_ports')
    .rename(columns={'drewery_match':'drewery_dest'})
    .drop('piers_ports', axis=1)
)

#re-create drewery route name
matched_df['route'] = matched_df.drewery_origin + ' to ' + matched_df.drewery_dest

#drop unneeded cols
matched_df = matched_df[['lane_id', 'lane_name', 'route']]

#drop duplicates
matched_df.drop_duplicates(inplace=True)


In [None]:
#merge piers match to drewery lanes
df = (
    pd.merge(
        drewery_lanes_df.to_pandas(),
        drewery_ports_df[['drewery_port', 'piers_match']],
        how='left',
        left_on='drewery_origin', right_on='drewery_port' 
    )
    .rename(columns={'piers_match':'piers_origin'})
    .drop('drewery_port', axis=1)
)
display(df.head())
df = (
    pd.merge(
        df,
        drewery_ports_df[['drewery_port', 'piers_match']],
        how='left',
        left_on='drewery_dest', right_on='drewery_port' 
    )
    .rename(columns={'piers_match':'piers_dest'})
    .drop('drewery_port', axis=1)
)
display(df.head())
#merge piers_lanes into df
df = (
    pd.merge(
        df,
        piers_lanes_df.select('lane_id', 'piers_origin', 'piers_dest').to_pandas(),
        how='left',
        on=['piers_origin', 'piers_dest']
    )
)

In [None]:
df[df.piers_origin.str.contains('US ')]

In [None]:
drewery_lanes_df.head()

In [None]:
piers_lanes_df.head()

In [None]:
piers_lanes_df.filter(pl.col('piers_dest').cast(pl.Utf8).str.contains('HONG KONG')).filter(pl.col('piers_origin').cast(pl.Utf8).str.contains('NEW YORK'))

In [None]:
df.describe()

In [None]:
df.head()

In [None]:
#name and restrict cols
new_matched_df = (
    df[['route', 'lane_id']]
)
#inspect
new_matched_df.head()

In [None]:
#save lane matches
matched_df.to_csv('../data/rates/lane_matching.csv')
new_matched_df.to_csv('../data/rates/new_lane_matching.csv')

In [None]:
#prep drewery df for merge
df = (
    #convert to polars because I apparently live here now
    pl.DataFrame(drewery_df)
    #choose cols
    .select('route', 'container_type', 'date', 'rate')
    #drop duplicates on relevant cols
    .unique(subset=['route', 'container_type', 'date'])
    #pivot container type
    .pivot('container_type', values='rate')
    #rename
    .rename({
        '40ft Dry':'rate_40',
        '20ft Dry':'rate_20'
    })
    #convert date to dt
    .with_columns(
        pl.col('date').str.to_date(format='%Y-%m')
    )
)

In [None]:
df.sort(by='route').head(20)