# 

# Lane Matching between PIERS and Drewery databases

Problem: PIERS BOL data on ports/lanes and Drewery lane categories do not match

Strategy: 
- geocode ports from both databases
- match on haversine distance to associate PIERS -> Drewery (many:1 matching)
- merge drewery lanes and associated price info into main_lf to allow aggregation and analysis

## Prelims and load data

In [1]:
#preliminaries 
import pandas as pd 
import polars as pl
import numpy as np
import geopy
import geopy.distance
from geopy.geocoders import Bing
from geopy.extra.rate_limiter import RateLimiter

#display settings
pd.set_option('display.max_columns', None)

#enable string cache for polars categoricals
pl.enable_string_cache()

#load main lf from PIERS
main_lf = (
    pl.scan_parquet('../data/main/*.parquet')
    #limit to exports
    .filter(pl.col('direction')=='export')
)

#load drewery data
drewery_df = (
    #load CSV
    pl.read_csv('../data/rates/tidy_rates.csv')
    #filter by US ports
    .filter(pl.col('route').str.contains(' US '))
    #drop lanes containing "via" - these are not coast ports
    .filter(~pl.col('route').str.contains(' via '))
)

In [2]:
main_lf.describe()

statistic,teus,date_raw,origin_territory,origin_region,arrival_port_code,arrival_port_name,departure_port_code,departure_port_name,coast_region,hs_code,carrier_name,carrier_scac,vessel_name,voyage_number,vessel_id,direction,bol_id,year,month,lane_id,lane_name,dest_territory,dest_region,unified_carrier_name,unified_carrier_scac,vessel_owner,primary_cargo,shared_teus,us_port,vessel_port_pair,date,alliance,alliance_member,pc_alliance,cargo_source,vessel_capacity,route,rate_20,rate_40
str,f64,str,str,str,str,str,str,str,str,str,str,str,str,str,f64,str,str,f64,str,str,str,str,str,str,str,str,f64,f64,str,str,str,str,f64,str,str,f64,str,f64,f64
"""count""",63737455.0,"""63737455""","""0""","""0""","""63737455""","""63737455""","""63737455""","""63737455""","""63737173""","""63736118""","""63612885""","""63737455""","""63737455""","""59354969""",63737455.0,"""63737455""","""63737454""",63737455.0,"""63737455""","""63737455""","""63737455""","""63723810""","""63723810""","""63696852""","""63737455""","""63737455""",63737455.0,63737455.0,"""63737455""","""63737455""","""63737380""","""63737455""",63737455.0,"""63737455""","""63737455""",59237960.0,"""63724294""",72.0,72.0
"""null_count""",0.0,"""0""","""63737455""","""63737455""","""0""","""0""","""0""","""0""","""282""","""1337""","""124570""","""0""","""0""","""4382486""",0.0,"""0""","""1""",0.0,"""0""","""0""","""0""","""13645""","""13645""","""40603""","""0""","""0""",0.0,0.0,"""0""","""0""","""75""","""0""",0.0,"""0""","""0""",4499495.0,"""13161""",63737383.0,63737383.0
"""mean""",3.220728,"""2015-12-20 09:20:55.450497""",,,,,,,,,,,,,9231900.0,,,2015.465131,,,,,,,,,0.702033,1.040767,,,"""2015-12-19 23:55:16.226553""",,0.301566,,,2160.482803,,5237.638889,6357.5
"""std""",5.982657,,,,,,,,,,,,,,474506.524554,,,4.741281,,,,,,,,,,3.692469,,,,,,,,1499.188471,,4256.356139,5200.918865
"""min""",0.01,"""2007-01-01 00:00:00""",,,,,,,,"""-1""",,,"""26 AGUSTOS""","""'44S""",196.0,,"""079A_26004878070""",2007.0,"""200701""",,,,,,,,0.0,0.0,,,"""2007-01-01 00:00:00""","""2M Alliance""",0.0,"""2M Alliance""","""ally""",0.0,,1520.0,1880.0
"""25%""",2.0,"""2012-02-28 00:00:00""",,,,,,,,,,,,,9218686.0,,,2012.0,,,,,,,,,,0.0,,,"""2012-02-27 00:00:00""",,,,,905.147059,,1850.0,2210.0
"""50%""",2.533158,"""2016-02-21 00:00:00""",,,,,,,,,,,,,9315202.0,,,2016.0,,,,,,,,,,0.0,,,"""2016-02-20 00:00:00""",,,,,2036.911765,,2620.0,3250.0
"""75%""",2.533158,"""2019-12-14 00:00:00""",,,,,,,,,,,,,9430868.0,,,2019.0,,,,,,,,,,2.0,,,"""2019-12-13 00:00:00""",,,,,3253.161765,,11480.0,14000.0
"""max""",3729.25,"""2023-12-31 00:00:00""",,,,,,,,"""ddedo""",,,"""ZUMA""","""|SAL5""",9979125.0,,"""zzzz_ZZZZ""",2023.0,"""202312""",,,,,,,,1.0,1123.25,,,"""2023-12-31 00:00:00""","""The Alliance""",1.0,"""The Alliance""","""non-ally""",17889.705882,,11480.0,14000.0


In [3]:
drewery_df.describe()

statistic,Unnamed: 1_level_0,route,container_type,date,rate
str,f64,str,str,str,f64
"""count""",20480.0,"""20480""","""20480""","""20480""",11663.0
"""null_count""",0.0,"""0""","""0""","""0""",8817.0
"""mean""",103494.64917,,,,4232.568807
"""std""",62211.323309,,,,3096.444518
"""min""",0.0,"""Australia (Melbourne) to US Ea…","""20ft Dry""","""2014-11""",670.0
"""25%""",49823.0,,,,2250.0
"""50%""",101430.0,,,,3060.0
"""75%""",154525.0,,,,4950.0
"""max""",218877.0,"""West Med (Genoa) to US West Co…","""40ft Dry""","""2023-09""",23490.0


In [188]:
#get unified regions and territories over lane_ids (NOTE this step will live in oca_data_prep)
main_lf = (
    main_lf
    .with_columns(
        pl.col('origin_territory').drop_nulls().mode().first().over('lane_id').alias('origin_territory'),
        pl.col('origin_region').drop_nulls().mode().first().over('lane_id').alias('origin_region'),
        pl.col('dest_territory').drop_nulls().mode().first().over('lane_id').alias('dest_territory'),
        pl.col('dest_region').drop_nulls().mode().first().over('lane_id').alias('dest_region')
    )
)

In [189]:
#get piers_lanes_df (used to merge back into main_lf)
piers_lanes_df = (
    main_lf
    #select columns
    .select('lane_id', 'lane_name', 'origin_territory', 'departure_port_name', 
            'coast_region', 'dest_territory', 'arrival_port_name', 'direction')
    #group by to get modes (NOTE: territory data is uncommonly messy/incorrect; this step avoids gets around that issue)
    .group_by('direction', 'lane_id')
    .agg(
        pl.all().mode().first()
    )
    #construct origin and destination port names for geocoder
    .with_columns(
        pl.when(pl.col('direction')=='import')
        .then(pl.col('origin_territory').cast(pl.Utf8)+' '+pl.col('departure_port_name').cast(pl.Utf8))
        .otherwise('US Port of '+pl.col('departure_port_name').cast(pl.Utf8))
        .alias('piers_origin'),
        pl.when(pl.col('direction')=='import')
        .then('US Port of '+pl.col('arrival_port_name').cast(pl.Utf8))
        .otherwise(pl.col('dest_territory').cast(pl.Utf8)+' '+pl.col('arrival_port_name').cast(pl.Utf8))
        .alias('piers_dest')
    )
    #drop unnessary cols
    .drop('origin_territory', 'departure_port_name', 'coast_region', 'dest_territory', 'arrival_port_name', 'direction')
    #recast to categorical data
    .cast(pl.Categorical)
    #drop duplicates
    .unique()
    #drop nulls
    .drop_nulls()
    #collect to memory
    .collect()
)

#get piers_ports_df
#convert origin ports to series
piers_ports = (
    piers_lanes_df
    .select('piers_origin')
    .rename({'piers_origin':'piers_ports'})
    .drop_nulls()
    .unique()
    .to_series()
)
#append dest ports
piers_ports_df = (
    pl.DataFrame(
        piers_ports.append(
            piers_lanes_df
            .select('piers_dest')
            .drop_nulls()
            .unique()
            .to_series()
        )
    )
    #cast to strings
    .cast(pl.Utf8)
    #convert to pandas
    .to_pandas()
)

In [190]:
#get drewery_lanes_df
drewery_lanes_df = (
    drewery_df
    .select('route')
    .unique()
    #split route col
    .with_columns(
        pl.col('route').str.split_exact(by=' to ', n=1)
        .alias('split')
    )
    #unnest into separate cols
    .unnest('split')
    #rename
    .rename({
        'field_0':'drewery_origin',
        'field_1':'drewery_dest'
    })
    #drop nulls
    .drop_nulls()
)

#get drewery_ports
#convert origin col to series
drewery_ports = (
    drewery_lanes_df
    .select('drewery_origin')
    .rename({'drewery_origin':'drewery_port'})
    .drop_nulls()
    .unique()
    .to_series()
)
#append dest col
drewery_ports_df = (
    pl.DataFrame(
        drewery_ports.append(
            drewery_lanes_df
            .select('drewery_dest')
            .drop_nulls()
            .unique()
            .to_series()
        )
    )
    #drop non-coast ports
    .filter(~pl.col('drewery_port').str.contains(' via '))
    #convert to pandas
    .to_pandas()
)

## Geocode

In [191]:
def geocoder_trg(locations, bing_rest_api_key='Am19ZYf8qoO0j2DJGJDu6oZJkhtyvG9v9-8zJ-RDowSZ8QIKLMbjDIq0w7qAzSv1', 
                 df_export=False):
    '''
    Converts location inputs to geographic coordinates (decimal degrees format, datum WGS-84) using the Bing REST Services geocoder API 
    INPUTS:
        locations - array-like - the address/es or place name/s to be geocoded.
        bing_rest_api_key - an API key issued by Bing Rest Services. Uses Adam Wilson's by default.
        df_export - boolean - default=False - when True, returns a pandas dataframe containing the 'locations' inputs in the first column, 
                    the latitude in the second column, and the longitude in the third column.  
    RETURNS:
        when df_export = False (default), returns a list of (lat, long) tuples corresponding to the 'locations' input list. Uninterpretable
                    inputs are listed as np.NaN.
        when df_export = True, returns a pandas dataframe containing the 'locations' inputs in the first column, 
                    the latitude in the second column, and the longitude in the third column.
    RELIES ON:
        pandas
        numpy
        geopy
        Bing from geopy.geocoders
        RateLimiter from geopy.extras
    '''
    #define geocoder function
    def geocoder_latlong(loc):
        '''returns latitute and longitude of given location if interpretable by Bing, else NaN'''
        #instantiate Bing client
        geocoder_bing = Bing(bing_rest_api_key)
        #rate limit
        geocoder_bing = RateLimiter(geocoder_bing.geocode, min_delay_seconds=0.5)
        #geocode location
        geoloc = geocoder_bing(loc)
        #return latitude and longitude results 
        if type(geoloc) == geopy.Location:
            return geoloc.latitude, geoloc.longitude
        else:
            return np.NaN, np.NaN
    #coerse locations input to pd.Series
    locations = pd.Series(locations)
    #init df
    df = pd.DataFrame({'locations': locations})
    #apply geocoder to each location 
    df[['lat', 'long']] = df.apply(lambda row: geocoder_latlong(row), axis=1, result_type='expand')
    #create coordinate list
    coord_list = [coord if ~np.isnan(coord[0]) else np.NaN for coord in list(zip(df.lat, df.long))]
    #return results 
    if df_export:
        return df
    elif len(df)==1:
        return coord_list[0]
    else:
        return coord_list

In [192]:
#%%script echo skipping #api calls are limited; only execute when necessary

#geocode drewery ports
drewery_ports_df['drewery_port_loc'] = (
    drewery_ports_df.drewery_port
    .apply(lambda r: geocoder_trg(r))
    .dropna()
)

#geocode piers ports
piers_ports_df['piers_port_loc'] = (
    piers_ports_df.piers_ports
    .apply(lambda r: geocoder_trg(r))
    .dropna()
)

#save geolocations
drewery_ports_df.to_parquet('../data/misc/drewery_port_geolocations.parquet')
piers_ports_df.to_parquet('../data/misc/piers_port_geolocations.parquet')

In [193]:
%%script echo skipping

#load previously geolocated data (with backflips because why does anyone use csvs when parquet exists...)
#NOTE parquet files now available
drewery_ports_df = (
    pl.read_csv('../data/misc/drewery_port_geolocations.csv')
    .with_columns(
        pl.col('drewery_port_loc').str.strip_chars('()').str.split(', ')
    )
    .with_columns(
        pl.col('drewery_port_loc').list.get(0).cast(pl.Float64).alias('lat'),
        pl.col('drewery_port_loc').list.get(1).cast(pl.Float64).alias('long')
    )
    .drop_nulls()
    .with_columns(
        drewery_port_loc=pl.concat_list("lat", "long")
    )
    .select('drewery_port', 'drewery_port_loc')
    .to_pandas()
)

piers_ports_df = (
    pl.read_csv('../data/misc/piers_port_geolocations.csv')
    #correct us port naming convention
    .with_columns(
        pl.when(pl.col('piers_ports').str.contains('US Port of'))
        .then(pl.col('piers_ports'))
        .otherwise(
            pl.when(pl.col('piers_ports').str.contains('Port of'))
            .then(('US '+pl.col('piers_ports')))
            .otherwise(pl.col('piers_ports'))
        )
        .alias('piers_ports')
    )
    #correct str coords to list of floats
    .with_columns(
        pl.col('piers_port_loc').str.strip_chars('()').str.split(', ')
    )
    .with_columns(
        pl.col('piers_port_loc').list.get(0).cast(pl.Float64).alias('lat'),
        pl.col('piers_port_loc').list.get(1).cast(pl.Float64).alias('long')
    )
    .drop_nulls()
    .with_columns(
        piers_port_loc=pl.concat_list("lat", "long")
    )
    .select('piers_ports', 'piers_port_loc')
    .to_pandas()
)

skipping


## Match on Haversine Distance

In [194]:
def find_drewery_match(piers_port_loc, df=drewery_ports_df):
    '''
    Finds nearest port to the given piers port from the Drewery locations using haversine distance
    This is a many:1 match of piers:drewery
    INPUT:
        piers_port - lat/long tuple of the port to be matched
        drewery_ports_df - a dataframe with port names in the first col and lat/long tuples in the second col
    OUTPUT:
        drewery_port - the name of the nearest drewery port  
    '''
    #get distance from piers port to all drewery ports
    df['dist'] = (
        df.drewery_port_loc.apply(lambda r: geopy.distance.great_circle(r, tuple(piers_port_loc)).km)
    )
    #return name of nearest drewery port
    return df.sort_values(by='dist').iloc[0][0]


In [195]:

def find_one_to_one_match(drewery_port_loc, df=piers_ports_df):
    '''
    Finds nearest port to the given drewery port from the piers locations using haversine distance
    This is the 1:1 match of piers:drewery
    INPUT:
        piers_port - lat/long tuple of the port to be matched
        drewery_ports_df - a dataframe with port names in the first col and lat/long tuples in the second col
    OUTPUT:
        drewery_port - the name of the nearest drewery port  
    '''
    #get distance from piers port to all drewery ports
    df['dist'] = (
        df.piers_port_loc.apply(lambda r: geopy.distance.great_circle(r, drewery_port_loc).km)
    )
    #return name of nearest drewery port
    return df.sort_values(by='dist').iloc[0][0]

In [216]:
#drop missing values
piers_ports_df.dropna(inplace=True)
drewery_ports_df.dropna(inplace=True)

In [217]:
#get nearest drewery port for each piers port
piers_ports_df['drewery_match'] = (
    piers_ports_df.piers_port_loc.apply(lambda r: find_drewery_match(r))
)

  return df.sort_values(by='dist').iloc[0][0]


In [218]:
#get nearest piers port for each drewery port
drewery_ports_df['piers_match'] = (
    drewery_ports_df.drewery_port_loc.apply(lambda r: find_one_to_one_match(r))
)

  return df.sort_values(by='dist').iloc[0][0]


In [219]:
drewery_ports_df.head()

Unnamed: 0,drewery_port,drewery_port_loc,dist,piers_match
0,Korea (Busan),"(35.09700012, 129.00810242)",9092.116793,REPUBLIC OF KOREA BUSAN
1,Chile (San Antonio),"(-33.58086014, -71.61323547)",9505.224251,CHILE SAN ANTONIO
2,South Africa (Durban),"(-29.88188934, 30.98084259)",17365.630839,REP. OF SOUTH AFRICA DURBAN
3,Colombia (Cartagena),"(10.39999771, -75.5)",5535.846308,COLOMBIA CARTAGENA
4,UK (Felixstowe),"(51.96366119, 1.35173857)",8600.495711,UNITED KINGDOM FELIXSTOWE


## Merge back to main lf

In [220]:
#merge on piers origin
matched_df = (
    pd.merge(piers_lanes_df.to_pandas(), piers_ports_df, how='left', left_on='piers_origin', right_on='piers_ports')
    .rename(columns={'drewery_match':'drewery_origin'})
    .drop('piers_ports', axis=1)
)

#merge on piers dest
matched_df = (
    pd.merge(matched_df, piers_ports_df[['piers_ports', 'drewery_match']], how='left', left_on='piers_dest', right_on='piers_ports')
    .rename(columns={'drewery_match':'drewery_dest'})
    .drop('piers_ports', axis=1)
)

#re-create drewery route name
matched_df['route'] = matched_df.drewery_origin + ' to ' + matched_df.drewery_dest

#drop unneeded cols
matched_df = matched_df[['lane_id', 'lane_name', 'route']]

#drop duplicates
matched_df.drop_duplicates(inplace=True)


In [221]:
#merge piers match to drewery lanes
df = (
    pd.merge(
        drewery_lanes_df.to_pandas(),
        drewery_ports_df[['drewery_port', 'piers_match']],
        how='left',
        left_on='drewery_origin', right_on='drewery_port' 
    )
    .rename(columns={'piers_match':'piers_origin'})
    .drop('drewery_port', axis=1)
)
df = (
    pd.merge(
        df,
        drewery_ports_df[['drewery_port', 'piers_match']],
        how='left',
        left_on='drewery_dest', right_on='drewery_port' 
    )
    .rename(columns={'piers_match':'piers_dest'})
    .drop('drewery_port', axis=1)
)
#merge piers_lanes into df
df = (
    pd.merge(
        df,
        piers_lanes_df.select('lane_id', 'piers_origin', 'piers_dest').to_pandas(),
        how='left',
        on=['piers_origin', 'piers_dest']
    )
)

In [223]:
df.describe()

Unnamed: 0,route,drewery_origin,drewery_dest,piers_origin,piers_dest,lane_id
count,90,90,90,90,90,62
unique,89,36,5,36,5,62
top,Colombia (Cartagena) to US East Coast (New York),Central China (Shanghai),US East Coast (New York),CANADA SHANGHAI,US Port of NEW YORK,52051_5301
freq,2,4,35,4,35,1


In [233]:
#name and restrict cols
new_matched_df = (
    df[['route', 'lane_id']]
)
#inspect
new_matched_df.head()

Unnamed: 0,route,lane_id
0,Central China (Shanghai) to US Mid West Coast ...,
1,New Zealand (Auckland) to US East Coast (New Y...,
2,South Africa (Durban) to US Gulf Coast (Houston),79113_5301
3,India (Nhava Sheva) to US East Coast (New York),53300_4601
4,Mexico (Manzanillo) to US West Coast (Los Ange...,


In [234]:
#save lane matches
matched_df.to_csv('../data/rates/lane_matching.csv')
new_matched_df.to_csv('../data/rates/new_lane_matching.csv')

In [None]:
#prep drewery df for merge
df = (
    #convert to polars because I apparently live here now
    pl.DataFrame(drewery_df)
    #choose cols
    .select('route', 'container_type', 'date', 'rate')
    #drop duplicates on relevant cols
    .unique(subset=['route', 'container_type', 'date'])
    #pivot container type
    .pivot('container_type', values='rate')
    #rename
    .rename({
        '40ft Dry':'rate_40',
        '20ft Dry':'rate_20'
    })
    #convert date to dt
    .with_columns(
        pl.col('date').str.to_date(format='%Y-%m')
    )
)

In [None]:
df.sort(by='route').head(20)