# Data Preparation for Ocean Carrier Alliances Project

This notebook loads, cleans, and prepares the data used to analyse containerized maritime freight carrier alliances. The primary data comes from S&P's PIERS BOL database, which is processed via the seperate PIERS Data Project. 

See the github repo and the README for more detail. 

In [1]:
#preliminaries 
import pandas as pd #v2.1.3
import polars as pl #v0.20.18
import plotly_express as px #v0.4.1 
import datetime as dt
from sklearn.cluster import HDBSCAN
import time


#display settings
pd.set_option('display.max_columns', None)

#enable string cache for polars categoricals
pl.enable_string_cache()

## PIERS BOL Data

This project uses data from the PIERS Data Project's bill of lading database. The initial codeblock loads the relevant columns from this database into seperate Polars LazyFrames for the import and export data, and subsequent blocks address the various issues in the data. 

Note: Since the first phase of our project is focused on PNW producers, we limit the data to only west coast ports. 

In [2]:
#load PIERS bol data lazyframes
imports_lf = (
    pl.scan_parquet('data/piers_raw/imports/*.parquet', parallel='columns')
    #drop unused columns 
    .select(
        #'weight',
        #'weight_unit',
        #'qty',
        #'qty_type',
        'teus',
        #'value_est',
        'date',
        #'container_piece_count',
        #'commod_short_desc_qty',
        'origin_territory',
        'origin_region',
        'arrival_port_code',
        'arrival_port_name',
        'departure_port_code',
        'departure_port_name',
        #'dest_final',
        'coast_region',
        #'clearing_district',
        #'place_receipt',
        #'shipper_name',
        #'shipper_address',
        #'consignee_name',
        #'consignee_address',
        #'notify_party1_name',
        #'notify_party1_address',
        #'notify_party2_name',
        #'notify_party2_address',
        #'commod_desc_raw',
        #'container_id_marks',
        #'marks_desc',
        'hs_code',
        #'joc_code',
        #'commod_short_desc',
        #'container_ids',
        'carrier_name',
        'carrier_scac',
        'vessel_name',
        'voyage_number',
        #'precarrier',
        'vessel_id',
        #'inbond_code',
        #'transport_mode',
        #'bol_number',
        'direction',
        'bol_id',
        'year',
        'month',
        'lane_id'
    )
    #filter for west coast
    #.filter(pl.col('coast_region')=='WEST')
    #get lane name 
    .with_columns(
            #find most commonly used departure port name for a given lane_id
            pl.col('departure_port_name').drop_nulls().mode().first().over('lane_id').alias('best_departure_port_name'),
            #find most commonly used arrival port name for a given lane_id
            pl.col('arrival_port_name').drop_nulls().mode().first().over('lane_id').alias('best_arrival_port_name')
        )
        .with_columns(
            (pl.col('best_departure_port_name').cast(pl.Utf8)+' — '+pl.col('best_arrival_port_name').cast(pl.Utf8))
            .str.to_titlecase()
            .cast(pl.Categorical)
            .alias('lane_name')
        )
        .drop('best_departure_port_name', 'best_arrival_port_name')
)

exports_lf = (
    pl.scan_parquet('data/piers_raw/exports/piers_exports_raw.parquet', parallel='columns') 
    #drop unused columns
    .select(
        #'shipper',
        #'shipper_address',
        #'weight',
        #'weight_unit',
        #'qty',
        #'quantity_type',
        'teus',
        'carrier_name',
        'carrier_scac',
        'vessel_name',
        'voyage_number',
        #'bol_number',
        'vessel_id',
        #'value_est',
        'departure_port_code',
        'departure_port_name',
        #'container_ids',
        #'container_piece_count',
        'coast_region',
        #'commod_desc_raw',
        #'commod_short_desc',
        'hs_code',
        #'joc_code',
        #'commod_short_desc_qty',
        'date',
        #'origin',
        'dest_territory',
        'dest_region',
        'arrival_port_code',
        'arrival_port_name',
        'direction',
        'bol_id',
        'year',
        'month',
        'lane_id'
    )
    #filter for west coast
    #.filter(pl.col('coast_region')=='WEST')
    #get lane name 
    .with_columns(
            #find most commonly used departure port name for a given lane_id
            pl.col('departure_port_name').drop_nulls().mode().first().over('lane_id').alias('best_departure_port_name'),
            #find most commonly used arrival port name for a given lane_id
            pl.col('arrival_port_name').drop_nulls().mode().first().over('lane_id').alias('best_arrival_port_name')
        )
        .with_columns(
            (pl.col('best_departure_port_name').cast(pl.Utf8)+' — '+pl.col('best_arrival_port_name').cast(pl.Utf8))
            .str.to_titlecase()
            .cast(pl.Categorical)
            .alias('lane_name')
        )
        .drop('best_departure_port_name', 'best_arrival_port_name')
)

In [3]:
#project functions

#fill nulls in volume cols with mean
def fill_volume(lf):
    '''ad hod function to fill volume columns with their means'''
    return (
        lf
        .with_columns([
            pl.col('teus').replace(0,None).fill_null(strategy='mean'),
            #pl.col('weight').replace(0,None).fill_null(strategy='mean'),
            #pl.col('qty').replace(0,None).fill_null(strategy='mean')
            ])
        )

#plotly graph inspecting nulls over time by group
def nulls_over_time_plotly(data_lf, group_var, time_var, value_var, title=False):
    '''
    Plots proportion of null values over time by group.
    INPUTS:
        data_lf - polars lazyframe containing the relevant data
        group_var - str - the name of the column by which to group
        time_var - str - the name of the time column (e.g., year, month) over which values will be counted
        value_var - str - the name of the column containing the variable in question
        title (default=False) - str - the title of the graph
    OUTPUT:
        a plotly express figure
    DEPENDS ON:
        polars
        plotly express 
    '''
    df = (
        #select relevant columns
        data_lf.select([group_var, time_var, value_var])
        #group by, creating null count and non-null count cols
        .group_by(group_var, time_var)
        .agg([pl.col(value_var).null_count().alias('null_count'),
                pl.col(value_var).count().alias('count')])
        #compute percent null and fill new column
        .with_columns((pl.col('null_count')/(pl.col('count')+pl.col('null_count'))).alias('null_percent'))
        #cast group col to string to allow sensible ordering of legend
        .cast({group_var:pl.Utf8})
        #sort by date (to allow proper visualization of lines) and group (for legend ordering) 
        .sort(time_var, group_var)
    ).collect()
    #plot
    fig = px.line(
        data_frame=df,
        x=time_var, y='null_percent',
        color=group_var,
        title= 'Count of nulls over time by source frame.' if not title else title
    )
    fig.show()
    del df

#fill nulls over groups given a single unique value per group
def fill_nulls_by_group(data_lf, group_vars, val_var):
    '''Fills null values by group if and only if the val_var for that group contains exactly one non-null unique value.
    INPUTS:
        data_lf - polars lazyframe containing the relevant data
        group_vars - iterable - the names of the columns by which groups will be created
        val_var - string - the name of the column in which nulls will be filled
    OUTPUT:
        filled_lf - the resultant lazyframe 
    DEPENDS ON:
        polars - current version written in polars 0.20.1
    '''
    filled_lf = (
        data_lf.with_columns(
            #if the group contains exactly one unique value: 
            pl.when(pl.col(val_var).drop_nulls().unique(maintain_order=True).len().over(group_vars)==1)
            #then fill the group with that value
            .then(pl.col(val_var).fill_null(pl.col(val_var).drop_nulls().unique(maintain_order=True).first().over(group_vars)))
            #otherwise do nothing
            .otherwise(pl.col(val_var))
            )
        )
    return filled_lf

#assign primary carrier
def add_primary_carrier(lf):
    '''ad hoc function to find primary carrier for each vessel and indicate cargo sharing'''
    lf = (
        #sum teus over vessel, month, and carrier
        lf.with_columns(
            pl.col('teus').sum()
            .over('vessel_id', 'month', 'unified_carrier_scac')
            .alias('sum_teus')
            )
        #select carrier that moved the most cargo on that vessel during that month
        .with_columns(
            pl.col('unified_carrier_scac')
            .sort_by('sum_teus', descending=True)
            .drop_nulls().first()
            .over('vessel_id', 'month')
            .alias('vessel_owner')
            )
        #add bool col if bol is from primary carrier
        .with_columns(
            (pl.col('unified_carrier_scac')==pl.col('vessel_owner'))
            .alias('primary_cargo')
            )
        #set related columns to missing when vessel_id is missing
        .with_columns(
            pl.when(pl.col('vessel_id').is_null()).then(pl.lit(None)).otherwise(pl.col('vessel_owner')).alias('vessel_owner'),
            pl.when(pl.col('vessel_id').is_null()).then(pl.lit(None)).otherwise(pl.col('primary_cargo')).alias('primary_cargo')
        )
        #drop ad hoc sum_teus col
        .drop('sum_teus')
    )
    return lf

#plot proportion of shared cargo over time
def sharing_over_time_plotly(data_lf, group_var, include_missing_vessels=True, limit=10, title=False):
    '''
    Plots proportion of shared cargo over time (months) by group_var.
    INPUTS:
        data_lf - polars lazyframe containing the relevant data
        group_var - str - the name of the column by which to group
        include_missing_vessels - bool - default=True, when False, drops missing vessel_ids
        title (default=False) - str - the title of the graph
    OUTPUT:
        a plotly express figure
    DEPENDS ON:
        polars
        plotly express 
    '''
    if not include_missing_vessels:
        df = data_lf.drop_nulls('vessel_id')
    else:
        df = data_lf
    
    df = (
        #select relevant columns
        df.select([group_var, 'month', 'primary_cargo', 'teus'])
        #sum teus over each group-month-shared 
        .group_by(group_var, 'month')
        .agg(
            (pl.col('teus')*pl.col('primary_cargo')).sum().alias('total_primary'),
            pl.col('teus').sum().alias('total_teus')
        )
        #create proportion shared
        .with_columns((1-(pl.col('total_primary')/pl.col('total_teus'))).alias('prop_shared'))
        #cast group col to string to allow sensible ordering of legend
        .cast({group_var:pl.Utf8})
        #sort by date (to allow proper visualization of lines) and group (for legend ordering) 
        .sort('month')
    ).collect()

    #limit categories
    top_groups = (
        data_lf.group_by(group_var)
        .agg(pl.col('teus').sum())
        .sort('teus', descending=True)
        .select(group_var)
        .limit(limit)
        .collect()
        .to_series()
        .cast(pl.Utf8)
    )
    
    #plot
    fig = px.line(
        data_frame=df.filter(pl.col(group_var).is_in(top_groups)).with_columns(pl.col('month').str.to_datetime('%Y%m')),
        x='month', y='prop_shared',
        color=group_var,
        title= 'Proportion of shared cargo over time.' if not title else title,
        labels={
            'prop_shared':'Proportion of cargo from non-primary carrier',
            'month':'Month'
        }
    )
    fig.show()

def cluster_dates(lf, direction, samples=None):
    '''
    Finds arrival/departure date using the following algorithm:
        1. Create 1-D dataframe of dates for each vessel-lane pair, 
            with one date occurance per TEU processed on that date
        2. Find clusers of dates using SciKitLearn's HDBSCAN
        3. Assign mode date of each cluster as the arrival/departure date
        4. Assign any bols with dates occuring between the modes as arriving/departing
            on the date of the preceeding mode.
        5. Join imputed arrival/departure dates into main lazyframe. 
    INPUTS
        lf - a polars LazyFrame containing the relevant data
        direction - 'imports' or 'exports' - indicating the source data
        samples - int - number of random samples 
    OUTPUTS
        lf - the original lazyframe with imputed dates 
    '''
    #create vessel_port_pair columns in main lf
    lf = (
        lf.with_columns(
            (pl.col('vessel_id').cast(pl.Utf8)+'_'+pl.col('lane_id').cast(pl.Utf8))
            .cast(pl.Categorical)
            .alias('vessel_lane_pair')
        )
    )
    #collect relevant columns from lf
    begin_collect = time.time()
    df = (
        lf.group_by('date', 'vessel_lane_pair')
        #get sum of TEUs on each date 
        .agg(pl.col('teus').sum().alias('sum_teus'))
        #drop missing vessel-port pairs
        .drop_nulls(subset='vessel_lane_pair')
        #sort by date
        .sort('date')
        .collect()
    )
    print('clustering data collected; time = {:.2f} minutes'.format((time.time() - begin_collect)/60))
    #initialize variables
    samples=samples 
    if samples:
        pairs = df.select('vessel_lane_pair').unique().sample(samples).to_series()
    else:
        pairs = df.select('vessel_lane_pair').unique().to_series()
    pairs_df = pl.DataFrame()
    #loop through vessel-port pairs
    print('Looping through vessel-lane pairs')
    for i in range(len(pairs)):
        if i%1000 == 0:
            begin_block = time.time()
        pair = pairs[i]
        #make single-column dataframe of dates where each date corresponds to a single TEU that arrived on that day 
        pair_1d = (
            df.filter(pl.col('vessel_lane_pair')==pair)
            .select('date', pl.col('sum_teus').ceil())
            #explode dates by each teu 
            .select(pl.exclude('sum_teus').repeat_by('sum_teus').explode())
        )
        #find minimum number of occurances of a single date (needed for HDBSCAN param)
        min_sample = pair_1d.group_by('date').agg(pl.col('date').count().alias('count')).min().row(0)[1]
        #skip empty pairs
        if min_sample == 0:
            continue
        #skip vessel_port pairs with less than 2 dates
        if len(pair_1d) < 2:
            continue
        #instantiate clusterer
        clusterer = HDBSCAN(min_cluster_size=50, min_samples=min_sample) #we need to find a dynamic way of seleting these parameters
        #get clusters
        clusterer.fit(pair_1d)
        #add back to pair_1d
        pair_df = (
            pair_1d
            #add cluster column
            .with_columns(
                pl.Series(name='cluster', values=clusterer.labels_)
            )
            #add imputed date column
            .with_columns(
                    #when date matches the mode of each cluster
                    pl.when(pl.col('date') == pl.col('date').mode().first().over('cluster'))
                    #fill with that date, otherwise fill with null
                    .then(pl.col('date'))
                    .otherwise(pl.lit(None))
                    #forward fill the arrival date to the mode of next cluster
                    .forward_fill()
                    #backward fill the first part of first cluster
                    .backward_fill()
                    #name column
                    .alias('date_imputed')
                )
            #groupby date to simplify
            .group_by('date')
            .agg(pl.col('date_imputed').first())
            #add pair label
            .with_columns(pl.lit(pair).alias('vessel_lane_pair').cast(pl.Categorical))
        )
        #init or concat pairs_df
        if i == 0:
            pairs_df = pair_df   
        else:
            pairs_df = pl.concat([pairs_df,pair_df], how='vertical')
        #print status update
        if (i != 0) and ((i+1)%1000 == 0):
            print('{:,} pairs clustered. The previous 1000 pairs took {:.2f} minutes.'.format(i+1, (time.time()-begin_block)/60))
    #rename imputed dates based on direction
    if direction=='import':
        pairs_df = pairs_df.rename({'date_imputed': 'date_arrival'})
    elif direction=='export':
        pairs_df = pairs_df.rename({'date_imputed': 'date_departure'})
    else:
        raise Exception('direction must equal "import" or "export"')
    #join imputed dates to main lf
    pairs_lf = pairs_df.lazy()
    lf = (
        lf.join(pairs_lf, on=['date', 'vessel_lane_pair'], how='left')
    )
    print('Total time to cluster dates: {:.2f} hours'.format((time.time()-begin_collect)/3600))
    return lf

### Carrier names and Standard Carrier Alpha Codes (SCAC)

Carrier names are often long strings of inconsistent nature (e.g. "Maersk", "MAERSK LINE", "A.P. Moller Maersk", etc.), and SCAC codes can change over time for the same carrier. To address these issues, we simply carrier names to the most commonly used name string for a given SCAC, and we simplify SCAC codes to the most recent SCAC used for a given carrier name. 

As carrier alliances apply only to containerized freight, we also drop instances of bulk cargo, which are coded in this data as SCAC = "BULK"

In [4]:
#clean carrier names and scac codes

imports_lf = (
    imports_lf
    #drop bulk carriers
    .filter(pl.col('carrier_scac')!='BULK')
    #sort by date
    .sort('date', descending=True)
    #get most commonly used carrier name and scac 
    .with_columns(
        pl.col('carrier_name').drop_nulls().mode().first().over('carrier_scac')
        .alias('unified_carrier_name')
    )
    .with_columns(
        pl.col('carrier_scac').drop_nulls().first().over('unified_carrier_name')
        .alias('unified_carrier_scac')
    )
)

exports_lf = (
    exports_lf
    #drop bulk carriers
    .filter(pl.col('carrier_scac')!='BULK')
    #sort by date
    .sort('date', descending=True)
    #get most commonly used carrier name and scac 
    .with_columns(
        pl.col('carrier_name').drop_nulls().mode().first().over('carrier_scac')
        .alias('unified_carrier_name')
    )
    .with_columns(
        pl.col('carrier_scac').drop_nulls().first().over('unified_carrier_name')
        .alias('unified_carrier_scac')
    )
)

### Tanker Transfer Ports

Some BOLs in the imports data list offshore tanker transfer ports as their origin. Since these are not relevant to containerized carrier alliances, we drop them from the database. 

In [5]:
imports_lf = imports_lf.filter(pl.col('departure_port_code').cast(pl.Utf8).str.starts_with('999') == False)
exports_lf = exports_lf.filter(pl.col('arrival_port_code').cast(pl.Utf8).str.starts_with('999') == False)

### Missing Data

The PIERS BOL data contains many missing values. Some of these, e.g. TEUs, are missing systematically related to date. The issues of missing volume data is a open question to S&P, although it has been many weeks with no response from them. In the meantime, we fill missing volume data with the mean from the rest of the dataset. (Note: if no help is to come from S&P, a nearest neighbor imputation may yield better results.)

In [None]:
lf = pl.concat(
    [imports_lf.select('direction', 'month', 'teus'),
    exports_lf.select('direction', 'month', 'teus')]
)
nulls_over_time_plotly(lf, group_var='direction', time_var='month', value_var='teus', title='Missing volume data over time.')

In [6]:
#fill missing volumes with the mean value
imports_lf = fill_volume(imports_lf)
exports_lf = fill_volume(exports_lf)

#### Missing Vessel info

A substantial portion of BOLs do not include vessel names or IDs. Note there is perfect correlation between missing vessels names and missing vessel IDs. 

In [None]:
lf = pl.concat(
    [imports_lf.select('direction', 'month', 'vessel_name'),
    exports_lf.select('direction', 'month', 'vessel_name')]
)

nulls_over_time_plotly(
    data_lf=lf,
    group_var='direction',
    time_var='month',
    value_var='vessel_name',
    title='Proportion of Missing Vessel Names over time.')

Since our analysis concerns the practice of carriers sharing cargo with other carriers on a single vessel, we drop missing vessels.

In [7]:
#drop missing vessels
imports_lf = imports_lf.drop_nulls(subset='vessel_id')
exports_lf = exports_lf.drop_nulls(subset='vessel_id')

We also drop bols with missing port data for the same reason. 

In [8]:
#drop missing ports
imports_lf = imports_lf.drop_nulls(subset=['arrival_port_code', 'departure_port_code'])
exports_lf = exports_lf.drop_nulls(subset=['arrival_port_code', 'departure_port_code'])

## Sharing Over Time

In [9]:
#add primary carrier
imports_lf = add_primary_carrier(imports_lf)
exports_lf = add_primary_carrier(exports_lf)

In [None]:
sharing_over_time_plotly(exports_lf, group_var='departure_port_name')

## Voyage Identification

We define a 'voyage' as a single vessel's trip from the port of departure to the port of arrival, i.e., a single vessel on a single *lane*. Sadly, the voyage IDs listed on BOLs are not consistent with this conception of a voyage (for a given vessel, lane, and date, the "voyage id" is far from unique), and the date listed on each BOL does not necessarily correspond to the actual arrival date of the ship (i.e., the date data are noisy). We address this problem in two steps. 

First, we compute route distances between each foreign/domestic port pair using [SeaRoute](https://github.com/eurostat/searoute). Using this distance we determine an estimated minimum number of days required to sail from one port to the other. This gives us a means of differentiating between multiple dates that correspond to the same port visit on the one hand, and two seperate port visits on the other. For example, if a BOL dated July 15 shows a vessel delivering a set of containers from Hong Kong to San Francisco, and a different BOL shows the same vessel delivering a different set of containers from Hong Kong to San Francisco on July 20th, we know that the vessel could not have visited Hong Kong between those port visits. Thus, we can cluster dates corresponding to a single port visit whenever the dates are close enough to preclude a visit to the other port. 

Second, we employ this minimum "turn" (from one domestic port to a foreign port and back) time as one of the parameters of a Hierarchical Density-Based Spatial Clustering of Applications with Noise (HDBSCAN) algorithm to group port visits together into a single voyage. 

In [None]:
%%script echo skipping
#load searoute data 
route_distances_df = pl.read_csv('data/route_distances.csv')
#add relevant columns
route_distances_df = (
    route_distances_df.with_columns(
        #add minimum round trip time in days, assuming speed of 25kt (46km/h)
        ((pl.col('distKM')/46/24)*2).alias('route_min_days'),
        #create key for joining to exports lf
        (pl.col('us_port_name').cast(pl.Utf8)+' — '+pl.col('foreign_port_name').cast(pl.Utf8))
        .str.to_titlecase().cast(pl.Categorical)
        .alias('exports_lane_name'),
        #create key for joinign to imports lf
        (pl.col('foreign_port_name').cast(pl.Utf8)+' — '+pl.col('us_port_name').cast(pl.Utf8))
        .str.to_titlecase().cast(pl.Categorical)
        .alias('imports_lane_name')
        )
)

Due to the large imports dataset, merging route distances directly is beyond the scope of available resources. As a work-around, the data is merged on a year-by-year basis and saved to parquet, then re-loaded. 

In [None]:
%%script echo skipping

#merge route distances to imports year by year and write to parquet
years = pl.arange(2005,2025, eager=True)
for year in years:
    print('Collecting {} dataframe and joining distances...'.format(year))
    df = (
        imports_lf
        .filter(pl.col('year')==year)
        .collect()
    )
    df = df.join(route_distances_df.select('route_min_days', 'imports_lane_name'), how='left', left_on='lane_name', right_on='imports_lane_name')
    print('Writing {} data to parquet...'.format(year))
    df.write_parquet(file='data/imports/imports_'+str(year)+'.parquet')
print('Imports data written to parquet')

In [None]:
%%script echo skipping
#join route distances to exports lf
exports_lf = exports_lf.join(route_distances_df.select('route_min_days', 'exports_lane_name').lazy(), 
                             how='left', left_on='lane_name', right_on='exports_lane_name')

In [None]:
%%script echo skipping
#re-load imports lazyframe from parquet
imports_lf = pl.scan_parquet('data/imports/*.parquet')
imports_lf.select('route_min_days').collect().describe()

#### Route minimum days column glitch

The script to join the route distances/days to the imports data seems to work, but as of 4/5 attempting to load the column as above throws Panic Exception ComputeError(ErrString("validity mask length must match the number of values"). This merits further investigation, but for now we will move on without utilizing this parameter in the clustering algorithm. 

Another note: after joining the route distances to the dataframe, the clustering algorithm quickly maxes out memory, even with a 10 bol sample and even without attempting to incorporate the distances into hdbscan. The reason for this is unknown, but clustering the data *before* joining in the route distances works as expected.  

### HDBSCAN Clustering 



In [None]:
imports_lf = cluster_dates(imports_lf, direction='import');

In [10]:
exports_lf = cluster_dates(exports_lf, direction='export');

clustering data collected; time = 0.19 minutes
Looping through vessel-lane pairs
2,000 pairs clustered. The previous 1000 pairs took 0.03 minutes.
3,000 pairs clustered. The previous 1000 pairs took 0.06 minutes.
4,000 pairs clustered. The previous 1000 pairs took 0.16 minutes.
5,000 pairs clustered. The previous 1000 pairs took 0.04 minutes.
6,000 pairs clustered. The previous 1000 pairs took 0.10 minutes.
7,000 pairs clustered. The previous 1000 pairs took 0.05 minutes.
8,000 pairs clustered. The previous 1000 pairs took 0.38 minutes.
9,000 pairs clustered. The previous 1000 pairs took 0.22 minutes.
10,000 pairs clustered. The previous 1000 pairs took 0.12 minutes.
11,000 pairs clustered. The previous 1000 pairs took 0.04 minutes.
12,000 pairs clustered. The previous 1000 pairs took 0.10 minutes.
13,000 pairs clustered. The previous 1000 pairs took 0.05 minutes.
14,000 pairs clustered. The previous 1000 pairs took 0.05 minutes.
15,000 pairs clustered. The previous 1000 pairs took 0.0

In [None]:
imports_lf.describe()

## Write to Parquet

In [None]:
#write imports to clean parquet

#get years
years = pl.arange(2005,2025, eager=True)

start = time.time()

for year in years:
    print('Collecting {} dataframe...'.format(year))
    df = (
        imports_lf
        .filter(pl.col('year')==year)
        .collect()
    )
    print('Writing {} data to parquet...'.format(year))
    df.write_parquet(file='data/imports/imports_'+str(year)+'.parquet')
print('Imports data written to parquet')
runtime = time.time() - start
print('Total time to write imports: {:.2f} hours'.format(runtime/3600))

In [11]:
#write exports to clean parquet
start = time.time()
print('Collecting exports data...')
df = exports_lf.collect()
print('Writing exports data to parquet...')
df.write_parquet('data/exports/exports.parquet')
del df
print('Exports data written to parquet.')
runtime = time.time() - start
print('Total time to write exports: {:.2f} hours'.format(runtime/3600))

Collecting exports data...
Writing exports data to parquet...
Exports data written to parquet.
Total time to write exports: 0.02 hours


## Limitations

- voyage identification
    - Ideally, a voyage would be defined by a vessel visiting a set of departure ports (e.g. Hong Kong, Beijing, and Tokyo) to pick up cargo and then delivering them to a set of arrival ports (e.g. Seattle, San Francisco, and Los Angeles). This way, we could measure the relative volumnes from each carrier actually on the ship during the main transit rather than only the relative volumes related to the specific lane. 
- missing data
    - volume data missing from the first half of the dataset is problematic, and our current method of filling with the mean links volumnes to the number of BOLs. 