# PIERS Imports ETL

This notebook extracts Bill of Lading data from csv files downloaded from S&P Global's PIERS BoL database. Transformations are limited here to setting appropriate datatypes for storage, adding year month and direction columns, and dropping duplicated rows. The data is then saved in .parquet format files by arrival year. 

In [1]:
#import libraries
import polars as pl #v0.20.7

#enable string cache for polars categoricals
pl.enable_string_cache()

#define dtypes
imports_dtypes = {'Weight': pl.Float64,
            'Weight Unit': pl.Categorical,
            'Quantity': pl.Float64,
            'Quantity Type': pl.Categorical,
            'TEUs': pl.Float64,
            'Estimated Value': pl.Float64,
            'Arrival Date': pl.Utf8,
            'Container Piece Count': pl.Int32,
            'Quantity of Commodity Short Description': pl.Utf8,
            'Territory of Origin': pl.Categorical,
            'Region of Origin': pl.Categorical,
            'Port of Arrival Code': pl.Categorical,
            'Port of Arrival': pl.Categorical,
            'Port of Departure Code': pl.Categorical,
            'Port of Departure': pl.Categorical,
            'Final Destination': pl.Categorical,
            'Coastal Region': pl.Categorical,
            'Clearing District': pl.Categorical,
            'Place of Receipt': pl.Categorical,
            'Shipper': pl.Utf8,
            'Shipper Address': pl.Utf8,
            'Consignee': pl.Utf8,
            'Consignee Address': pl.Utf8,
            'Notify Party': pl.Utf8,
            'Notify Party Address': pl.Utf8,
            'Also Notify Party': pl.Utf8,
            'Also Notify Party Address': pl.Utf8,
            'Raw Commodity Description': pl.Utf8,
            'Marks Container Number': pl.Utf8,
            'Marks Description': pl.Utf8,
            'HS Code': pl.Utf8,
            'JOC Code': pl.Utf8,
            'Commodity Short Description': pl.Utf8,
            'Container Number': pl.Utf8,
            'Carrier': pl.Categorical,
            'SCAC': pl.Categorical,
            'Vessel Name': pl.Utf8,
            'Voyage Number': pl.Utf8,
            'Pre Carrier': pl.Float64,
            'IMO Number': pl.Int32,
            'Inbond Code': pl.Float64,
            'Mode of Transport': pl.Categorical,
            'Bill of Lading Number': pl.Utf8}

#define pythonic column names 
import_colnames_dict = {'Weight': 'weight',
            'Weight Unit': 'weight_unit',
            'Quantity': 'qty',
            'Quantity Type': 'qty_type',
            'TEUs': 'teus',
            'Estimated Value': 'value_est',
            'Arrival Date': 'date',
            'Container Piece Count': 'container_piece_count',
            'Quantity of Commodity Short Description': 'commod_short_desc_qty',
            'Territory of Origin': 'origin_territory',
            'Region of Origin': 'origin_region',
            'Port of Arrival Code': 'arrival_port_code',
            'Port of Arrival': 'arrival_port_name',
            'Port of Departure Code': 'departure_port_code',
            'Port of Departure': 'departure_port_name',
            'Final Destination': 'dest_final',
            'Coastal Region': 'coast_region',
            'Clearing District': 'clearing_district',
            'Place of Receipt': 'place_receipt',
            'Shipper': 'shipper_name',
            'Shipper Address': 'shipper_address',
            'Consignee': 'consignee_name',
            'Consignee Address': 'consignee_address',
            'Notify Party': 'notify_party1_name',
            'Notify Party Address': 'notify_party1_address',
            'Also Notify Party': 'notify_party2_name',
            'Also Notify Party Address': 'notify_party2_address',
            'Raw Commodity Description': 'commod_desc_raw',
            'Marks Container Number': 'container_id_marks',
            'Marks Description': 'marks_desc',
            'HS Code': 'hs_code',
            'JOC Code': 'joc_code',
            'Commodity Short Description': 'commod_short_desc',
            'Container Number': 'container_ids',
            'Carrier': 'carrier_name',
            'SCAC': 'carrier_scac',
            'Vessel Name': 'vessel_name',
            'Voyage Number': 'voyage_number',
            'Pre Carrier': 'precarrier',
            'IMO Number': 'vessel_id',
            'Inbond Code': 'inbond_code',
            'Mode of Transport': 'transport_mode',
            'Bill of Lading Number': 'bol_number'}

#define schema
imports_schema = {'weight': pl.Float64,
                'weight_unit': pl.Categorical,
                'qty': pl.Float64,
                'qty_type': pl.Categorical,
                'teus': pl.Float64,
                'value_est': pl.Float64,
                'date': pl.Utf8,
                'container_piece_count': pl.Int32,
                'commod_short_desc_qty': pl.Utf8,
                'origin_territory': pl.Categorical,
                'origin_region': pl.Categorical,
                'arrival_port_code': pl.Categorical,
                'arrival_port_name': pl.Categorical,
                'departure_port_code': pl.Categorical,
                'departure_port_name': pl.Categorical,
                'dest_final': pl.Categorical,
                'coast_region': pl.Categorical,
                'clearing_district': pl.Categorical,
                'place_receipt': pl.Categorical,
                'shipper_name': pl.Utf8,
                'shipper_address': pl.Utf8,
                'consignee_name': pl.Utf8,
                'consignee_address': pl.Utf8,
                'notify_party1_name': pl.Utf8,
                'notify_party1_address': pl.Utf8,
                'notify_party2_name': pl.Utf8,
                'notify_party2_address': pl.Utf8,
                'commod_desc_raw': pl.Utf8,
                'container_id_marks': pl.Utf8,
                'marks_desc': pl.Utf8,
                'hs_code': pl.Utf8,
                'joc_code': pl.Utf8,
                'commod_short_desc': pl.Utf8,
                'container_ids': pl.Utf8,
                'carrier_name': pl.Categorical,
                'carrier_scac': pl.Categorical,
                'vessel_name': pl.Utf8,
                'voyage_number': pl.Utf8,
                'precarrier': pl.Float64,
                'vessel_id': pl.Int32,
                'inbond_code': pl.Float64,
                'transport_mode': pl.Categorical,
                'bol_number': pl.Utf8}

#define years
years = pl.arange(2005,2024, eager=True)

## ETL

In [2]:
#ETL
for year in years:
    print('Collecting observations from '+str(year)+'...')
    df = (
        #scan csvs
        pl.scan_csv('data/raw_csv/imports/*.csv', dtypes=imports_dtypes)
        #rename columns
        .rename(import_colnames_dict)
        #reorder columns
        .select(import_colnames_dict.values())
        #filter by year
        .filter(pl.col('date').str.starts_with(str(year)))
        #collect
        .collect()
    )
    df = (
        #cast entire df to string
        df.cast(pl.Utf8)
        #strip whitespace and replace empty str with null
        .with_columns(pl.all().str.strip_chars().replace('',None))
        #recast appropriately
        .cast(imports_schema)
        #drop duplicates
        .unique()
        #cast date col to datetime
        .with_columns(pl.col('date').str.to_datetime('%Y%m%d'))
        .with_columns(
            #create direction column
            pl.lit('import').cast(pl.Categorical).alias('direction'),
            #create bol_id
            (pl.col('carrier_scac').fill_null('')+'_'+pl.col('bol_number')).alias('bol_id'),
            #extract year
            pl.col('date').dt.year().alias('year'),
            #extract month (e.g., '202304')
            pl.col('date').dt.strftime('%Y%m').alias('month'),
            #create lane_id
            (pl.col('departure_port_code').cast(pl.Utf8)+'_'+pl.col('arrival_port_code').cast(pl.Utf8))
            .cast(pl.Categorical)
            .alias('lane_id'),
            #convert zero volume values to null
            pl.col('teus').replace(0,None),
            pl.col('weight').replace(0,None),
            pl.col('qty').replace(0,None)
        )
    )
    print(str(year)+' dataframe collected. \nWriting to parquet...')
    #write file to parquet
    df.write_parquet(file='data/raw_parquet/imports/piers_imports_'+str(year)+'.parquet')
    del df

print('\nImports ETL complete.')

Collecting observations from 2005...
2005 pldf collected. 
Writing to parquet...
Collecting observations from 2006...
2006 pldf collected. 
Writing to parquet...
Collecting observations from 2007...
2007 pldf collected. 
Writing to parquet...
Collecting observations from 2008...
2008 pldf collected. 
Writing to parquet...
Collecting observations from 2009...
2009 pldf collected. 
Writing to parquet...
Collecting observations from 2010...
2010 pldf collected. 
Writing to parquet...
Collecting observations from 2011...
2011 pldf collected. 
Writing to parquet...
Collecting observations from 2012...
2012 pldf collected. 
Writing to parquet...
Collecting observations from 2013...
2013 pldf collected. 
Writing to parquet...
Collecting observations from 2014...
2014 pldf collected. 
Writing to parquet...
Collecting observations from 2015...
2015 pldf collected. 
Writing to parquet...
Collecting observations from 2016...
2016 pldf collected. 
Writing to parquet...
Collecting observations from

In [3]:
#close string cache
pl.disable_string_cache()