# PIERS Container BOL Data ETL 

This notebook builds an ETL pipeline for S&P Global's PIERS data. Data is extracted from CSV files downloaded from the Global Trade Analytics Suite, assigned appropriate datatypes, concatendated into a single dataframe, and loaded to an Apache Parquet file for storage.

In [1]:
#import libraries
import pandas as pd
import os
import time

#display settings
pd.set_option('display.max_columns', None)

## Extract and Transform

Read from csv into a pandas dataframe with appropriate dtypes

Note for future optimization: build a dictionary of column dtypes and assign within read_csv. 

In [2]:
#define path
path = 'data/raw/'
#get list of data files, ignoring any hidden files in directory 
datafiles = [file for file in os.listdir(path) if not file.startswith('.')]
#init filenumber
filenumber = 1
#define new col names
import_colnames_dict = {'Weight': 'weight',
                        'Weight Unit': 'weight_unit',
                        'Quantity': 'qty',
                        'Quantity Type': 'qty_type',
                        'TEUs': 'teus',
                        'Estimated Value': 'value_est',
                        'Arrival Date': 'date_arrival',
                        'Container Piece Count': 'container_piece_count',
                        'Quantity of Commodity Short Description': 'commod_short_desc_qty',
                        'Territory of Origin': 'origin_territory',
                        'Region of Origin': 'origin_region',
                        'Port of Arrival Code': 'arrival_port_code',
                        'Port of Arrival': 'arrival_port_name',
                        'Port of Departure Code': 'departure_port_code',
                        'Port of Departure': 'departure_port_name',
                        'Final Destination': 'dest_final',
                        'Coastal Region': 'coast_region',
                        'Clearing District': 'clearing_district',
                        'Place of Receipt': 'place_receipt',
                        'Shipper': 'shipper_name',
                        'Shipper Address': 'shipper_address',
                        'Consignee': 'consignee_name',
                        'Consignee Address': 'consignee_address',
                        'Notify Party': 'notify_party1_name',
                        'Notify Party Address': 'notify_party1_address',
                        'Also Notify Party': 'notify_party2_name',
                        'Also Notify Party Address': 'notify_party2_address',
                        'Raw Commodity Description': 'commod_desc_raw',
                        'Marks Container Number': 'container_id_marks',
                        'Marks Description': 'marks_desc',
                        'HS Code': 'hs_code',
                        'JOC Code': 'joc_code',
                        'Commodity Short Description': 'commod_short_desc',
                        'Container Number': 'container_ids',
                        'Carrier': 'carrier_name',
                        'SCAC': 'carrier_scac',
                        'Vessel Name': 'vessel_name',
                        'Voyage Number': 'vessel_id',
                        'Pre Carrier': 'precarrier',
                        'IMO Number': 'imo_num',
                        'Inbond Code': 'inbond_code',
                        'Mode of Transport': 'transport_mode',
                        'Bill of Lading Number': 'bol_id'}
#define dtypes
import_dtype_dict = {'Weight': 'float64',
            'Weight Unit': 'category',
            'Quantity': 'float64',
            'Quantity Type': 'category',
            'TEUs': 'float64',
            'Estimated Value': 'float64',
            'Arrival Date': 'int64',
            'Container Piece Count': 'int64',
            'Quantity of Commodity Short Description': 'object',
            'Territory of Origin': 'category',
            'Region of Origin': 'category',
            'Port of Arrival Code': 'category',
            'Port of Arrival': 'category',
            'Port of Departure Code': 'category',
            'Port of Departure': 'category',
            'Final Destination': 'category',
            'Coastal Region': 'category',
            'Clearing District': 'category',
            'Place of Receipt': 'category',
            'Shipper': 'object',
            'Shipper Address': 'object',
            'Consignee': 'object',
            'Consignee Address': 'object',
            'Notify Party': 'object',
            'Notify Party Address': 'object',
            'Also Notify Party': 'object',
            'Also Notify Party Address': 'object',
            'Raw Commodity Description': 'object',
            'Marks Container Number': 'object',
            'Marks Description': 'object',
            'HS Code': 'category',
            'JOC Code': 'category',
            'Commodity Short Description': 'object',
            'Container Number': 'object',
            'Carrier': 'category',
            'SCAC': 'category',
            'Vessel Name': 'object',
            'Voyage Number': 'object',
            'Pre Carrier': 'float64',
            'IMO Number': 'float64',
            'Inbond Code': 'float64',
            'Mode of Transport': 'category',
            'Bill of Lading Number': 'object'}
#define category variable cols
catcols = ['weight_unit', 'qty_type', 'origin_territory', 'origin_region', 'arrival_port_code', 
           'arrival_port_name', 'departure_port_code', 'departure_port_name', 'dest_final', 'coast_region', 
           'clearing_district', 'place_receipt', 'hs_code', 'joc_code', 'carrier_name', 'carrier_scac', 
           'transport_mode']

In [3]:
print('Extracting CSV files...\n', 'Files to process: ', len(datafiles), '\n')

#extract from csv to clean dataframes and concat
for filename in datafiles:
    start = time.time()
    print('Extracting file ', filenumber, '...')
    #read csv with appropriate dtypes
    file_df = pd.read_csv(path+filename, dtype=import_dtype_dict)
    #rename columns
    file_df.rename(columns=import_colnames_dict, inplace=True)
    #unpack strings to list objects
    file_df.container_ids = file_df.container_ids.str.split()
    file_df.commod_short_desc_qty = file_df.commod_short_desc_qty.str.split(pat=';')
    file_df.commod_short_desc = file_df.commod_short_desc.str.split(pat=',')
    #recast dates to datetime 
    file_df.date_arrival = pd.to_datetime(file_df.date_arrival.astype(str), format='%Y%m%d') 
    #concat to or create main imports df
    if 'imports_df' in locals():
        #create category unions and assign union to each df col
        for col in catcols:
            catunion = pd.api.types.union_categoricals([imports_df[col], file_df[col]])
            imports_df[col] = pd.Categorical(imports_df[col], categories=catunion.categories)
            file_df[col] = pd.Categorical(file_df[col], categories=catunion.categories)
        #concat to main df
        imports_df = pd.concat([imports_df, file_df])
    else:
        imports_df = file_df 
    del file_df
    end = time.time()
    print('Extraction complete.\n', 'Time: {} sec \n'.format(end-start))
    filenumber += 1

Extracting CSV files...
 Files to process:  39 

Extracting file  1 ...
Extraction complete.
 Time: 30.30369281768799 sec 

Extracting file  2 ...
Extraction complete.
 Time: 38.606818199157715 sec 

Extracting file  3 ...
Extraction complete.
 Time: 20.177419900894165 sec 

Extracting file  4 ...
Extraction complete.
 Time: 33.34604597091675 sec 

Extracting file  5 ...
Extraction complete.
 Time: 34.10702681541443 sec 

Extracting file  6 ...
Extraction complete.
 Time: 33.43861198425293 sec 

Extracting file  7 ...
Extraction complete.
 Time: 20.76675009727478 sec 

Extracting file  8 ...
Extraction complete.
 Time: 38.86877512931824 sec 

Extracting file  9 ...
Extraction complete.
 Time: 47.7787709236145 sec 

Extracting file  10 ...
Extraction complete.
 Time: 47.76598501205444 sec 

Extracting file  11 ...
Extraction complete.
 Time: 36.88781189918518 sec 

Extracting file  12 ...
Extraction complete.
 Time: 68.1502730846405 sec 

Extracting file  13 ...
Extraction complete.
 Ti

: 

In [None]:
# inspect output 
display(imports_df.head())
imports_df.info()

## Load

In [None]:
#save to parquet file
imports_df.to_parquet('data/piers_imports.parquet', index=False, engine='fastparquet') #requires fastparquet dependency  

#delete imports df
del imports_df

In [None]:
imports_df = pd.read_parquet('data/piers_imports.parquet', engine='fastparquet')
imports_df.info()