# PIERS Container BOL Data ETL 

This notebook builds an ETL pipeline for S&P Global's PIERS data. 

In [None]:
#import libraries
import pandas as pd
import os

In [None]:
#settings
pd.set_option('display.max_columns', None)

## Extract and Transform

Read from csv into a pandas dataframe with appropriate column names and dtypes

In [None]:
def piers_imports_extractor(data):
    '''
    Extracts from downloaded PIERS csv files and performs initial cleaning 
    INPUT:
        data - str - the csv file to be extracted, including the path from current directory
    OUTPUT:
        df - pandas dataframe with appropriate column names and dtypes
    '''
    #read csv file 
    df = pd.read_csv(data, engine='pyarrow') # using the pyarrow engine engages more cpu cores 
    #unpack strings to list objects
    df['Container Number'] = df['Container Number'].str.split()
    df['Quantity of Commodity Short Description'] = df['Quantity of Commodity Short Description'].str.split(pat=';')
    df['Commodity Short Description'] = df['Commodity Short Description'].str.split(pat=',')
    #recast dates to datetime 
    df['Arrival Date'] = pd.to_datetime(df['Arrival Date'].astype(str), format='%Y%m%d') 
    #recast to int
    df['Quantity'] = pd.to_numeric(df['Quantity'], downcast='integer')
    #recast to categorical dtypes
    df[['Weight Unit', 'Quantity Type', 'Territory of Origin', 'Region of Origin', 'Port of Arrival Code', 'Port of Arrival',
        'Port of Departure Code', 'Port of Departure', 'Final Destination', 'Coastal Region', 'Clearing District', 'Place of Receipt',
        'Shipper', 'Carrier', 'SCAC', 'Mode of Transport']
        ] = df[['Weight Unit', 'Quantity Type', 'Territory of Origin', 'Region of Origin', 'Port of Arrival Code', 'Port of Arrival',
                'Port of Departure Code', 'Port of Departure', 'Final Destination', 'Coastal Region', 'Clearing District', 'Place of Receipt',
                'Shipper', 'Carrier', 'SCAC', 'Mode of Transport']].astype('category')
    return df    

In [None]:
#define path
path = 'data/raw/'
#get list of data files, ignoring any hidden files in directory 
datafiles = [file for file in os.listdir(path) if not file.startswith('.')]
#initialize dataframe
imports_df = pd.DataFrame()

#extract from csv to clean dataframes and concat
for filename in datafiles:
    print(filename)
    file_df = piers_imports_extractor(path+filename)
    imports_df = pd.concat([imports_df, file_df])
    del file_df
#recast to categorical dtypes
imports_df[
    ['Weight Unit', 'Quantity Type', 'Territory of Origin', 'Region of Origin', 'Port of Arrival Code', 'Port of Arrival',
    'Port of Departure Code', 'Port of Departure', 'Final Destination', 'Coastal Region', 'Clearing District', 'Place of Receipt',
    'Shipper', 'Carrier', 'SCAC', 'Mode of Transport']
    ] = imports_df[
        ['Weight Unit', 'Quantity Type', 'Territory of Origin', 'Region of Origin', 'Port of Arrival Code', 'Port of Arrival',
            'Port of Departure Code', 'Port of Departure', 'Final Destination', 'Coastal Region', 'Clearing District', 'Place of Receipt',
            'Shipper', 'Carrier', 'SCAC', 'Mode of Transport']].astype('category')

In [None]:
display(imports_df.head())
imports_df.info()

Note for future optimization: build a dictionary of column dtypes and assign within read_csv. 

## Load

In [None]:
#save to parquet file
imports_df.to_parquet('data/piers_imports.parquet')

#delete imports df
del imports_df