# PIERS Container BOL Data ETL 

This notebook builds an ETL pipeline for S&P Global's PIERS data. Data is extracted from CSV files downloaded from the Global Trade Analytics Suite, assigned appropriate datatypes, concatendated into a single dataframe, and loaded to an Apache Parquet file for storage.

In [1]:
#import libraries
import pandas as pd
import os

#display settings
pd.set_option('display.max_columns', None)

## Extract and Transform

Read from csv into a pandas dataframe with appropriate dtypes

Note for future optimization: build a dictionary of column dtypes and assign within read_csv. 

In [2]:
def piers_imports_extractor(data):
    '''
    Extracts from downloaded PIERS csv files and performs initial cleaning 
    INPUT:
        data - str - the csv file to be extracted, including the path from current directory
    OUTPUT:
        df - pandas dataframe with appropriate column names and dtypes
    '''
    #read csv file 
    df = pd.read_csv(data, engine='pyarrow') # using the pyarrow engine engages more cpu cores 
    #unpack strings to list objects
    df['Container Number'] = df['Container Number'].str.split()
    df['Quantity of Commodity Short Description'] = df['Quantity of Commodity Short Description'].str.split(pat=';')
    df['Commodity Short Description'] = df['Commodity Short Description'].str.split(pat=',')
    #recast dates to datetime 
    df['Arrival Date'] = pd.to_datetime(df['Arrival Date'].astype(str), format='%Y%m%d') 
    #recast to int
    df['Quantity'] = pd.to_numeric(df['Quantity'], downcast='integer')
    #recast to categorical dtypes
    df[['Weight Unit', 'Quantity Type', 'Territory of Origin', 'Region of Origin', 'Port of Arrival Code', 'Port of Arrival',
        'Port of Departure Code', 'Port of Departure', 'Final Destination', 'Coastal Region', 'Clearing District', 'Place of Receipt',
        'Shipper', 'Carrier', 'SCAC', 'Mode of Transport']
        ] = df[['Weight Unit', 'Quantity Type', 'Territory of Origin', 'Region of Origin', 'Port of Arrival Code', 'Port of Arrival',
                'Port of Departure Code', 'Port of Departure', 'Final Destination', 'Coastal Region', 'Clearing District', 'Place of Receipt',
                'Shipper', 'Carrier', 'SCAC', 'Mode of Transport']].astype('category')
    return df    

In [3]:
#define path
path = 'data/raw/'
#get list of data files, ignoring any hidden files in directory 
datafiles = [file for file in os.listdir(path) if not file.startswith('.')]
#initialize dataframe
imports_df = pd.DataFrame()

#extract from csv to clean dataframes and concat
for filename in datafiles:
    file_df = piers_imports_extractor(path+filename)
    imports_df = pd.concat([imports_df, file_df])
    del file_df
#recast to categorical dtypes
imports_df[
    ['Weight Unit', 'Quantity Type', 'Territory of Origin', 'Region of Origin', 'Port of Arrival Code', 'Port of Arrival',
    'Port of Departure Code', 'Port of Departure', 'Final Destination', 'Coastal Region', 'Clearing District', 'Place of Receipt',
    'Shipper', 'Carrier', 'SCAC', 'Mode of Transport']
    ] = imports_df[
        ['Weight Unit', 'Quantity Type', 'Territory of Origin', 'Region of Origin', 'Port of Arrival Code', 'Port of Arrival',
            'Port of Departure Code', 'Port of Departure', 'Final Destination', 'Coastal Region', 'Clearing District', 'Place of Receipt',
            'Shipper', 'Carrier', 'SCAC', 'Mode of Transport']].astype('category')

In [4]:
# inspect output 
display(imports_df.head())
imports_df.info()

Unnamed: 0,Weight,Weight Unit,Quantity,Quantity Type,TEUs,Estimated Value,Arrival Date,Container Piece Count,Quantity of Commodity Short Description,Territory of Origin,Region of Origin,Port of Arrival Code,Port of Arrival,Port of Departure Code,Port of Departure,Final Destination,Coastal Region,Clearing District,Place of Receipt,Shipper,Shipper Address,Consignee,Consignee Address,Notify Party,Notify Party Address,Also Notify Party,Also Notify Party Address,Raw Commodity Description,Marks Container Number,Marks Description,HS Code,JOC Code,Commodity Short Description,Container Number,Carrier,SCAC,Vessel Name,Voyage Number,Pre Carrier,IMO Number,Inbond Code,Mode of Transport,Bill of Lading Number
0,9384.0,KG,920,PCS,2.0,26087.0,2023-01-31,1,[920],CHINA (MAINLAND),NORTH EAST ASIA,5,VANCOUVER BC,58023.0,BUSAN,,WEST,,QINGDAO,QINGDAO TIANYI NO 11 WOODWORK,,JYSK LINEN N FURNITURE,25 KING EDWARD ST,,,,,WOODEN JEWELRY CABINET ORDER NUMBER CD 124 528,DESCRIPTION PRODUCT DIMENSIONCOLOR MATERIAL N...,40,940350,7275000,"[FURNITURE, FIXTURES; NOS (* 7275)]",[MEDU9352303],MEDITERRANEAN SHIPPING COMPANY,MEDU,MSC DARWIN VI,UK251,,9200689.0,,MARITIME,SHKK156975611601
1,11416.0,KG,583,CTN,1.0,30138.0,2023-01-31,1,[583],CHINA (MAINLAND),NORTH EAST ASIA,5,VANCOUVER BC,58023.0,BUSAN,,WEST,,QINGDAO,YANTAI HONGTIAN AUTOPARTS,,UNI SELECT USA,8039 5TH LINE,,,,,BRAKE ROTOR 583CTNS 20PLTS,,20,870830,6922000,"[PARTS; AUTOMOBILE, MOTOR VECHILES]",[MEDU1232702],MEDITERRANEAN SHIPPING COMPANY,MEDU,MSC DARWIN VI,UK251,,9200689.0,,MARITIME,SHKK156976599151
2,21487.0,KG,1500,PKG,2.0,71121.0,2023-01-31,1,[1500],AUSTRALIA,OCEANIA,4601,NEW YORK,22519.0,COLON,,EAST,,MELBOURNE,TRIMBOLI FAMILY WINES,,LIONSTONE INTERNATIONAL,28188 N. BALLARD DRIVE,JF HILLEBRAND USA CUSTOMS DESK RAH,,,,WINE NOT SPARKLING GRAPE MUST WITH FERMT.PRE .,VT.BY ALCOHOL IN 2 L CONT.NO MARKS,40,2204,1673000,[WINE; NOS (* 1671/1674)],[TGHU5172870],CMA-CGM,CMDU,MARFRET GUYANE,0PPTX,,9362334.0,,MARITIME,BQEGAUA279294
3,94080.0,KG,5880,PKG,10.0,311400.0,2023-01-31,5,"[1176, 1176, 1176, 1176, 1176]",NEW ZEALAND,OCEANIA,4601,NEW YORK,24128.0,KINGSTON,,EAST,,NELSON,VINLINK MARLBOROUGH,,DC FLYNT MW SELECTIONS,902 S. DIVISION ST,DC FLYNT MW SELECTIONS,,,,WINE NOT SPARKLING GRAPE MUST WITH FERMT.PRE ....,VT.BY ALCOHOL IN 2 L CONT.NO MARKS VT.BY ALCOH...,40; 40; 40; 40; 40,2204,1673000,[WINE; NOS (* 1671/1674)],"[TEMU8382775, GAOU6853790, CMAU6652090, CMAU64...",CMA-CGM,CMDU,MARFRET GUYANE,0UADG,,9362334.0,,MARITIME,BQEGUSR019686
4,107100.0,KG,75,DRS,8.75,761480.0,2023-01-31,5,"[15, 15, 15, 15, 15]",CHINA (MAINLAND),NORTH EAST ASIA,5,VANCOUVER BC,57035.0,SHANGHAI,,WEST,,SHANGHAI,ZHEJIANG XINAN CHEMICAL INDUSTRIAL,,JET AIR FERRARA,,L E I,456 HUMBER PLACE,,,GLYPHOSATE 540G L SL PRESENT AS GLYPHOSATE PO ...,TASSIUM SALTNM TASSIUM SALTNM TASSIUM SALTNM T...,,380893,4051530,[INSECTICIDE],"[MSDU2444663, MSDU1536852, MEDU3857221, FTAU14...",MEDITERRANEAN SHIPPING COMPANY,MEDU,MSC VANESSA,251A,,9251688.0,,MARITIME,MOSJSZXS042667


<class 'pandas.core.frame.DataFrame'>
Index: 26255532 entries, 0 to 7437503
Data columns (total 43 columns):
 #   Column                                   Dtype         
---  ------                                   -----         
 0   Weight                                   float64       
 1   Weight Unit                              category      
 2   Quantity                                 int32         
 3   Quantity Type                            category      
 4   TEUs                                     float64       
 5   Estimated Value                          float64       
 6   Arrival Date                             datetime64[ns]
 7   Container Piece Count                    int64         
 8   Quantity of Commodity Short Description  object        
 9   Territory of Origin                      category      
 10  Region of Origin                         category      
 11  Port of Arrival Code                     category      
 12  Port of Arrival                 

## Load

In [5]:
#save to parquet file
imports_df.to_parquet('data/piers_imports.parquet')

#delete imports df
del imports_df