# PIERS Container BOL Data ETL 

This notebook builds an ETL pipeline for S&P Global's PIERS data. Data is extracted from CSV files downloaded from the Global Trade Analytics Suite, assigned appropriate datatypes, concatendated into a single dataframe, and loaded to an Apache Parquet file for storage.

In [1]:
#import libraries
import pandas as pd
import os
import time
import polars as pl

#display settings
pd.set_option('display.max_columns', None)

## Extract and Transform

Read from csv into a polars dataframe with appropriate dtypes

In [4]:
df = pd.read_csv('data/raw/exports/PIERS export records 2010 01-06 874C48B941A54798B9047F6C51E0EC42.csv', nrows=10)

In [7]:
df.head()

Unnamed: 0,Shipper,Shipper Address,Weight,Weight Unit,Quantity,Quantity Type,TEUs,Carrier,SCAC,Vessel Name,Voyage Number,Bill of Lading Number,IMO Number,Estimated Value,Port of Departure Code,Port of Departure,Container Number,Container Piece Count,Coastal Region,Raw Commodity Description,Commodity Short Description,HS Code,JOC Code,Quantity of Commodity Short Description,Departure Date,U.S. Origin,Destination Territory,Destination Region,Declared Destination Port Code,Declared Destination Port
0,NACA LOGISTICS USA INC,300 MIDDLESEX AVE CARTARET,0,KG,1801.0,CF,0,N Y K LINE,NYKS,NYK DEMETER,1526,6061172270,9337664,0,4601,NEW YORK,NYKU5485475,1,EAST,INSULATION TAPE; ROLLS OF FABRIC; AUTO PTS; NO...,"TAPE; ELECTRICAL BRAID,CANVAS,CLOTH,FABRIC,TEX...",391910 621790 382490 870899 280800 320710 8541...,7905545 3956000 4999997 6922000 4162500 473880...,110; 86; 25; 20; 9; 5; 3; 3; 3; 3; 1; 1; 1; 2,20100630,,CHINA (MAINLAND),NORTH EAST ASIA,57035,SHANGHAI
1,PANTAINER EXPRESS LINE,21112 72ND AVE S,0,KG,0.0,,0,K LINE,KKLU,HANJIN LONDON,107,US0891771,9111383,0,2904,PORTLAND OR,KKFU1437061,1,WEST,NON HAZ SODIUM BENZOATE,SODIUM BENZOATE,291631,4036043,800,20100630,,REPUBLIC OF KOREA,NORTH EAST ASIA,58023,BUSAN
2,P V PEREGAN CORP,28971 PALOS VERDES DR E,0,KG,0.0,CF,0,HANJIN SHIPPING COMPANY LTD,HJSC,BAY BRIDGE,45,PHXO16535700,9463267,0,2811,OAKLAND,HJCU1233571,1,WEST,SINGLE SCRAP METAL,"METAL; CONCENTRATES,RESIDUE,SCRAP",811291,6037000,1,20100630,,HONG KONG,NORTH EAST ASIA,58201,HONG KONG
3,SIMMONDS CANADA CARIBBEAN LINK,9 RUTHLEDGE AVE,0,KG,0.0,CF,0,TROPICAL SHIPPING,TSCW,TROPIC TIDE,862,TSCW9590114,9039028,0,5204,W PALM BCH,FSCU7056551,1,EAST,HOUSEHOLD GOODS INCLUDING CANDRY FOOOS HOUSEHO...,"HOUSEHOLD GOODS, PERSONAL EFFECTS",000797,7970000,7,20100630,,SAINT KITTS AND NEVIS,CARIBBEAN,24835,BASSETERRE
4,LESCHACO INC,15355 VANTAGE PKWY W STE 195,0,KG,0.0,CF,0,MEDITERRANEAN SHIPPING COMPANY,MSCU,MSC ILONA,8,MSCUC6083392,9225641,0,1601,CHARLESTON,EXFU0578350,1,EAST,MENTHOL DL NOT REGULATED,"MEDICINALS, PHARMACEUTICALS; NOS",300490,4400000,1,20100630,,GERMANY,NORTH EUROPE,42870,BREMERHAVEN


In [None]:
export_dytpe_dict = {'Shipper': 'object',
                    'Shipper Address': 'object',
                    'Weight': 'float64',
                    'Weight Unit': 'category',
                    'Quantity': 'float64',
                    'Quantity Type': 'category',
                    'TEUs': 'float64',
                    'Carrier': 'category',
                    'SCAC': 'category',
                    'Vessel Name': 'object',
                    'Voyage Number': 'int32',
                    'Bill of Lading Number': 'object',
                    'IMO Number': 'int32',
                    'Estimated Value': 'float64',
                    'Port of Departure Code': 'int32',
                    'Port of Departure': 'category',
                    'Container Number': 'object',
                    'Container Piece Count': 'int32',
                    'Coastal Region': 'category',
                    'Raw Commodity Description': 'object',
                    'Commodity Short Description': 'object',
                    'HS Code': 'object',
                    'JOC Code': 'object',
                    'Quantity of Commodity Short Description': 'object',
                    'Departure Date': 'int32',
                    'U.S. Origin': 'origin',
                    'Destination Territory': 'category',
                    'Destination Region': 'category',
                    'Declared Destination Port Code': 'int32',
                    'Declared Destination Port': 'category'}

In [11]:
df.columns

Index(['Shipper', 'Shipper Address', 'Weight', 'Weight Unit', 'Quantity',
       'Quantity Type', 'TEUs', 'Carrier', 'SCAC', 'Vessel Name',
       'Voyage Number', 'Bill of Lading Number', 'IMO Number',
       'Estimated Value', 'Port of Departure Code', 'Port of Departure',
       'Container Number', 'Container Piece Count', 'Coastal Region',
       'Raw Commodity Description', 'Commodity Short Description', 'HS Code',
       'JOC Code', 'Quantity of Commodity Short Description', 'Departure Date',
       'U.S. Origin', 'Destination Territory', 'Destination Region',
       'Declared Destination Port Code', 'Declared Destination Port'],
      dtype='object')

In [12]:
export_col_names = ['shipper', 'shipper_address', 'weight', 'weight_unit', 'quantity', 'quantity_type', 'teus', 'carrier_name', 'carrier_scac', 'vessel_name', 'voyage_number', 'bol_id', 'imo_num', 'value_est', 'departure_port_code', 'departure_port_name', 'container_ids', 'container_piece_count', 'coast_region', 'commod_desc_raw', 'commod_short_desc', 'hs_code', 'joc_code', 'commod_short_desc_qty', 'date_departure', 'origin', 'dest_territory', 'dest_region', 'dest_port_code_declared', 'dest_port_name']

In [13]:
export_colnames_dict = dict(list(zip(df.columns, export_col_names)))

In [14]:
export_colnames_dict

{'Shipper': 'shipper',
 'Shipper Address': 'shipper_address',
 'Weight': 'weight',
 'Weight Unit': 'weight_unit',
 'Quantity': 'quantity',
 'Quantity Type': 'quantity_type',
 'TEUs': 'teus',
 'Carrier': 'carrier_name',
 'SCAC': 'carrier_scac',
 'Vessel Name': 'vessel_name',
 'Voyage Number': 'voyage_number',
 'Bill of Lading Number': 'bol_id',
 'IMO Number': 'imo_num',
 'Estimated Value': 'value_est',
 'Port of Departure Code': 'departure_port_code',
 'Port of Departure': 'departure_port_name',
 'Container Number': 'container_ids',
 'Container Piece Count': 'container_piece_count',
 'Coastal Region': 'coast_region',
 'Raw Commodity Description': 'commod_desc_raw',
 'Commodity Short Description': 'commod_short_desc',
 'HS Code': 'hs_code',
 'JOC Code': 'joc_code',
 'Quantity of Commodity Short Description': 'commod_short_desc_qty',
 'Departure Date': 'date_departure',
 'U.S. Origin': 'origin',
 'Destination Territory': 'dest_territory',
 'Destination Region': 'dest_region',
 'Declared D

In [5]:
pldf = (
    pl.scan_csv('data/raw/exports/PIERS export records 2010 01-06 874C48B941A54798B9047F6C51E0EC42.csv', infer_schema_length=10000)
    .collect()
)

df = pl.to_pandas(pldf)

ComputeError: could not parse `1685.62` as dtype `i64` at column 'Weight' (column number 3)

The current offset in the file is 53992750 bytes.

You might want to try:
- increasing `infer_schema_length` (e.g. `infer_schema_length=10000`),
- specifying correct dtype with the `dtypes` argument
- setting `ignore_errors` to `True`,
- adding `1685.62` to the `null_values` list.

Original error: ```remaining bytes non-empty```

In [2]:
#define path
path = 'data/raw/'
#get list of data files, ignoring any hidden files in directory 
datafiles = [file for file in os.listdir(path) if not file.startswith('.')]
#init filenumber
filenumber = 1
#define new col names
import_colnames_dict = {'Weight': 'weight',
                        'Weight Unit': 'weight_unit',
                        'Quantity': 'qty',
                        'Quantity Type': 'qty_type',
                        'TEUs': 'teus',
                        'Estimated Value': 'value_est',
                        'Arrival Date': 'date_arrival',
                        'Container Piece Count': 'container_piece_count',
                        'Quantity of Commodity Short Description': 'commod_short_desc_qty',
                        'Territory of Origin': 'origin_territory',
                        'Region of Origin': 'origin_region',
                        'Port of Arrival Code': 'arrival_port_code',
                        'Port of Arrival': 'arrival_port_name',
                        'Port of Departure Code': 'departure_port_code',
                        'Port of Departure': 'departure_port_name',
                        'Final Destination': 'dest_final',
                        'Coastal Region': 'coast_region',
                        'Clearing District': 'clearing_district',
                        'Place of Receipt': 'place_receipt',
                        'Shipper': 'shipper_name',
                        'Shipper Address': 'shipper_address',
                        'Consignee': 'consignee_name',
                        'Consignee Address': 'consignee_address',
                        'Notify Party': 'notify_party1_name',
                        'Notify Party Address': 'notify_party1_address',
                        'Also Notify Party': 'notify_party2_name',
                        'Also Notify Party Address': 'notify_party2_address',
                        'Raw Commodity Description': 'commod_desc_raw',
                        'Marks Container Number': 'container_id_marks',
                        'Marks Description': 'marks_desc',
                        'HS Code': 'hs_code',
                        'JOC Code': 'joc_code',
                        'Commodity Short Description': 'commod_short_desc',
                        'Container Number': 'container_ids',
                        'Carrier': 'carrier_name',
                        'SCAC': 'carrier_scac',
                        'Vessel Name': 'vessel_name',
                        'Voyage Number': 'vessel_id',
                        'Pre Carrier': 'precarrier',
                        'IMO Number': 'imo_num',
                        'Inbond Code': 'inbond_code',
                        'Mode of Transport': 'transport_mode',
                        'Bill of Lading Number': 'bol_id'}
#define dtypes
import_dtype_dict = {'Weight': 'float64',
            'Weight Unit': 'category',
            'Quantity': 'float64',
            'Quantity Type': 'category',
            'TEUs': 'float64',
            'Estimated Value': 'float64',
            'Arrival Date': 'int64',
            'Container Piece Count': 'int64',
            'Quantity of Commodity Short Description': 'object',
            'Territory of Origin': 'category',
            'Region of Origin': 'category',
            'Port of Arrival Code': 'category',
            'Port of Arrival': 'category',
            'Port of Departure Code': 'category',
            'Port of Departure': 'category',
            'Final Destination': 'category',
            'Coastal Region': 'category',
            'Clearing District': 'category',
            'Place of Receipt': 'category',
            'Shipper': 'object',
            'Shipper Address': 'object',
            'Consignee': 'object',
            'Consignee Address': 'object',
            'Notify Party': 'object',
            'Notify Party Address': 'object',
            'Also Notify Party': 'object',
            'Also Notify Party Address': 'object',
            'Raw Commodity Description': 'object',
            'Marks Container Number': 'object',
            'Marks Description': 'object',
            'HS Code': 'category',
            'JOC Code': 'category',
            'Commodity Short Description': 'object',
            'Container Number': 'object',
            'Carrier': 'category',
            'SCAC': 'category',
            'Vessel Name': 'object',
            'Voyage Number': 'object',
            'Pre Carrier': 'float64',
            'IMO Number': 'float64',
            'Inbond Code': 'float64',
            'Mode of Transport': 'category',
            'Bill of Lading Number': 'object'}
#define category variable cols
catcols = ['weight_unit', 'qty_type', 'origin_territory', 'origin_region', 'arrival_port_code', 
           'arrival_port_name', 'departure_port_code', 'departure_port_name', 'dest_final', 'coast_region', 
           'clearing_district', 'place_receipt', 'hs_code', 'joc_code', 'carrier_name', 'carrier_scac', 
           'transport_mode']

In [3]:
print('Extracting CSV files...\n', 'Files to process: ', len(datafiles), '\n')

#extract from csv to clean dataframes and concat
for filename in datafiles:
    start = time.time()
    print('Extracting file ', filenumber, '...')
    #read csv with appropriate dtypes
    file_df = pd.read_csv(path+filename, dtype=import_dtype_dict)
    #rename columns
    file_df.rename(columns=import_colnames_dict, inplace=True)
    #unpack strings to list objects
    file_df.container_ids = file_df.container_ids.str.split()
    file_df.commod_short_desc_qty = file_df.commod_short_desc_qty.str.split(pat=';')
    file_df.commod_short_desc = file_df.commod_short_desc.str.split(pat=',')
    #recast dates to datetime 
    file_df.date_arrival = pd.to_datetime(file_df.date_arrival.astype(str), format='%Y%m%d') 
    #concat to or create main imports df
    if 'imports_df' in locals():
        #create category unions and assign union to each df col
        for col in catcols:
            catunion = pd.api.types.union_categoricals([imports_df[col], file_df[col]])
            imports_df[col] = pd.Categorical(imports_df[col], categories=catunion.categories)
            file_df[col] = pd.Categorical(file_df[col], categories=catunion.categories)
        #concat to main df
        imports_df = pd.concat([imports_df, file_df])
    else:
        imports_df = file_df 
    del file_df
    end = time.time()
    print('Extraction complete.\n', 'Time: {} sec \n'.format(end-start))
    filenumber += 1

Extracting CSV files...
 Files to process:  39 

Extracting file  1 ...
Extraction complete.
 Time: 30.30369281768799 sec 

Extracting file  2 ...
Extraction complete.
 Time: 38.606818199157715 sec 

Extracting file  3 ...
Extraction complete.
 Time: 20.177419900894165 sec 

Extracting file  4 ...
Extraction complete.
 Time: 33.34604597091675 sec 

Extracting file  5 ...
Extraction complete.
 Time: 34.10702681541443 sec 

Extracting file  6 ...
Extraction complete.
 Time: 33.43861198425293 sec 

Extracting file  7 ...
Extraction complete.
 Time: 20.76675009727478 sec 

Extracting file  8 ...
Extraction complete.
 Time: 38.86877512931824 sec 

Extracting file  9 ...
Extraction complete.
 Time: 47.7787709236145 sec 

Extracting file  10 ...
Extraction complete.
 Time: 47.76598501205444 sec 

Extracting file  11 ...
Extraction complete.
 Time: 36.88781189918518 sec 

Extracting file  12 ...
Extraction complete.
 Time: 68.1502730846405 sec 

Extracting file  13 ...
Extraction complete.
 Ti

: 

In [None]:
# inspect output 
display(imports_df.head())
imports_df.info()

## Load

In [None]:
#save to parquet file
imports_df.to_parquet('data/piers_imports.parquet', index=False, engine='fastparquet') #requires fastparquet dependency  

#delete imports df
del imports_df

In [None]:
imports_df = pd.read_parquet('data/piers_imports.parquet', engine='fastparquet')
imports_df.info()