# PIERS Container BOL Data ETL 

This notebook builds an ETL pipeline for S&P Global's PIERS data. 

In [17]:
#import libraries
import pandas as pd
import os

In [18]:
#settings
pd.set_option('display.max_columns', None)

## Extract

In [19]:
# compare column headers to ensure order and names match
# pd.read_csv('data/raw/PIERS import records 2023-02-01 to 2023-09-05 CB5380DE33CB4169A676AD839290E38E.csv', header=0, nrows=0).columns.tolist()

In [20]:
#read in csv - NOTE this becomes a with/as statement in final function
df = pd.read_csv('data/raw/PIERS import records 2023-02-01 to 2023-09-05 CB5380DE33CB4169A676AD839290E38E.csv')

In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7437504 entries, 0 to 7437503
Data columns (total 43 columns):
 #   Column                                   Dtype  
---  ------                                   -----  
 0   Weight                                   float64
 1   Weight Unit                              object 
 2   Quantity                                 float64
 3   Quantity Type                            object 
 4   TEUs                                     float64
 5   Estimated Value                          float64
 6   Arrival Date                             int64  
 7   Container Piece Count                    int64  
 8   Quantity of Commodity Short Description  object 
 9   Territory of Origin                      object 
 10  Region of Origin                         object 
 11  Port of Arrival Code                     int64  
 12  Port of Arrival                          object 
 13  Port of Departure Code                   int64  
 14  Port of Departure 

## Transform

Note - renaming columns in this fashion only works if all downloaded files have identical column names and orders. 

In [22]:
#rename columns

#read in pythonic column names
columnnames = pd.read_csv('column_names.csv')
columnnames = list(columnnames.iloc[0])
#assign pythonic column names
df.columns = columnnames

In [23]:
#unpack strings to list objects
df.container_numbers = df.container_numbers.str.split()
df.commodity_description_short_quantities = df.commodity_description_short_quantities.str.split(pat=';')
df.commodity_descriptions_short = df.commodity_descriptions_short.str.split(pat=',')

In [24]:
#expand commonity descriptions and associated quantities ??

In [25]:
df.date_arrival.head()

0    20230905
1    20230905
2    20230905
3    20230905
4    20230905
Name: date_arrival, dtype: int64

In [26]:
#recast dates to datetime 
df.date_arrival = pd.to_datetime(df.date_arrival.astype(str), format='%Y%m%d')

In [27]:
#recast to int
df.quantity = pd.to_numeric(df.quantity, downcast='integer')


In [28]:
#recast to categorical dtypes
df[
    ['weight_unit', 'quantity_type', 'origin_territory', 'origin_region', 'port_arrival_code', 'port_arrival_name',
    'port_departure_code', 'port_departure_name', 'destination_final', 'coastal_region', 'clearing_district', 'place_of_receipt',
    'shipper_name', 'carrier_name', 'carrier_scac', 'transport_mode']
    ] = df[
        ['weight_unit', 'quantity_type', 'origin_territory', 'origin_region', 'port_arrival_code', 'port_arrival_name', 
         'port_departure_code', 'port_departure_name', 'destination_final', 'coastal_region', 'clearing_district', 'place_of_receipt',
         'shipper_name', 'carrier_name', 'carrier_scac', 'transport_mode']].astype('category')

In [29]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7437504 entries, 0 to 7437503
Data columns (total 43 columns):
 #   Column                                  Dtype         
---  ------                                  -----         
 0   weight                                  float64       
 1   weight_unit                             category      
 2   quantity                                int32         
 3   quantity_type                           category      
 4   teus                                    float64       
 5   value_est                               float64       
 6   date_arrival                            datetime64[ns]
 7   container_piece_count                   int64         
 8   commodity_description_short_quantities  object        
 9   origin_territory                        category      
 10  origin_region                           category      
 11  port_arrival_code                       category      
 12  port_arrival_name                       ca

## Load

In [30]:
#save to parquet file
df.to_parquet('data/piers_imports.parquet')
