# PIERS Container BOL Data ETL 

This notebook builds an ETL pipeline for S&P Global's PIERS data. 

In [4]:
#import libraries
import pandas as pd
import os
#from dask import dataframe as dd 

In [5]:
#settings
pd.set_option('display.max_columns', None)

## Extract

Read from csv into a pandas dataframe with appropriate column names and dtypes

In [6]:
def piers_imports_extractor(data, columnnames):
    '''
    Extracts from downloaded PIERS csv files and performs initial cleaning 
    INPUT:
        data - str - the csv file to be extracted, including the path from current directory
        columnnames - list - the names to be assigned to each column
    OUTPUT:
        df - pandas dataframe with appropriate column names and dtypes
    '''
    #read csv file 
    df = pd.read_csv(data)
    #assign pythonic column names
    df.columns = columnnames
    #unpack strings to list objects
    df.container_numbers = df.container_numbers.str.split()
    df.commodity_description_short_quantities = df.commodity_description_short_quantities.str.split(pat=';')
    df.commodity_descriptions_short = df.commodity_descriptions_short.str.split(pat=',')
    #recast dates to datetime 
    df.date_arrival = pd.to_datetime(df.date_arrival.astype(str), format='%Y%m%d') 
    #recast to int
    df.quantity = pd.to_numeric(df.quantity, downcast='integer')
    #recast to categorical dtypes
    df[
        ['weight_unit', 'quantity_type', 'origin_territory', 'origin_region', 'port_arrival_code', 'port_arrival_name',
        'port_departure_code', 'port_departure_name', 'destination_final', 'coastal_region', 'clearing_district', 'place_of_receipt',
        'shipper_name', 'carrier_name', 'carrier_scac', 'transport_mode']
        ] = df[
            ['weight_unit', 'quantity_type', 'origin_territory', 'origin_region', 'port_arrival_code', 'port_arrival_name', 
            'port_departure_code', 'port_departure_name', 'destination_final', 'coastal_region', 'clearing_district', 'place_of_receipt',
            'shipper_name', 'carrier_name', 'carrier_scac', 'transport_mode']].astype('category')
    return df    

In [7]:
#define file
path = 'data/raw/'
filename = 'PIERS import records 2023-02-01 to 2023-09-05 CB5380DE33CB4169A676AD839290E38E.csv'

#read in pythonic column names
columnnames = pd.read_csv('column_names.csv')
columnnames = list(columnnames.iloc[0])

#testrun function
file_df = piers_imports_extractor(path + filename, columnnames)

In [8]:
file_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7437504 entries, 0 to 7437503
Data columns (total 43 columns):
 #   Column                                  Dtype         
---  ------                                  -----         
 0   weight                                  float64       
 1   weight_unit                             category      
 2   quantity                                int32         
 3   quantity_type                           category      
 4   teus                                    float64       
 5   value_est                               float64       
 6   date_arrival                            datetime64[ns]
 7   container_piece_count                   int64         
 8   commodity_description_short_quantities  object        
 9   origin_territory                        category      
 10  origin_region                           category      
 11  port_arrival_code                       category      
 12  port_arrival_name                       ca

Note for future optimization: build a dictionary of column dtypes and assign within read_csv. 

## Transform

Concat each file into a single dataframe 



## Load

In [None]:
#save to parquet file
#df.to_parquet('data/piers_imports.parquet')
