In [1]:
#import libraries
import pandas as pd
import os
import polars as pl

#display settings
pd.set_option('display.max_columns', None)

#enable string cache for polars categoricals
pl.enable_string_cache()

## Extract and Transform 

Steps:

- Read raw csv files 
- Set dtypes
- rename columns
- concat to main dataframe

In [2]:
#intantiate column name dict
export_colnames_dict = {'Shipper': 'shipper',
                        'Shipper Address': 'shipper_address',
                        'Weight': 'weight',
                        'Weight Unit': 'weight_unit',
                        'Quantity': 'quantity',
                        'Quantity Type': 'quantity_type',
                        'TEUs': 'teus',
                        'Carrier': 'carrier_name',
                        'SCAC': 'carrier_scac',
                        'Vessel Name': 'vessel_name',
                        'Voyage Number': 'voyage_number',
                        'Bill of Lading Number': 'bol_id',
                        'IMO Number': 'imo_num',
                        'Estimated Value': 'value_est',
                        'Port of Departure Code': 'departure_port_code',
                        'Port of Departure': 'departure_port_name',
                        'Container Number': 'container_ids',
                        'Container Piece Count': 'container_piece_count',
                        'Coastal Region': 'coast_region',
                        'Raw Commodity Description': 'commod_desc_raw',
                        'Commodity Short Description': 'commod_short_desc',
                        'HS Code': 'hs_code',
                        'JOC Code': 'joc_code',
                        'Quantity of Commodity Short Description': 'commod_short_desc_qty',
                        'Departure Date': 'date_departure',
                        'U.S. Origin': 'origin',
                        'Destination Territory': 'dest_territory',
                        'Destination Region': 'dest_region',
                        'Declared Destination Port Code': 'dest_port_code_declared',
                        'Declared Destination Port': 'dest_port_name'}

#instatiate dtypes dict
export_dytpe_dict = {'Shipper': pl.Utf8,
                     'Shipper Address': pl.Utf8,
                     'Weight': pl.Float64,
                     'Weight Unit': pl.Categorical,
                     'Quantity': pl.Float64,
                     'Quantity Type': pl.Categorical,
                     'TEUs': pl.Float64,
                     'Carrier': pl.Categorical,
                     'SCAC': pl.Categorical,
                     'Vessel Name': pl.Utf8,
                     'Voyage Number': pl.Utf8,
                     'Bill of Lading Number': pl.Utf8,
                     'IMO Number': pl.Int32,
                     'Estimated Value': pl.Float64,
                     'Port of Departure Code': pl.Categorical,
                     'Port of Departure': pl.Categorical,
                     'Container Number': pl.Utf8,
                     'Container Piece Count': pl.Int32,
                     'Coastal Region': pl.Categorical,
                     'Raw Commodity Description': pl.Utf8,
                     'Commodity Short Description': pl.Utf8,
                     'HS Code': pl.Utf8,
                     'JOC Code': pl.Utf8,
                     'Quantity of Commodity Short Description': pl.Utf8,
                     'Departure Date': pl.Utf8,
                     'U.S. Origin': pl.Utf8,
                     'Destination Territory': pl.Categorical,
                     'Destination Region': pl.Categorical,
                     'Declared Destination Port Code': pl.Int32,
                     'Declared Destination Port': pl.Categorical}

In [3]:
path = 'data/raw/exports/'
exports_pldf = pl.DataFrame()
filenumber = 1

for file in os.listdir(path):
    #create polars dataframe from file
    file_pldf = (#scan csv
        pl.scan_csv(path+file, dtypes=export_dytpe_dict)
        #rename columns
        .rename(export_colnames_dict)
        #split strings to lists
        .with_columns([pl.col('container_ids').str.split(' '),
                       pl.col('hs_code').str.split(' '),
                       pl.col('joc_code').str.split(' '),
                       pl.col('commod_short_desc').str.split(','),
                       pl.col('commod_short_desc_qty').str.split(';')])
        #set departure date to datetime
        .with_columns(pl.col('date_departure').str.to_datetime('%Y%m%d'))
        #collect scan
        .collect())
    #concat to main pldf
    exports_pldf = pl.concat([exports_pldf, file_pldf])
    print('File {} of {} complete.'.format(filenumber, len(os.listdir(path))))
    filenumber += 1
    

File 1 of 38 complete.
File 2 of 38 complete.
File 3 of 38 complete.
File 4 of 38 complete.
File 5 of 38 complete.
File 6 of 38 complete.
File 7 of 38 complete.
File 8 of 38 complete.
File 9 of 38 complete.
File 10 of 38 complete.
File 11 of 38 complete.
File 12 of 38 complete.
File 13 of 38 complete.
File 14 of 38 complete.
File 15 of 38 complete.
File 16 of 38 complete.
File 17 of 38 complete.
File 18 of 38 complete.
File 19 of 38 complete.
File 20 of 38 complete.
File 21 of 38 complete.
File 22 of 38 complete.
File 23 of 38 complete.
File 24 of 38 complete.
File 25 of 38 complete.
File 26 of 38 complete.
File 27 of 38 complete.
File 28 of 38 complete.
File 29 of 38 complete.
File 30 of 38 complete.
File 31 of 38 complete.
File 32 of 38 complete.
File 33 of 38 complete.
File 34 of 38 complete.
File 35 of 38 complete.
File 36 of 38 complete.
File 37 of 38 complete.
File 38 of 38 complete.


## Load

Save dataframe to parquet file.

In [4]:
exports_pldf.write_parquet('data/exports_complete.parquet')