In [1]:
#import libraries
import polars as pl
import glob

#enable string cache for polars categoricals
pl.enable_string_cache()

In [2]:
#intantiate column name dict
export_colnames_dict = {'Shipper': 'shipper',
                        'Shipper Address': 'shipper_address',
                        'Weight': 'weight',
                        'Weight Unit': 'weight_unit',
                        'Quantity': 'qty',
                        'Quantity Type': 'quantity_type',
                        'TEUs': 'teus',
                        'Carrier': 'carrier_name',
                        'SCAC': 'carrier_scac',
                        'Vessel Name': 'vessel_name',
                        'Voyage Number': 'voyage_number',
                        'Bill of Lading Number': 'bol_number',
                        'IMO Number': 'vessel_id',
                        'Estimated Value': 'value_est',
                        'Port of Departure Code': 'departure_port_code',
                        'Port of Departure': 'departure_port_name',
                        'Container Number': 'container_ids',
                        'Container Piece Count': 'container_piece_count',
                        'Coastal Region': 'coast_region',
                        'Raw Commodity Description': 'commod_desc_raw',
                        'Commodity Short Description': 'commod_short_desc',
                        'HS Code': 'hs_code',
                        'JOC Code': 'joc_code',
                        'Quantity of Commodity Short Description': 'commod_short_desc_qty',
                        'Departure Date': 'date',
                        'U.S. Origin': 'origin',
                        'Destination Territory': 'dest_territory',
                        'Destination Region': 'dest_region',
                        'Declared Destination Port Code': 'arrival_port_code',
                        'Declared Destination Port': 'arrival_port_name'}

#instatiate schema
export_schema = {'Shipper': pl.Utf8,
                     'Shipper Address': pl.Utf8,
                     'Weight': pl.Float64,
                     'Weight Unit': pl.Categorical,
                     'Quantity': pl.Float64,
                     'Quantity Type': pl.Categorical,
                     'TEUs': pl.Float64,
                     'Carrier': pl.Categorical,
                     'SCAC': pl.Categorical,
                     'Vessel Name': pl.Utf8,
                     'Voyage Number': pl.Utf8,
                     'Bill of Lading Number': pl.Utf8,
                     'IMO Number': pl.Int32,
                     'Estimated Value': pl.Float64,
                     'Port of Departure Code': pl.Categorical,
                     'Port of Departure': pl.Categorical,
                     'Container Number': pl.Utf8,
                     'Container Piece Count': pl.Int32,
                     'Coastal Region': pl.Categorical,
                     'Raw Commodity Description': pl.Utf8,
                     'Commodity Short Description': pl.Utf8,
                     'HS Code': pl.Utf8,
                     'JOC Code': pl.Utf8,
                     'Quantity of Commodity Short Description': pl.Utf8,
                     'Departure Date': pl.Utf8,
                     'U.S. Origin': pl.Utf8,
                     'Destination Territory': pl.Categorical,
                     'Destination Region': pl.Categorical,
                     'Declared Destination Port Code': pl.Categorical,
                     'Declared Destination Port': pl.Categorical}

## Extract and Transform 

Steps:

- Read raw csv files 
- Set dtypes
- rename columns
- drop duplicates
- replace empty strings with null 
- ~~split strings to lists where applicable~~ see note in README.md
- recast departure date string to datetime 

In [3]:
#set path
path = 'data/raw_csv/exports/'
#set files and order
export_files = glob.glob(path+'*')
export_files.sort()

for i in range(len(export_files)):
    #scan csv
    ex_lf = pl.scan_csv(export_files[i], infer_schema_length=0)
    #coerse new format to match old format
    if ('US Origin' in ex_lf.columns):
        ex_lf = ex_lf.rename({'US Origin':'U.S. Origin'})
    #process data
    ex_lf = (
        ex_lf
        #strip whitespace and replace empty str with null
        .with_columns(pl.all().str.strip_chars().replace('',None))
        #cast dtypes
        .cast(export_schema)
        #drop duplicates
        .unique()
        #rename columns
        .rename(export_colnames_dict)
        #reorder columns
        .select(export_colnames_dict.values())
        #set departure date to datetime
        .with_columns(pl.col('date').str.to_datetime('%Y%m%d'))
        .with_columns(
            #create direction column
            pl.lit('export').cast(pl.Categorical).alias('direction'),
            #create bol_id
            (pl.col('carrier_scac').fill_null('')+'_'+pl.col('bol_number')).alias('bol_id'),
            #extract year
            pl.col('date').dt.year().alias('year'),
            #extract month (e.g., '202304')
            pl.col('date').dt.strftime('%Y%m').alias('month'),
            #create lane_id
            (pl.col('departure_port_code').cast(pl.Utf8)+'_'+pl.col('arrival_port_code').cast(pl.Utf8))
            .cast(pl.Categorical)
            .alias('lane_id'),
            #convert zero volume values to null
            pl.col('teus').replace(0,None), 
            pl.col('weight').replace(0,None),
            pl.col('qty').replace(0,None)
        )
    )
    if i == 0:
        #create main lazyframe
        exports_lf = ex_lf
    else:
        #concat file frame to main lazyframe
        exports_lf = (
            pl.concat([exports_lf, ex_lf], how='diagonal')
        )

In [4]:
#dedup and collect main frame
exports_df = (
    exports_lf
    #drop duplicates
    .unique()
    #collect
    .collect()
)

## Load

Save dataframe to parquet file.

In [5]:
exports_df.write_parquet('data/raw_parquet/exports/piers_exports_raw.parquet')