In [12]:
#import libraries
import pandas as pd
import os
import polars as pl

#display settings
pd.set_option('display.max_columns', None)

#enable string cache for polars categoricals
pl.enable_string_cache()

## Extract and Transform 

Steps:

- Read raw csv files 
- Set dtypes
- rename columns
- concat to main dataframe

In [13]:
#intantiate column name dict
export_colnames_dict = {'Shipper': 'shipper',
                        'Shipper Address': 'shipper_address',
                        'Weight': 'weight',
                        'Weight Unit': 'weight_unit',
                        'Quantity': 'quantity',
                        'Quantity Type': 'quantity_type',
                        'TEUs': 'teus',
                        'Carrier': 'carrier_name',
                        'SCAC': 'carrier_scac',
                        'Vessel Name': 'vessel_name',
                        'Voyage Number': 'voyage_number',
                        'Bill of Lading Number': 'bol_id',
                        'IMO Number': 'imo_num',
                        'Estimated Value': 'value_est',
                        'Port of Departure Code': 'departure_port_code',
                        'Port of Departure': 'departure_port_name',
                        'Container Number': 'container_ids',
                        'Container Piece Count': 'container_piece_count',
                        'Coastal Region': 'coast_region',
                        'Raw Commodity Description': 'commod_desc_raw',
                        'Commodity Short Description': 'commod_short_desc',
                        'HS Code': 'hs_code',
                        'JOC Code': 'joc_code',
                        'Quantity of Commodity Short Description': 'commod_short_desc_qty',
                        'Departure Date': 'date_departure',
                        'U.S. Origin': 'origin',
                        'Destination Territory': 'dest_territory',
                        'Destination Region': 'dest_region',
                        'Declared Destination Port Code': 'dest_port_code_declared',
                        'Declared Destination Port': 'dest_port_name'}

#instatiate dtypes dict
export_dytpe_dict = {'Shipper': pl.Utf8,
                     'Shipper Address': pl.Utf8,
                     'Weight': pl.Float64,
                     'Weight Unit': pl.Categorical,
                     'Quantity': pl.Float64,
                     'Quantity Type': pl.Categorical,
                     'TEUs': pl.Float64,
                     'Carrier': pl.Categorical,
                     'SCAC': pl.Categorical,
                     'Vessel Name': pl.Utf8,
                     'Voyage Number': pl.Utf8,
                     'Bill of Lading Number': pl.Utf8,
                     'IMO Number': pl.Int32,
                     'Estimated Value': pl.Float64,
                     'Port of Departure Code': pl.Int32,
                     'Port of Departure': pl.Categorical,
                     'Container Number': pl.Utf8,
                     'Container Piece Count': pl.Int32,
                     'Coastal Region': pl.Categorical,
                     'Raw Commodity Description': pl.Utf8,
                     'Commodity Short Description': pl.Utf8,
                     'HS Code': pl.Utf8,
                     'JOC Code': pl.Utf8,
                     'Quantity of Commodity Short Description': pl.Utf8,
                     'Departure Date': pl.Int32,
                     'U.S. Origin': pl.Utf8,
                     'Destination Territory': pl.Categorical,
                     'Destination Region': pl.Categorical,
                     'Declared Destination Port Code': pl.Int32,
                     'Declared Destination Port': pl.Categorical}

In [14]:
path = 'data/raw/exports/'
exports_pldf = pl.DataFrame()
filenumber = 1

for file in os.listdir(path):
    #create polars dataframe from file
    file_pldf = (#scan csv
        pl.scan_csv(path+file, dtypes=export_dytpe_dict)
        #rename columns
        .rename(export_colnames_dict)
        #collect scan
        .collect())
    #concat to main pldf
    exports_pldf = pl.concat([exports_pldf, file_pldf])
    #print('File {} of {} complete.'.format(filenumber, len(os.listdir(path))))
    filenumber += 1
    

## Load

Save dataframe to parquet file.

In [15]:
exports_pldf.write_parquet('data/exports_complete.parquet')

In [16]:
exports_pldf.to_pandas().head()

Unnamed: 0,shipper,shipper_address,weight,weight_unit,quantity,quantity_type,teus,carrier_name,carrier_scac,vessel_name,voyage_number,bol_id,imo_num,value_est,departure_port_code,departure_port_name,container_ids,container_piece_count,coast_region,commod_desc_raw,commod_short_desc,hs_code,joc_code,commod_short_desc_qty,date_departure,origin,dest_territory,dest_region,dest_port_code_declared,dest_port_name
0,,,0.0,KG,3431.0,CS,0.0,EVERGREEN LINE,EVER,EVER REFINE,313,425634088239,9061124.0,0.0,4601.0,NEW YORK,EMCU9328596 INKU2226914,2,EAST,,,190590,,3431,20060602,,PANAMA,CENTRAL AMERICA,22519.0,COLON PA
1,,,0.0,KG,3.0,UNT,0.0,EVERGREEN LINE,EVER,EVER REACH,334,400620087671,9088122.0,0.0,1303.0,BALTIMORE,FSCU9507850,1,EAST,,,870390,,3,20061107,,PANAMA,CENTRAL AMERICA,22519.0,COLON PA
2,,,0.0,KG,156.0,CS,0.0,ORIENT OVERSEAS CONTAINER LINE,OOCL,OOCL LONG BEACH,24,1020208270,9243409.0,0.0,2709.0,LONG BEACH,,0,WEST,,,7985,,156; 3,20060711,,HONG KONG,NORTH EAST ASIA,58201.0,HONG KONG
3,,,0.0,KG,1.0,UNT,0.0,SALLAUM LINE S A L,SALQ,REPUBBLICA DI AMA,806,S301124909,8521218.0,0.0,1303.0,BALTIMORE,GCNU1036340,1,EAST,,,870390,,1,20061107,,NIGERIA,AFRICA,75367.0,LAGOS
4,,,0.0,KG,20.0,CYL,0.0,N Y K LINE,NYKL,LUDWIGSHAFEN EXPR,88,358533789,8902577.0,0.0,2709.0,LONG BEACH,NYKU2838199,1,WEST,,,290330,,20,20060224,,JAPAN,NORTH EAST ASIA,58895.0,YOKOHAMA
