In [1]:
#import libraries
import pandas as pd
import os
import sys
import time
import dask.dataframe as dd
from dask.distributed import Client, LocalCluster

#ini Dask client
cluster = LocalCluster(n_workers=10)
client = Client(cluster)

#display settings
pd.set_option('display.max_columns', None)

# To do:
- read csvs
    - set datatypes and column names during read
- 
- concat dataframes
    - use union categoricals to preserve categories
- drop duplicates


In [2]:
#define path
path = 'data/raw/'
#get list of data files, ignoring any hidden files in directory 
datafiles = [file for file in os.listdir(path) if not file.startswith('.')]
#init filenumber
filenumber = 1
#define new col names
import_colnames_dict = {'Weight': 'weight',
                        'Weight Unit': 'weight_unit',
                        'Quantity': 'qty',
                        'Quantity Type': 'qty_type',
                        'TEUs': 'teus',
                        'Estimated Value': 'value_est',
                        'Arrival Date': 'date_arrival',
                        'Container Piece Count': 'container_piece_count',
                        'Quantity of Commodity Short Description': 'commod_short_desc_qty',
                        'Territory of Origin': 'origin_territory',
                        'Region of Origin': 'origin_region',
                        'Port of Arrival Code': 'arrival_port_code',
                        'Port of Arrival': 'arrival_port_name',
                        'Port of Departure Code': 'departure_port_code',
                        'Port of Departure': 'departure_port_name',
                        'Final Destination': 'dest_final',
                        'Coastal Region': 'coast_region',
                        'Clearing District': 'clearing_district',
                        'Place of Receipt': 'place_receipt',
                        'Shipper': 'shipper_name',
                        'Shipper Address': 'shipper_address',
                        'Consignee': 'consignee_name',
                        'Consignee Address': 'consignee_address',
                        'Notify Party': 'notify_party1_name',
                        'Notify Party Address': 'notify_party1_address',
                        'Also Notify Party': 'notify_party2_name',
                        'Also Notify Party Address': 'notify_party2_address',
                        'Raw Commodity Description': 'commod_desc_raw',
                        'Marks Container Number': 'container_id_marks',
                        'Marks Description': 'marks_desc',
                        'HS Code': 'hs_code',
                        'JOC Code': 'joc_code',
                        'Commodity Short Description': 'commod_short_desc',
                        'Container Number': 'container_ids',
                        'Carrier': 'carrier_name',
                        'SCAC': 'carrier_scac',
                        'Vessel Name': 'vessel_name',
                        'Voyage Number': 'vessel_id',
                        'Pre Carrier': 'precarrier',
                        'IMO Number': 'imo_num',
                        'Inbond Code': 'inbond_code',
                        'Mode of Transport': 'transport_mode',
                        'Bill of Lading Number': 'bol_id'}
#define dtypes
import_dtype_dict = {'Weight': 'float64',
            'Weight Unit': 'category',
            'Quantity': 'float64',
            'Quantity Type': 'category',
            'TEUs': 'float64',
            'Estimated Value': 'float64',
            'Arrival Date': 'int64',
            'Container Piece Count': 'int64',
            'Quantity of Commodity Short Description': 'object',
            'Territory of Origin': 'category',
            'Region of Origin': 'category',
            'Port of Arrival Code': 'category',
            'Port of Arrival': 'category',
            'Port of Departure Code': 'category',
            'Port of Departure': 'category',
            'Final Destination': 'category',
            'Coastal Region': 'category',
            'Clearing District': 'category',
            'Place of Receipt': 'category',
            'Shipper': 'object',
            'Shipper Address': 'object',
            'Consignee': 'object',
            'Consignee Address': 'object',
            'Notify Party': 'object',
            'Notify Party Address': 'object',
            'Also Notify Party': 'object',
            'Also Notify Party Address': 'object',
            'Raw Commodity Description': 'object',
            'Marks Container Number': 'object',
            'Marks Description': 'object',
            'HS Code': 'category',
            'JOC Code': 'category',
            'Commodity Short Description': 'object',
            'Container Number': 'object',
            'Carrier': 'category',
            'SCAC': 'category',
            'Vessel Name': 'object',
            'Voyage Number': 'object',
            'Pre Carrier': 'float64',
            'IMO Number': 'float64',
            'Inbond Code': 'float64',
            'Mode of Transport': 'category',
            'Bill of Lading Number': 'object'}
#define category variable cols
catcols = ['weight_unit', 'qty_type', 'origin_territory', 'origin_region', 'arrival_port_code', 
           'arrival_port_name', 'departure_port_code', 'departure_port_name', 'dest_final', 'coast_region', 
           'clearing_district', 'place_receipt', 'hs_code', 'joc_code', 'carrier_name', 'carrier_scac', 
           'transport_mode']
#get col names for reordering
import_colnames = list(import_colnames_dict.values())

In [3]:
print('Extracting CSV files...\n', 'Files to process: ', len(datafiles), '\n')

for file in datafiles:
    #extract from csv to clean dataframes and concat
    start = time.time()
    print('Extracting file {}...'.format(filenumber))
    #read csv with appropriate dtypes
    file_df = dd.read_csv(path+file, dtype=import_dtype_dict, assume_missing=True, sample=1000)
    
    #rename columns
    file_df = file_df.compute().rename(columns=import_colnames_dict)
    #unpack strings to list objects
    file_df.container_ids = file_df.container_ids.str.split()
    file_df.commod_short_desc_qty = file_df.commod_short_desc_qty.str.split(pat=';')
    file_df.commod_short_desc = file_df.commod_short_desc.str.split(pat=',')
    #recast dates to datetime 
    file_df.date_arrival = pd.to_datetime(file_df.date_arrival.astype(str), format='%Y%m%d')
    #reorder columns
    file_df = file_df[import_colnames]
    #concat to imports_df
    if 'imports_df' in locals():
        imports_df = dd.concat([imports_df, file_df])
    else: 
        imports_df = file_df
    #save file_df
    extract = time.time()
    print('File extracted. That took {} sec.'.format(extract-start))
    print('Saving file {} to parquet...'.format(filenumber))
    file_df.to_parquet('data/clean_parquet/'+ file[:-3] + 'parquet')
    del file_df
    end = time.time()
    print('File ETL complete.\n', 'Total time for file {}: {} sec \n'.format(filenumber, end-start))
    filenumber += 1

Extracting CSV files...
 Files to process:  39 

Extracting file 1...
File extracted. That took 15.014738082885742 sec.
Saving file 1 to parquet...
File ETL complete.
 Total time for file 1: 23.72644305229187 sec 

Extracting file 2...
File extracted. That took 63.04529690742493 sec.
Saving file 2 to parquet...
File ETL complete.
 Total time for file 2: 73.83171582221985 sec 

Extracting file 3...
File extracted. That took 19.983909130096436 sec.
Saving file 3 to parquet...
File ETL complete.
 Total time for file 3: 25.05823516845703 sec 

Extracting file 4...
File extracted. That took 35.92933797836304 sec.
Saving file 4 to parquet...
File ETL complete.
 Total time for file 4: 44.57157897949219 sec 

Extracting file 5...
File extracted. That took 36.683330059051514 sec.
Saving file 5 to parquet...
File ETL complete.
 Total time for file 5: 46.062987089157104 sec 

Extracting file 6...
File extracted. That took 34.82046389579773 sec.
Saving file 6 to parquet...
File ETL complete.
 Tota



File extracted. That took 76.64041018486023 sec.
Saving file 16 to parquet...
File ETL complete.
 Total time for file 16: 82.74497318267822 sec 

Extracting file 17...
File extracted. That took 36.422752141952515 sec.
Saving file 17 to parquet...
File ETL complete.
 Total time for file 17: 45.33340811729431 sec 

Extracting file 18...
File extracted. That took 29.241694927215576 sec.
Saving file 18 to parquet...
File ETL complete.
 Total time for file 18: 36.390482902526855 sec 

Extracting file 19...
File extracted. That took 21.168065071105957 sec.
Saving file 19 to parquet...
File ETL complete.
 Total time for file 19: 26.004690885543823 sec 

Extracting file 20...




File extracted. That took 106.75309801101685 sec.
Saving file 20 to parquet...
File ETL complete.
 Total time for file 20: 113.77377104759216 sec 

Extracting file 21...
File extracted. That took 34.01321196556091 sec.
Saving file 21 to parquet...
File ETL complete.
 Total time for file 21: 42.328327894210815 sec 

Extracting file 22...
File extracted. That took 16.641629934310913 sec.
Saving file 22 to parquet...
File ETL complete.
 Total time for file 22: 20.52099609375 sec 

Extracting file 23...
File extracted. That took 48.01070284843445 sec.
Saving file 23 to parquet...
File ETL complete.
 Total time for file 23: 59.97862887382507 sec 

Extracting file 24...
File extracted. That took 52.51041293144226 sec.
Saving file 24 to parquet...
File ETL complete.
 Total time for file 24: 64.71740698814392 sec 

Extracting file 25...




File extracted. That took 137.00891494750977 sec.
Saving file 25 to parquet...
File ETL complete.
 Total time for file 25: 145.89261603355408 sec 

Extracting file 26...
File extracted. That took 41.684616804122925 sec.
Saving file 26 to parquet...
File ETL complete.
 Total time for file 26: 51.329452991485596 sec 

Extracting file 27...
File extracted. That took 33.806188106536865 sec.
Saving file 27 to parquet...
File ETL complete.
 Total time for file 27: 40.7286262512207 sec 

Extracting file 28...
File extracted. That took 24.715368032455444 sec.
Saving file 28 to parquet...
File ETL complete.
 Total time for file 28: 30.003836154937744 sec 

Extracting file 29...
File extracted. That took 41.24799990653992 sec.
Saving file 29 to parquet...
File ETL complete.
 Total time for file 29: 50.97440505027771 sec 

Extracting file 30...
File extracted. That took 35.29223322868347 sec.
Saving file 30 to parquet...
File ETL complete.
 Total time for file 30: 44.899818897247314 sec 

Extract



File extracted. That took 189.98041200637817 sec.
Saving file 31 to parquet...
File ETL complete.
 Total time for file 31: 197.5816900730133 sec 

Extracting file 32...
File extracted. That took 38.540515184402466 sec.
Saving file 32 to parquet...
File ETL complete.
 Total time for file 32: 47.92301416397095 sec 

Extracting file 33...
File extracted. That took 22.093156814575195 sec.
Saving file 33 to parquet...
File ETL complete.
 Total time for file 33: 27.177011966705322 sec 

Extracting file 34...
File extracted. That took 28.94852089881897 sec.
Saving file 34 to parquet...
File ETL complete.
 Total time for file 34: 35.03355002403259 sec 

Extracting file 35...
File extracted. That took 38.395867109298706 sec.
Saving file 35 to parquet...
File ETL complete.
 Total time for file 35: 47.65315389633179 sec 

Extracting file 36...
File extracted. That took 25.810106992721558 sec.
Saving file 36 to parquet...
File ETL complete.
 Total time for file 36: 31.064847946166992 sec 

Extract

In [6]:
#inspect imports_df
imports_df.compute().info()

: 

In [5]:
# Close the Dask client and cluster
#client.close()
#cluster.close()