In [1]:
#import libraries
import pandas as pd
import os
import time

#display settings
pd.set_option('display.max_columns', None)

# To do:
- read csvs
    - set datatypes and column names during read
- 
- concat dataframes
    - use union categoricals to preserve categories
- drop duplicates


In [2]:
#define path
path = 'data/raw/'
#get list of data files, ignoring any hidden files in directory 
datafiles = [file for file in os.listdir(path) if not file.startswith('.')]
#init filenumber
filenumber = 1
#define new col names
import_colnames_dict = {'Weight': 'weight',
                        'Weight Unit': 'weight_unit',
                        'Quantity': 'qty',
                        'Quantity Type': 'qty_type',
                        'TEUs': 'teus',
                        'Estimated Value': 'value_est',
                        'Arrival Date': 'date_arrival',
                        'Container Piece Count': 'container_piece_count',
                        'Quantity of Commodity Short Description': 'commod_short_desc_qty',
                        'Territory of Origin': 'origin_territory',
                        'Region of Origin': 'origin_region',
                        'Port of Arrival Code': 'arrival_port_code',
                        'Port of Arrival': 'arrival_port_name',
                        'Port of Departure Code': 'departure_port_code',
                        'Port of Departure': 'departure_port_name',
                        'Final Destination': 'dest_final',
                        'Coastal Region': 'coast_region',
                        'Clearing District': 'clearing_district',
                        'Place of Receipt': 'place_receipt',
                        'Shipper': 'shipper_name',
                        'Shipper Address': 'shipper_address',
                        'Consignee': 'consignee_name',
                        'Consignee Address': 'consignee_address',
                        'Notify Party': 'notify_party1_name',
                        'Notify Party Address': 'notify_party1_address',
                        'Also Notify Party': 'notify_party2_name',
                        'Also Notify Party Address': 'notify_party2_address',
                        'Raw Commodity Description': 'commod_desc_raw',
                        'Marks Container Number': 'container_id_marks',
                        'Marks Description': 'marks_desc',
                        'HS Code': 'hs_code',
                        'JOC Code': 'joc_code',
                        'Commodity Short Description': 'commod_short_desc',
                        'Container Number': 'container_id',
                        'Carrier': 'carrier_name',
                        'SCAC': 'carrier_scac',
                        'Vessel Name': 'vessel_name',
                        'Voyage Number': 'vessel_id',
                        'Pre Carrier': 'precarrier',
                        'IMO Number': 'imo_num',
                        'Inbond Code': 'inbond_code',
                        'Mode of Transport': 'transport_mode',
                        'Bill of Lading Number': 'bol_id'}
#define dtypes
import_dtype_dict = {'Weight': 'float64',
            'Weight Unit': 'category',
            'Quantity': 'float64',
            'Quantity Type': 'category',
            'TEUs': 'float64',
            'Estimated Value': 'float64',
            'Arrival Date': 'int64',
            'Container Piece Count': 'int64',
            'Quantity of Commodity Short Description': 'object',
            'Territory of Origin': 'category',
            'Region of Origin': 'category',
            'Port of Arrival Code': 'category',
            'Port of Arrival': 'category',
            'Port of Departure Code': 'category',
            'Port of Departure': 'category',
            'Final Destination': 'category',
            'Coastal Region': 'category',
            'Clearing District': 'category',
            'Place of Receipt': 'category',
            'Shipper': 'object',
            'Shipper Address': 'object',
            'Consignee': 'object',
            'Consignee Address': 'object',
            'Notify Party': 'object',
            'Notify Party Address': 'object',
            'Also Notify Party': 'object',
            'Also Notify Party Address': 'object',
            'Raw Commodity Description': 'object',
            'Marks Container Number': 'object',
            'Marks Description': 'object',
            'HS Code': 'category',
            'JOC Code': 'category',
            'Commodity Short Description': 'object',
            'Container Number': 'object',
            'Carrier': 'category',
            'SCAC': 'category',
            'Vessel Name': 'object',
            'Voyage Number': 'object',
            'Pre Carrier': 'float64',
            'IMO Number': 'float64',
            'Inbond Code': 'float64',
            'Mode of Transport': 'category',
            'Bill of Lading Number': 'object'}
#define category variable cols
catcols = ['weight_unit', 'qty_type', 'origin_territory', 'origin_region', 'arrival_port_code', 
           'arrival_port_name', 'departure_port_code', 'departure_port_name', 'dest_final', 'coast_region', 
           'clearing_district', 'place_receipt', 'hs_code', 'joc_code', 'carrier_name', 'carrier_scac', 
           'transport_mode']

In [3]:
#extract from csv to clean dataframes and concat
for filename in datafiles:
    start = time.time()
    #read csv with appropriate dtypes
    file_df = pd.read_csv(path+filename, dtype=import_dtype_dict, nrows=1000)
    #rename columns
    file_df.rename(columns=import_colnames_dict, inplace=True)
    #concat to or create main imports df
    if 'imports_df' in locals():
        #create category unions and assign union to each df col
        for col in catcols:
            catunion = pd.api.types.union_categoricals([imports_df[col], file_df[col]])
            imports_df[col] = pd.Categorical(imports_df[col], categories=catunion.categories)
            file_df[col] = pd.Categorical(file_df[col], categories=catunion.categories)
        #concat to main df
        imports_df = pd.concat([imports_df, file_df])
    else:
        imports_df = file_df 
    del file_df
    end = time.time()
    filenumber += 1

In [4]:
imports_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 39000 entries, 0 to 999
Data columns (total 43 columns):
 #   Column                 Non-Null Count  Dtype   
---  ------                 --------------  -----   
 0   weight                 39000 non-null  float64 
 1   weight_unit            39000 non-null  category
 2   qty                    39000 non-null  float64 
 3   qty_type               33627 non-null  category
 4   teus                   39000 non-null  float64 
 5   value_est              39000 non-null  float64 
 6   date_arrival           39000 non-null  int64   
 7   container_piece_count  39000 non-null  int64   
 8   commod_short_desc_qty  39000 non-null  object  
 9   origin_territory       38680 non-null  category
 10  origin_region          38680 non-null  category
 11  arrival_port_code      38985 non-null  category
 12  arrival_port_name      38985 non-null  category
 13  departure_port_code    38817 non-null  category
 14  departure_port_name    38817 non-null  catego