# Exploratory Analysis of Import Data

This notebook explores the PIERS Bill of Lading data, obtained from S&P's Global Trade Analytics Suite. See the README.md file for more info on the overall project, data pre-processing, and column definitions. 

In [2]:
#import libraries
import pandas as pd
import polars as pl
import matplotlib.pyplot as plt
import seaborn as sns
import plotly_express as px

#display settings
pd.set_option('display.max_columns', None)
%matplotlib inline

#enable string cache for polars categoricals
pl.enable_string_cache()

## Basic Summary Stats

This sections presents basic summary statistics on each variable in the dataset. See the [discussion below regarding known issues with the dataset](#known-data-issues)

In [3]:
#create lazyframes, get shape and basic counts

#set paths
imports_path = 'data/clean/imports/'
exports_path = 'data/'

#get schema and col names
imports_schema = pl.read_parquet_schema(source= imports_path+'piers_imports_2005.parquet')
imports_colnames = imports_schema.keys()
exports_schema = pl.read_parquet_schema(source=exports_path+'piers_exports_complete.parquet')
exports_colnames = exports_schema.keys()

#init lazy dataframes, creating 2-digit hs codes (for now) and year column. 
imports_lzdf = (
    pl.scan_parquet(imports_path+'*.parquet', parallel='columns')
    .with_columns([pl.col('hs_code').str.slice(0, length=2).alias('hs_2d'),
                   pl.col('date_arrival').dt.year().alias('year')])
    )
exports_lzdf = (
    pl.scan_parquet(exports_path+'piers_exports_complete.parquet', parallel='columns')
    .with_columns([pl.col('hs_code').str.slice(0, length=2).alias('hs_2d'),
                   pl.col('date_departure').dt.year().alias('year')])
    )

#get number of observations
imports_n = imports_lzdf.select(pl.count()).collect().item()
exports_n = exports_lzdf.select(pl.count()).collect().item()
print('The imports dataset has {:,} rows and {} columns.'.format(imports_n, len(imports_schema)))
print('The exports dataset has {:,} rows and {} columns.'.format(exports_n, len(exports_schema)))

The imports dataset has 178,875,896 rows and 44 columns.
The exports dataset has 68,769,489 rows and 31 columns.


In [4]:
#view head of imports dataframe
print('Imports dataframe preview:')
imports_lzdf.limit(n=3).collect()

Imports dataframe preview:


weight,weight_unit,qty,qty_type,teus,value_est,date_arrival,container_piece_count,commod_short_desc_qty,origin_territory,origin_region,arrival_port_code,arrival_port_name,departure_port_code,departure_port_name,dest_final,coast_region,clearing_district,place_receipt,shipper_name,shipper_address,consignee_name,consignee_address,notify_party1_name,notify_party1_address,notify_party2_name,notify_party2_address,commod_desc_raw,container_id_marks,marks_desc,hs_code,joc_code,commod_short_desc,container_ids,carrier_name,carrier_scac,vessel_name,voyage_id,precarrier,vessel_id,inbond_code,transport_mode,bol_number,bol_id,hs_2d,year
f64,cat,f64,cat,f64,f64,datetime[μs],i32,str,cat,cat,cat,cat,cat,cat,cat,cat,cat,cat,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,cat,cat,str,str,f64,i32,f64,cat,str,str,str,i32
0.0,"""KG""",19.0,"""PCS""",0.0,0.0,2005-12-15 00:00:00,1,"""19""","""SWEDEN""","""NORTH EUROPE""","""4601""","""NEW YORK""","""42879""","""HAMBURG""",""" ""","""EAST""",,"""MALMO""","""ORDER""","""""","""ORDER""","""2258 ALLEN STR…","""ORDER""","""2258 ALLEN STR…",,,,"""""","""LC""","""870899""","""""","""""","""PONU7791697""","""P&O NEDLLOYD""","""PONL""","""P&O NEDLLOYD Y…","""5449""",,9252993,,,"""HBGSF10809""","""PONL_HBGSF1080…","""87""",2005
0.0,"""KG""",329.0,"""CTN""",0.0,0.0,2005-10-12 00:00:00,1,"""329""","""CHINA (MAINLAN…","""NORTH EAST ASI…","""2709""","""LONG BEACH""","""57035""","""SHANGHAI""",""" ""","""WEST""","""LOS ANGELES, C…","""SHANGHAI""","""CARNIVAL""","""21F E 1 ZHENGH…","""ELEGANCE PRINT…","""YIP ST""","""GRAMTER INTERN…","""11222 S LA CIE…",,,,"""""","""LC""","""852990""","""""","""""","""FSCU7037551""","""CHINA NATIONAL…","""CNFT""","""TRADE FOISON""","""35""",,9158525,,,"""0509SGA017F""","""CNFT_0509SGA01…","""85""",2005
0.0,"""KG""",14.0,"""CTN""",0.0,0.0,2005-08-29 00:00:00,1,"""14""","""SPAIN""","""MEDITERRANEAN""","""4601""","""NEW YORK""","""47061""","""BARCELONA""",""" ""","""EAST""",,"""BARCELONA""","""EMPRESA TEXTIL…","""""","""ATHLETA""","""1450 TECHNOLOG…","""OZ ARCHITECTS""","""6621 N SCOTTSD…",,,,"""""","""LC""","""620630""","""""","""""","""CCLU6360140""","""CHINA SHIPPING…","""CSCN""","""ZIM VIRGINIA""","""12""",,9231808,,,"""BCNLE2687B""","""CSCN_BCNLE2687…","""62""",2005


In [5]:
#view head of exports dataframe 
print('Exports dataframe preview:')
exports_lzdf.limit(n=3).collect()

Exports dataframe preview:


shipper,shipper_address,weight,weight_unit,quantity,quantity_type,teus,carrier_name,carrier_scac,vessel_name,voyage_number,bol_number,vessel_id,value_est,departure_port_code,departure_port_name,container_ids,container_piece_count,coast_region,commod_desc_raw,commod_short_desc,hs_code,joc_code,commod_short_desc_qty,date_departure,origin,dest_territory,dest_region,dest_port_code_declared,dest_port_name,bol_id,hs_2d,year
str,str,f64,cat,f64,cat,f64,cat,cat,str,str,str,i32,f64,cat,cat,str,i32,cat,str,str,str,str,str,datetime[μs],str,cat,cat,cat,cat,str,str,i32
,,0.0,"""KG""",17.0,"""UNT""",0.0,"""HOEGH UGLAND A…","""AUTO""","""OCEAN SPIRIT""","""93""","""03102""",8321747,0.0,"""1803""","""JACKSONVILLE""","""""",0,"""EAST""",,"""""","""870390""","""""","""1; 2; 1; 2; 1;…",2005-12-26 00:00:00,""" ""","""SAUDI ARABIA""","""MIDDLE EAST""","""51721""","""JEDDAH""","""AUTO_03102""","""87""",2005
,,0.0,"""KG""",1080.0,"""CS""",0.0,"""CROWLEY LINER …","""CRLS""","""LA REINA""","""57""","""JAXS5M138856""",7802136,0.0,"""1111""","""PENNSAUKEN""","""CMCZ835185""",1,"""EAST""",,"""""","""220290""","""""","""1080""",2005-12-16 00:00:00,""" ""","""PUERTO RICO""","""CARIBBEAN""","""90309""","""SAN JUAN""","""CRLS_JAXS5M138…","""22""",2005
,,0.0,"""KG""",5.0,"""PCS""",0.0,"""HANJIN SHIPPIN…","""HJSC""","""HANJIN YANTIAN…","""1""","""LGBA05312805""",9295218,0.0,"""2709""","""LONG BEACH""","""SCZU7962271""",1,"""WEST""",,"""""","""847290""","""""","""5""",2005-12-21 00:00:00,""" ""","""TAIWAN""","""NORTH EAST ASI…","""58309""","""KAOHSIUNG""","""HJSC_LGBA05312…","""84""",2005


In [6]:
#Summary stats for Imports

#init df and get stats labels column
imports_summarystats_df = imports_lzdf.select(pl.first()).collect().describe().select(pl.first()).to_pandas()
#loop through columns and get descriptive stats
for column in imports_colnames:
    imports_summarystats_df[column] = imports_lzdf.select(pl.col(column)).collect().describe().select(column).to_pandas()
#display
print('Summary Stats - Imports')
imports_summarystats_df

Summary Stats - Imports


Unnamed: 0,describe,weight,weight_unit,qty,qty_type,teus,value_est,date_arrival,container_piece_count,commod_short_desc_qty,origin_territory,origin_region,arrival_port_code,arrival_port_name,departure_port_code,departure_port_name,dest_final,coast_region,clearing_district,place_receipt,shipper_name,shipper_address,consignee_name,consignee_address,notify_party1_name,notify_party1_address,notify_party2_name,notify_party2_address,commod_desc_raw,container_id_marks,marks_desc,hs_code,joc_code,commod_short_desc,container_ids,carrier_name,carrier_scac,vessel_name,voyage_id,precarrier,vessel_id,inbond_code,transport_mode,bol_number,bol_id
0,count,178875900.0,178875896.0,178875900.0,150054988.0,178875900.0,178875900.0,178875896,178875900.0,178875850,178004511.0,178004511.0,178842823.0,178842823.0,178399200.0,178399200.0,178875896.0,178767698.0,138267806.0,178028936.0,177152819,149352161,177316770,158522570,131500774,147349739,11190643,80229284,153344894,178875896,178875896,178875850,178875850,178875850,178875896,178743544.0,178875872.0,167484536,178875896,0.0,167394300.0,45999800.0,54698436.0,178875896,178875896
1,null_count,0.0,0.0,0.0,28820908.0,0.0,0.0,0,0.0,46,871385.0,871385.0,33073.0,33073.0,476696.0,476696.0,0.0,108198.0,40608090.0,846960.0,1723077,29523735,1559126,20353326,47375122,31526157,167685253,98646612,25531002,0,0,46,46,46,0,132352.0,24.0,11391360,0,178875896.0,11481590.0,132876100.0,124177460.0,0,0
2,mean,47406.86,,1203.557,,1.28577,78087.32,,1.440886,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,9338100.0,306.6917,,,
3,std,1721627.0,,409020.4,,3.112699,2853759.0,,3.409732,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,285028.8,1059.749,,,
4,min,0.0,,-74561090.0,,0.0,0.0,2005-01-01 00:00:00,0.0,,,,,,,,,,,,,,,,,,& OPTCO,,,,,,,,,,,,,,0.0,0.0,,$70050200001,-1_CSHSE0009996
5,25%,0.0,,1.0,,0.0,0.0,,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,9248112.0,0.0,,,
6,50%,1466.0,,38.0,,0.22,6571.0,,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,9332884.0,0.0,,,
7,75%,15010.0,,400.0,,2.0,45526.0,,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,9462706.0,0.0,,,
8,max,6172936000.0,,3531460000.0,,1730.75,9981402000.0,2023-11-30 00:00:00,999.0,9; 9; 9; 9; 9; 9; 9; 9; 9; 9; 9; 9; 9; 9; 9; 9...,,,,,,,,,,,“COOPERATIVA OPERAI CAVATORI DEL BOTTICINO,ÞORRASALIR 13,C MCCRACKEN,ŠILUTES STR 9,C MCCRACKEN,ŠILUTES STR 9,ZYP COATINGS,co RMI Titanium Company LLC 208, WORKING GLOVES,}No MarksNo MarksNo Marks,; ; ; ; ; ; ; ; ; ; ; ,ddedo,Y-,"ZIRCONIUM; SAND,FLOUR,SPONGE,POWDER VALVES; EN...",vTA1108,,,ZYGI,^144,,9975363.0,7122.0,,_YLC158932,zimu_ZIMUZHJ0000952


In [7]:
#Summary stats for Exports

#NOTE at the moment the exports dataset may fit in memory, in which case the below code could be accomplished more efficiently
#by executing the following line; however, the main code below should run even when the dataset does not fit in memory. 
#exports_lzdf.collect().describe()

#init df and get stats labels column
exports_summarystats_df = exports_lzdf.select(pl.first()).collect().describe().select(pl.first()).to_pandas()
#loop through columns and get descriptive stats
for column in exports_colnames:
    exports_summarystats_df[column] = exports_lzdf.select(pl.col(column)).collect().describe().select(column).to_pandas()
#display
print('Summary Stats - Exports')
exports_summarystats_df

Summary Stats - Exports


Unnamed: 0,describe,shipper,shipper_address,weight,weight_unit,quantity,quantity_type,teus,carrier_name,carrier_scac,vessel_name,voyage_number,bol_number,vessel_id,value_est,departure_port_code,departure_port_name,container_ids,container_piece_count,coast_region,commod_desc_raw,commod_short_desc,hs_code,joc_code,commod_short_desc_qty,date_departure,origin,dest_territory,dest_region,dest_port_code_declared,dest_port_name,bol_id
0,count,52901485,51534703,68769490.0,68769489.0,68769490.0,61810793.0,68769490.0,68595453.0,68768115.0,63384893,67625608,68769456,63231150.0,68769490.0,68720145.0,68720145.0,68769489,68769490.0,68769013.0,53168182,68769437,68769437,68769437,68769437,68769489,68769489.0,68279599.0,68279599.0,68299282.0,68299282.0,68769456
1,null_count,15868004,17234786,0.0,0.0,0.0,6958696.0,0.0,174036.0,1374.0,5384596,1143881,33,5538337.0,0.0,49344.0,49344.0,0,0.0,476.0,15601307,52,52,52,52,0,0.0,489890.0,489890.0,470207.0,470207.0,33
2,mean,,,115206.7,,39653.11,,1.672979,,,,,,9159625.0,141566.4,,,,1.923483,,,,,,,,,,,,,
3,std,,,2375519.0,,3471335.0,,5.518306,,,,,,543876.9,6466967.0,,,,11.0981,,,,,,,,,,,,,
4,min,,,0.0,,-1467123000.0,,0.0,,,,,,196.0,0.0,,,,0.0,,,,,,,2005-01-01 00:00:00,,,,,,079A_26004878070
5,25%,,,0.0,,0.0,,0.0,,,,,,9143568.0,0.0,,,,1.0,,,,,,,,,,,,,
6,50%,,,136.08,,13.0,,0.0,,,,,,9295220.0,408.0,,,,1.0,,,,,,,,,,,,,
7,75%,,,19196.96,,178.0,,2.0,,,,,,9398424.0,46271.0,,,,1.0,,,,,,,,,,,,,
8,max,SUNSET TRANSPORTATION SERVICE,,952998100.0,,3716633000.0,,3729.25,,,xin yang shan,|SAL5,PE10518042,9980629.0,10000000000.0,,,~POPLAR AVE |SAVALY762976 _38197 WFHU5153172 W...,4136.0,,ÙTEEL GUARD RAIL; ÙTEEL GUARD RAILS,"ZIRCONIUM; SAND,FLOUR,SPONGE,POWDER ZIRCONIUM ...",ddedo,9669000 7746000 1301000 6830000,9; 9; 9; 9; 9; 9; 9; 9; 9; 9; 9; 9; 9; 9; 9; 9...,2023-11-30 00:00:00,,,,,,zzzz_ZZZZ


In [8]:
#Counts of top imports categorical columns

#get categorical cols
imports_catcols_lzdf = (
    imports_lzdf.select(pl.col(pl.Categorical))
)
#init top cats df
imports_topcats_df = pl.DataFrame()

for colname in imports_catcols_lzdf.columns:
    col_counts_df = (
        #select column
        imports_catcols_lzdf.select(pl.col(colname))
        .collect()
        .to_series()
        #get top 10 value counts
        .value_counts()
        .sort('count', descending=True)
        .limit(10)
        #rename count column
        .rename({'count': colname + '_count'})
    )
    #stack dfs horizontally 
    imports_topcats_df = pl.concat([imports_topcats_df, col_counts_df], how='horizontal')

print('Counts of top 10 categories for each categorical variable in imports database:')
imports_topcats_df

Counts of top 10 categories for each categorical variable in imports database:


weight_unit,weight_unit_count,qty_type,qty_type_count,origin_territory,origin_territory_count,origin_region,origin_region_count,arrival_port_code,arrival_port_code_count,arrival_port_name,arrival_port_name_count,departure_port_code,departure_port_code_count,departure_port_name,departure_port_name_count,dest_final,dest_final_count,coast_region,coast_region_count,clearing_district,clearing_district_count,place_receipt,place_receipt_count,carrier_name,carrier_name_count,carrier_scac,carrier_scac_count,transport_mode,transport_mode_count
cat,u32,cat,u32,cat,u32,cat,u32,cat,u32,cat,u32,cat,u32,cat,u32,cat,u32,cat,u32,cat,u32,cat,u32,cat,u32,cat,u32,cat,u32
"""KG""",178875896.0,"""CTN""",50685096,"""CHINA (MAINLAN…",75748421,"""NORTH EAST ASI…",100044029,"""2704""",36437064,"""LOS ANGELES""",36437064,"""57035""",23761723,"""SHANGHAI""",23761723,""" """,178875896.0,"""WEST""",92073665.0,"""LOS ANGELES, C…",67153221,"""SHANGHAI""",23256000,"""MEDITERRANEAN …",17750989,"""MAEU""",13023524,,124177460.0
,,"""CM""",33078826,"""VIETNAM""",7628769,"""SOUTH EAST ASI…",18718236,"""4601""",32461038,"""NEW YORK""",32461038,"""57078""",21377427,"""YANTIAN""",21377427,,,"""EAST""",74496467.0,,40608090,"""YANTIAN""",19061963,"""MAERSK LINE""",15450322,"""EGLV""",12540853,"""MARITIME""",54698436.0
,,,28820908,"""INDIA""",7420825,"""NORTH EUROPE""",18549213,"""2709""",30379775,"""LONG BEACH""",30379775,"""58023""",11126231,"""BUSAN""",11126231,,,"""GULF""",10576612.0,"""SAVANNAH, GEOR…",11073275,"""NINGBO""",8266716,"""EVERGREEN LINE…",13210626,"""CMDU""",11848743,,
,,"""PKG""",20376899,"""TAIWAN""",7093428,"""INDIAN SUB CON…",11088257,"""1703""",10870895,"""SAVANNAH""",10870895,"""57020""",10980717,"""HONG KONG""",10007635,,,"""CARIBBEAN""",1486746.0,"""SEATTLE, WASHI…",10626872,"""BUSAN""",5767191,"""CMA-CGM""",12767736,"""HLCU""",10899595,,
,,"""PCS""",15728895,"""GERMANY""",6640084,"""MEDITERRANEAN""",10919966,"""1401""",7834038,"""NORFOLK""",7834038,"""58201""",10007635,"""NINGBO""",8621898,,,"""GREAT LAKES""",134208.0,"""HOUSTON/GALVES…",7989565,"""QINGDAO""",5668080,"""HAPAG LLOYD""",12391309,"""COSU""",9183898,,
,,"""X""",11428859,"""REPUBLIC OF KO…",6393647,"""CENTRAL AMERIC…",5113232,"""5301""",7591326,"""HOUSTON""",7591326,"""58309""",8261696,"""KAOHSIUNG""",8261696,,,,108198.0,"""NORFOLK, VIRGI…",7903877,"""HONG KONG""",5633841,"""CHINA OCEAN SH…",10347199,"""MEDU""",9068195,,
,,"""BXS""",3473802,"""HONG KONG""",5644361,"""WEST COAST SOU…",4142811,"""2811""",6626522,"""OAKLAND""",6626522,"""55976""",5882736,"""SINGAPORE""",5882736,,,,,"""MIAMI, FLORIDA…",7076741,"""HO CHI MINH""",4585227,"""ORIENT OVERSEA…",9546039,"""OOLU""",8323800,,
,,"""CS""",3347845,"""ITALY""",5447553,"""EAST COAST SOU…",3047243,"""0005""",6471974,"""VANCOUVER BC""",6471974,"""42870""",5008439,"""BREMERHAVEN""",5008439,,,,,"""SAN FRANCISCO,…",6665016,"""XIAMEN""",4164686,"""AMERICAN PRESI…",8122323,"""ONEY""",8070853,,
,,"""BGS""",1593183,"""JAPAN""",5163461,"""CARIBBEAN""",2718638,"""1601""",6468246,"""CHARLESTON""",6468246,"""57047""",4596130,"""QINGDAO""",4596130,,,,,"""CHARLESTON, S.…",6468460,"""KAOHSIUNG""",3066893,"""OCEAN NETWORK …",8069530,"""MSCU""",7268936,,
,,"""UNT""",1403260,"""THAILAND""",3295124,"""OCEANIA""",1597398,"""3002""",5738754,"""TACOMA""",5738754,"""57069""",4184543,"""XIAMEN""",4184543,,,,,"""PHILADELPHIA, …",3047661,"""BREMERHAVEN""",2436295,"""YANG MING LINE…",6814942,"""HDMU""",6874957,,


In [9]:
#Counts of top exports categorical columns

#get categorical cols
exports_catcols_lzdf = (
    exports_lzdf.select(pl.col(pl.Categorical))
)
#init top cats df
exports_topcats_df = pl.DataFrame()

for colname in exports_catcols_lzdf.columns:
    col_counts_df = (
        #select column
        exports_catcols_lzdf.select(pl.col(colname))
        .collect()
        .to_series()
        #get top 10 value counts
        .value_counts()
        .sort('count', descending=True)
        .limit(10)
        #rename count column
        .rename({'count': colname + '_count'})
    )
    #stack dfs horizontally 
    exports_topcats_df = pl.concat([exports_topcats_df, col_counts_df], how='horizontal')

print('Counts of top 10 categories for each categorical variable in exports database:')
exports_topcats_df

Counts of top 10 categories for each categorical variable in exports database:


weight_unit,weight_unit_count,quantity_type,quantity_type_count,carrier_name,carrier_name_count,carrier_scac,carrier_scac_count,departure_port_code,departure_port_code_count,departure_port_name,departure_port_name_count,coast_region,coast_region_count,dest_territory,dest_territory_count,dest_region,dest_region_count,dest_port_code_declared,dest_port_code_declared_count,dest_port_name,dest_port_name_count
cat,u32,cat,u32,cat,u32,cat,u32,cat,u32,cat,u32,cat,u32,cat,u32,cat,u32,cat,u32,cat,u32
"""KG""",68769489.0,"""CF""",20052121,"""MEDITERRANEAN …",5793495,"""HAPL""",4861095,"""4601""",9016653,"""NEW YORK""",9016653,"""EAST""",41638045.0,"""CHINA (MAINLAN…",5577757,"""NORTH EAST ASI…",17045863,"""90309""",3911195,"""SAN JUAN""",3911708
,,,6958696,"""HAPAG LLOYD""",4861294,"""MLSL""",3539362,"""2704""",6071495,"""LOS ANGELES""",6071495,"""WEST""",18675268.0,"""PUERTO RICO""",3941260,"""CARIBBEAN""",13873527,"""58023""",2606295,"""BUSAN""",2606295
,,"""UNT""",6389123,"""MAERSK LINE""",4856652,"""MSCU""",3280334,"""5301""",5271211,"""HOUSTON""",5271206,"""GULF""",8002937.0,"""JAPAN""",3796480,"""NORTH EUROPE""",8987532,"""58201""",2430256,"""HONG KONG""",2430256
,,"""PCS""",6019382,"""CMA-CGM""",3296996,"""CMDU""",2930458,"""2709""",5249455,"""LONG BEACH""",5249455,"""CARIBBEAN""",436480.0,"""REPUBLIC OF KO…",3083205,"""CENTRAL AMERIC…",5586530,"""57035""",2193297,"""SHANGHAI""",2193297
,,"""PKG""",4463101,"""SEABOARD MARIN…",3223745,"""SMLU""",2575867,"""1803""",4646304,"""JACKSONVILLE""",4646304,"""GREAT LAKES""",16283.0,"""HONG KONG""",2430461,"""SOUTH EAST ASI…",3902742,"""42305""",2192089,"""ANTWERP""",2192089
,,"""CTN""",2784205,"""TROPICAL SHIPP…",2818095,"""MDSC""",2513240,"""5203""",4522768,"""PT EVERGLADES""",4522768,,476.0,"""BELGIUM""",2306736,"""MEDITERRANEAN""",3486244,"""58309""",1658597,"""KAOHSIUNG""",1658597
,,"""PLT""",2730319,"""ORIENT OVERSEA…",2720267,"""TRSL""",2308826,"""1703""",4481005,"""SAVANNAH""",4481005,,,"""BAHAMAS""",2299915,"""AFRICA""",3342850,"""42157""",1619013,"""ROTTERDAM""",1619013
,,"""CS""",2713884,"""CROWLEY LINER …",2705052,"""CAMN""",2102643,"""5201""",4120195,"""MIAMI""",4120195,,,"""TAIWAN""",2156880,"""WEST COAST SOU…",2615873,"""42870""",1433094,"""BREMERHAVEN""",1433094
,,"""CM""",2504807,"""EVERGREEN LINE…",2316248,"""EGLV""",2093878,"""1401""",4083423,"""NORFOLK""",4083423,,,"""GERMANY""",2073918,"""EAST COAST SOU…",2601738,"""58886""",1408627,"""TOKYO""",1408627
,,"""BGS""",2113310,"""AMERICAN PRESI…",2282398,"""OOLU""",1976522,"""2811""",3501350,"""OAKLAND""",3501350,,,"""BRAZIL""",1726356,"""MIDDLE EAST""",2442534,"""55976""",1347657,"""SINGAPORE""",1352763


## Exploratory Stats and Visuals

This section presents basic comparisons between variables and other exploratory analysis. See the [discussion below regarding known issues with the dataset](#known-data-issues)

In [10]:
def count2w(data_lzdf, groupby_colname, counts_colnames, normalize=False, reference_colname=False, limit=50):
    '''
    Creates a dataframe showing two-way counts of the variables passed. 
    INPUT:
        data_lzdf - a polars lazyframe containing the relevant data
        groupby_colname - the column name to group by
        counts_colnames - a list of column names
        normalize - boolean - default=False - when True, values in the dataframe will be divided row-wise by the reference column.
        reference_colname - the reference column. The resultant dataframe will be sorted (descending) by this column.
                    note: reference_colname must appear in the counts_colnames list, and must be specified when normalize=True.
    OUTPUT:
        count2w_df - a polars dataframe with the normalized two-way counts for each variable
    Depends on:
        polars
    '''
    counts2w_df = (
        data_lzdf
        .group_by(groupby_colname)
        .agg([
            pl.col(col).count() for col in counts_colnames
        ])
        .sort(groupby_colname if not reference_colname else reference_colname, descending=True)
        .limit(limit)
        #normalize by reference column if normalize = True
        .select([groupby_colname, *counts_colnames] if not normalize else [pl.col(groupby_colname)]+[pl.col(col)/pl.col(reference_colname) for col in counts_colnames])
        .collect()
    )
    return counts2w_df

In [11]:
count2w(
    data_lzdf = imports_lzdf.with_columns(pl.col('teus').replace(0, None)), 
    groupby_colname= 'year',
    counts_colnames= ['arrival_port_name', 'teus', 'container_piece_count', 'origin_territory', 'hs_code', 'carrier_scac', 'bol_id'],
    normalize=True, 
    reference_colname= 'bol_id'
).sort('year', descending=True)

year,arrival_port_name,teus,container_piece_count,origin_territory,hs_code,carrier_scac,bol_id
i32,f64,f64,f64,f64,f64,f64,f64
2023,1.0,0.92426,1.0,0.999845,0.999999,1.0,1.0
2022,1.0,0.915275,1.0,0.999858,0.999999,1.0,1.0
2021,1.0,0.934605,1.0,0.999897,0.999999,1.0,1.0
2020,0.999883,0.915505,1.0,0.999861,1.0,1.0,1.0
2019,0.999997,0.979339,1.0,0.9997,1.0,1.0,1.0
2018,0.999982,0.969603,1.0,0.999791,1.0,1.0,1.0
2017,0.999978,0.972842,1.0,0.999731,1.0,0.999998,1.0
2016,0.999996,0.975272,1.0,0.999787,1.0,1.0,1.0
2015,0.999996,0.9687,1.0,0.999791,1.0,1.0,1.0
2014,0.999996,0.21693,1.0,0.999848,1.0,1.0,1.0


In [12]:
weeklycounts_df = count2w(
                    data_lzdf = (
                        imports_lzdf.with_columns([
                            #create week column
                            pl.col('date_arrival').dt.week().alias('week'),
                            #replace 0 values with null
                            pl.col('teus').replace(0,None)
                            ])
                    ),
                    groupby_colname= 'week',
                    counts_colnames= ['arrival_port_name', 'teus', 'container_piece_count', 'origin_territory', 'hs_code', 'carrier_scac', 'bol_id'],
                    reference_colname= 'bol_id',
                    normalize=True
                ).sort('week')
weeklycounts_df

week,arrival_port_name,teus,container_piece_count,origin_territory,hs_code,carrier_scac,bol_id
i8,f64,f64,f64,f64,f64,f64,f64
1,0.999681,0.589272,1.0,0.994864,1.0,1.0,1.0
2,0.999718,0.598917,1.0,0.994513,1.0,1.0,1.0
3,0.999733,0.587576,1.0,0.994769,0.999999,1.0,1.0
4,0.999761,0.599749,1.0,0.994739,1.0,1.0,1.0
5,0.999767,0.572541,1.0,0.994357,1.0,1.0,1.0
6,0.999771,0.573732,1.0,0.994132,1.0,1.0,1.0
7,0.999795,0.589613,1.0,0.995192,1.0,1.0,1.0
8,0.999694,0.58017,1.0,0.995355,1.0,1.0,1.0
9,0.999755,0.587003,1.0,0.995235,0.999999,1.0,1.0
11,0.999709,0.594995,1.0,0.996227,1.0,1.0,1.0


In [13]:
count2w(imports_lzdf, 
        groupby_colname='arrival_port_name', 
        counts_colnames=['teus', 'container_piece_count', 'origin_territory', 'hs_code', 'carrier_scac', 'bol_id'], 
        normalize=True,
        reference_colname='bol_id', 
        limit=50)

arrival_port_name,teus,container_piece_count,origin_territory,hs_code,carrier_scac,bol_id
cat,f64,f64,f64,f64,f64,f64
"""LOS ANGELES""",1.0,1.0,0.994696,1.0,0.999999,1.0
"""NEW YORK""",1.0,1.0,0.996438,1.0,1.0,1.0
"""LONG BEACH""",1.0,1.0,0.99438,1.0,1.0,1.0
"""SAVANNAH""",1.0,1.0,0.995334,1.0,1.0,1.0
"""NORFOLK""",1.0,1.0,0.995169,1.0,1.0,1.0
"""HOUSTON""",1.0,1.0,0.997686,0.999999,1.0,1.0
"""OAKLAND""",1.0,1.0,0.995726,1.0,1.0,1.0
"""VANCOUVER BC""",1.0,1.0,0.991612,1.0,1.0,1.0
"""CHARLESTON""",1.0,1.0,0.997309,1.0,1.0,1.0
"""TACOMA""",1.0,1.0,0.996677,1.0,1.0,1.0


In [14]:
def groupby_summarystats(data_lzdf, group_var, stats_vars, sort_var=False, limit=100):
    '''
    Calculates summary stats of each variable of interest for each group in a polars lazyframe. 
    INPUT:
        data_lzdf - Polars LazyFrame - lazyframe containing the relevant data
        group_var - Str - the name of the column to group by
        stats_vars - List - the names of the columns of interest
        sort_var - Str - default=False - when False, the resultant dataframe will be sorted by the
                                        group_by variable. Note that polars categories can have unexpected
                                        ordernal values. 
                                        when True, sorts the resultant dataframe by the sum of
                                        the named variable. Note sort_var must be an element of
                                        stats_vars. 
        limit - UInt - default=100 - limits the number of rows in the resultant dataframe. 
    OUTPUT:
        groupby_summarystats_df - Polars DataFrame - a dataframe containing the summary stats for each group
    '''
    agg_list = []
    for var in stats_vars:
        agg_list.append(pl.col(var).sum().alias(var+'_sum'))
        agg_list.append(pl.col(var).mean().alias(var+'_mean'))
        agg_list.append(pl.col(var).median().alias(var+'_median'))
        agg_list.append(pl.col(var).std().alias(var+'_std'))
    groupby_summarystats_df = (
        data_lzdf.group_by(group_var)
        .agg(agg_list)
        .sort(group_var if not sort_var else sort_var+'_sum', descending=True)
        .limit(limit)
        .collect()
    )
    return groupby_summarystats_df

In [15]:
for var in ['arrival_port_name', 'hs_2d', 'origin_territory', 'carrier_scac']:
    display(
        groupby_summarystats(imports_lzdf, group_var=var, stats_vars=['teus', 'qty', 'weight'], sort_var='teus', limit=20),
    )

arrival_port_name,teus_sum,teus_mean,teus_median,teus_std,qty_sum,qty_mean,qty_median,qty_std,weight_sum,weight_mean,weight_median,weight_std
cat,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""LOS ANGELES""",43232000.0,1.186491,0.17,2.882541,33260000000.0,912.813452,47.0,99228.850363,472050000000.0,12955.12938,676.81834,470004.665002
"""LONG BEACH""",35980000.0,1.184344,0.15,2.830877,27946000000.0,919.898084,41.0,407137.881272,562220000000.0,18506.320553,498.0,921026.343611
"""NEW YORK""",35211000.0,1.084712,0.2,2.315223,38162000000.0,1175.637777,27.0,743413.733776,787170000000.0,24249.585526,1159.63012,832358.476212
"""SAVANNAH""",20136000.0,1.852258,1.0,3.551344,10531000000.0,968.755445,62.0,73145.892985,266100000000.0,24478.643014,6728.50024,1913100.0
"""NORFOLK""",12241000.0,1.562563,1.0,3.109838,13086000000.0,1670.416013,36.0,269903.611943,142320000000.0,18167.156945,4789.0,247067.53935
"""HOUSTON""",12128000.0,1.597652,1.0,3.284618,9117800000.0,1201.084732,24.0,225378.209296,786850000000.0,103651.364253,7106.44,2503600.0
"""CHARLESTON""",9897900.0,1.530226,0.33,3.698971,6026700000.0,931.742806,22.0,97806.912735,168290000000.0,26018.489352,2442.0,737131.000937
"""OAKLAND""",8429100.0,1.272025,0.13,3.001839,7410800000.0,1118.348962,34.0,108617.183998,89018000000.0,13433.556286,212.0,91091.553095
"""VANCOUVER BC""",7452300.0,1.151468,0.1,3.442873,5934900000.0,917.018315,44.0,136419.329683,118780000000.0,18353.253032,220.462,755827.546249
"""TACOMA""",7066700.0,1.231398,0.22,3.063,4968700000.0,865.812308,43.0,76114.378627,83507000000.0,14551.35122,1000.89748,265317.646865


hs_2d,teus_sum,teus_mean,teus_median,teus_std,qty_sum,qty_mean,qty_median,qty_std,weight_sum,weight_mean,weight_median,weight_std
str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""94""",30140000.0,1.739,1.75,2.931633,10750000000.0,620.276307,96.0,105979.572802,183100000000.0,10564.564851,4561.35878,126208.683421
"""84""",19396000.0,1.30896,0.19,3.17998,7390700000.0,498.77489,12.0,80961.130434,201160000000.0,13575.707401,1470.0,425563.93604
"""39""",14532000.0,1.460363,0.67,2.832705,13559000000.0,1362.632585,46.0,124776.029783,165160000000.0,16597.408743,3782.0,219997.900988
"""85""",13824000.0,1.333044,0.17,3.477817,11429000000.0,1102.113013,30.0,143283.047276,115870000000.0,11173.711428,643.65,85225.909426
"""87""",11881000.0,1.306297,0.13,4.067554,10239000000.0,1125.83977,14.0,55345.238245,209550000000.0,23040.945408,1233.0,204013.057786
"""95""",10138000.0,1.223469,0.22,2.778954,8886000000.0,1072.340888,75.0,161039.917808,66955000000.0,8079.985669,801.0,140897.522246
"""08""",8404800.0,2.165947,1.76,7.414266,6603500000.0,1701.756355,800.0,47548.968926,134250000000.0,34595.789771,18904.0,169811.452872
"""73""",8273100.0,1.109481,0.18,2.546828,5405800000.0,724.964053,24.0,75475.084523,211500000000.0,28363.393926,1947.27,871227.12976
"""40""",8123500.0,1.960235,1.0,3.784231,5794600000.0,1398.254096,75.0,189318.364751,82039000000.0,19796.435227,8088.0,84873.318714
"""44""",6348700.0,1.759296,0.67,3.358957,5511500000.0,1527.294798,31.0,1112800.0,110430000000.0,30600.48978,5209.97,382956.40391


origin_territory,teus_sum,teus_mean,teus_median,teus_std,qty_sum,qty_mean,qty_median,qty_std,weight_sum,weight_mean,weight_median,weight_std
cat,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""CHINA (MAINLAN…",100820000.0,1.330998,0.29,2.764761,58186000000.0,768.146567,60.0,282567.396913,924080000000.0,12199.286745,1810.0,227421.942575
"""VIETNAM""",14799000.0,1.939942,1.68,3.360133,5743600000.0,752.887371,108.0,40594.239155,122340000000.0,16036.210199,5747.0,267553.308749
"""REPUBLIC OF KO…",8785600.0,1.374109,0.18,3.569255,3728100000.0,583.086979,15.0,102109.411232,260050000000.0,40672.976146,1135.3793,970277.780688
"""GERMANY""",7610100.0,1.146079,0.15,3.349716,6908800000.0,1040.474939,3.0,81033.444105,127540000000.0,19207.399255,961.21432,312697.095673
"""INDIA""",7540500.0,1.016131,0.4,1.734018,7157600000.0,964.528409,24.0,228422.530447,160480000000.0,21625.24215,3279.0,696882.334229
"""TAIWAN""",6370300.0,0.898059,0.13,2.17557,7599200000.0,1071.303575,42.0,69391.58104,94973000000.0,13388.835148,514.0,604063.571323
"""THAILAND""",6202600.0,1.882367,1.0,3.813933,5403100000.0,1639.722359,84.0,83881.375137,70773000000.0,21478.045965,6860.0,162132.622701
"""JAPAN""",5887700.0,1.140255,0.0,4.150479,11996000000.0,2323.341629,15.0,71936.432805,168800000000.0,32691.750071,0.0,686482.277142
"""ITALY""",5199400.0,0.95444,0.18,2.241003,6094600000.0,1118.780226,6.0,872033.414099,105290000000.0,19327.120361,815.0,606403.538311
"""INDONESIA""",3976300.0,1.220913,0.2,2.458622,3994000000.0,1226.345269,58.0,120792.664935,86001000000.0,26406.360216,1071.44532,894617.226697


carrier_scac,teus_sum,teus_mean,teus_median,teus_std,qty_sum,qty_mean,qty_median,qty_std,weight_sum,weight_mean,weight_median,weight_std
cat,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""MEDU""",21921000.0,2.417314,2.0,3.650818,12446000000.0,1372.480331,313.0,85969.340258,187870000000.0,20717.168207,14177.0,39361.657355
"""CMDU""",21762000.0,1.836644,1.0,3.526268,9133700000.0,770.858375,58.0,16552.068703,228280000000.0,19266.225316,8761.0,50286.703932
"""MAEU""",21328000.0,1.637615,1.0,3.382876,13891000000.0,1066.637725,53.0,159236.117762,203890000000.0,15655.881032,3562.52,47263.112132
"""EGLV""",18450000.0,1.471226,0.5,3.155951,12592000000.0,1004.110749,69.0,109926.405661,156070000000.0,12445.191129,3216.0,36205.621946
"""HLCU""",17677000.0,1.621839,1.0,3.148705,8048200000.0,738.395064,8.0,156237.541118,194510000000.0,17845.443027,6290.0,49692.309791
"""COSU""",15559000.0,1.694179,1.0,3.520617,8413800000.0,916.143013,61.0,720320.170371,128320000000.0,13972.774008,5038.0,33674.856757
"""ONEY""",15355000.0,1.902564,1.13,3.4498,8714600000.0,1079.757494,151.0,19300.318746,98559000000.0,12211.708286,5872.0,25470.923965
"""OOLU""",11746000.0,1.411077,0.58,2.501289,7478500000.0,898.443565,36.0,103875.254284,96478000000.0,11590.66679,3510.0,27296.840063
"""MSCU""",9755300.0,1.34205,0.19,3.61807,1441300000.0,198.283843,5.0,45882.953243,179040000000.0,24631.325542,826.7325,78987.105787
"""HDMU""",8340700.0,1.213205,0.2,2.70031,5946300000.0,864.915127,2.0,211609.164859,75750000000.0,11018.266235,1119.94696,109033.120262


In [69]:
def groupsum(data_lzdf, groupby_colname, sum_colnames, reference_colname=False, limit=20):
    '''
    Creates a dataframe showing sums of the variables passed by group. 
    INPUT:
        data_lzdf - a polars lazyframe containing the relevant data
        groupby_colname - the column name to group by
        sum_colnames - a list of column names
        normalize - boolean - default=False - when True, values in the dataframe will be divided row-wise by the reference column.
        reference_colname - the reference column. The resultant dataframe will be sorted (descending) by this column.
                    note: reference_colname must appear in the counts_colnames list, and must be specified when normalize=True.
    OUTPUT:
        count2w_df - a polars dataframe with the normalized two-way counts for each variable
    Depends on:
        polars
    '''
    groupsum_df = (
        data_lzdf
        .group_by(groupby_colname)
        .agg([
            pl.col(col).sum() for col in sum_colnames
        ])
        .sort(groupby_colname if not reference_colname else reference_colname, descending=True)
        .limit(limit)
        .collect()
    )
    return groupsum_df

In [109]:
def box_sums_by_time(data_lzdf, value_var, group_var, time_var, limit=10, title=''):
    '''
    Plots box and whisker diagram of the sums of the given variable for each group over time. 
    INPUT:
        data_lzdf - polars lazyframe - the relevant data
        value_var - str - the column name containing the variable of interest
        group_var - str - the column name by which the data will be grouped
        time_var - str - the column name containing the time category
        limit - int - default=10 - limits the number of categories presented
        title - str - the title of the box and whisker diagram. 
    OUTPUT:
        Displays the figure
    NOTES:
        - resulting figure is sorted by median value for each category
    DEPENDS ON:
        polars, plotly express 
    '''
    top_cats = (
        data_lzdf.select([group_var, value_var])
        .group_by(group_var)
        .agg(pl.col(value_var).sum())
        .sort(value_var, descending=True)
        .limit(limit)
        .collect()
    ).select(group_var).to_series().to_list()
    box_df = (
        data_lzdf.select([value_var, group_var, time_var])
        .filter(pl.col(group_var).is_in(top_cats))
        .group_by(time_var, group_var)
        .agg(pl.col(value_var).sum())
        .collect()
    )
    #get median values for ordering
    medians = (
        box_df.group_by(group_var)
        .agg(pl.col(value_var).median())
        .sort(value_var)
    ).to_series().to_list()
    fig = px.box(
            data_frame=box_df,
            y=group_var,
            x=value_var,
            title=title
        )
    fig.update_yaxes(categoryorder='array', categoryarray=medians)
    fig.show()

In [110]:
def box_counts_by_time(data_lzdf, value_var, group_var, time_var, limit=10, title=''):
    '''
    Plots box and whisker diagram of the counts of the given variable for each group over time. 
    INPUT:
        data_lzdf - polars lazyframe - the relevant data
        value_var - str - the column name containing the variable of interest
        group_var - str - the column name by which the data will be grouped
        time_var - str - the column name containing the time category
        limit - int - default=10 - limits the number of categories presented
        title - str - the title of the box and whisker diagram. 
    OUTPUT:
        Displays the figure
    NOTES:
        - resulting figure is sorted by median value for each category
    DEPENDS ON:
        polars, plotly express 
    '''
    #get top categories
    top_cats = (
        data_lzdf.select([group_var, value_var])
        .group_by(group_var)
        .agg(pl.col(value_var).count())
        .sort(value_var, descending=True)
        .limit(limit)
        .collect()
    ).select(group_var).to_series().to_list()
    #make df for box plots
    box_df = (
        data_lzdf.select([value_var, group_var, time_var])
        .filter(pl.col(group_var).is_in(top_cats))
        .group_by(time_var, group_var)
        .agg(pl.col(value_var).count())
        .cast({value_var: pl.Int32})
        .collect()
    )
    #get median values for ordering - only necessary because plotly's categoryorder='median ascending' does not act as expected 
    medians = (
        box_df.group_by(group_var)
        .agg(pl.col(value_var).median())
        .sort(value_var)
    ).to_series().to_list()
    fig = px.box(
            data_frame=box_df,
            y=group_var,
            x=value_var,
            title=title
        )
    fig.update_yaxes(categoryorder='array', categoryarray=medians)
    fig.show()

In [112]:
box_sums_by_time(
    data_lzdf=imports_lzdf.with_columns([
        pl.col('date_arrival').dt.strftime('%Y%m').alias('month_year'),
        pl.col('teus').replace(0,None)]),
    value_var='teus',
    group_var='hs_2d',
    time_var='month_year',
    title='TEUs per Month by HS Code'
)

In [113]:
box_counts_by_time(
    data_lzdf=imports_lzdf.with_columns(pl.col('date_arrival').dt.strftime('%Y%m').alias('month_year')),
    value_var='bol_id',
    group_var='hs_2d',
    time_var='month_year',
    title='Number of BOLs per Month by HS Code'
)

In [115]:
box_counts_by_time(
    data_lzdf=imports_lzdf.with_columns(pl.col('date_arrival').dt.strftime('%Y%m').alias('month_year')),
    value_var='bol_id',
    group_var='arrival_port_name',
    time_var='month_year',
    limit=15,
    title='Number of BOLs Imported per Month by Port'
)

In [23]:
def volumes_over_time_plotly(data_lzdf, time_var, group_var, volume_var, title=''):
    volume_df = (
        data_lzdf.select([time_var, group_var, volume_var])
        .group_by(time_var, group_var)
        .sum()
        .sort(time_var)
        .collect()
    )
    fig = px.line(
        data_frame=volume_df,
        x=time_var, y=volume_var,
        color=group_var,
        title=title
    )
    fig.show()

In [24]:
volumes_over_time_plotly(imports_lzdf, 'year', 'arrival_port_name', 'teus', title='Total TEUs imported by arrival port')





In [25]:
volumes_over_time_plotly(exports_lzdf, 'year', 'departure_port_name', 'teus', title='Total TEUs exported by port.')





In [26]:
volumes_over_time_plotly(imports_lzdf, 'year', 'arrival_port_name', 'weight', title='Total Quantity imported by arrival port')





See Known Issues section below for discussion of quantity types.

In [27]:
volumes_over_time_plotly(imports_lzdf.with_columns(pl.col('date_arrival').dt.week().alias('week')), 'week', 'hs_2d', 'teus', title='Weekly Volume (TEU) by HS Code')

## Known Data Issues

The below cells inspect known, unresolved issues in the database such as duplicated Bills of Lading, inconsistent carrier names/codes, etc. 

Known issues to be resolved:
- duplicate bols
- missing weight and teu data pre2015
- carrier names
- quantity types are quite varied 


### Duplicate BOLs 

In [28]:
#count unique bol_scac IDs
import_bols_unique_n = imports_lzdf.select(pl.col('bol_id')).unique().select(pl.count()).collect().item()
export_bols_unique_n = exports_lzdf.select(pl.col('bol_id')).unique().select(pl.count()).collect().item()

print('{:,} out of {:,} rows ({:.2f}%) in the imports dataset contain duplicated BoLs.'.format(imports_n-import_bols_unique_n, imports_n, (imports_n-import_bols_unique_n)/imports_n*100))
print('{:,} out of {:,} rows ({:.2f}%) in the exports dataset contain duplicated BoLs.'.format(exports_n-export_bols_unique_n, exports_n, (exports_n-export_bols_unique_n)/exports_n*100))

1,158,525 out of 178,875,896 rows (0.65%) in the imports dataset contain duplicated BoLs.
3,488,195 out of 68,769,489 rows (5.07%) in the exports dataset contain duplicated BoLs.


Possible reasons:
- data entry errors
- aggregations of some kind by S&P
- ???

### Missing weight and TEU data 

- weight data is inconsisent prior to 2017
    - inexplicably spiking 2015-2017 
    - inexplicably low 2013-2014
    - entirely missing prior to 2013

- TEU data is similarly strange
    - large jump 2021-2022
    - inexplicably low 2013-2014
    - entirely missing prior to 2013

We are pursuing this with S&P

### Carrier Names 

In [29]:
carriers_df = (imports_lzdf.select([pl.col('carrier_scac'), 
                                    pl.col('carrier_name')])
               .unique() 
               .sort('carrier_scac', descending=True)
               .collect()
               )

scac_unique = (
    imports_lzdf.select(pl.col('carrier_scac'))
    .unique()
    .cast(pl.Utf8)
    .sort('carrier_scac')
    .collect()
)
#scac_unique.write_csv('scac_unique.csv')

In [30]:
carriers_df.describe()

describe,carrier_scac,carrier_name
str,str,str
"""count""","""4383""","""3993"""
"""null_count""","""1""","""391"""
"""mean""",,
"""std""",,
"""min""",,
"""25%""",,
"""50%""",,
"""75%""",,
"""max""",,


In [31]:
dupscacs_df = (
    carriers_df.drop_nulls(subset='carrier_name')
    .filter(pl.col('carrier_scac').is_duplicated())
)

dupnames_df = (
    carriers_df.drop_nulls(subset='carrier_scac')
    .filter(pl.col('carrier_name').is_duplicated())
    .sort('carrier_name')
)

Discussion:
- There appear to be very few (~200 out of 178M) duplications of SCAC codes based on different spelling or naming of carriers.
- SCAC codes that need to be addressed:
    - 'BULK' (~50 rows)
    - '-1' (1 row)
    - 'ZZZZ' (a carrier scac corresponding to entirely missing carrier name data in this database)
    - these could be coded as null values in the ETL step

# Older code - likely to be dropped

## Value and Volumes by Year:

In [32]:
#get year col
pldf['year'] = pldf.date_arrival.dt.to_period('Y')
#group value and volume by year
activityacrosstime_df = pldf[['year', 'teus', 'value_est']].groupby('year').sum()
#plot
sns.barplot(data=activityacrosstime_df, x='year', y='value_est');
plt.title('Total Value of Imports Over Time')
plt.xticks(rotation=45);

NameError: name 'pldf' is not defined

In [None]:
#plot
sns.barplot(data=activityacrosstime_df, x='year', y='teus');
plt.title('Total Volume (TEUs) of Imports Over Time')
plt.xticks(rotation=45);

I guess value and volume records weren't kept before ~2015 ?!?

In [None]:
#group value and volume by year
until2012_df = pldf[pldf['year']< pd.Period(2013)]
until2012_df = until2012_df[['year', 'teus', 'value_est']].groupby('year').sum()
#plot
sns.barplot(data=until2012_df, x='year', y='teus');
plt.title('Total Volume (TEUs) of Imports Over Time')
plt.xticks(rotation=45);

Must be missing data here?

In [None]:
#get year col
pldf['year'] = pldf.date_arrival.dt.to_period('Y')
#group value and volume by year
activityacrosstime_df = pldf[['year', 'container_piece_count']].groupby('year').sum()
#plot
sns.barplot(data=activityacrosstime_df, x='year', y='container_piece_count');
plt.title('Total Volume (Container Piece Count) of Imports Over Time')
plt.xticks(rotation=45);

In [None]:
pldf.head()

## Carriers Over Time

In [None]:
carriersovertime_df = pldf[['year', 'carrier_scac']].groupby('year').nunique()

In [None]:
sns.barplot(data=carriersovertime_df, x='year', y='carrier_scac');
plt.title('Number of Unique Carriers Over Time');
plt.xticks(rotation=45);

Was there really a spike in carriers in 2010 and 2012? Or does this indicate changes in the way SCAC codes are assigned?

### Market share of the 50 largest carriers by estimated value 

In [None]:
#get largest carriers
carriers_df = pldf[pldf.year > pd.Period(2014)]
carriers_df = carriers_df[['year', 'carrier_scac', 'value_est']].groupby(['year', 'carrier_scac']).sum()

In [None]:
carriers_df.columns = ['value_usd']
carriers_df.sort_values('value_usd', ascending=False, inplace=True)
carriers_df.sort_values('year', inplace=True)

In [None]:
carriers_df.reset_index(inplace=True)
carriers_df.head()