# Exploratory Analysis of Import Data

This notebook explores the PIERS Bill of Lading data, obtained from S&P's Global Trade Analytics Suite. See the README.md file for more info on the overall project, data pre-processing, and column definitions. 

In [1]:
#import libraries
import pandas as pd
import polars as pl
import matplotlib.pyplot as plt
import seaborn as sns


#display settings
pd.set_option('display.max_columns', None)
%matplotlib inline

#enable string cache for polars categoricals
pl.enable_string_cache()

## Basic Summary Stats

In [2]:
#set paths
imports_path = 'data/clean/imports/'
exports_path = 'data/'

#get schema and col names
imports_schema = pl.read_parquet_schema(source= imports_path+'piers_imports_2005.parquet')
imports_colnames = imports_schema.keys()
exports_schema = pl.read_parquet_schema(source=exports_path+'piers_exports_complete.parquet')
exports_colnames = exports_schema.keys()

#init lazy dataframes
imports_lzdf = pl.scan_parquet(imports_path+'*.parquet', parallel='columns')
exports_lzdf = pl.scan_parquet(exports_path+'piers_exports_complete.parquet', parallel='columns')

#get number of observations
imports_n = imports_lzdf.select(pl.count()).collect().item()
exports_n = exports_lzdf.select(pl.count()).collect().item()
print('The imports dataset has {:,} rows and {} columns.'.format(imports_n, len(imports_schema)))
print('The exports dataset has {:,} rows and {} columns.'.format(exports_n, len(exports_schema)))

The imports dataset has 178,875,896 rows and 44 columns.
The exports dataset has 68,769,489 rows and 31 columns.


In [3]:
#view head of imports dataframe 
imports_lzdf.limit(n=3).collect()

weight,weight_unit,qty,qty_type,teus,value_est,date_arrival,container_piece_count,commod_short_desc_qty,origin_territory,origin_region,arrival_port_code,arrival_port_name,departure_port_code,departure_port_name,dest_final,coast_region,clearing_district,place_receipt,shipper_name,shipper_address,consignee_name,consignee_address,notify_party1_name,notify_party1_address,notify_party2_name,notify_party2_address,commod_desc_raw,container_id_marks,marks_desc,hs_code,joc_code,commod_short_desc,container_ids,carrier_name,carrier_scac,vessel_name,voyage_id,precarrier,vessel_id,inbond_code,transport_mode,bol_number,bol_id
f64,cat,f64,cat,f64,f64,datetime[μs],i32,str,cat,cat,cat,cat,cat,cat,cat,cat,cat,cat,str,str,str,str,str,str,str,str,str,str,str,cat,cat,str,str,cat,cat,str,str,f64,i32,f64,cat,str,str
0.0,"""KG""",0.0,"""-1""",0.0,0.0,2005-10-12 00:00:00,0,"""0""","""UNITED KINGDOM…","""NORTH EUROPE""","""1601""","""CHARLESTON""","""41351""","""THAMESPORT""",""" ""","""EAST""","""CHARLESTON, S.…","""BRIDGWATER""","""FORMASTERCONTA…","""""","""FORMASTERCONTA…","""""","""FORMASTERCONTA…","""""",,,,"""""","""""","""009669""","""""","""""","""""","""HAPAG LLOYD""","""HAPL""","""CP LIBERATOR""","""8039""",,8415952,,,"""LIV050931677""","""HAPL_LIV050931…"
0.0,"""KG""",1832.0,"""CTN""",0.0,0.0,2005-03-30 00:00:00,1,"""1832""","""CHINA (MAINLAN…","""NORTH EAST ASI…","""2709""","""LONG BEACH""","""58023""","""BUSAN""",""" ""","""WEST""","""LOS ANGELES, C…","""TIANJIN""","""ORDER""","""""","""ORDER""","""""","""ORDER""","""""",,,,"""""","""LC""","""721590""","""""","""""","""TRLU6266377""","""MEDITERRANEAN …","""MDSC""","""MSC JENNY""","""510""",,8709169,,,"""UMSCHH51015F""","""MDSC_UMSCHH510…"
0.0,"""KG""",3500.0,"""PKG""",0.0,0.0,2005-01-26 00:00:00,2,"""3500""","""SPAIN""","""MEDITERRANEAN""","""4909""","""SAN JUAN""","""47094""","""VALENCIA""",""" ""","""CARIBBEAN""",,"""VALENCIA""","""PERFUMERIA GAL…","""AVDA DE MADRID…","""LOUIS GARRATON…","""""","""LOUIS GARRATON…","""""",,,,"""""","""ZZ; ZZ""","""330300""","""""","""""","""CTEU2003642 CT…","""CIA. TRANSATLA…","""CTES""","""TMM SINALOA""","""32""",,8406286,,,"""VLOPP0407045""","""CTES_VLOPP0407…"


In [4]:
#view head of exports dataframe 
exports_lzdf.limit(n=3).collect()

shipper,shipper_address,weight,weight_unit,quantity,quantity_type,teus,carrier_name,carrier_scac,vessel_name,voyage_number,bol_number,vessel_id,value_est,departure_port_code,departure_port_name,container_ids,container_piece_count,coast_region,commod_desc_raw,commod_short_desc,hs_code,joc_code,commod_short_desc_qty,date_departure,origin,dest_territory,dest_region,dest_port_code_declared,dest_port_name,bol_id
str,str,f64,cat,f64,cat,f64,cat,cat,str,str,str,i32,f64,cat,cat,str,i32,cat,str,str,str,str,str,datetime[μs],str,cat,cat,i32,cat,str
,,0.0,"""KG""",17.0,"""UNT""",0.0,"""HOEGH UGLAND A…","""AUTO""","""OCEAN SPIRIT""","""93""","""03102""",8321747,0.0,"""1803""","""JACKSONVILLE""","""""",0,"""EAST""",,"""""","""870390""","""""","""1; 2; 1; 2; 1;…",2005-12-26 00:00:00,""" ""","""SAUDI ARABIA""","""MIDDLE EAST""",51721,"""JEDDAH""","""AUTO_03102"""
,,0.0,"""KG""",3120.0,"""PKG""",0.0,"""MAERSK LINE""","""MLSL""","""SEA LAND INNOV…","""513""","""SJ1724618""",7820851,0.0,"""3126""","""ANCHORAGE""","""GLDU0487348 MA…",5,"""WEST""",,"""""","""030510""","""""","""3120""",2005-09-15 00:00:00,""" ""","""JAPAN""","""NORTH EAST ASI…",58895,"""YOKOHAMA""","""MLSL_SJ1724618…"
,,0.0,"""KG""",1.0,"""LOT""",0.0,"""P&O NEDLLOYD""","""PONL""","""OOCL NINGBO""","""6636""","""LNAM30127882""",9256482,0.0,"""2709""","""LONG BEACH""","""INBU3009947""",1,"""WEST""",,"""""","""4805""","""""","""1""",2005-12-14 00:00:00,""" ""","""SINGAPORE""","""SOUTH EAST ASI…",55976,"""SINGAPORE""","""PONL_LNAM30127…"


In [5]:
#init df and get stats labels column
imports_summarystats_df = imports_lzdf.select(pl.first()).collect().describe().select(pl.first()).to_pandas()
#loop through columns and get descriptive stats
for column in imports_colnames:
    imports_summarystats_df[column] = imports_lzdf.select(pl.col(column)).collect().describe().select(column).to_pandas()
#display
imports_summarystats_df

Unnamed: 0,describe,weight,weight_unit,qty,qty_type,teus,value_est,date_arrival,container_piece_count,commod_short_desc_qty,origin_territory,origin_region,arrival_port_code,arrival_port_name,departure_port_code,departure_port_name,dest_final,coast_region,clearing_district,place_receipt,shipper_name,shipper_address,consignee_name,consignee_address,notify_party1_name,notify_party1_address,notify_party2_name,notify_party2_address,commod_desc_raw,container_id_marks,marks_desc,hs_code,joc_code,commod_short_desc,container_ids,carrier_name,carrier_scac,vessel_name,voyage_id,precarrier,vessel_id,inbond_code,transport_mode,bol_number,bol_id
0,count,178875900.0,178875896.0,178875900.0,150054988.0,178875900.0,178875900.0,178875896,178875900.0,178875850,178004511.0,178004511.0,178842823.0,178842823.0,178399200.0,178399200.0,178875896.0,178767698.0,138267806.0,178028936.0,177152819,149352161,177316770,158522570,131500774,147349739,11190643,80229284,153344894,178875896,178875896,178875850.0,178875850.0,178875850,178875896,178743544.0,178875872.0,167484536,178875896,0.0,167394300.0,45999800.0,54698436.0,178875896,178875896
1,null_count,0.0,0.0,0.0,28820908.0,0.0,0.0,0,0.0,46,871385.0,871385.0,33073.0,33073.0,476696.0,476696.0,0.0,108198.0,40608090.0,846960.0,1723077,29523735,1559126,20353326,47375122,31526157,167685253,98646612,25531002,0,0,46.0,46.0,46,0,132352.0,24.0,11391360,0,178875896.0,11481590.0,132876100.0,124177460.0,0,0
2,mean,47406.86,,1203.557,,1.28577,78087.32,,1.440886,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,9338100.0,306.6917,,,
3,std,1721627.0,,409020.4,,3.112699,2853759.0,,3.409732,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,285028.8,1059.749,,,
4,min,0.0,,-74561090.0,,0.0,0.0,2005-01-01 00:00:00,0.0,,,,,,,,,,,,,,,,,,& OPTCO,,,,,,,,,,,,,,0.0,0.0,,$70050200001,-1_CSHSE0009996
5,25%,0.0,,1.0,,0.0,0.0,,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,9248112.0,0.0,,,
6,50%,1466.0,,38.0,,0.22,6571.0,,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,9332884.0,0.0,,,
7,75%,15010.0,,400.0,,2.0,45526.0,,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,9462706.0,0.0,,,
8,max,6172936000.0,,3531460000.0,,1730.75,9981402000.0,2023-11-30 00:00:00,999.0,9; 9; 9; 9; 9; 9; 9; 9; 9; 9; 9; 9; 9; 9; 9; 9...,,,,,,,,,,,“COOPERATIVA OPERAI CAVATORI DEL BOTTICINO,ÞORRASALIR 13,C MCCRACKEN,ŠILUTES STR 9,C MCCRACKEN,ŠILUTES STR 9,ZYP COATINGS,co RMI Titanium Company LLC 208, WORKING GLOVES,}No MarksNo MarksNo Marks,; ; ; ; ; ; ; ; ; ; ; ,,,"ZIRCONIUM; SAND,FLOUR,SPONGE,POWDER VALVES; EN...",vTA1108,,,ZYGI,^144,,9975363.0,7122.0,,_YLC158932,zimu_ZIMUZHJ0000952


In [6]:
#NOTE at the moment the exports dataset may fit in memory, in which case the below code could be accomplished more efficiently
#by executing the following line; however, the main code below should run even when the dataset does not fit in memory. 
#exports_lzdf.collect().describe()

#init df and get stats labels column
exports_summarystats_df = exports_lzdf.select(pl.first()).collect().describe().select(pl.first()).to_pandas()
#loop through columns and get descriptive stats
for column in exports_colnames:
    exports_summarystats_df[column] = exports_lzdf.select(pl.col(column)).collect().describe().select(column).to_pandas()
#display
exports_summarystats_df

Unnamed: 0,describe,shipper,shipper_address,weight,weight_unit,quantity,quantity_type,teus,carrier_name,carrier_scac,vessel_name,voyage_number,bol_number,vessel_id,value_est,departure_port_code,departure_port_name,container_ids,container_piece_count,coast_region,commod_desc_raw,commod_short_desc,hs_code,joc_code,commod_short_desc_qty,date_departure,origin,dest_territory,dest_region,dest_port_code_declared,dest_port_name,bol_id
0,count,52901485,51534703,68769490.0,68769489.0,68769490.0,61810793.0,68769490.0,68595453.0,68768115.0,63384893,67625608,68769456,63231150.0,68769490.0,68720145.0,68720145.0,68769489,68769490.0,68769013.0,53168182,68769437,68769437,68769437,68769437,68769489,68769489.0,68279599.0,68279599.0,68299280.0,68299282.0,68769456
1,null_count,15868004,17234786,0.0,0.0,0.0,6958696.0,0.0,174036.0,1374.0,5384596,1143881,33,5538337.0,0.0,49344.0,49344.0,0,0.0,476.0,15601307,52,52,52,52,0,0.0,489890.0,489890.0,470207.0,470207.0,33
2,mean,,,115206.7,,39653.11,,1.672979,,,,,,9159625.0,141566.4,,,,1.923483,,,,,,,,,,,49071.75,,
3,std,,,2375519.0,,3471335.0,,5.518306,,,,,,543876.9,6466967.0,,,,11.0981,,,,,,,,,,,19060.1,,
4,min,,,0.0,,-1467123000.0,,0.0,,,,,,196.0,0.0,,,,0.0,,,,,,,2005-01-01 00:00:00,,,,0.0,,079A_26004878070
5,25%,,,0.0,,0.0,,0.0,,,,,,9143568.0,0.0,,,,1.0,,,,,,,,,,,33700.0,,
6,50%,,,136.08,,13.0,,0.0,,,,,,9295220.0,408.0,,,,1.0,,,,,,,,,,,52051.0,,
7,75%,,,19196.96,,178.0,,2.0,,,,,,9398424.0,46271.0,,,,1.0,,,,,,,,,,,58201.0,,
8,max,SUNSET TRANSPORTATION SERVICE,,952998100.0,,3716633000.0,,3729.25,,,xin yang shan,|SAL5,PE10518042,9980629.0,10000000000.0,,,~POPLAR AVE |SAVALY762976 _38197 WFHU5153172 W...,4136.0,,ÙTEEL GUARD RAIL; ÙTEEL GUARD RAILS,"ZIRCONIUM; SAND,FLOUR,SPONGE,POWDER ZIRCONIUM ...",ddedo,9669000 7746000 1301000 6830000,9; 9; 9; 9; 9; 9; 9; 9; 9; 9; 9; 9; 9; 9; 9; 9...,2023-11-30 00:00:00,,,,99930.0,,zzzz_ZZZZ


## Known Data Issues

The below cells inspect known, unresolved issues in the database such as duplicated Bills of Lading, inconsistent carrier names/codes, etc. 

### Duplicate BOLs 

In [7]:
#count unique bol_scac IDs
import_bols_unique_n = imports_lzdf.select(pl.col('bol_id')).unique().select(pl.count()).collect().item()
export_bols_unique_n = exports_lzdf.select(pl.col('bol_id')).unique().select(pl.count()).collect().item()

print('{:,} out of {:,} rows ({:.2f}%) in the imports dataset contain duplicated BoLs.'.format(imports_n-import_bols_unique_n, imports_n, (imports_n-import_bols_unique_n)/imports_n*100))
print('{:,} out of {:,} rows ({:.2f}%) in the exports dataset contain duplicated BoLs.'.format(exports_n-export_bols_unique_n, exports_n, (exports_n-export_bols_unique_n)/exports_n*100))

1,158,525 out of 178,875,896 rows (0.65%) in the imports dataset contain duplicated BoLs.
3,488,195 out of 68,769,489 rows (5.07%) in the exports dataset contain duplicated BoLs.


Possible reasons:
- data entry errors
- aggregations of some kind by S&P
- ???

### Carrier Names 

In [56]:
carriers_df = (imports_lzdf.select([pl.col('carrier_scac'), 
                                    pl.col('carrier_name')])
               .unique() 
               .sort('carrier_scac', descending=True)
               .collect()
               )

scac_unique = (
    imports_lzdf.select(pl.col('carrier_scac'))
    .unique()
    .cast(pl.Utf8)
    .sort('carrier_scac')
    .collect()
)
#scac_unique.write_csv('scac_unique.csv')

In [53]:
carriers_df.describe()

describe,carrier_scac,carrier_name
str,str,str
"""count""","""4383""","""3993"""
"""null_count""","""1""","""391"""
"""mean""",,
"""std""",,
"""min""",,
"""25%""",,
"""50%""",,
"""75%""",,
"""max""",,


In [58]:
dupscacs_df = (
    carriers_df.drop_nulls(subset='carrier_name')
    .filter(pl.col('carrier_scac').is_duplicated())
)

dupnames_df = (
    carriers_df.drop_nulls(subset='carrier_scac')
    .filter(pl.col('carrier_name').is_duplicated())
    .sort('carrier_name')
)

Discussion:
- There appear to be very few (~200 out of 178M) duplications of SCAC codes based on different spelling or naming of carriers.
- SCAC codes that need to be addressed:
    - 'BULK' (~50 rows)
    - '-1' (1 row)
    - 'ZZZZ' (a carrier scac corresponding to entirely missing carrier name data in this database)
    - these could be coded as null values in the ETL step

# Older code - likely to be dropped

## Value and Volumes by Year:

In [None]:
#get year col
pldf['year'] = pldf.date_arrival.dt.to_period('Y')
#group value and volume by year
activityacrosstime_df = pldf[['year', 'teus', 'value_est']].groupby('year').sum()
#plot
sns.barplot(data=activityacrosstime_df, x='year', y='value_est');
plt.title('Total Value of Imports Over Time')
plt.xticks(rotation=45);

In [None]:
#plot
sns.barplot(data=activityacrosstime_df, x='year', y='teus');
plt.title('Total Volume (TEUs) of Imports Over Time')
plt.xticks(rotation=45);

I guess value and volume records weren't kept before ~2015 ?!?

In [None]:
#group value and volume by year
until2012_df = pldf[pldf['year']< pd.Period(2013)]
until2012_df = until2012_df[['year', 'teus', 'value_est']].groupby('year').sum()
#plot
sns.barplot(data=until2012_df, x='year', y='teus');
plt.title('Total Volume (TEUs) of Imports Over Time')
plt.xticks(rotation=45);

Must be missing data here?

In [None]:
#get year col
pldf['year'] = pldf.date_arrival.dt.to_period('Y')
#group value and volume by year
activityacrosstime_df = pldf[['year', 'container_piece_count']].groupby('year').sum()
#plot
sns.barplot(data=activityacrosstime_df, x='year', y='container_piece_count');
plt.title('Total Volume (Container Piece Count) of Imports Over Time')
plt.xticks(rotation=45);

In [None]:
pldf.head()

## Carriers Over Time

In [None]:
carriersovertime_df = pldf[['year', 'carrier_scac']].groupby('year').nunique()

In [None]:
sns.barplot(data=carriersovertime_df, x='year', y='carrier_scac');
plt.title('Number of Unique Carriers Over Time');
plt.xticks(rotation=45);

Was there really a spike in carriers in 2010 and 2012? Or does this indicate changes in the way SCAC codes are assigned?

### Market share of the 50 largest carriers by estimated value 

In [None]:
#get largest carriers
carriers_df = pldf[pldf.year > pd.Period(2014)]
carriers_df = carriers_df[['year', 'carrier_scac', 'value_est']].groupby(['year', 'carrier_scac']).sum()

In [None]:
carriers_df.columns = ['value_usd']
carriers_df.sort_values('value_usd', ascending=False, inplace=True)
carriers_df.sort_values('year', inplace=True)

In [None]:
carriers_df.reset_index(inplace=True)
carriers_df.head()