# Ocean Carrier Alliances: Analysis and Modeling 

This notebook analyzes the data processed in the "oca_data_prep" notebook; see the [repo](https://github.com/epistemetrica/Ocean-Carrier-Alliances-Project/blob/main/analysis/oca_data_prep.ipynb) for full details. 

In [1]:
#preliminaries

#preliminaries 
import pandas as pd #v2.1.3
import numpy as np
import polars as pl #v1.1.0
import plotly_express as px #v0.4.1 
import plotly.graph_objects as go
from datetime import datetime
import statsmodels.api as sm
import scipy
import matplotlib.pyplot as plt
import seaborn as sns

#enable string cache for polars categoricals
pl.enable_string_cache()

#load data to lazyframe NOTE modify this step once unified data is finished clustering
exports_lf = pl.scan_parquet('../data/exports/exports.parquet') 
imports_lf = pl.scan_parquet('../data/imports/*.parquet')
#create main lf
main_lf = pl.concat([imports_lf, exports_lf], how='diagonal')

## Data Summary 

In [6]:
display(main_lf.limit(5).collect())
main_lf.describe()

teus,date,origin_territory,origin_region,arrival_port_code,arrival_port_name,departure_port_code,departure_port_name,coast_region,hs_code,carrier_name,carrier_scac,vessel_name,voyage_number,vessel_id,direction,bol_id,year,month,lane_id,lane_name,unified_carrier_name,unified_carrier_scac,vessel_owner,primary_cargo,vessel_lane_pair,date_arrival,dest_territory,dest_region,date_departure
f64,datetime[μs],cat,cat,cat,cat,cat,cat,cat,str,cat,cat,str,str,i32,cat,str,i32,str,cat,cat,cat,cat,cat,bool,cat,datetime[μs],cat,cat,datetime[μs]
2.198764,2005-12-31 00:00:00,"""PHILIPPINES""","""SOUTH EAST ASIA""","""2704""","""LOS ANGELES""","""58309""","""KAOHSIUNG""","""WEST""","""730721""","""HYUNDAI""","""HYMM""","""APL CHINA""","""97""",9074389,"""import""","""HYMM_62321214L12""",2005,"""200512""","""58309_2704""","""Kaohsiung — Los Angeles""","""HYUNDAI""","""HDMU""","""APLU""",False,"""9074389_58309_2704""",2005-12-31 00:00:00,,,
2.198764,2005-12-31 00:00:00,"""MALAYSIA""","""SOUTH EAST ASIA""","""1601""","""CHARLESTON""","""22519""","""COLON PA""","""EAST""","""400121""","""EVERGREEN LINE""","""EVER""","""EVER DAINTY""","""200""",9134232,"""import""","""EVER_090550113466""",2005,"""200512""","""22519_1601""","""Colon Pa — Charleston""","""EVERGREEN LINE""","""SLCC""","""SLCC""",True,"""9134232_22519_1601""",2005-12-31 00:00:00,,,
2.198764,2005-12-31 00:00:00,"""CHINA (MAINLAND)""","""NORTH EAST ASIA""","""2704""","""LOS ANGELES""","""57078""","""YANTIAN""","""WEST""","""009669""","""HATSU MARINE LTD""","""HTML""","""HATSU EXCEL""","""344""",9241322,"""import""","""HTML_149502055056""",2005,"""200512""","""57078_2704""","""Yantian — Los Angeles""","""HATSU MARINE LTD""","""HTML""","""SLCC""",False,"""9241322_57078_2704""",2005-12-31 00:00:00,,,
2.198764,2005-12-31 00:00:00,"""CHINA (MAINLAND)""","""NORTH EAST ASIA""","""2704""","""LOS ANGELES""","""57069""","""XIAMEN""","""WEST""","""640590""","""HYUNDAI""","""HYMM""","""APL CHINA""","""97""",9074389,"""import""","""HYMM_615737517""",2005,"""200512""","""57069_2704""","""Xiamen — Los Angeles""","""HYUNDAI""","""HDMU""","""APLU""",False,"""9074389_57069_2704""",2005-12-31 00:00:00,,,
2.198764,2005-12-31 00:00:00,"""CHINA (MAINLAND)""","""NORTH EAST ASIA""","""2704""","""LOS ANGELES""","""57069""","""XIAMEN""","""WEST""","""732393""","""ORIENT OVERSEAS CONTAINER LINE""","""OOCL""","""NYK ATHENA""","""26""",9247766,"""import""","""OOCL_96352730""",2005,"""200512""","""57069_2704""","""Xiamen — Los Angeles""","""ORIENT OVERSEAS CONTAINER LINE""","""SMMB""","""NYKS""",False,"""9247766_57069_2704""",2005-12-31 00:00:00,,,


statistic,teus,date,origin_territory,origin_region,arrival_port_code,arrival_port_name,departure_port_code,departure_port_name,coast_region,hs_code,carrier_name,carrier_scac,vessel_name,voyage_number,vessel_id,direction,bol_id,year,month,lane_id,lane_name,unified_carrier_name,unified_carrier_scac,vessel_owner,primary_cargo,vessel_lane_pair,date_arrival,dest_territory,dest_region,date_departure
str,f64,str,str,str,str,str,str,str,str,str,str,str,str,str,f64,str,str,f64,str,str,str,str,str,str,f64,str,str,str,str,str
"""count""",237242716.0,"""237242716""","""170344621""","""170344621""","""237242716""","""237242716""","""237242716""","""237242716""","""237135817""","""237241270""","""236978373""","""237242716""","""237242716""","""233594720""",237242716.0,"""237242716""","""237240807""",237242716.0,"""237242716""","""237242716""","""237242716""","""237200871""","""237242716""","""237242716""",237242716.0,"""237242716""","""170844507""","""66354759""","""66354759""","""66329578"""
"""null_count""",0.0,"""0""","""66898095""","""66898095""","""0""","""0""","""0""","""0""","""106899""","""1446""","""264343""","""0""","""0""","""3647996""",0.0,"""0""","""1909""",0.0,"""0""","""0""","""0""","""41845""","""0""","""0""",0.0,"""0""","""66398209""","""170887957""","""170887957""","""170913138"""
"""mean""",2.604211,"""2015-11-12 09:32:47.660812""",,,,,,,,,,,,,9291200.0,,,2015.363832,,,,,,,0.631104,,"""2016-05-03 06:53:29.688093""",,,"""2014-07-29 18:53:10.259862"""
"""std""",3.791839,,,,,,,,,,,,,,387539.223857,,,5.347143,,,,,,,,,,,,
"""min""",0.01,"""2005-01-01 00:00:00""",,,,,,,,"""-1""",,,"""102 SUNG SHIN""","""#c""",196.0,,"""-1_CSHSE0009996""",2005.0,"""200501""",,,,,,0.0,,"""2005-01-01 00:00:00""",,,"""2005-01-01 00:00:00"""
"""25%""",2.0,"""2011-09-02 00:00:00""",,,,,,,,,,,,,9232759.0,,,2011.0,,,,,,,,,"""2012-06-17 00:00:00""",,,"""2009-10-02 00:00:00"""
"""50%""",2.198764,"""2016-04-05 00:00:00""",,,,,,,,,,,,,9320257.0,,,2016.0,,,,,,,,,"""2016-11-08 00:00:00""",,,"""2014-07-19 00:00:00"""
"""75%""",2.65,"""2020-08-06 00:00:00""",,,,,,,,,,,,,9450648.0,,,2020.0,,,,,,,,,"""2020-12-15 00:00:00""",,,"""2019-02-15 00:00:00"""
"""max""",3729.25,"""2024-03-31 00:00:00""",,,,,,,,"""ddedo""",,,"""xin yang shan""","""|SAL5""",9993688.0,,"""zzzz_ZZZZ""",2024.0,"""202403""",,,,,,1.0,,"""2024-03-31 00:00:00""",,,"""2024-03-31 00:00:00"""


In [7]:
#load sum format
sum_df = pl.read_excel('tables/summary_format.xlsx')

#force main lf to match new main format (NOTE drop this step after polishing oca_data_prep)
main_lf = (
    main_lf
    #unify date to single column
    .drop('date')
    .with_columns(
        pl.when(pl.col('direction')=='import')
        .then(pl.col('date_arrival'))
        .otherwise(pl.col('date_departure'))
        .alias('date')
    )
    #add alliance column
    .with_columns(pl.lit(None).alias('alliance'))
    #add vessel capacity col
    .with_columns(pl.lit(None).alias('vessel_cap'))
    #add primary carrier alliance col
    .with_columns(pl.lit(None).alias('pc_alliance'))
    #add rate col
    .with_columns(pl.lit(None).alias('rate'))
    #add within alliance region col
    .with_columns(pl.lit(None).alias('in_alliance_region'))
)

#create description table from main lf
desc_df = (
    main_lf
    #add cols to match sum_df format
    .with_columns(
        pl.lit(None).alias('Cargo Data'),
        pl.lit(None).alias('Geographic Data'),
        pl.lit(None).alias('Carrier Data'),
        pl.lit(None).alias('Vessel Data'),
    )
    #select appropriate columns
    .select(
        'Cargo Data', 'bol_id', 'teus', 'rate', 'date', 'hs_code', 'primary_cargo',
        'Geographic Data', 'departure_port_name', 'departure_port_code', 'origin_region', 
        'arrival_port_name', 'arrival_port_code', 'dest_region', 'coast_region', 'lane_id', 'in_alliance_region',
        'Carrier Data', 'unified_carrier_name', 'unified_carrier_scac', 'alliance',
        'Vessel Data', 'vessel_id', 'vessel_cap', 'vessel_owner', 'pc_alliance'
    )
    .describe()
)


In [8]:
desc_df

statistic,Cargo Data,bol_id,teus,rate,date,hs_code,primary_cargo,Geographic Data,departure_port_name,departure_port_code,origin_region,arrival_port_name,arrival_port_code,dest_region,coast_region,lane_id,in_alliance_region,Carrier Data,unified_carrier_name,unified_carrier_scac,alliance,Vessel Data,vessel_id,vessel_cap,vessel_owner,pc_alliance
str,f64,str,f64,f64,str,str,f64,f64,str,str,str,str,str,str,str,str,f64,f64,str,str,f64,f64,f64,f64,str,f64
"""count""",0.0,"""237240807""",237242716.0,0.0,"""237174085""","""237241270""",237242716.0,0.0,"""237242716""","""237242716""","""170344621""","""237242716""","""237242716""","""66354759""","""237135817""","""237242716""",0.0,0.0,"""237200871""","""237242716""",0.0,0.0,237242716.0,0.0,"""237242716""",0.0
"""null_count""",237242716.0,"""1909""",0.0,237242716.0,"""68631""","""1446""",0.0,237242716.0,"""0""","""0""","""66898095""","""0""","""0""","""170887957""","""106899""","""0""",237242716.0,237242716.0,"""41845""","""0""",237242716.0,237242716.0,0.0,237242716.0,"""0""",237242716.0
"""mean""",,,2.604211,,"""2015-11-05 07:43:30.318531""",,0.631104,,,,,,,,,,,,,,,,9291200.0,,,
"""std""",,,3.791839,,,,,,,,,,,,,,,,,,,,387539.223857,,,
"""min""",,"""-1_CSHSE0009996""",0.01,,"""2005-01-01 00:00:00""","""-1""",0.0,,,,,,,,,,,,,,,,196.0,,,
"""25%""",,,2.0,,"""2011-08-24 00:00:00""",,,,,,,,,,,,,,,,,,9232759.0,,,
"""50%""",,,2.198764,,"""2016-03-28 00:00:00""",,,,,,,,,,,,,,,,,,9320257.0,,,
"""75%""",,,2.65,,"""2020-07-29 00:00:00""",,,,,,,,,,,,,,,,,,9450648.0,,,
"""max""",,"""zzzz_ZZZZ""",3729.25,,"""2024-03-31 00:00:00""","""ddedo""",1.0,,,,,,,,,,,,,,,,9993688.0,,,


In [9]:

#transpose desc df
desc_df = (
    desc_df
    .transpose(include_header=True, column_names='statistic')
    #select desired cols in order
    .select('count', 'mean', 'std', 'min', 'max')
)

#add desc_df data to sum_df
sum_df = (
    sum_df
    .with_columns(
        Obs = desc_df.select('count').to_series(),
        Mean = desc_df.select('mean').to_series(),
        Std = desc_df.select('std').to_series(),
        Min = desc_df.select('min').to_series(),
        Max = desc_df.select('max').to_series()
    )
    #drop polars weirdness

)

In [10]:
sum_df

Variable,Type,Source,Obs,Mean,Std,Min,Max,Description
str,str,str,str,str,str,str,str,str
"""Cargo Data""",,,"""0.0""",,,,,
"""BOL ID""","""categorical""","""PIERS""","""237240807""",,,"""-1_CSHSE0009996""","""zzzz_ZZZZ""","""Alphanumeric code uniquely ide…"
"""Volume (TEU)""","""continuous""","""PIERS""","""237242716.0""","""2.6042111641170265""","""3.7918386507790065""","""0.01""","""3729.25""","""The Twenty-foot Equivalent Uni…"
"""Rate (USD)""","""continuous""","""Drewery""","""0.0""",,,,,"""Drewery monthly rate index for…"
"""Date""","""categorical""","""PIERS""","""237174085""","""2015-11-05 07:43:30.318531""",,"""2005-01-01 00:00:00""","""2024-03-31 00:00:00""","""Arrival (imports) or departure…"
…,…,…,…,…,…,…,…,…
"""Vessel Data""",,,"""0.0""",,,,,
"""Vessel ID""","""categorical""","""PIERS""","""237242716.0""","""9291202.401805988""","""387539.22385705466""","""196.0""","""9993688.0""","""IMO code uniquely identifying …"
"""Vessel Capacity""","""continuous""","""US Corp of Eng.""","""0.0""",,,,,"""Total TEUs able to be carried …"
"""Primary Carrier""","""categorical""","""PIERS""","""237242716""",,,,,"""The carrier representing the m…"


In [4]:
#save to csv
sum_df.write_csv('tables/table1_summary.csv')