In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import polars as pl
# pd.set_option('display.max_rows', 7)
# pd.set_option('display.max_columns', None)

carrier=pl.read_parquet('../data/usda_aggregations/lane_month_carrier.parquet').to_pandas()
carrier['date']=pd.to_datetime(carrier['month'],format='%Y%m')
carrier['year']=carrier.date.dt.year

lane=pl.read_parquet('../data/usda_aggregations/lane_month.parquet').to_pandas()
lane['date']=pd.to_datetime(lane['month'],format='%Y%m')
lane['year']=lane.date.dt.year

In [2]:
cmap={
        'Non-alliance Carriers':'#636EFA',
        'The Alliance':'#EF553B',
        'Ocean Alliance':'#00CC96',
        'CYKHE':'#ABC3FA',
        'New World All (NWA)':'#FFA15A',
        'Grand Alliance IV':'#19D3F3',
        'G6 Alliance':'#FF6692',
        'MSC/CMA CGM':'#B6E880',
        '2M Alliance':'#FECB52',
        'Ocean Three':'#FF97FF',
        'CYKH':'#17BECF',
    }
catOrders={
        'carrier_alliance':[
            'The Alliance', 'CYKHE', 'Ocean Alliance',  
            'CYKH', 'Ocean Three', '2M Alliance', 'G6 Alliance', 'MSC/CMA CGM', 'New World All (NWA)', 'Grand Alliance IV','Non-alliance Carriers',
        ]
    }
fig=px.bar(carrier.groupby(['year','scac','carrier_alliance'],observed=True)['teus'].sum().reset_index(),
       x='year',y='teus',color='carrier_alliance',text='scac',
       color_discrete_map=cmap,category_orders=catOrders,
    labels={'year':'Year','teus':'TEUs'},title='Annual Volumes by Carrier and Alliance').update_layout(
        legend=dict(title='',bgcolor='rgba(0,0,0,0)',orientation='v'),
        font_size=15,
        title={'y':.99,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'},
        margin=dict(l=20, r=20, t=30, b=20))
fig.write_image('alliance_bar.png')
fig.show()

# data=carrier.groupby(['year','scac','carrier_alliance'],observed=True)['teus'].sum().reset_index()
# data['total']=data.groupby('year')['teus'].transform('sum')
# data['prop_total']=data.teus/data.total
# px.bar(data,
#        x='year',y='prop_total',color='carrier_alliance',text='scac',
#        color_discrete_map=cmap,category_orders=catOrders,
#     labels={'year:'}).update_layout(
#         legend=dict(title='',bgcolor='rgba(0,0,0,0)',orientation='v'),
#         font_size=15,
#         title={'y':.99,
#         'x':0.5,
#         'xanchor': 'center',
#         'yanchor': 'top'},
#         margin=dict(l=20, r=20, t=30, b=20)).show()
# px.line(carrier[carrier.groupby(['carrier_alliance','date'],observed=True)['teus'].transform('sum')>22000].groupby(['carrier_alliance','date'],observed=True)['teus'].sum().reset_index().sort_values('date'),
#         x='date',y='teus',color='carrier_alliance',color_discrete_map=cmap,category_orders=catOrders).show()

In [3]:
data1=carrier.groupby(['date','alliance_alt'],observed=True)['teus'].sum().reset_index()
data1['share']=data1.teus/(data1.groupby('date')['teus'].transform('sum'))
data1['share_sq']=(data1.share*100)**2
data1['hhi']=data1.groupby('date').share_sq.transform('sum')
data2=carrier.groupby(['date','scac'],observed=True)['teus'].sum().reset_index()
data2['share']=data2.teus/(data2.groupby('date')['teus'].transform('sum'))
data2['share_sq']=(data2.share*100)**2
data2['hhi']=data2.groupby('date').share_sq.transform('sum')
fig=px.line(data1.drop_duplicates('date'),x='date',y='hhi',
        title='Market Concentration by Herfindahl-Hirschman Index (HHI, industry-level)',
        labels={'hhi':'Herfindahl-Hirschman Index (HHI)','date':'Date'}).update_traces(name='Alliance', showlegend = True).add_trace(px.line(data2,
        x='date',y='hhi',color_discrete_sequence=px.colors.qualitative.Plotly[1:]).update_traces(name='Carrier', showlegend = True).data[0]).update_layout(
        legend=dict(title='',x=.70,bgcolor='rgba(0,0,0,0)',orientation='h'),
        font_size=15,
        title={'y':.99,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'},
        margin=dict(l=20, r=20, t=30, b=20))
fig.write_image('hhi_industry_line.png')
fig.show()

In [4]:
# px.bar(carrier[carrier.lane_name.isin(['Long Beach — Busan','Long Beach — Kaohsiung','Miami — Haina','Long Beach — Hong Kong'])].groupby(['year','scac','carrier_alliance'],observed=True)['teus'].sum().reset_index(),
#        x='year',y='teus',color='carrier_alliance',text='scac',
#        color_discrete_map=cmap,category_orders=catOrders,
#     labels={'year':'Year','teus':'TEUs'},title='Annual Volumes by Carrier and Alliance').update_layout(
#         legend=dict(title='',bgcolor='rgba(0,0,0,0)',orientation='v'),
#         font_size=15,
#         title={'y':.99,
#         'x':0.5,
#         'xanchor': 'center',
#         'yanchor': 'top'},
#         margin=dict(l=20, r=20, t=30, b=20)).show()

In [5]:
fig=px.line(lane.groupby('date').mean(numeric_only=True).reset_index().sort_values('date'),
        x='date',y='hhi_alliance_lane',
        title='Market Concentration by Herfindahl-Hirschman Index (HHI, lane-level)',
        labels={'hhi_alliance_lane':'Herfindahl-Hirschman Index (HHI)','date':'Date'}).update_traces(name='Alliance', showlegend = True).add_trace(
        px.line(lane.groupby('date').mean(numeric_only=True).reset_index().sort_values('date'),
        x='date',y='hhi_lane',color_discrete_sequence=px.colors.qualitative.Plotly[1:]).update_traces(name='Carrier', showlegend = True).data[0]).update_layout(
        legend=dict(title='',x=.70,bgcolor='rgba(0,0,0,0)',orientation='h'),
        font_size=15,
        title={'y':.99,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'},
        margin=dict(l=20, r=20, t=30, b=20))
fig.write_image('hhi_lane_line.png')
fig.show()

# px.line(carrier.drop_duplicates(['date','lane_name']).groupby(['date'],observed=True)['hhi_alliance_lane'].mean().reset_index().sort_values('date'),
#         x='date',y='hhi_alliance_lane',
#         title='Market Concentration by Herfindahl-Hirschman Index (HHI, lane-level)',
#         labels={'hhi_alliance_lane':'Herfindahl-Hirschman Index (HHI)','date':'Date'}).update_traces(name='Alliance', showlegend = True).add_trace(
#         px.line(carrier.drop_duplicates(['date','lane_name']).groupby(['date'],observed=True)['hhi_lane'].mean().reset_index().sort_values('date'),
#         x='date',y='hhi_lane',color_discrete_sequence=px.colors.qualitative.Plotly[1:]).update_traces(name='Carrier', showlegend = True).data[0]).update_layout(
#         legend=dict(title='',x=.70,bgcolor='rgba(0,0,0,0)',orientation='h'),
#         font_size=15,
#         title={'y':.99,
#         'x':0.5,
#         'xanchor': 'center',
#         'yanchor': 'top'},
#         margin=dict(l=20, r=20, t=30, b=20))

In [6]:
topLanes=lane.groupby('lane_name')['teus'].sum().sort_values(ascending=False).reset_index()['lane_name'][:10]
fig=px.line(lane[lane.lane_name.isin(['Long Beach — Busan','Long Beach — Kaohsiung','Miami — Haina','Long Beach — Hong Kong'])].groupby(['lane_name','date'],observed=True)['hhi_alliance_lane'].mean().reset_index().sort_values('date'),
        x='date',y='hhi_alliance_lane',color='lane_name',
        title='Market Concentration by Herfindahl-Hirschman Index (HHI, lane-level)',
        labels={'hhi_alliance_lane':'Herfindahl-Hirschman Index (HHI)','date':'Date'}).update_layout(
        legend=dict(title='',x=1,bgcolor='rgba(0,0,0,0)',orientation='v'),
        font_size=15,
        title={'y':.99,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'},
        margin=dict(l=20, r=20, t=30, b=20))
fig.write_image('hhi_lanes_line.png')
fig.show()

# px.line(lane[lane.lane_name.isin(topLanes)].groupby('date').mean(numeric_only=True).reset_index().sort_values('date'),
#         x='date',y='hhi_alliance_lane',
#         title='Market Concentration by Herfindahl-Hirschman Index (HHI, lane-level)',
#         labels={'hhi_alliance_lane':'Herfindahl-Hirschman Index (HHI)','date':'Date'}).update_traces(name='Alliance', showlegend = True).add_trace(
#         px.line(lane.groupby('date').mean(numeric_only=True).reset_index().sort_values('date'),
#         x='date',y='hhi_lane',color_discrete_sequence=px.colors.qualitative.Plotly[1:]).update_traces(name='Carrier', showlegend = True).data[0]).update_layout(
#         legend=dict(title='',x=.70,bgcolor='rgba(0,0,0,0)',orientation='h'),
#         font_size=15,
#         title={'y':.99,
#         'x':0.5,
#         'xanchor': 'center',
#         'yanchor': 'top'},
#         margin=dict(l=20, r=20, t=30, b=20))





In [7]:
topLanes=lane.groupby(['lane_name'],observed=True)['teus'].sum().sort_values(ascending=False).reset_index()['lane_name'][:25]
order=np.array(lane[lane.origin_port_name.isin(topLanes)].groupby('lane_name',observed=True).median(numeric_only=True).reset_index().sort_values('hhi_alliance_lane',ascending=True)['lane_name'])
fig=px.box(lane[lane.lane_name.isin(topLanes)].sort_values('hhi_alliance_lane'),
       x='hhi_alliance_lane',y='lane_name',height=600,
       title='Market Concentration by Herfindahl-Hirschman Index (HHI, lane-level)',
       labels={'hhi_alliance_lane':'Herfindahl-Hirschman Index (HHI)','hhi_lane':'Herfindahl-Hirschman Index (HHI)','lane_name':'Lane','origin_port_name':'Origin Port','date':'Date'}).update_layout(
           yaxis={'categoryorder':'array', 'categoryarray':order},
           legend=dict(title='',x=.70,bgcolor='rgba(0,0,0,0)',orientation='h'),
        font_size=15,
        title={'y':.99,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'},
        margin=dict(l=20, r=20, t=30, b=20)
       )
fig.write_image('alliance_lane_bar.png')
fig.show()

In [8]:
topPorts=lane.groupby(['origin_port_name'],observed=True)['teus'].sum().sort_values(ascending=False).reset_index()['origin_port_name'][:20]
# order=lane[lane.origin_port_name.isin(topPorts)].groupby('origin_port_name',observed=True)['hhi_lane'].median().sort_values().index
order=np.array(lane[lane.origin_port_name.isin(topPorts)].groupby('origin_port_name',observed=True).median(numeric_only=True).reset_index().sort_values('hhi_alliance_origin_port',ascending=True)['origin_port_name'])
fig=px.box(lane[lane.origin_port_name.isin(topPorts)].sort_values('hhi_alliance_origin_port'),
       x='hhi_alliance_origin_port',y='origin_port_name',height=600,
       title='Market Concentration by Herfindahl-Hirschman Index (HHI, port-level)',
       labels={'hhi_alliance_origin_port':'Herfindahl-Hirschman Index (HHI)','hhi_lane':'Herfindahl-Hirschman Index (HHI)','lane_name':'Lane','origin_port_name':'Origin Port','date':'Date'}).update_layout(
           yaxis={'categoryorder':'array', 'categoryarray':order},
           legend=dict(title='',x=.70,bgcolor='rgba(0,0,0,0)',orientation='h'),
        font_size=15,
        title={'y':.99,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'},
        margin=dict(l=20, r=20, t=30, b=20)
       )
fig.write_image('alliance_oport_bar.png')
fig.show()

# order=np.array(lane[lane.origin_port_name.isin(topPorts)].groupby('origin_port_name',observed=True).median(numeric_only=True).reset_index().sort_values('hhi_lane',ascending=True)['origin_port_name'])
# px.box(lane[lane.origin_port_name.isin(topPorts)].sort_values('hhi_lane'),
#        x='hhi_lane',y='origin_port_name',height=600,
#        labels={'hhi_alliance_lane':'Herfindahl-Hirschman Index (HHI)','hhi_lane':'Herfindahl-Hirschman Index (HHI)','lane_name':'Lane','origin_port_name':'Origin Port','date':'Date'}).update_layout(
#            yaxis={'categoryorder':'array', 'categoryarray':order},
#            legend=dict(title='',x=.70,bgcolor='rgba(0,0,0,0)',orientation='h'),
#         font_size=15,
#         title={'y':.99,
#         'x':0.5,
#         'xanchor': 'center',
#         'yanchor': 'top'},
#         margin=dict(l=20, r=20, t=30, b=20)
#        ).show()

In [9]:
topPorts=lane.groupby(['dest_port_name'],observed=True)['teus'].sum().sort_values(ascending=False).reset_index()['dest_port_name'][:20]
# order=lane[lane.origin_port_name.isin(topPorts)].groupby('origin_port_name',observed=True)['hhi_lane'].median().sort_values().index
order=np.array(lane[lane.dest_port_name.isin(topPorts)].groupby('dest_port_name',observed=True).median(numeric_only=True).reset_index().sort_values('hhi_alliance_dest_port',ascending=True)['dest_port_name'])
fig=px.box(lane[lane.dest_port_name.isin(topPorts)].sort_values('hhi_alliance_dest_port'),
       x='hhi_alliance_dest_port',y='dest_port_name',height=600,
       title='Market Concentration by Herfindahl-Hirschman Index (HHI, port-level)',
       labels={'hhi_alliance_dest_port':'Herfindahl-Hirschman Index (HHI)','hhi_lane':'Herfindahl-Hirschman Index (HHI)','lane_name':'Lane','dest_port_name':'Destination Port','date':'Date'}).update_layout(
           yaxis={'categoryorder':'array', 'categoryarray':order},
           legend=dict(title='',x=.70,bgcolor='rgba(0,0,0,0)',orientation='h'),
        font_size=15,
        title={'y':.99,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'},
        margin=dict(l=20, r=20, t=30, b=20)
       )
fig.write_image('alliance_dport_bar.png')
fig.show()

In [10]:
#Freight rates
topLanes=lane.groupby('lane_name')['teus'].sum().sort_values(ascending=False).reset_index()['lane_name'][:5]

fig=px.line(lane[lane.lane_name.isin(topLanes)].drop_duplicates(['drewery_lane','date']).sort_values('date').dropna(subset='rate_40'),
        x='date',
        y='rate_40',
        color='drewery_lane',
        title='Ocean Container Spot Rates: US Exports',
        height=500,
        labels={'date':'Date','rate_40':'Rate'}).add_trace(
            px.line(lane[lane.lane_name.isin(topLanes)].drop_duplicates(['drewery_lane','date']).groupby('date',observed=True).mean(numeric_only=True).reset_index().sort_values('date').dropna(subset='rate_40'),
                    x='date',y='rate_40',color_discrete_sequence=px.colors.qualitative.Plotly[5:]).update_traces(name='Total                                                       ', showlegend = True).data[0]
        ).update_layout(legend=dict(title='',bgcolor='rgba(0,0,0,0)',x=.15, y=-0.15,orientation='h'),font_size=15,title={'y':.99,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'},
        margin=dict(l=20, r=20, t=30, b=30))
fig.write_image('rates_line.png')
fig.show()





In [11]:
# px.scatter(lane[lane.groupby('drewery_lane',observed=True).dist.rank(method='min')==1].dropna(subset='rate_40'),
#            x='rate_40',y='hhi_alliance_lane',trendline='ols',trendline_color_override='red')

In [12]:
import patsy
import statsmodels.api as sm
import warnings
from pandas.errors import SettingWithCopyWarning
warnings.filterwarnings("ignore", category=SettingWithCopyWarning)

reg=lane.copy()
reg['lane_id']=reg.lane_id.astype(str)

stats=['N','df_resid','R2','f_pvalue','llf']
results=pd.DataFrame(data={'var':['hhi_lane', 'hhi_directsub_lanes',
       'hhi_potentialsub_lanes', 'hhi_origin_port', 'hhi_dest_port',
       'hhi_directsub_origin_ports', 'hhi_potentialsub_origin_ports',
       'hhi_directsub_dest_ports', 'hhi_potentialsub_dest_ports',
       'hhi_alliance_lane', 'hhi_alliance_directsub_lanes',
       'hhi_alliance_potentialsub_lanes', 'hhi_alliance_origin_port',
       'hhi_alliance_dest_port', 'hhi_alliance_directsub_origin_ports',
       'hhi_alliance_potentialsub_origin_ports',
       'hhi_alliance_directsub_dest_ports',
       'hhi_alliance_potentialsub_dest_ports',]+stats})

# for var in ['rate_40','rate_20']:
#     try:
#         f = var+' ~ 1+teus+month+lane_id+hhi_alliance_lane+hhi_alliance_origin_port+hhi_alliance_dest_port+hhi_alliance_directsub_lanes+hhi_alliance_directsub_origin_ports+hhi_alliance_directsub_dest_ports'
#         #f = var+' ~ 1+teus+month+lane_id+hhi_alliance_lane+hhi_alliance_origin_port+hhi_alliance_dest_port+hhi_alliance_potentialsub_lanes+hhi_alliance_potentialsub_origin_ports+hhi_alliance_potentialsub_dest_ports'
#         y,X = patsy.dmatrices(f,reg,return_type='dataframe')
#         model = sm.OLS(y,X,missing='drop').fit(cov_type='HC1')
#         params=pd.DataFrame(model.params,columns=['coef']).merge(pd.DataFrame(model.HC0_se,columns=['se']),left_index=True,right_index=True).merge(pd.DataFrame(model.pvalues,columns=['pvalue']),left_index=True,right_index=True).reset_index().rename(columns={'index':'var'})
        
#         res=params[(params['var'].str.contains('hhi'))|(params['var']=='Intercept')|(params['var']=='teus')]
#         res[var]=["{:,.3g}".format(res.coef[x])+'***' if res.pvalue[x]<.01 else "{:,.3g}".format(res.coef[x])+'**' if res.pvalue[x]<.05 else "{:,.3g}".format(res.coef[x])+'*' if res.pvalue[x]<.1 else "{:,.3g}".format(res.coef[x]) for x in res.index]
#         res=res.set_index('var')
#         res.loc['N',var]="{:,.0f}".format(model.nobs)
#         res.loc['df_resid',var]="{:,.0f}".format(model.df_resid)
#         res.loc['R2',var]="{:,.3g}".format(model.rsquared)
#         #res.loc['f_pvalue',var]=model.f_pvalue 
#         #res.loc['llf',var]=model.llf
#         results=results.merge(res.reset_index()[['var',var]],on='var',how='right')

#     except Exception as e:
#         print(var)
#         print(e)
#         print()
#         pass

#restricted on drewery lanes
reg=lane[lane.groupby('drewery_lane',observed=True).dist.rank(method='min')==1].dropna(subset='rate_40')
#lane[lane.lane_id.isin(lane.loc[lane.groupby(['drewery_lane'],observed=True)['dist'].idxmin()].lane_id.unique())].copy()
reg['lane_id']=reg.lane_id.astype(str)

for var in ['rate_40']:#,'rate_20']:
    try:
        f = var+' ~ 1+teus+lane_id+month+hhi_alliance_lane+hhi_alliance_directsub_lanes+hhi_alliance_origin_port+hhi_alliance_dest_port+hhi_alliance_directsub_origin_ports+hhi_alliance_directsub_dest_ports'
        y,X = patsy.dmatrices(f,reg,return_type='dataframe')
        model = sm.OLS(y,X,missing='drop').fit()
        
        params=pd.DataFrame(model.params,columns=['coef']).merge(pd.DataFrame(model.HC0_se,columns=['se']),left_index=True,right_index=True).merge(pd.DataFrame(model.pvalues,columns=['pvalue']).round(3),left_index=True,right_index=True).reset_index().rename(columns={'index':'var'})
        
        res=params[(params['var'].str.contains('hhi'))|(params['var']=='teus')]
        res[var]=["{:,.3g}".format(res.coef[x])+'***' if res.pvalue[x]<.01 else "{:,.3g}".format(res.coef[x])+'**' if res.pvalue[x]<.05 else "{:,.3g}".format(res.coef[x])+'*' if res.pvalue[x]<.1 else "{:,.3g}".format(res.coef[x]) for x in res.index]
        res=res.set_index('var')
        res.loc['N',var]="{:,.0f}".format(model.nobs)
        res.loc['df_resid',var]="{:,.0f}".format(model.df_resid)
        res.loc['R2',var]="{:,.3g}".format(model.rsquared)
        #res.loc['f_pvalue',var]=model.f_pvalue 
        #res.loc['llf',var]=model.llf        
        results=results.merge(res.reset_index()[['var',var]],on='var',how='right',suffixes=['_all','_nearest'])
    
    except Exception as e:
        print(var)
        print(e)
        print()
        pass
for var in ['rate_40']:#,'rate_20']:
    try:
        f = var+' ~ 1+lane_id+month+hhi_alliance_lane+hhi_alliance_directsub_lanes+hhi_alliance_origin_port+hhi_alliance_dest_port+hhi_alliance_directsub_origin_ports+hhi_alliance_directsub_dest_ports'
        y,X = patsy.dmatrices(f,reg,return_type='dataframe')
        model = sm.OLS(y,X,missing='drop').fit()
        
        params=pd.DataFrame(model.params,columns=['coef']).merge(pd.DataFrame(model.HC0_se,columns=['se']),left_index=True,right_index=True).merge(pd.DataFrame(model.pvalues,columns=['pvalue']).round(3),left_index=True,right_index=True).reset_index().rename(columns={'index':'var'})
        
        res=params[(params['var'].str.contains('hhi'))|(params['var']=='teus')]
        res[var]=["{:,.3g}".format(res.coef[x])+'***' if res.pvalue[x]<.01 else "{:,.3g}".format(res.coef[x])+'**' if res.pvalue[x]<.05 else "{:,.3g}".format(res.coef[x])+'*' if res.pvalue[x]<.1 else "{:,.3g}".format(res.coef[x]) for x in res.index]
        res=res.set_index('var')
        res.loc['N',var]="{:,.0f}".format(model.nobs)
        res.loc['df_resid',var]="{:,.0f}".format(model.df_resid)
        res.loc['R2',var]="{:,.3g}".format(model.rsquared)
        #res.loc['f_pvalue',var]=model.f_pvalue 
        #res.loc['llf',var]=model.llf        
        results=results.merge(res.reset_index()[['var',var]],on='var',how='left',suffixes=['',' '])
    
    except Exception as e:
        print(var)
        print(e)
        print()
        pass
#results.to_csv('lane_results_rates.csv',index=False)
results.fillna('-').rename(columns={'var':'','rate_40':'Rate','rate_40 ':'Rate'}).replace({'teus':'TEUs','hhi_alliance_lane':'HHI on lane ij','hhi_alliance_origin_port':'HHI at origin port i','hhi_alliance_dest_port':'HHI at dest port j','hhi_alliance_directsub_lanes':'HHI on substitute lanes -ij','hhi_alliance_directsub_origin_ports':'HHI on substitute origin ports -i','hhi_alliance_directsub_dest_ports':'HHI on substitute dest ports -j','df_resid':'DF Resid'}).to_csv('rate_results.csv',index=False)

In [13]:
reg=lane.copy()
reg['lane_id']=reg.lane_id.astype(str)
reg['origin_port_name']=reg.origin_port_name.astype(str)
reg['dest_port_name']=reg.dest_port_name.astype(str)

stats=['N','df_resid','R2','f_pvalue','llf']
results=pd.DataFrame(data={'var':['hhi_lane', 'hhi_directsub_lanes',
       'hhi_potentialsub_lanes', 'hhi_origin_port', 'hhi_dest_port',
       'hhi_directsub_origin_ports', 'hhi_potentialsub_origin_ports',
       'hhi_directsub_dest_ports', 'hhi_potentialsub_dest_ports',
       'hhi_alliance_lane', 'hhi_alliance_directsub_lanes',
       'hhi_alliance_potentialsub_lanes', 'hhi_alliance_origin_port',
       'hhi_alliance_dest_port', 'hhi_alliance_directsub_origin_ports',
       'hhi_alliance_potentialsub_origin_ports',
       'hhi_alliance_directsub_dest_ports',
       'hhi_alliance_potentialsub_dest_ports',]+stats})

for var in ['num_vessels','lane_capacity','num_foreign_ports','num_foreign_countries', 
            'num_foreign_regions']:
    try:
        f = var+' ~ 1+teus+month+lane_id+hhi_alliance_lane+hhi_alliance_origin_port+hhi_alliance_dest_port+hhi_alliance_directsub_lanes+hhi_alliance_directsub_origin_ports+hhi_alliance_directsub_dest_ports'
        #f = var+' ~ 1+teus+month+lane_id+hhi_alliance_lane+hhi_alliance_origin_port+hhi_alliance_dest_port+hhi_alliance_potentialsub_lanes+hhi_alliance_potentialsub_origin_ports+hhi_alliance_potentialsub_dest_ports'
        y,X = patsy.dmatrices(f,reg,return_type='dataframe')
        model = sm.OLS(y,X,missing='drop').fit(cov_type='HC1')
        params=pd.DataFrame(model.params,columns=['coef']).merge(pd.DataFrame(model.HC0_se,columns=['se']),left_index=True,right_index=True).merge(pd.DataFrame(model.pvalues,columns=['pvalue']),left_index=True,right_index=True).reset_index().rename(columns={'index':'var'})
        
        res=params[(params['var'].str.contains('hhi'))|(params['var']=='teus')]
        res[var]=["{:,.3g}".format(res.coef[x])+'***' if res.pvalue[x]<.01 else "{:,.3g}".format(res.coef[x])+'**' if res.pvalue[x]<.05 else "{:,.3g}".format(res.coef[x])+'*' if res.pvalue[x]<.1 else "{:,.3g}".format(res.coef[x]) for x in res.index]
        res=res.set_index('var')
        res.loc['N',var]="{:,.0f}".format(model.nobs)
        res.loc['df_resid',var]="{:,.0f}".format(model.df_resid)
        res.loc['R2',var]="{:,.3g}".format(model.rsquared)
        #res.loc['f_pvalue',var]=model.f_pvalue 
        #res.loc['llf',var]=model.llf
        results=results.merge(res.reset_index()[['var',var]],on='var',how='right')

    except Exception as e:
        print(var)
        print(e)
        print()
        pass

#results.to_csv('lane_results.csv',index=False)
results.rename(columns={'var':'','num_vessels':'# Vessels','lane_capacity':'Lane Capacity','num_foreign_ports':'# Foreign Ports','num_foreign_countries':'# Foreign Countries','num_foreign_regions':'# Foreign Regions'}).replace({'teus':'TEUs','hhi_alliance_lane':'HHI on lane ij','hhi_alliance_origin_port':'HHI at origin port i','hhi_alliance_dest_port':'HHI at dest port j','hhi_alliance_directsub_lanes':'HHI on substitute lanes -ij','hhi_alliance_directsub_origin_ports':'HHI on substitute origin ports -i','hhi_alliance_directsub_dest_ports':'HHI on substitute dest ports -j','df_resid':'DF Resid'}).to_csv('lane_results.csv',index=False)

KeyboardInterrupt: 

In [14]:
reg.describe().to_csv('descriptive_lanes.csv')

In [15]:
#add port coordinates
#drewery_ports = pd.read_csv('../Data/drewery_port_geolocations.csv')
# piers_ports = pd.read_csv('../Data/piers_port_geolocations.csv').drop('Unnamed: 0',axis=1)
# piers_ports.piers_ports=piers_ports.piers_ports.str.replace('US Port of ','')
# piers_ports.piers_ports=piers_ports.piers_ports.str.replace('Port of ','')
# lane[['drewery_origin','drewery_dest']]=lane.drewery_lane.str.split(' to ',expand=True)[[0,1]]
#lane.merge(drewery_ports,left_on='drewery_origin',right_on='drewery_port')
# lane=lane.merge(piers_ports.rename(columns={'piers_ports':'origin_port_name','piers_port_loc':'origin_loc'}),on='origin_port_name',how='left').merge(piers_ports.rename(columns={'piers_ports':'dest_port_name','piers_port_loc':'dest_loc'}),on='dest_port_name',how='left')
# lane['olat']=lane.origin_loc.str.split().str[0].str.strip('(').str.strip(',').astype(float)
# lane['olng']=lane.origin_loc.str.split().str[1].str.strip(')').str.strip(',').astype(float)
# carrier=carrier.merge(piers_ports.rename(columns={'piers_ports':'origin_port_name','piers_port_loc':'origin_loc'}),on='origin_port_name',how='left').merge(piers_ports.rename(columns={'piers_ports':'dest_port_name','piers_port_loc':'dest_loc'}),on='dest_port_name',how='left')
# carrier['olat']=carrier.origin_loc.str.split().str[0].str.strip('(').str.strip(',').astype(float)
# carrier['olng']=carrier.origin_loc.str.split().str[1].str.strip(')').str.strip(',').astype(float)
#some bad geolocations...
# piers_ports[piers_ports.piers_ports.str.contains('BOSTON')]
# px.scatter_mapbox(lane.groupby(['origin_port_name','olat','olng','month'],observed=True).sum(numeric_only=True).groupby(['origin_port_name','olat','olng'],observed=True).mean(numeric_only=True).reset_index().dropna(subset=['hhi_lane','teus','olat','olng']),
#                   lat='olat',lon='olng',color='hhi_lane',size='teus', #
#                   zoom=3, #center = {"lat": 47.251076, "lon": -120.740135},
#                   width=800,height=550, mapbox_style='carto-positron',).update_layout(
#                       margin=dict(l=0, r=0, t=0, b=0))

In [16]:
stop

NameError: name 'stop' is not defined

In [18]:
vessel=pl.read_parquet('../data/usda_aggregations/lane_month_carrier_vessel.parquet').to_pandas()
vessel['date']=pd.to_datetime(vessel['month'],format='%Y%m')
vessel['year']=vessel.date.dt.year
##alliance of each vessel
vessel=vessel.merge(vessel.loc[((vessel.scac.astype(str)==vessel.vessel_owner.astype(str)))][['lane_id','vessel_id','date','carrier_alliance']].rename(columns={'carrier_alliance':'vessel_alliance'}),on=['lane_id','vessel_id','date'],how='left')

In [19]:
vessel['alliance_operated']=vessel.alliance_operated.replace({True:1,False:0})
#vessel is alliance operated if on that vessel within that month there exists more than 1 carrier from the same alliance, and the alliance represents a volume share greater that .9
#make this consistent for all carrier observations on the same vessel in the same month
vessel['alliance_operated']=vessel.groupby(['vessel_id','date'])['alliance_operated'].transform('max')
vessel['alliance_operated']=vessel['alliance_operated'].replace({0:'Non-Alliance Operated',1:'Alliance Operated'})
fig=px.line(vessel.drop_duplicates(['vessel_id','date']).groupby(['date','alliance_operated']).count().reset_index(),
    x='date',y='vessel_id',labels={'alliance_operated':'','vessel_id':' # of vessels','date':''},color='alliance_operated',
    title='Count of Alliance Operated and Non-Alliance Operated Vessels').update_layout(
        legend=dict(title='',x=.25,bgcolor='rgba(0,0,0,0)',orientation='h'),
        font_size=15,
        title={'y':.99,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'},
        margin=dict(l=20, r=20, t=30, b=20))
fig.write_image('alliance_operated_vessels_line.png')
fig.show()
fig=px.line(vessel.groupby(['date','alliance_operated'])['teus'].sum().reset_index(),
        y='teus',x='date',color='alliance_operated',labels={'alliance_operated':'','teus':'TEUs/month','date':''},
        title='Volumes on Alliance Operated and Non-Alliance Operated Vessels').update_layout(
        legend=dict(title='',x=.25,bgcolor='rgba(0,0,0,0)',orientation='h'),
        font_size=15,
        title={'y':.99,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'},
        margin=dict(l=20, r=20, t=30, b=20))
fig.write_image('alliance_operated_volumes_line.png')
fig.show()
vessel['alliance_operated']=vessel['alliance_operated'].replace({'Non-Alliance Operated':False,'Alliance Operated':True})

#px.bar(carrier.groupby(['date','scac'])['teus'].sum().reset_index(),x='date',y='teus',color='scac').show()

In [20]:
import patsy
import statsmodels.api as sm
#vessel-alliance level
stats=['N','df_resid','R2']
#results=pd.DataFrame(data={'var':['active_lt','active_vlt']+stats})
#results=pd.DataFrame(columns=['var','lane_id'])
results=pd.DataFrame(data={'var':['teus', 'num_foreign_ports','num_foreign_countries', 
            'num_foreign_regions', 'cap_from_vessel','turns','vessel_capacity']+stats})
#grab one row for each vessel, with vessel owner, lane, month, indicator if the vessel is alliance operated, 
reg=vessel.groupby(['vessel_id','lane_id','month','date','year','alliance_operated','vessel_alliance','num_foreign_ports','num_foreign_countries', 
            'num_foreign_regions','cap_from_vessel','turns','vessel_capacity'],observed=True)['teus'].sum().reset_index()

reg['lane_id']=reg.lane_id.astype(str)
reg['vessel_id']=reg.vessel_id.astype(str)
reg['vessel_alliance']=reg.vessel_alliance.astype(str)

reg['alliance_operated']=reg.alliance_operated.replace({True:1,False:0})
#vessel is alliance operated if on that vessel within that month there exists more than 1 carrier from the same alliance, and the alliance represents a volume share greater that .9
#make this consistent for all carrier observations on the same vessel in the same month (may be unnecessary)
reg['alliance_operated']=reg.groupby(['vessel_id','date'],observed=True)['alliance_operated'].transform('max')

#restrict to subset of vessels that are operated by alliance carriers
reg=reg[~reg.vessel_alliance.str.contains('Non-alliance')]
#which months is an alliance operated vessel observed in by alliance by lane (not grouped by vessel)
reg['active_lt']=reg.groupby(['date','lane_id','vessel_alliance'],observed=True)['alliance_operated'].transform('max')
#which vessels are alliance operated
reg['active_vlt']=reg['alliance_operated']

# reg['active_lt_count']=reg.groupby(['month','lane_id','alliance'])['shared'].transform('sum')
# reg['active_vlt']=reg.groupby([
# 
# 'month','lane_id','alliance','scac'])['shared'].transform('sum')

# the control group is the set of carriers during active alliance on non-active lanes,... pre-alliance formation, and post-alliance dissolution
# or the set of carriers on active lane during non-active alliance 
# the treatment group is the set of carriers during active alliance on active lanes (very limited variation)
for var in ['teus', 'num_foreign_ports','num_foreign_countries', 
            'num_foreign_regions','turns','vessel_capacity']:
       #f = var+' ~ 1+vessel_id+vessel_alliance+lane_id:month+active_lt+active_vlt'
       f = var+' ~ 1+vessel_id+vessel_alliance+month+active_lt+active_vlt'
       y,X = patsy.dmatrices(f,reg,return_type='dataframe')
       model = sm.OLS(y,X,missing='drop').fit(cov_type='HC1')

       params=pd.DataFrame(model.params,columns=['coef']).merge(pd.DataFrame(model.HC0_se,columns=['se']),left_index=True,right_index=True).merge(pd.DataFrame(model.pvalues,columns=['pvalue']),left_index=True,right_index=True).reset_index().rename(columns={'index':'var'})
        
       res=params[(params['var'].isin(['active_lt','active_vlt']))]
       res[var]=["{:,.3g}".format(res.coef[x])+'***' if res.pvalue[x]<.01 else "{:,.3g}".format(res.coef[x])+'**' if res.pvalue[x]<.05 else "{:,.3g}".format(res.coef[x])+'*' if res.pvalue[x]<.1 else "{:,.3g}".format(res.coef[x]) for x in res.index]
       res=res.set_index('var')
       res.loc['N',var]="{:,.0f}".format(model.nobs)
       res.loc['df_resid',var]="{:,.0f}".format(model.df_resid)
       res.loc['R2',var]="{:,.3g}".format(model.rsquared)
       #res.loc['f_pvalue',var]=model.f_pvalue 
       #res.loc['llf',var]=model.llf
       results=results.merge(res.reset_index()[['var',var]],on='var',how='right')
       print(var)
       
#results.to_csv('vessel_results.csv',index=False)
reg.describe().to_csv('descriptive_vessels.csv')
results.rename(columns={'var':'','teus':'TEUs','vessel_capacity':'Vessel Capacity','turns':'Turns','num_foreign_ports':'# Foreign Ports','num_foreign_countries':'# Foreign Countries','num_foreign_regions':'# Foreign Regions'}).replace({'active_lt':'vessel of alliance a is alliance operated on lane ij in time t','active_vlt':'vessel v on lane ij is alliance operated in time t','df_resid':'DF Resid'}).to_csv('vessel_results.csv',index=False)
results
#within the same month, on the same lane, for a set of alliance carriers, you have some vessels that are alliance operated and some that aren't
#maersk and med operate on seattle to tokyo, some vessels together as an alliance, and some vessels seperately
#how are the vessels/services they operate together different than the vessels/services they operate seperately?
#alliance operated vessels ship more teus, on smaller vessels, service more ports, service more countries, service more regions

teus
num_foreign_ports
num_foreign_countries
num_foreign_regions
turns
vessel_capacity


Unnamed: 0,var,teus,num_foreign_ports,num_foreign_countries,num_foreign_regions,turns,vessel_capacity
0,active_lt,6.59***,0.131***,0.00516,-0.0266***,-0.0127***,-0.26
1,active_vlt,6.98***,0.404***,0.25***,-0.0589***,-0.0146***,1.86***
2,N,1038069,1038069,1038069,1038069,1038069,1038069
3,df_resid,1034530,1034530,1034530,1034530,1034530,1034530
4,R2,0.104,0.357,0.311,0.306,0.217,0.992
