In [10]:
import pandas as pd
import chardet
from plotly.offline import init_notebook_mode
import numpy as np
from numpy import mean
from numpy import std
import plotly.io as pio
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import json
import itertools
import geopandas as gpd
import geoplot
import geoplot.crs as gcrs
import math
import scipy
from scipy.signal import find_peaks
from datetime import datetime
from scipy.stats import spearmanr

init_notebook_mode(connected=True)
pio.renderers.default = "plotly_mimetype+notebook"

In [11]:
file_path = r"/Users/iantrout/TIL6022-group_project/Data/Maritime data/US_PortCalls_S_ST202209220924_v1.csv"
with open(file_path, 'rb') as rawdata:
    result = chardet.detect(rawdata.read(100000))
result

{'encoding': 'utf-8', 'confidence': 0.99, 'language': ''}

In [12]:
file_path1 = '/Users/iantrout/TIL6022-group_project/JOHN_FILES/covid_data.csv'

df = pd.read_csv(file_path1)
df = df.rename({
    'Date_reported': 'date',
    'Country': 'country',
    'New_cases': 'new_cases',
    'Cumulative_cases': 'cumulative_cases'
}, axis=1) 
df = df.drop(labels=[
    'New_deaths', 
    'Cumulative_deaths', 
    'Country_code', 
    'WHO_region'
], axis=1)

df_port = pd.read_csv(file_path,encoding='utf-8')
df_port = df_port.drop(columns=['Period Label','Year','Frequency', 'Frequency Label', 'Economy', 
                                      'CommercialMarket', 'Median time in port (days) Footnote',
                                      'Average age of vessels Footnote', 'Average size (GT) of vessels Footnote',
                                      'Maximum size (GT) of vessels Footnote', 'Average cargo carrying capacity (dwt) per vessel Footnote',
                                      'Maximum cargo carrying capacity (dwt) of vessels Footnote','Average container carrying capacity (TEU) per container ship Footnote',
                                      'Maximum container carrying capacity (TEU) of container ships Footnote'])
df_port.rename(columns = {'Economy Label': 'country', 'CommercialMarket Label': 'Vessel_Type', }, inplace=True)

for i in range(len(df)):
    k=df.iloc[i,0].split('-')
    df.iloc[i,0]=datetime(int(k[0]),int(k[1]),int(k[2]))

df_new = (df.groupby(['country', pd.Grouper(key='date', freq='6M')])
        .max()
        .reset_index())


In [13]:
date_change=[]
for row in df_port['Period']:
    if row == '2018S01' :   date_change.append(datetime(2018,7,31))
    elif row == '2018S02':   date_change.append(datetime(2019,1,31))
    elif row == '2019S01':  date_change.append(datetime(2019,7,31))
    elif row == '2019S02':  date_change.append(datetime(2020,1,31))
    elif row == '2020S01':  date_change.append(datetime(2020,7,31))
    elif row == '2020S02':  date_change.append(datetime(2021,1,31))
    elif row == '2021S01':  date_change.append(datetime(2021,7,31))
    elif row == '2021S02':  date_change.append(datetime(2022,1,31))
    elif row == '2022S01':  date_change.append(datetime(2022,7,31))
    
    else:           date_change.append('Not_Rated')


#df_port = df_port.drop(columns=['Period']) Ian needs this otherwise it's a pain to show the choloropath later. 
df_port['date'] = date_change

In [14]:
df_combined=pd.merge(df_new,df_port,on=['country','date'], how='right') 
#I changed this to be right so that we have pre-covid data
df_combined.head()

Unnamed: 0,country,date,new_cases,cumulative_cases,Period,Vessel_Type,Median time in port (days),Average age of vessels,Average size (GT) of vessels,Maximum size (GT) of vessels,Average cargo carrying capacity (dwt) per vessel,Maximum cargo carrying capacity (dwt) of vessels,Average container carrying capacity (TEU) per container ship,Maximum container carrying capacity (TEU) of container ships
0,World,2018-07-31,,,2018S01,All ships,0.97,18,15222,234006,24074.0,441561.0,3526.0,21413.0
1,World,2018-07-31,,,2018S01,Passenger ships,,21,8978,228081,,,,
2,World,2018-07-31,,,2018S01,Liquid bulk carriers,0.94,13,15470,234006,26871.0,441561.0,,
3,World,2018-07-31,,,2018S01,Container ships,0.69,13,38405,217673,,,3526.0,21413.0
4,World,2018-07-31,,,2018S01,Dry breakbulk carriers,1.12,19,5455,91784,7413.0,138743.0,,


In [15]:
geodata = gpd.read_file("/Users/iantrout/TIL6022-group_project/Data/countries.geojson") # geojson file
geodata.rename(columns = {'Location': 'country', }, inplace=True)

In [16]:
# Merge the two dataframes, using _ID column as key
geo_port = pd.merge(geodata, df_combined, on = 'country')

geo_port.head()

Unnamed: 0,country,ISO_A3,geometry,date,new_cases,cumulative_cases,Period,Vessel_Type,Median time in port (days),Average age of vessels,Average size (GT) of vessels,Maximum size (GT) of vessels,Average cargo carrying capacity (dwt) per vessel,Maximum cargo carrying capacity (dwt) of vessels,Average container carrying capacity (TEU) per container ship,Maximum container carrying capacity (TEU) of container ships
0,Australia,AUS,"MULTIPOLYGON (((158.86573 -54.74993, 158.83823...",2018-07-31,,,2018S01,All ships,1.49,19,25686,168666,78572.0,299688.0,4263.0,8084.0
1,Australia,AUS,"MULTIPOLYGON (((158.86573 -54.74993, 158.83823...",2018-07-31,,,2018S01,Passenger ships,,27,5105,168666,,,,
2,Australia,AUS,"MULTIPOLYGON (((158.86573 -54.74993, 158.83823...",2018-07-31,,,2018S01,Liquid bulk carriers,1.34,7,23585,85496,40187.0,166447.0,,
3,Australia,AUS,"MULTIPOLYGON (((158.86573 -54.74993, 158.83823...",2018-07-31,,,2018S01,Container ships,1.19,12,46778,90449,,,4263.0,8084.0
4,Australia,AUS,"MULTIPOLYGON (((158.86573 -54.74993, 158.83823...",2018-07-31,,,2018S01,Dry breakbulk carriers,1.69,12,15417,54529,21345.0,80500.0,,


now we will merge the other port performance index data with the table above

In [17]:
port_2021_path = r"/Users/iantrout/TIL6022-group_project/Data/The productivity of the ports/Container-Port-Performance-Index-2021 copy.csv"
data_call_path = r"/Users/iantrout/TIL6022-group_project/Data/Maritime data/US_PortCallsArrivals_S_ST202209220927_v1.csv"


#port_index.rename(columns = {'Economy Label': 'country', }, inplace=True)

data_call = pd.read_csv(data_call_path)
port_time = pd.read_csv(port_2021_path,encoding='utf-8')
# using merge function by setting how='outer'to not lose years where there is no common data

#output = pd.merge(port_time, data_call, 
#                   left_on=['Economy Label', "Year"],  
#                   right_on=['Economy Label', "Year"],
#                   how='left')

#column_names = ['Port Name', 'CommercialMarket Label']
#output.drop_duplicates(subset=column_names, keep='first', inplace=True)
#output = output.drop(output[output["CommercialMarket Label"] != "Container ships"].index)
#output = output.drop(output[output["Economy Label"] == "World"].index)
#output.sort_values("Statistic Approach Rank")
#output.sort_values("Year")

#output.head()
#using a for if-else and a for loop, I'm going to

#filtered by year 2020 and 2021 

In [18]:
# the higher the index value, the better it is; CAUTION 
#though since the index values go into the negative side too for worst.
port_time = port_time.drop(columns=['Total\nPoints', "Administrative Approach Rank", 'Economy Label', 'Port Name'])
port_time.set_index('Economy Label.1', inplace=True)
port_time
for country in 
port_time.groupby('Economy Label.1').mean()

SyntaxError: invalid syntax (338959456.py, line 6)

# Part II

we start by understanding how many countries we have data for and for that we will plot a world map for All ship types combined

In [99]:
#df['text'] = geo_port['Location'] + '<br>' + \
   # 'Passenger ships ' + geo_port['Passenger ships'] + ' Dairy ' + geo_port['dairy'] + '<br>' + \
   # 'Fruits ' + geo_port['total fruits'] + ' Veggies ' + geo_port['total veggies'] + '<br>' + \
   # 'Wheat ' + geo_port['wheat'] + ' Corn ' + geo_port['corn'] 
geo_port_all_vessels= geo_port[
    (geo_port.Vessel_Type == 'All ships')
]

fig = px.choropleth(geo_port_all_vessels, 
                    locations="ISO_A3",
                    color="Median time in port (days)", 
                    hover_name="country",
                    range_color=(0, 2),
                    animation_frame="Period",
                    #text=df['text'], # hover text
                    color_continuous_scale=px.colors.sequential.Plasma)
fig.show()


now we can't only infer information from graphs, so we will calculate the peaks and valleys of the COVID data and the port data to see if there is a match based on serveral values of the port data (avg age of the vessel, average size of the vessel, average time in port)

In [100]:
# Variables from COVID data 
activity_1 = 'new_cases'
activity_3 = 'cumulative_cases'

# Varaibles from Maritime data 
activity_5 = 'Median time in port (days)'
#activity_6 = 'port index value'
#activity_7 = 'port calls'
activity_8 = 'Average age of vessels'
activity_10 = 'Average size (GT) of vessels'
activity_9 = 'Vessel_Type'

# Common variables
country_1 = 'China'
country_2 = 'Australia'
country_3 = 'United States'


activities_story_1 = [activity_8, activity_5, activity_10]
#activities_story_2 = [activity_1, activity_2, activity_6]
#activities_story_3 = [activity_5, activity_2]


In [101]:
# first, I'm going to define a function to be able to select the different vessels in a list for each country for a specific time period
def data_highs(data, acitivity, **kwargs):

    diff_1 = data[activity].diff(periods = -1)
    diff_2 = data[activity].diff(periods = 1)
    
    peaks = []
    for i in range(len(diff_1)):
        if diff_1[i] > 0 and diff_2[i] > 0:
            peaks.append(int(i))          
            
    return peaks

# And do the same for the valleys
def data_lows(data, activity, **kwargs):

    diff_1 = data[activity].diff(periods = -1)
    diff_2 = data[activity].diff(periods = 1)

    valleys = []
    for i in range(len(diff_1)):
        if diff_1[i] < 0 and diff_2[i] < 0:
            valleys.append(int(i))          
            
    return valleys

In [19]:
# Then I start the figure and create several dictionaries that are necessary. The peaks and valleys dictionaries are for the graphs and the date dictionaries are for the next steps
fig_1 = go.Figure()

peaks_dict_1 = {}
valleys_dict_1 = {}
peaks_date_dict_1 = {}
valleys_date_dict_1 = {}

# I create a dataframe that contains only the data for the world and reset the indices for it
geo_port_all_vessels = geo_port[(geo_port.Vessel_Type == 'All ships')]
geo_port_all_vessels = geo_port_all_vessels[(geo_port_all_vessels.country == 'World')]
geo_port_all_vessels.reset_index(inplace=True)

# I find the peaks and valleys and add them to the dictionaries
for activity in activities_story_1:
    max_ind = data_highs(geo_port_all_vessels, activity)
    peaks_dict_1[activity]=max_ind

    min_ind = data_lows(geo_port_all_vessels,activity)
    valleys_dict_1[activity]=min_ind
    
    # Then I turn them into dataframes to be able to use the dates for the graphs, and for the date dictionaries
    df_max_1 = geo_port_all_vessels.iloc[max_ind]
    df_min_1 = geo_port_all_vessels.iloc[min_ind]

# The date dictionaries are filled with the dates of the peaks and the valleys
    peaks_date_dict_1[activity] = df_max_1['date']
    valleys_date_dict_1[activity] = df_min_1['date']

NameError: name 'activities_story_1' is not defined

In [103]:
fig_2 = go.Figure()
fig_2 = make_subplots(rows=3,cols=1)
x1 = geo_port_all_vessels['date']
y1 = geo_port_all_vessels[activity_5]
x2 = df_max_1['date']
y2 = df_max_1[activity_5]
x3 = df_min_1['date']
y3 = df_min_1[activity_5]
x4 = geo_port_all_vessels['date']
y4 = geo_port_all_vessels[activity_8]
x5 = df_max_1['date']
y5 = df_max_1[activity_8]
x6 = df_min_1['date']
y6 = df_min_1[activity_8]
x7 = geo_port_all_vessels['date']
y7 = geo_port_all_vessels[activity_10]
x8 = df_max_1['date']
y8 = df_max_1[activity_10]
x9 = df_min_1['date'] #it was Period Label
y9 = df_min_1[activity_10]


fig_2.append_trace(go.Scatter(x=x1,y=y1,name=activity_5),row=1,col=1)
fig_2.append_trace(go.Scatter(x=x2,y=y2,mode='markers',name='peaks ' + activity_5),row=1,col=1)
fig_2.append_trace(go.Scatter(x=x3,y=y3,mode='markers',name='valleys ' + activity_5),row=1,col=1)
fig_2.append_trace(go.Scatter(x=x4,y=y4,name=activity_8),row=2,col=1)
fig_2.append_trace(go.Scatter(x=x5,y=y5,mode='markers',name='peaks ' + activity_8),row=2,col=1)
fig_2.append_trace(go.Scatter(x=x6,y=y6,mode='markers',name='valleys ' + activity_8),row=2,col=1)
fig_2.append_trace(go.Scatter(x=x7,y=y7,name=activity_10),row=3,col=1)
fig_2.append_trace(go.Scatter(x=x8,y=y8,mode='markers',name='peaks ' + activity_10),row=3,col=1)
fig_2.append_trace(go.Scatter(x=x9,y=y9,mode='markers',name='valleys ' + activity_10),row=3,col=1)


fig_2.update_layout(title='Trends in vessel port time, age, and size thru the years for Australia')

fig_2.show()

Spearman’s Correlation

Two variables may be related by a nonlinear relationship, such that the relationship is stronger or weaker across the distribution of the variables.

Further, the two variables being considered may have a non-Gaussian distribution.

In this case, the Spearman’s correlation coefficient (named for Charles Spearman) can be used to summarize the strength between the two data samples.

We will compare the data set of median time in ports with the COVID new cases to see if there is a correlation. 

In [118]:
data1 = geo_port.loc['new_cases']
data2 = geo_port.loc['Median time in port (days)']
coef, p = spearmanr(data1, data2)
print('Spearmans correlation coefficient: %.3f' % coef)
# interpret the significance
alpha = 0.05
if p > alpha:
	print('Samples are uncorrelated (fail to reject H0) p=%.3f' % p)
else:
	print('Samples are correlated (reject H0) p=%.3f' % p)

KeyError: 'new_cases'

In addition, we will do a world to world comparision of COVID cases versus port times: 

## Part III - Data visualisation

For this last part, we're going to visually show the effect that COVID had on vessel times so that users can see how ports have been impacted by COVID and thus has also impacted the logistics system as a whole by: 

We're going to look at regions and look at the semi annual trend by vessel type 

pie chart showing the proportions of the commodity shipped

World map showing the change in port call times over the years 

Comparing covid high periods vs low periods with port call times 

Interpreting the results 

First, We show our variables for this part.

We want to show the COVID data with the port time (worldwide)

Now we will show over the years from 2018, the number of port calls by region

In [9]:
 fig = px.histogram(df_ports, y="Location", x="Median time in port (days)", orientation= "h",
             animation_frame="Period Label", 
             #range_x=[0,4000000000], 
                color="Location",)
fig.update_yaxes(categoryorder='sum ascending')

fig.show()

NameError: name 'df_ports' is not defined

In [None]:
# load dataset
df = pd.read_csv("https://raw.githubusercontent.com/plotly/datasets/master/volcano.csv")

# create figure
fig = go.Figure()

# Add surface trace
fig.add_trace(go.Surface(z=df.values.tolist(), colorscale="Viridis"))

# Update plot sizing
fig.update_layout(
    width=800,
    height=900,
    autosize=False,
    margin=dict(t=0, b=0, l=0, r=0),
    template="plotly_white",
)


# Add dropdown
fig.update_layout(
    updatemenus=[
        dict(
            buttons=list([
                dict(
                    args=["type", "surface"],
                    label="Asia",
                    method="restyle"
                ),
                dict(
                    args=["type", "heatmap"],
                    label="America",
                    method="restyle"
                ),
                dict(
                    args=["type", "heatmap"],
                    label="Africa",
                    method="restyle"
                )
            ]),
            direction="down",
            pad={"r": 10, "t": 10},
            showactive=True,
            x=0.1,
            xanchor="left",
            y=1.1,
            yanchor="top"
        ),
    ]
)

# Add annotation
fig.update_layout(
    annotations=[
        dict(text="Countries:", showarrow=False,
        x=0, y=1.085, yref="paper", align="left")
    ]
)

fig.show()

In [None]:
pie = px.pie(df_new, values="occurance", names="Sectors", title="sector wise composition")
pie.show()
#https://www.youtube.com/watch?v=s_iEvTBSBfA
sunburst=px.sunburst(df_path=['Sectors', 'regions'],values='volume transported')
sunburst.show()