In [2]:
import pickle
import pandas as pd
import numpy as np
from API import K
import requests
import json
import googlemaps

In [3]:
#import dataframe and drop nans
df_2021 = pd.read_csv('data/2021/2021_combined.csv', dtype={'Data':np.datetime64})
#drop nans
df_2021 = df_2021.loc[df_2021.SK_CODE.isnull() != True]

In [4]:
# group trips by origin, destination and category
df_t = df_2021.groupby(by=['UNLO_herkomst', 'UNLO_bestemming', 'SK_CODE']).count().reset_index().sort_values(
    by='Jaarmaand', ascending=False).iloc[:, :4].reset_index(drop=True)

In [5]:
# give columns english names
df_t.rename(
    columns={'UNLO_herkomst': 'origin', 'UNLO_bestemming': 'destination', 'SK_CODE': 'type', 'Unnamed: 0': 'trips'},
    inplace=True)

In [6]:
# subset inland trips
df_t = df_t.loc[(df_t.origin.str.contains('NL')) & (df_t.destination.str.contains('NL'))]
df_t.reset_index(inplace=True, drop=True)

In [7]:
#create dict to fill with total number of trips on path for each ship type
df_t_dict = {}
# now fill this dict
# loop over data frame
for i in range(len(df_t)):
    #define temp variables for ease
    origin = df_t['origin'][i]
    destination = df_t['destination'][i]
    ship_type = df_t['type'][i]
    trip_count = df_t['trips'][i]
    #only make new column if trip and its inverse are not in dict yet. Otherwise append.
    if (origin, destination, ship_type) in df_t_dict.keys():
        df_t_dict[(origin, destination, ship_type)] += trip_count
    elif (destination, origin, ship_type) in df_t_dict.keys():
        df_t_dict[(destination, origin, ship_type)] += trip_count
    else:
        df_t_dict[(origin, destination, ship_type)] = trip_count
df_t_dict

{('NLAMS', 'NLRTM', 'M8'): 3510,
 ('NLAMS', 'NLRTM', 'M12'): 2076,
 ('NLRTM', 'NLMOE', 'M8'): 1584,
 ('NLRTM', 'NLRTM', 'M8'): 715,
 ('NLAMS', 'NLRTM', 'M9'): 1316,
 ('NLRTM', 'NLWLK', 'M3'): 973,
 ('NLOOS', 'NLTLB', 'M8'): 688,
 ('NLRTM', 'NLAMS', 'M6'): 881,
 ('NLAER', 'NLAMS', 'BI'): 824,
 ('NLDHR', 'NLDHR', 'M0'): 411,
 ('NLTLB', 'NLRTM', 'M8'): 623,
 ('NLRTM', 'NLOOS', 'M8'): 727,
 ('NLHAR', 'NLWTE', 'M12'): 690,
 ('NLHGL', 'NLRTM', 'M8'): 652,
 ('NLRTM', 'NLVLI', 'M8'): 658,
 ('NLRTM', 'NLRTM', 'M12'): 363,
 ('NLMOE', 'NLOOS', 'M8'): 615,
 ('NLVEG', 'NLRTM', 'M6'): 575,
 ('NLAMS', 'NLRTM', 'M11'): 666,
 ('NLMOE', 'NLAPN', 'M8'): 604,
 ('NLVEG', 'NLRTM', 'M7'): 581,
 ('NLAMS', 'NLMOE', 'M8'): 588,
 ('NLRTM', 'NLMEP', 'M8'): 568,
 ('NLSTD', 'NLSTD', 'M0'): 282,
 ('NLAMS', 'NLLMR', 'BIIL-1'): 553,
 ('NLNIJ', 'NLWAS', 'M8'): 508,
 ('NLWSP', 'NLAER', 'M6'): 510,
 ('NLDOR', 'NLRTM', 'M8'): 459,
 ('NLAMS', 'NLHAS', 'M8'): 436,
 ('NLRTM', 'NLDZL', 'M8'): 415,
 ('NLRTM', 'NLTIE', 'M8'): 4

## Compute vessel sum per route
Idea: make similar dict as above first, but then with total amount of trips per route. Later on: add column for each ship.
This will result in a column for each ship type and an entry with the amount of ships of that type that have travelled.

In [7]:
# now only sort by origin and destination
all_trips_ranked = df_2021.groupby(by=['UNLO_herkomst', 'UNLO_bestemming']).count().reset_index().sort_values(
    by='Jaarmaand', ascending=False).iloc[:, :3].reset_index(drop=True)

#again, rename columns
all_trips_ranked.rename(
    columns={'UNLO_herkomst': 'origin', 'UNLO_bestemming': 'destination', 'SK_CODE': 'type', 'Unnamed: 0': 'trips'},
    inplace=True)

#remove non nl
all_trips_ranked = all_trips_ranked.loc[(all_trips_ranked.origin.str.contains('NL')) & (all_trips_ranked.destination.str.contains('NL'))]

#reset index to be able to loop over all values later on
all_trips_ranked.reset_index(inplace=True, drop=True)

In [8]:
# make dict to store trips
trip_dict_main = {}

# now fill this dict, same procedure as above
for i in range(len(all_trips_ranked)):
    origin = all_trips_ranked['origin'][i]
    destination = all_trips_ranked['destination'][i]
    trip_count = all_trips_ranked['trips'][i]

    if (origin, destination) in trip_dict_main.keys():
        trip_dict_main[(origin, destination)] += trip_count
    elif (destination, origin) in trip_dict_main.keys():
        trip_dict_main[(destination, origin)] += trip_count
    else:
        trip_dict_main[(origin, destination)] = trip_count

# now sort values
trip_dict_main = dict(sorted(trip_dict_main.items(), key=lambda item: item[1], reverse=True))

In [9]:
trip_dict_main

{('NLRTM', 'NLAMS'): 9943,
 ('NLRTM', 'NLMOE'): 2686,
 ('NLRTM', 'NLVLI'): 2153,
 ('NLAMS', 'NLAER'): 2119,
 ('NLRTM', 'NLRTM'): 1794,
 ('NLVEG', 'NLRTM'): 1484,
 ('NLTLB', 'NLRTM'): 1318,
 ('NLLEY', 'NLAMS'): 1292,
 ('NLOSS', 'NLRTM'): 1145,
 ('NLZWO', 'NLAMS'): 1118,
 ('NLUTC', 'NLRTM'): 1042,
 ('NLRTM', 'NLKAM'): 1036,
 ('NLHGL', 'NLRTM'): 1035,
 ('NLLMR', 'NLAMS'): 1005,
 ('NLRTM', 'NLWLK'): 987,
 ('NLAMS', 'NLMOE'): 971,
 ('NLAER', 'NLRTM'): 945,
 ('NLRTM', 'NLAPN'): 945,
 ('NLTNZ', 'NLVLI'): 926,
 ('NLDOR', 'NLRTM'): 911,
 ('NLAMS', 'NLDEV'): 847,
 ('NLGOR', 'NLRTM'): 843,
 ('NLRTM', 'NLNIJ'): 817,
 ('NLBZM', 'NLRTM'): 814,
 ('NLWSP', 'NLAER'): 794,
 ('NLRTM', 'NLOOS'): 785,
 ('NLRTM', 'NLTNZ'): 779,
 ('NLVLI', 'NLAMS'): 769,
 ('NLRTM', 'NLMEP'): 768,
 ('NLOOS', 'NLTLB'): 737,
 ('NLNIJ', 'NLWAS'): 725,
 ('NLKAM', 'NLAMS'): 721,
 ('NLIJM', 'NLMOE'): 718,
 ('NLHAR', 'NLWTE'): 714,
 ('NLKGZ', 'NLAMS'): 696,
 ('NLBON', 'NLRTM'): 680,
 ('NLMOE', 'NLAPN'): 667,
 ('NLDOR', 'NLAMS'): 662

In [10]:
# make dataframe dict with a key for each column
trip_dict_df_main = {'origin': [], 'destination': [], 'trip_count': []}
for i in df_2021.SK_CODE.unique():
    trip_dict_df_main[i] = []

# now fill this dict
for key, value in trip_dict_main.items():
    # append values from dict a
    trip_dict_df_main['origin'].append(key[0])
    trip_dict_df_main['destination'].append(key[1])
    trip_dict_df_main['trip_count'].append(value)
    #now append values from other dict, for each ship type
    for ship_type in df_2021.SK_CODE.unique():
        #if ships have travelled from A to B, append this value
        if (key[0], key[1], ship_type) in df_t_dict.keys():
            trip_dict_df_main[ship_type].append(df_t_dict[(key[0], key[1], ship_type)])
        elif (key[1], key[0], ship_type) in df_t_dict.keys():
            trip_dict_df_main[ship_type].append(df_t_dict[(key[1], key[0], ship_type)])
        else:
            # no? Then no ships of this type have travelled this path, append 0
            trip_dict_df_main[ship_type].append(0)

In [11]:
df_main = pd.DataFrame.from_dict(trip_dict_df_main)

In [12]:
df_main

Unnamed: 0,origin,destination,trip_count,M12,M8,BII-6b,M10,BIIa-1,M9,BII-6l,...,C4,B04,M0,C2l,BII-2L,B02,C1b,C2b,B01,C1l
0,NLRTM,NLAMS,9943,2076,3510,0,427,102,1316,0,...,5,5,11,7,39,4,0,0,0,0
1,NLRTM,NLMOE,2686,70,1584,0,56,2,144,0,...,2,0,1,0,1,0,0,0,0,0
2,NLRTM,NLVLI,2153,190,658,0,140,10,414,0,...,0,1,2,2,0,0,0,1,0,0
3,NLAMS,NLAER,2119,6,132,0,7,2,0,0,...,0,37,5,1,0,3,0,1,1,0
4,NLRTM,NLRTM,1794,363,715,0,164,15,92,1,...,4,0,0,0,2,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10588,NLBZM,NLVSN,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10589,NLBZM,NLVER,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10590,NLBZM,NLSMB,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10591,NLBZM,NLSHH,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
#remove entries with same origin and destination
df_main = df_main.loc[df_main.origin!=df_main.destination]

In [14]:
df_main

Unnamed: 0,origin,destination,trip_count,M12,M8,BII-6b,M10,BIIa-1,M9,BII-6l,...,C4,B04,M0,C2l,BII-2L,B02,C1b,C2b,B01,C1l
0,NLRTM,NLAMS,9943,2076,3510,0,427,102,1316,0,...,5,5,11,7,39,4,0,0,0,0
1,NLRTM,NLMOE,2686,70,1584,0,56,2,144,0,...,2,0,1,0,1,0,0,0,0,0
2,NLRTM,NLVLI,2153,190,658,0,140,10,414,0,...,0,1,2,2,0,0,0,1,0,0
3,NLAMS,NLAER,2119,6,132,0,7,2,0,0,...,0,37,5,1,0,3,0,1,1,0
5,NLVEG,NLRTM,1484,0,1,0,0,0,0,0,...,0,0,17,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10588,NLBZM,NLVSN,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10589,NLBZM,NLVER,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10590,NLBZM,NLSMB,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10591,NLBZM,NLSHH,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
#save dataframe
# pickle.dump( df_main, open( "data/df_trips_per_path.p", "wb" ) )
df_main = pickle.load( open( "data/df_trips_per_path.p", "rb" ) )

## Compute vessel sum per route and per hour (for ABM)
only for top 100 origin destination pairs

In [9]:
#generate hour
df_2021['hour'] = df_2021.Date.apply(lambda x: int(x.split()[1][:2]))

In [10]:
# now only sort by origin and destination
all_trips_ranked = df_2021.groupby(by=['UNLO_herkomst', 'UNLO_bestemming', 'hour', 'SK_CODE']).count().reset_index().sort_values(
    by='Jaarmaand', ascending=False).iloc[:, :5].reset_index(drop=True)

#again, rename columns
all_trips_ranked.rename(
    columns={'UNLO_herkomst': 'origin', 'UNLO_bestemming': 'destination', 'SK_CODE': 'type', 'Unnamed: 0': 'trips'},
    inplace=True)

#remove non nl
all_trips_ranked = all_trips_ranked.loc[
    (all_trips_ranked.origin.str.contains('NL')) & (all_trips_ranked.destination.str.contains('NL'))]

#reset index to be able to loop over all values later on
all_trips_ranked.reset_index(inplace=True, drop=True)

In [11]:
all_trips_ranked

Unnamed: 0,origin,destination,hour,type,trips
0,NLAPN,NLMOE,8,M8,100
1,NLHAR,NLWTE,8,M12,100
2,NLAMS,NLRTM,14,M8,98
3,NLRTM,NLAMS,9,M8,98
4,NLAMS,NLRTM,11,M8,95
...,...,...,...,...,...
90533,NLAMS,NLVLA,10,M5,1
90534,NLAMS,NLVLA,10,M9,1
90535,NLAMS,NLVLA,11,M4,1
90536,NLAMS,NLVLA,12,M12,1


In [12]:
# make dict to store trips
trip_dict_main = {}

# now fill this dict, same procedure as above
for i in range(len(all_trips_ranked)):
    origin = all_trips_ranked['origin'][i]
    destination = all_trips_ranked['destination'][i]
    trip_count = all_trips_ranked['trips'][i]
    hour = all_trips_ranked['hour'][i]
    ship_type = all_trips_ranked['type'][i]

    if (origin, destination, ship_type, hour) in trip_dict_main.keys():
        trip_dict_main[(origin, destination, ship_type, hour)] += trip_count
    elif (destination, origin, ship_type, hour) in trip_dict_main.keys():
        trip_dict_main[(destination, origin, ship_type, hour)] += trip_count
    else:
        trip_dict_main[(origin, destination, ship_type, hour)] = trip_count
# now sort values
trip_dict_main = dict(sorted(trip_dict_main.items(), key=lambda item: item[1], reverse=True))



In [13]:
trip_dict_main

{('NLRTM', 'NLAMS', 'M8', 12): 186,
 ('NLRTM', 'NLAMS', 'M8', 9): 181,
 ('NLRTM', 'NLAMS', 'M8', 19): 180,
 ('NLAMS', 'NLRTM', 'M8', 18): 175,
 ('NLAMS', 'NLRTM', 'M8', 17): 171,
 ('NLAMS', 'NLRTM', 'M8', 11): 170,
 ('NLAMS', 'NLRTM', 'M8', 16): 169,
 ('NLAMS', 'NLRTM', 'M8', 14): 165,
 ('NLRTM', 'NLAMS', 'M8', 15): 157,
 ('NLRTM', 'NLAMS', 'M8', 10): 153,
 ('NLAMS', 'NLRTM', 'M8', 22): 148,
 ('NLRTM', 'NLAMS', 'M8', 20): 147,
 ('NLRTM', 'NLAMS', 'M8', 8): 146,
 ('NLRTM', 'NLAMS', 'M8', 4): 144,
 ('NLRTM', 'NLAMS', 'M8', 6): 141,
 ('NLRTM', 'NLAMS', 'M8', 7): 136,
 ('NLRTM', 'NLAMS', 'M8', 21): 130,
 ('NLAMS', 'NLRTM', 'M8', 5): 129,
 ('NLAMS', 'NLRTM', 'M12', 18): 124,
 ('NLAMS', 'NLRTM', 'M8', 2): 122,
 ('NLAMS', 'NLRTM', 'M12', 19): 121,
 ('NLWTE', 'NLHAR', 'M12', 12): 119,
 ('NLAMS', 'NLRTM', 'M12', 14): 117,
 ('NLRTM', 'NLAMS', 'M8', 3): 117,
 ('NLAMS', 'NLRTM', 'M8', 13): 114,
 ('NLHAR', 'NLWTE', 'M12', 8): 111,
 ('NLAMS', 'NLRTM', 'M8', 1): 110,
 ('NLAMS', 'NLRTM', 'M8', 23): 11

In [14]:
# now only keep entries of trip_dict_main that are part of top 100 OD pairs
top100 = df_main.head(100)
top100.reset_index(drop=True, inplace=True)
ordes = []
for i in range(len(top100)):
    ordes.append((top100['origin'][i], top100['destination'][i]))

trip_dict_subset = {i:trip_dict_main[i] for i in trip_dict_main.keys() if (i[0],i[1]) in ordes}

In [15]:
len(trip_dict_subset.keys())

6923

In [16]:
df_main.loc[(df_main.origin == 'NLRTM')&(df_main.destination == 'NLAMS')].trip_count.item()

9943

In [17]:
# make dataframe dict wih a key for each column
trip_dict_df_main = {'origin': [], 'destination': [], 'hour':[]}
for i in df_2021.SK_CODE.unique():
    trip_dict_df_main[i] = []

# now fill this dict
for key in ordes:
    # append values from dict a
    for hour in list(np.arange(0,24,1)):
        trip_dict_df_main['origin'].append(key[0])
        trip_dict_df_main['destination'].append(key[1])
        trip_dict_df_main['hour'].append(hour)
        #now append values from other dict, for each ship types
        for ship_type in df_2021.SK_CODE.unique():
            #if ships have travelled from A to B, append this value
            if (key[0], key[1], ship_type, hour) in trip_dict_main.keys():
                trip_dict_df_main[ship_type].append(trip_dict_main[(key[0], key[1], ship_type, hour)])
            elif (key[1], key[0], ship_type, hour) in trip_dict_main.keys():
                trip_dict_df_main[ship_type].append(trip_dict_main[(key[1], key[0], ship_type, hour)])
            else:
                # no? Then no ships of this type have travelled this path, append 0
                trip_dict_df_main[ship_type].append(0)
df_main_hourly = pd.DataFrame.from_dict(trip_dict_df_main)
df_main_hourly

Unnamed: 0,origin,destination,hour,M12,M8,BII-6b,M10,BIIa-1,M9,BII-6l,...,C4,B04,M0,C2l,BII-2L,B02,C1b,C2b,B01,C1l
0,NLRTM,NLAMS,0,91,109,0,18,2,45,0,...,0,0,0,0,1,0,0,0,0,0
1,NLRTM,NLAMS,1,78,110,0,17,3,55,0,...,0,0,0,0,0,0,0,0,0,0
2,NLRTM,NLAMS,2,52,122,0,10,3,46,0,...,1,0,0,0,0,0,0,0,0,0
3,NLRTM,NLAMS,3,52,117,0,9,4,52,0,...,1,0,0,0,0,0,0,0,0,0
4,NLRTM,NLAMS,4,55,144,0,14,1,72,0,...,0,2,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2395,NLAMS,NLZWI,19,0,3,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2396,NLAMS,NLZWI,20,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2397,NLAMS,NLZWI,21,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2398,NLAMS,NLZWI,22,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [18]:
df_main_hourly.insert(2, 'trip_count', df_main_hourly.iloc[:, 3:].sum(axis=1))

In [21]:
df_main_hourly

Unnamed: 0,origin,destination,trip_count,hour,M12,M8,BII-6b,M10,BIIa-1,M9,...,C4,B04,M0,C2l,BII-2L,B02,C1b,C2b,B01,C1l
0,NLRTM,NLAMS,323,0,91,109,0,18,2,45,...,0,0,0,0,1,0,0,0,0,0
1,NLRTM,NLAMS,302,1,78,110,0,17,3,55,...,0,0,0,0,0,0,0,0,0,0
2,NLRTM,NLAMS,275,2,52,122,0,10,3,46,...,1,0,0,0,0,0,0,0,0,0
3,NLRTM,NLAMS,303,3,52,117,0,9,4,52,...,1,0,0,0,0,0,0,0,0,0
4,NLRTM,NLAMS,367,4,55,144,0,14,1,72,...,0,2,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2395,NLAMS,NLZWI,17,19,0,3,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2396,NLAMS,NLZWI,12,20,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2397,NLAMS,NLZWI,13,21,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2398,NLAMS,NLZWI,7,22,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [23]:
pickle.dump( df_main_hourly, open( "ABM/own_work/data/df_trips_per_path_hourly.p", "wb" ) )

## Generate random data

In [26]:
df_chance = df_main.copy()

In [27]:
df_chance

Unnamed: 0,origin,destination,trip_count,M12,M8,BII-6b,M10,BIIa-1,M9,BII-6l,...,C4,B04,M0,C2l,BII-2L,B02,C1b,C2b,B01,C1l
0,NLRTM,NLAMS,9943,2076,3510,0,427,102,1316,0,...,5,5,11,7,39,4,0,0,0,0
1,NLRTM,NLMOE,2686,70,1584,0,56,2,144,0,...,2,0,1,0,1,0,0,0,0,0
2,NLRTM,NLVLI,2153,190,658,0,140,10,414,0,...,0,1,2,2,0,0,0,1,0,0
3,NLAMS,NLAER,2119,6,132,0,7,2,0,0,...,0,37,5,1,0,3,0,1,1,0
5,NLVEG,NLRTM,1484,0,1,0,0,0,0,0,...,0,0,17,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10588,NLBZM,NLVSN,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10589,NLBZM,NLVER,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10590,NLBZM,NLSMB,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10591,NLBZM,NLSHH,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [28]:
type_list = list(df_main.columns[3:])

In [29]:
for i in type_list:
    df_chance[i] = df_chance[i]/df_chance.trip_count

In [31]:
# pickle.dump( df_chance, open( "data/probability_df.p", "wb" ) )
# df_chance = pickle.load(  open( "data/probability_df.p", "rb" ) )

In [32]:
df_chance

Unnamed: 0,origin,destination,trip_count,M12,M8,BII-6b,M10,BIIa-1,M9,BII-6l,...,C4,B04,M0,C2l,BII-2L,B02,C1b,C2b,B01,C1l
0,NLRTM,NLAMS,9943,0.208790,0.353012,0.0,0.042945,0.010258,0.132354,0.0,...,0.000503,0.000503,0.001106,0.000704,0.003922,0.000402,0.0,0.000000,0.000000,0.0
1,NLRTM,NLMOE,2686,0.026061,0.589724,0.0,0.020849,0.000745,0.053611,0.0,...,0.000745,0.000000,0.000372,0.000000,0.000372,0.000000,0.0,0.000000,0.000000,0.0
2,NLRTM,NLVLI,2153,0.088249,0.305620,0.0,0.065026,0.004645,0.192290,0.0,...,0.000000,0.000464,0.000929,0.000929,0.000000,0.000000,0.0,0.000464,0.000000,0.0
3,NLAMS,NLAER,2119,0.002832,0.062294,0.0,0.003303,0.000944,0.000000,0.0,...,0.000000,0.017461,0.002360,0.000472,0.000000,0.001416,0.0,0.000472,0.000472,0.0
5,NLVEG,NLRTM,1484,0.000000,0.000674,0.0,0.000000,0.000000,0.000000,0.0,...,0.000000,0.000000,0.011456,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10588,NLBZM,NLVSN,1,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0
10589,NLBZM,NLVER,1,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0
10590,NLBZM,NLSMB,1,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0
10591,NLBZM,NLSHH,1,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0


In [33]:
test_100 = df_chance.head(100)

In [34]:
test_100

Unnamed: 0,origin,destination,trip_count,M12,M8,BII-6b,M10,BIIa-1,M9,BII-6l,...,C4,B04,M0,C2l,BII-2L,B02,C1b,C2b,B01,C1l
0,NLRTM,NLAMS,9943,0.208790,0.353012,0.0,0.042945,0.010258,0.132354,0.0,...,0.000503,0.000503,0.001106,0.000704,0.003922,0.000402,0.0,0.000000,0.000000,0.0
1,NLRTM,NLMOE,2686,0.026061,0.589724,0.0,0.020849,0.000745,0.053611,0.0,...,0.000745,0.000000,0.000372,0.000000,0.000372,0.000000,0.0,0.000000,0.000000,0.0
2,NLRTM,NLVLI,2153,0.088249,0.305620,0.0,0.065026,0.004645,0.192290,0.0,...,0.000000,0.000464,0.000929,0.000929,0.000000,0.000000,0.0,0.000464,0.000000,0.0
3,NLAMS,NLAER,2119,0.002832,0.062294,0.0,0.003303,0.000944,0.000000,0.0,...,0.000000,0.017461,0.002360,0.000472,0.000000,0.001416,0.0,0.000472,0.000472,0.0
5,NLVEG,NLRTM,1484,0.000000,0.000674,0.0,0.000000,0.000000,0.000000,0.0,...,0.000000,0.000000,0.011456,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101,NLHTB,NLBON,324,0.000000,0.003086,0.0,0.000000,0.000000,0.000000,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0
102,NLKGZ,NLZAA,319,0.000000,0.000000,0.0,0.000000,0.661442,0.000000,0.0,...,0.000000,0.015674,0.009404,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0
103,NLRTM,NLLID,311,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0
104,NLRTM,NLSLU,310,0.009677,0.661290,0.0,0.148387,0.000000,0.019355,0.0,...,0.000000,0.000000,0.000000,0.003226,0.000000,0.000000,0.0,0.000000,0.000000,0.0


In [35]:
def random_vessel_generator(df_prob):
    # create dict to store random prob based values later on
    main_dict = {i:[] for i in df_prob.columns}

    # loop over all rows of the probability dataframe
    for i in range(len(df_prob)):
        # copy origin, destination and count from original df
        for x in df_prob.columns[:3]:
            main_dict[x].append(df_prob.iloc[i,:][x])

        # find probability, items to pick, and the number of vessels to generate in total
        prob = list(df_prob.iloc[i, 3:].values)
        to_pick = list(df_prob.columns)
        to_pick = to_pick[3:]
        count = df_prob['trip_count'][i]

        # generate random vessels
        rand_vessels = np.random.choice(a=to_pick, size=count, replace=True, p=prob)
        unique, counts = np.unique(rand_vessels, return_counts=True)
        temp_dict = dict(zip(unique,counts))

        # append amount of random generated vessels right dict list
        for key in list(main_dict.keys())[3:]:
            if key in temp_dict.keys():
                main_dict[key].append(temp_dict[key])
            else:
                main_dict[key].append(0)

        # now make dict
        df_return = pd.DataFrame.from_dict(main_dict)

    return df_return

In [36]:
import time
%time vs = random_vessel_generator(test_100)

KeyError: 4

## Now translate random vessels fuel consumption value for each path
Idea: take 1 vessel as a basis, and assume that fuel consumption is linearly correlated with average engine power, to be able to use index
Capacity is likely x vessels per day of a certain type for a charging unit, depending on the charging time.
Hence, computing the average amount of that type of vehicles that comes by everyday makes sense.

In [37]:
# generate a random set to calculate path values with
df_gen_vessels = random_vessel_generator(test_100)

KeyError: 4

In [None]:
ship_data = pd.read_excel('data/ship_types.xlsx')
ship_data.fillna(0, inplace=True)

In [None]:
ship_data.loc[:,['RWS-class', 'Factor']]

In [None]:
df_gen_vessels

In [None]:
ship_data = dict(zip(ship_data['RWS-class'],ship_data['Factor']))

In [None]:
def flow_computation(df):
    # create dict to store path based values
    flows = {}
    # loop over data frame
    for i in range(len(df)-1):
        # subset all data ship type data
        a = df.iloc[:,3:]
        # flow is initially 0
        flow = 0
        #add number of ships times specific ship type weighing factor
        for row in a.columns:
            flow += ship_data[row] * a[row][i]
        # store flow, divide by 365 to get daily flow
        flows[(df.origin[i],df.destination[i])]= (flow/365)
    return flows

In [None]:
flows = flow_computation(df_gen_vessels)

In [None]:
flows

In [None]:
#sort flows from large to small
flows = dict(sorted(flows.items(), key=lambda item: item[1], reverse=True))

In [None]:
flows

## Get and clean harbour data NL, Ger and Bel

In [38]:
# now get coordinates harbours, in three files, check encoding frist below
pd.read_csv('data/harbour_codes_coords/2021-2 UNLOCODE CodeListPart1.csv', encoding="ISO-8859-1", header=None, sep=",",
            index_col=None)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,,AD,,.ANDORRA,,,,,,,,
1,,AD,ALV,Andorra la Vella,Andorra la Vella,,--34-6--,AI,601.0,,4230N 00131E,
2,,AD,CAN,Canillo,Canillo,,--3-----,RL,307.0,,4234N 00135E,
3,,AD,ENC,Encamp,Encamp,,--3-----,RL,307.0,,4232N 00134E,
4,,AD,ESC,Escaldes-Engordany,Escaldes-Engordany,,--3-----,RL,307.0,,4231N 00133E,
...,...,...,...,...,...,...,...,...,...,...,...,...
54611,,FR,ZTG,Zetting,Zetting,57,--3-----,RL,1901.0,,4905N 00708E,
54612,,FR,ZL2,Zilia,Zilia,2B,-----6--,RL,1901.0,,4231N 00854E,
54613,,FR,ZWL,Zinswiller,Zinswiller,67,--3-----,RL,1901.0,,4855N 00735E,
54614,,FR,PCZ,Zoteux,Zoteux,62,-----6--,RL,1001.0,,5037N 00153E,


In [39]:
# now identify all files to combine
from os import walk

f = []
for (dirpath, dirnames, filenames) in walk('data/harbour_codes_coords'):
    f.extend(filenames)
    break

In [40]:

#empty list to store data
li = []

#read everything in as object for now
for filename in f:
    df = pd.read_csv("data/harbour_codes_coords/" + str(filename), sep=',', index_col=None, header=None,
                     encoding="ISO-8859-1")
    print(filename, 'handled')
    li.append(df)

2021-2 UNLOCODE CodeListPart1.csv handled
2021-2 UNLOCODE CodeListPart2.csv handled
2021-2 UNLOCODE CodeListPart3.csv handled


In [41]:
#create df
df_harbour_codes = pd.concat(li, axis=0, ignore_index=True)
df_benelux=df_harbour_codes.copy()
#subset Dutch harbours
# df_benelux = df_harbour_codes.loc[(df_harbour_codes[1] == 'NL')]

# uncomment below to loc benelux, for now only use NL because of scope
# df_benelux = df_harbour_codes.loc[
#     (df_harbour_codes[1] == 'NL') | (df_harbour_codes[1] == 'DE') | (df_harbour_codes[1] == 'BE') | (
#                 df_harbour_codes[1] == 'LU')]

In [42]:
#fillna
df_benelux.fillna(np.nan, inplace=True)

In [43]:
# give appropriate column names
df_benelux.rename(columns={0: 'mutation', 1: 'country', 2: 'city_abbr', 3: 'city_full', 4: 'city_full_2', 5: 'subdivision', 6: 'function', 7:'status', 8:'date', 9:'iata_code', 10: 'coords', 11: 'comments'}, inplace=True)

In [44]:
df_benelux

Unnamed: 0,mutation,country,city_abbr,city_full,city_full_2,subdivision,function,status,date,iata_code,coords,comments
0,,AD,,.ANDORRA,,,,,,,,
1,,AD,ALV,Andorra la Vella,Andorra la Vella,,--34-6--,AI,601.0,,4230N 00131E,
2,,AD,CAN,Canillo,Canillo,,--3-----,RL,307.0,,4234N 00135E,
3,,AD,ENC,Encamp,Encamp,,--3-----,RL,307.0,,4232N 00134E,
4,,AD,ESC,Escaldes-Engordany,Escaldes-Engordany,,--3-----,RL,307.0,,4231N 00133E,
...,...,...,...,...,...,...,...,...,...,...,...,...
115984,,ZW,STH,Southerton,Southerton,,--3-----,RL,201.0,,1751S 03101E,
115985,,ZW,THJ,Thompson Junction,Thompson Junction,,-23-----,RL,701.0,,1800S 02626E,
115986,,ZW,VFA,Victoria Falls,Victoria Falls,,---4----,AI,9501.0,,,
115987,,ZW,ZMZ,Zimbabwe,Zimbabwe,MV,1-3-----,RL,1401.0,,2016S 03055E,


In [45]:
# subset relevant columns
# df_benelux = df_benelux.loc[:,['country','city_abbr','function', 'coords']]

In [46]:
#reset index
df_benelux.reset_index(inplace=True,drop=True)

In [47]:
# drop rows with nan values for necessary columns
# df_benelux.dropna(subset=['country', 'city_abbr', 'coords'], inplace = True)

In [48]:
# # convert columns to be normal strings
# for i in df_benelux.columns:
#     df_benelux[i] = df_benelux[i].astype('|S80')
#     df_benelux[i] = df_benelux[i].apply(lambda x: x.decode('utf-8'))

In [49]:
df_benelux.replace('nan', np.nan)

Unnamed: 0,mutation,country,city_abbr,city_full,city_full_2,subdivision,function,status,date,iata_code,coords,comments
0,,AD,,.ANDORRA,,,,,,,,
1,,AD,ALV,Andorra la Vella,Andorra la Vella,,--34-6--,AI,601.0,,4230N 00131E,
2,,AD,CAN,Canillo,Canillo,,--3-----,RL,307.0,,4234N 00135E,
3,,AD,ENC,Encamp,Encamp,,--3-----,RL,307.0,,4232N 00134E,
4,,AD,ESC,Escaldes-Engordany,Escaldes-Engordany,,--3-----,RL,307.0,,4231N 00133E,
...,...,...,...,...,...,...,...,...,...,...,...,...
115984,,ZW,STH,Southerton,Southerton,,--3-----,RL,201.0,,1751S 03101E,
115985,,ZW,THJ,Thompson Junction,Thompson Junction,,-23-----,RL,701.0,,1800S 02626E,
115986,,ZW,VFA,Victoria Falls,Victoria Falls,,---4----,AI,9501.0,,,
115987,,ZW,ZMZ,Zimbabwe,Zimbabwe,MV,1-3-----,RL,1401.0,,2016S 03055E,


In [50]:
df_benelux['harbour_code'] = df_benelux.country+df_benelux.city_abbr

In [51]:
df_benelux = df_benelux.loc[:,['country','city_abbr','city_full','function','coords','harbour_code']]

In [52]:
# save cleaned data
df_benelux.to_csv('data/cleaned_harbours.csv')