Notebook to convert AIS points into trade routes / journeys

In [1]:
import os
from os.path import join
from glob import glob
import pandas as pd
import geopandas as gpd
import folium
from shapely.geometry import Point
import folium.plugins as plugins
import seaborn as sns
from matplotlib import pyplot as plt

import numpy as np
import datetime
from datetime import timedelta

In [2]:
pd.options.display.max_columns = None

In [3]:
ais_dir = join(os.path.expanduser("~"), 'data', 'AIS')
data_dir = join(ais_dir, 'Syria')

In [4]:
data_files = glob(data_dir+"/*.csv")

In [5]:
dfs = [pd.read_csv(f, index_col=0) for f in data_files]

In [6]:
df = pd.concat(dfs)

In [7]:
def substract_seconds(x, y):
    return pd.to_datetime(x, format='%Y-%m-%d %H:%M:%S') - pd.Timedelta(y, unit='s')


def passing_by(df):
    if (df["turn_around_time"] < 3) and (0 < np.abs(df['heading-diff']) < 75):
        return 1
    elif (df["turn_around_time"] < 3) and (np.abs(df['heading-diff']) >= 75):
        return 2
    else:
        return 3

def cumsum_rows(df):
    mask = pd.isna(df).astype(bool)
    # compute cumsum across rows fillna with ffill
    cumulative = df.cumsum(1).fillna(method='ffill', axis=1).fillna(0)
    # get the values of cumulative where nan is True use the same method
    restart = cumulative[mask].fillna(method='ffill', axis=1).fillna(0)
    # set the result
    result = (cumulative - restart)
    result[mask] = np.nan
    return result

def week_format(date0,date1,date_format):
    d1 = datetime.datetime.strptime(str(date0), date_format).date()
    d2 = datetime.datetime.strptime(str(date1), date_format).date()
    d = d1
    step = datetime.timedelta(days=90)

    list_weeks = []
    while d < d2:
        list_weeks.append(d.strftime(date_format))
        d += step
    list_weeks.append(str(date1))
    return list_weeks


def get_list_dates(date0,date1):
    #### get the list of dates to merge later on
    datetime_object1 = datetime.datetime.strptime(date0, '%Y-%m-%d')
    datetime_object2 = datetime.datetime.strptime(date1, '%Y-%m-%d')

    datetime_diff = (datetime_object2-datetime_object1).days

    list_dates = pd.date_range(str(date0), periods=datetime_diff+1, freq='1D')
    list_dates_df = pd.DataFrame({'Date':list_dates.date.astype(str)})
    list_dates_df = list_dates_df.set_index('Date')

    return list_dates_df

In [8]:
df.polygon_name.unique()

array(['AL LADHIQIYAH', 'TARTUS', 'BANIYAS'], dtype=object)

In [9]:
port = "AL LADHIQIYAH"
data = df.loc[df.polygon_name==port].copy()
data_raw = data.copy()
date0 = '2018-12-01'
date1 = '2022-08-31'
country = "Syria"

In [10]:
data = data.loc[~data['nav_status'].isin(['At Anchor'])]

In [11]:
data = data.loc[~data.mmsi.isna()]

In [12]:
data.loc[:, "mmsi"] = data.loc[:, "mmsi"].astype('int')

In [13]:
len(data), len(data_raw)

(57088, 70023)

In [14]:
data['dt_pos_utc'].head()

90000    2020-08-31 23:06:45
90001    2020-08-31 22:11:15
90002    2020-08-31 15:20:19
90003    2020-08-31 18:10:34
90004    2020-08-31 11:38:01
Name: dt_pos_utc, dtype: object

In [15]:
data.mmsi.value_counts().head()

563044300    3466
477552700    3221
468395000    3022
353150000    2753
271044398    2486
Name: mmsi, dtype: int64

In [16]:
top_mmsi = data.mmsi.value_counts().index[0]

In [46]:
top_mmsi

563044300

In [17]:
data[['Date','Time']] = data.dt_pos_utc.str.split(' ',expand=True)

In [18]:
data['hour'] = pd.to_datetime(data['Time'], format='%H:%M:%S',errors = 'coerce').dt.hour
data['dtg'] = pd.to_datetime(data['Date'] + ' ' + data['Time'])

In [19]:
data = data.sort_values(by=['dtg'])
data_new_subset = data[['mmsi','vessel_type','vessel_type_code','draught','length','width','longitude','latitude','Date','Time','dtg','hour','nav_status','heading','vessel_type_main','vessel_type_sub']]

In [20]:
data_new_subset.head(2)

Unnamed: 0,mmsi,vessel_type,vessel_type_code,draught,length,width,longitude,latitude,Date,Time,dtg,hour,nav_status,heading,vessel_type_main,vessel_type_sub
0,312945000,Cargo,,6.0,171.0,27.0,35.75975,35.526542,2018-12-02,19:31:21,2018-12-02 19:31:21,19,Under Way Using Engine,78.0,Bulk Carrier,
1,312945000,Cargo,,6.0,171.0,27.0,35.75975,35.5265,2018-12-03,07:09:27,2018-12-03 07:09:27,7,Under Way Using Engine,78.0,Bulk Carrier,


In [21]:
### get per day the first and last record per vessel
first_day = data_new_subset.drop_duplicates(subset = ['mmsi','Date'],keep='first')
last_day = data_new_subset.drop_duplicates(subset = ['mmsi','Date'],keep='last')

In [22]:
#### get the list of dates to merge later on
list_dates_df = get_list_dates(date0,date1)

In [23]:
first_day.head(2)

Unnamed: 0,mmsi,vessel_type,vessel_type_code,draught,length,width,longitude,latitude,Date,Time,dtg,hour,nav_status,heading,vessel_type_main,vessel_type_sub
0,312945000,Cargo,,6.0,171.0,27.0,35.75975,35.526542,2018-12-02,19:31:21,2018-12-02 19:31:21,19,Under Way Using Engine,78.0,Bulk Carrier,
1,312945000,Cargo,,6.0,171.0,27.0,35.75975,35.5265,2018-12-03,07:09:27,2018-12-03 07:09:27,7,Under Way Using Engine,78.0,Bulk Carrier,


In [24]:
### First day data processing
merged_time_series_first_select_1  = first_day[['mmsi','Date','dtg']].copy()
merged_time_series_first_select_2  = first_day[['mmsi','vessel_type','Date','dtg','length','width','draught','heading','vessel_type_main','vessel_type_sub']].copy()
merged_time_series_first_select_2['Date'] = merged_time_series_first_select_2['Date'].astype(str)
merged_time_series_first_select_2.rename(columns={'Date':'date-entry', 'draught':'draught-in','heading':'heading-in'}, inplace = True)

In [25]:
### Last day data processing
merged_time_series_last_select_1  = last_day[['mmsi','Date','dtg']].copy()
merged_time_series_last_select_2  = last_day[['mmsi','Date','dtg','draught','heading']].copy()

In [26]:
#### add full date list per mmsi
merged_time_series_first_new = merged_time_series_first_select_1.set_index(['mmsi','Date']).unstack(level =0)
merged_time_series_first_new = pd.concat([merged_time_series_first_new, list_dates_df], axis=1)
merged_time_series_first_new.index.name = 'Date'
merged_time_series_first_new = merged_time_series_first_new.stack().unstack(level = 0)

Example of first time per day for one vessel

In [27]:
merged_time_series_first_new.loc[top_mmsi].dropna()

     Date      
dtg  2019-03-13   2019-03-13 11:54:57
     2019-03-14   2019-03-14 00:34:21
     2019-03-15   2019-03-15 00:46:56
     2019-04-06   2019-04-06 22:14:40
     2019-04-07   2019-04-07 00:01:09
                          ...        
     2022-05-06   2022-05-06 00:02:01
     2022-06-04   2022-06-04 12:30:42
     2022-06-05   2022-06-05 00:23:32
     2022-08-26   2022-08-26 17:16:13
     2022-08-27   2022-08-27 00:20:16
Name: 563044300, Length: 83, dtype: datetime64[ns]

In [28]:
#### add full date list per mmsi
merged_time_series_last_new = merged_time_series_last_select_1.set_index(['mmsi','Date']).unstack(level = 0)
merged_time_series_last_new = pd.concat([merged_time_series_last_new,list_dates_df], axis=1)
merged_time_series_last_new.index.name = 'Date'
merged_time_series_last_new = merged_time_series_last_new.stack().unstack(level = 0)

Example of last time per day for one vessel

In [29]:
merged_time_series_last_new.loc[top_mmsi].dropna()

     Date      
dtg  2019-03-13   2019-03-13 23:55:14
     2019-03-14   2019-03-14 23:43:46
     2019-03-15   2019-03-15 09:45:17
     2019-04-06   2019-04-06 23:48:59
     2019-04-07   2019-04-07 12:21:06
                          ...        
     2022-05-06   2022-05-06 19:59:02
     2022-06-04   2022-06-04 21:44:08
     2022-06-05   2022-06-05 02:50:59
     2022-08-26   2022-08-26 23:41:11
     2022-08-27   2022-08-27 12:28:05
Name: 563044300, Length: 83, dtype: datetime64[ns]

In [30]:
### convert to numeric value to do substraction and then convert back to number of hours
t2 = merged_time_series_first_new.astype('datetime64').astype(int).astype(float)
t1 = merged_time_series_last_new.astype('datetime64').astype(int).astype(float)
time_diff = (t1['dtg']-t2['dtg'])/(3600*1000*1000*1000)
time_diff = time_diff.replace(0,np.nan)

In [31]:
time_diff.loc[top_mmsi].dropna().head(10)

Date
2019-03-13    12.004722
2019-03-14    23.156944
2019-03-15     8.972500
2019-04-06     1.571944
2019-04-07    12.332500
2019-05-11    15.989722
2019-06-08     9.876944
2019-06-09    13.998333
2019-07-07    18.264167
2019-08-07    20.453056
Name: 563044300, dtype: float64

In [33]:
time_diff_new = cumsum_rows(time_diff)

In [34]:
time_diff_new.loc[top_mmsi].dropna().head(10)

Date
2019-03-13    12.004722
2019-03-14    35.161667
2019-03-15    44.134167
2019-04-06     1.571944
2019-04-07    13.904444
2019-05-11    15.989722
2019-06-08     9.876944
2019-06-09    23.875278
2019-07-07    18.264167
2019-08-07    20.453056
Name: 563044300, dtype: float64

In [43]:
### derive the port calls and time between coming in and out
time_diff_new = cumsum_rows(time_diff)
time_diff_new.replace(np.nan,0, inplace = True)
time_diff_new = time_diff_new.diff(axis = 1)
time_diff_new = time_diff_new[time_diff_new < 0]
cols = time_diff_new.columns[:-1]
time_diff_new.drop(time_diff_new.columns[0],axis=1,inplace=True)
time_diff_new.columns = cols
time_diff_new = time_diff_new * -1
time_diff_new = time_diff_new.mask(time_diff_new < 3)

In [45]:
list(time_diff_new.loc[top_mmsi].dropna())

[44.13416666666667,
 13.90444444444445,
 15.989722222222227,
 23.875277777777782,
 18.264166666666668,
 20.453055555555565,
 21.864444444444445,
 19.746944444444438,
 18.496944444444438,
 26.469444444444434,
 27.441666666666663,
 29.80694444444441,
 37.39333333333332,
 20.651388888888903,
 22.425833333333344,
 22.165833333333353,
 24.680555555555543,
 20.355555555555554,
 13.014166666666654,
 20.14833333333337,
 14.635833333333323,
 18.41638888888889,
 21.203333333333376,
 15.094166666666752,
 29.736944444444475,
 26.514999999999986,
 21.404166666666583,
 29.185277777777742,
 19.98249999999996,
 20.251666666666665,
 20.299722222222158,
 32.273333333333426,
 22.478333333333353,
 18.800555555555547,
 22.857222222222276,
 42.46527777777783,
 42.98138888888889,
 20.250833333333276,
 11.681388888888819,
 18.54638888888894]

In [68]:
### get the turnaround time
turn_around_time = time_diff_new.unstack().dropna().reset_index(level = ['mmsi','Date'])
turn_around_time = pd.merge(turn_around_time, merged_time_series_last_select_2, on=['Date','mmsi'])
turn_around_time.rename(columns={"draught": "draught-out","dtg": "datetime-leave", 0: "turn_around_time", 'Date': 'date-leave','heading':'heading-out'}, inplace = True)

In [69]:
turn_around_time.loc[turn_around_time.mmsi==top_mmsi].head()

Unnamed: 0,date-leave,mmsi,turn_around_time,datetime-leave,draught-out,heading-out
24,2019-03-15,563044300,44.134167,2019-03-15 09:45:17,10.2,0.0
36,2019-04-07,563044300,13.904444,2019-04-07 12:21:06,10.1,0.0
63,2019-05-11,563044300,15.989722,2019-05-11 23:03:17,10.2,274.0
93,2019-06-09,563044300,23.875278,2019-06-09 13:59:56,10.2,0.0
115,2019-07-07,563044300,18.264167,2019-07-07 20:25:18,9.9,0.0


In [70]:
turn_around_time['seconds'] = turn_around_time['turn_around_time']*3600
turn_around_time['datetime-entry'] = turn_around_time.apply(lambda row: substract_seconds(row['datetime-leave'], row['seconds']), axis=1, result_type='reduce')
turn_around_time['datetime-entry']  = turn_around_time['datetime-entry'].dt.round('1s')
turn_around_time['date-entry'] = turn_around_time['datetime-entry'].dt.date
turn_around_time['date-entry'] = turn_around_time['date-entry'].astype(str)

In [71]:
turn_around_time.loc[turn_around_time.mmsi==top_mmsi].head()

Unnamed: 0,date-leave,mmsi,turn_around_time,datetime-leave,draught-out,heading-out,seconds,datetime-entry,date-entry
24,2019-03-15,563044300,44.134167,2019-03-15 09:45:17,10.2,0.0,158883.0,2019-03-13 13:37:14,2019-03-13
36,2019-04-07,563044300,13.904444,2019-04-07 12:21:06,10.1,0.0,50056.0,2019-04-06 22:26:50,2019-04-06
63,2019-05-11,563044300,15.989722,2019-05-11 23:03:17,10.2,274.0,57563.0,2019-05-11 07:03:54,2019-05-11
93,2019-06-09,563044300,23.875278,2019-06-09 13:59:56,10.2,0.0,85951.0,2019-06-08 14:07:25,2019-06-08
115,2019-07-07,563044300,18.264167,2019-07-07 20:25:18,9.9,0.0,65751.0,2019-07-07 02:09:27,2019-07-07


In [39]:
### get the final port calls
port_calls = pd.merge(turn_around_time, merged_time_series_first_select_2, on=['date-entry','mmsi'])
port_calls['draught-diff'] = port_calls['draught-out'] - port_calls['draught-in']
port_calls['heading-diff'] = port_calls['heading-out'] - port_calls['heading-in']

In [40]:
### check if vessels are passing by
port_calls['passing'] = port_calls.apply(passing_by, axis = 1)
port_calls = port_calls[port_calls['passing']!= 1]

In [41]:
### add information
port_calls['port-name'] = str(port)
port_calls['country'] = str(country)

In [42]:
port_calls.columns

Index(['date-leave', 'mmsi', 'turn_around_time', 'datetime-leave',
       'draught-out', 'heading-out', 'seconds', 'datetime-entry', 'date-entry',
       'vessel_type', 'dtg', 'length', 'width', 'draught-in', 'heading-in',
       'vessel_type_main', 'vessel_type_sub', 'draught-diff', 'heading-diff',
       'passing', 'port-name', 'country'],
      dtype='object')

In [43]:
len(port_calls)

793

In [44]:
port_calls.loc[turn_around_time.mmsi==top_mmsi].head(5)[['mmsi', 'turn_around_time', 'datetime-leave',
       'draught-out', 'heading-out', 'datetime-entry','vessel_type', 'length', 'width', 'draught-in', 'heading-in', 'draught-diff', 'heading-diff', 'port-name']]

Unnamed: 0,mmsi,turn_around_time,datetime-leave,draught-out,heading-out,datetime-entry,vessel_type,length,width,draught-in,heading-in,draught-diff,heading-diff,port-name
24,271044633,22.245,2019-03-22 09:29:34,6.2,305.0,2019-03-21 11:14:52,Cargo,151.0,24.0,6.7,86.0,-0.5,219.0,AL LADHIQIYAH
36,257972000,35.153056,2019-04-14 20:20:55,6.3,333.0,2019-04-13 09:11:44,Cargo,200.0,32.0,6.3,18.0,0.0,315.0,AL LADHIQIYAH
63,355267000,56.431111,2019-05-16 17:18:11,5.3,275.0,2019-05-14 08:52:19,Cargo,108.0,18.0,6.5,84.0,-1.2,191.0,AL LADHIQIYAH
93,620575000,8.099722,2019-06-15 15:16:15,6.5,0.0,2019-06-15 07:10:16,Tug,46.0,11.0,6.5,0.0,0.0,0.0,AL LADHIQIYAH
115,622113176,13.907778,2019-07-12 03:50:59,10.4,47.0,2019-07-11 13:56:31,Cargo,184.0,25.0,10.4,92.0,0.0,-45.0,AL LADHIQIYAH


In [45]:
port_calls.loc[turn_around_time.mmsi==top_mmsi].head(5)[['mmsi', 'turn_around_time', 'datetime-leave',
       'draught-out', 'heading-out', 'datetime-entry','vessel_type', 'length', 'width', 'draught-in', 'heading-in', 'draught-diff', 'heading-diff', 'port-name']]

Unnamed: 0,mmsi,turn_around_time,datetime-leave,draught-out,heading-out,datetime-entry,vessel_type,length,width,draught-in,heading-in,draught-diff,heading-diff,port-name
24,271044633,22.245,2019-03-22 09:29:34,6.2,305.0,2019-03-21 11:14:52,Cargo,151.0,24.0,6.7,86.0,-0.5,219.0,AL LADHIQIYAH
36,257972000,35.153056,2019-04-14 20:20:55,6.3,333.0,2019-04-13 09:11:44,Cargo,200.0,32.0,6.3,18.0,0.0,315.0,AL LADHIQIYAH
63,355267000,56.431111,2019-05-16 17:18:11,5.3,275.0,2019-05-14 08:52:19,Cargo,108.0,18.0,6.5,84.0,-1.2,191.0,AL LADHIQIYAH
93,620575000,8.099722,2019-06-15 15:16:15,6.5,0.0,2019-06-15 07:10:16,Tug,46.0,11.0,6.5,0.0,0.0,0.0,AL LADHIQIYAH
115,622113176,13.907778,2019-07-12 03:50:59,10.4,47.0,2019-07-11 13:56:31,Cargo,184.0,25.0,10.4,92.0,0.0,-45.0,AL LADHIQIYAH


In [46]:
len(port_calls)

793