Notebook to convert AIS points into trade routes / journeys

In [1]:
import os
from os.path import join
from glob import glob
import pandas as pd
import geopandas as gpd
import folium
from shapely.geometry import Point
import folium.plugins as plugins
import seaborn as sns
from matplotlib import pyplot as plt

import numpy as np
import datetime
from datetime import timedelta

In [2]:
pd.options.display.max_columns = None

In [3]:
ais_dir = join(os.path.expanduser("~"), 'data', 'AIS')
data_dir = join(ais_dir, 'Syria')

In [4]:
data_files = glob(data_dir+"/*.csv")

In [5]:
dfs = [pd.read_csv(f, index_col=0) for f in data_files]

In [6]:
df = pd.concat(dfs)

In [7]:
def substract_seconds(x, y):
    return pd.to_datetime(x, format='%Y-%m-%d %H:%M:%S') - pd.Timedelta(y, unit='s')


def passing_by(df):
    if (df["turn_around_time"] < 3) and (0 < np.abs(df['heading-diff']) < 75):
        return 1
    elif (df["turn_around_time"] < 3) and (np.abs(df['heading-diff']) >= 75):
        return 2
    else:
        return 3

def cumsum_rows(df):
    mask = pd.isna(df).astype(bool)
    # compute cumsum across rows fillna with ffill
    cumulative = df.cumsum(1).fillna(method='ffill', axis=1).fillna(0)
    # get the values of cumulative where nan is True use the same method
    restart = cumulative[mask].fillna(method='ffill', axis=1).fillna(0)
    # set the result
    result = (cumulative - restart)
    result[mask] = np.nan
    return result

def week_format(date0,date1,date_format):
    d1 = datetime.datetime.strptime(str(date0), date_format).date()
    d2 = datetime.datetime.strptime(str(date1), date_format).date()
    d = d1
    step = datetime.timedelta(days=90)

    list_weeks = []
    while d < d2:
        list_weeks.append(d.strftime(date_format))
        d += step
    list_weeks.append(str(date1))
    return list_weeks


def get_list_dates(date0,date1):
    #### get the list of dates to merge later on
    datetime_object1 = datetime.datetime.strptime(date0, '%Y-%m-%d')
    datetime_object2 = datetime.datetime.strptime(date1, '%Y-%m-%d')

    datetime_diff = (datetime_object2-datetime_object1).days

    list_dates = pd.date_range(str(date0), periods=datetime_diff+1, freq='1D')
    list_dates_df = pd.DataFrame({'Date':list_dates.date.astype(str)})
    list_dates_df = list_dates_df.set_index('Date')

    return list_dates_df

In [8]:
df.polygon_name.unique()

array(['AL LADHIQIYAH', 'TARTUS', 'BANIYAS'], dtype=object)

In [128]:
port = "AL LADHIQIYAH"
data = df.loc[df.polygon_name==port].copy()
data_raw = data.copy()
date0 = '2018-12-01'
date1 = '2022-08-31'
country = "Syria"

In [129]:
data = data.loc[~data['nav_status'].isin(['At Anchor'])]

In [130]:
data = data.loc[~data.mmsi.isna()]

In [131]:
data.loc[:, "mmsi"] = data.loc[:, "mmsi"].astype('int')

In [132]:
len(data), len(data_raw)

(57088, 70023)

In [133]:
data['dt_pos_utc'].head()

90000    2020-08-31 23:06:45
90001    2020-08-31 22:11:15
90002    2020-08-31 15:20:19
90003    2020-08-31 18:10:34
90004    2020-08-31 11:38:01
Name: dt_pos_utc, dtype: object

In [134]:
data.mmsi.value_counts().head()

563044300    3466
477552700    3221
468395000    3022
353150000    2753
271044398    2486
Name: mmsi, dtype: int64

In [135]:
top_mmsi = data.mmsi.value_counts().index[0]

In [136]:
data = data.loc[data.mmsi==top_mmsi].copy()

In [137]:
data[['Date','Time']] = data.dt_pos_utc.str.split(' ',expand=True)

In [138]:
data['hour'] = pd.to_datetime(data['Time'], format='%H:%M:%S',errors = 'coerce').dt.hour
data['dtg'] = pd.to_datetime(data['Date'] + ' ' + data['Time'])

In [140]:
data = data.sort_values(by=['dtg'])

In [141]:
# data.head(150)

In [142]:
# data = data.head(150).copy()
data_new_subset = data[['mmsi','vessel_type','vessel_type_code','draught','length','width','longitude','latitude','Date','Time','dtg','hour','nav_status','heading','vessel_type_main','vessel_type_sub']]

In [143]:
data_new_subset.tail(2)

Unnamed: 0,mmsi,vessel_type,vessel_type_code,draught,length,width,longitude,latitude,Date,Time,dtg,hour,nav_status,heading,vessel_type_main,vessel_type_sub
87436,563044300,Cargo,74.0,9.0,210.0,30.0,35.678333,35.535,2022-08-27,12:16:02,2022-08-27 12:16:02,12,Under Way Using Engine,0.0,,
87437,563044300,Cargo,74.0,9.0,210.0,30.0,35.606667,35.535,2022-08-27,12:28:05,2022-08-27 12:28:05,12,Under Way Using Engine,0.0,,


In [144]:
### get per day the first and last record per vessel
first_day = data_new_subset.drop_duplicates(subset = ['mmsi','Date'],keep='first')
last_day = data_new_subset.drop_duplicates(subset = ['mmsi','Date'],keep='last')

In [145]:
date0 = '2019-03-13'
date1 = '2019-04-07'

In [146]:
#### get the list of dates to merge later on
list_dates_df = get_list_dates(date0,date1)

In [148]:
### First day data processing
merged_time_series_first_select_1  = first_day[['mmsi','Date','dtg']].copy()
merged_time_series_first_select_2  = first_day[['mmsi','vessel_type','Date','dtg','length','width','draught','heading','vessel_type_main','vessel_type_sub']].copy()
merged_time_series_first_select_2['Date'] = merged_time_series_first_select_2['Date'].astype(str)
merged_time_series_first_select_2.rename(columns={'Date':'date-entry', 'draught':'draught-in','heading':'heading-in'}, inplace = True)

In [149]:
### Last day data processing
merged_time_series_last_select_1  = last_day[['mmsi','Date','dtg']].copy()
merged_time_series_last_select_2  = last_day[['mmsi','Date','dtg','draught','heading']].copy()

In [155]:
#### add full date list per mmsi
merged_time_series_first_new = merged_time_series_first_select_1.set_index(['mmsi','Date']).unstack(level =0)
merged_time_series_first_new = pd.concat([merged_time_series_first_new, list_dates_df], axis=1)
merged_time_series_first_new.index.name = 'Date'
merged_time_series_first_new = merged_time_series_first_new.stack().unstack(level = 0)

In [156]:
merged_time_series_first_new

Unnamed: 0_level_0,dtg,dtg,dtg,dtg,dtg,dtg,dtg,dtg,dtg,dtg,dtg,dtg,dtg,dtg,dtg,dtg,dtg,dtg,dtg,dtg,dtg,dtg,dtg,dtg,dtg,dtg,dtg,dtg,dtg,dtg,dtg,dtg,dtg,dtg,dtg,dtg,dtg,dtg,dtg,dtg,dtg,dtg,dtg,dtg,dtg,dtg,dtg,dtg,dtg,dtg,dtg,dtg,dtg,dtg,dtg,dtg,dtg,dtg,dtg,dtg,dtg,dtg,dtg,dtg,dtg,dtg,dtg,dtg,dtg,dtg,dtg,dtg,dtg,dtg,dtg,dtg,dtg,dtg,dtg,dtg,dtg,dtg,dtg
Date,2019-03-13,2019-03-14,2019-03-15,2019-04-06,2019-04-07,2019-05-11,2019-06-08,2019-06-09,2019-07-07,2019-08-07,2019-09-01,2019-09-02,2019-09-03,2019-09-27,2019-10-24,2019-10-25,2019-11-21,2019-11-22,2019-11-23,2019-12-19,2019-12-20,2019-12-21,2020-01-23,2020-01-24,2020-01-25,2020-02-20,2020-02-21,2020-03-23,2020-03-24,2020-04-18,2020-04-19,2020-05-15,2020-06-12,2020-06-13,2020-07-09,2020-07-10,2020-07-11,2020-08-07,2020-09-03,2020-09-04,2020-10-03,2020-10-04,2020-10-30,2020-10-31,2020-11-27,2020-11-28,2020-12-25,2020-12-26,2021-03-01,2021-03-02,2021-03-28,2021-03-29,2021-05-22,2021-05-23,2021-06-18,2021-06-19,2021-07-17,2021-07-18,2021-08-13,2021-09-10,2021-09-11,2021-10-08,2021-10-09,2021-11-05,2021-11-06,2021-12-04,2021-12-05,2021-12-31,2022-02-06,2022-02-08,2022-02-09,2022-03-05,2022-03-06,2022-03-07,2022-04-08,2022-04-09,2022-04-10,2022-05-05,2022-05-06,2022-06-04,2022-06-05,2022-08-26,2022-08-27
mmsi,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2,Unnamed: 25_level_2,Unnamed: 26_level_2,Unnamed: 27_level_2,Unnamed: 28_level_2,Unnamed: 29_level_2,Unnamed: 30_level_2,Unnamed: 31_level_2,Unnamed: 32_level_2,Unnamed: 33_level_2,Unnamed: 34_level_2,Unnamed: 35_level_2,Unnamed: 36_level_2,Unnamed: 37_level_2,Unnamed: 38_level_2,Unnamed: 39_level_2,Unnamed: 40_level_2,Unnamed: 41_level_2,Unnamed: 42_level_2,Unnamed: 43_level_2,Unnamed: 44_level_2,Unnamed: 45_level_2,Unnamed: 46_level_2,Unnamed: 47_level_2,Unnamed: 48_level_2,Unnamed: 49_level_2,Unnamed: 50_level_2,Unnamed: 51_level_2,Unnamed: 52_level_2,Unnamed: 53_level_2,Unnamed: 54_level_2,Unnamed: 55_level_2,Unnamed: 56_level_2,Unnamed: 57_level_2,Unnamed: 58_level_2,Unnamed: 59_level_2,Unnamed: 60_level_2,Unnamed: 61_level_2,Unnamed: 62_level_2,Unnamed: 63_level_2,Unnamed: 64_level_2,Unnamed: 65_level_2,Unnamed: 66_level_2,Unnamed: 67_level_2,Unnamed: 68_level_2,Unnamed: 69_level_2,Unnamed: 70_level_2,Unnamed: 71_level_2,Unnamed: 72_level_2,Unnamed: 73_level_2,Unnamed: 74_level_2,Unnamed: 75_level_2,Unnamed: 76_level_2,Unnamed: 77_level_2,Unnamed: 78_level_2,Unnamed: 79_level_2,Unnamed: 80_level_2,Unnamed: 81_level_2,Unnamed: 82_level_2,Unnamed: 83_level_2
563044300,2019-03-13 11:54:57,2019-03-14 00:34:21,2019-03-15 00:46:56,2019-04-06 22:14:40,2019-04-07 00:01:09,2019-05-11 07:03:54,2019-06-08 13:59:00,2019-06-09 00:00:02,2019-07-07 02:09:27,2019-08-07 00:52:03,2019-09-01 22:11:35,2019-09-02 07:55:17,2019-09-03 00:00:40,2019-09-27 01:34:26,2019-10-24 21:47:54,2019-10-25 00:00:16,2019-11-21 21:44:33,2019-11-22 00:08:55,2019-11-23 00:18:38,2019-12-19 21:38:41,2019-12-20 00:03:37,2019-12-21 00:15:08,2020-01-23 21:29:44,2020-01-24 00:19:08,2020-01-25 00:36:47,2020-02-20 07:21:06,2020-02-21 00:11:42,2020-03-23 10:14:08,2020-03-24 00:02:41,2020-04-18 02:58:35,2020-04-19 00:02:02,2020-05-15 01:47:16,2020-06-12 03:21:06,2020-06-13 00:14:52,2020-07-09 20:41:35,2020-07-10 06:05:25,2020-07-11 00:21:27,2020-08-07 00:46:01,2020-09-03 11:08:37,2020-09-04 00:11:37,2020-10-03 09:08:10,2020-10-04 00:10:26,2020-10-30 09:04:43,2020-10-31 00:07:08,2020-11-27 11:02:45,2020-11-28 00:25:52,2020-12-25 21:43:31,2020-12-26 07:08:59,2021-03-01 05:12:38,2021-03-02 00:58:14,2021-03-28 00:05:58,2021-03-29 00:00:50,2021-05-22 11:29:55,2021-05-23 01:23:05,2021-06-18 01:03:11,2021-06-19 00:11:13,2021-07-17 16:27:42,2021-07-18 00:06:29,2021-08-13 00:51:25,2021-09-10 18:09:23,2021-09-11 00:25:17,2021-10-08 00:25:27,2021-10-09 00:20:24,2021-11-05 01:24:41,2021-11-06 00:03:14,2021-12-04 16:07:25,2021-12-05 01:11:53,2021-12-31 04:29:07,2022-02-06 03:24:50,2022-02-08 09:56:59,2022-02-09 00:38:15,2022-03-05 11:11:03,2022-03-06 00:07:10,2022-03-07 00:01:49,2022-04-08 16:26:40,2022-04-09 00:07:03,2022-04-10 00:14:24,2022-05-05 23:31:57,2022-05-06 00:02:01,2022-06-04 12:30:42,2022-06-05 00:23:32,2022-08-26 17:16:13,2022-08-27 00:20:16


Example of first time per day for one vessel

In [113]:
#### add full date list per mmsi
merged_time_series_last_new = merged_time_series_last_select_1.set_index(['mmsi','Date']).unstack(level = 0)
merged_time_series_last_new = pd.concat([merged_time_series_last_new,list_dates_df], axis=1)
merged_time_series_last_new.index.name = 'Date'
merged_time_series_last_new = merged_time_series_last_new.stack().unstack(level = 0)

Example of last time per day for one vessel

In [114]:
merged_time_series_last_new

Unnamed: 0_level_0,dtg,dtg,dtg,dtg,dtg
Date,2019-03-13,2019-03-14,2019-03-15,2019-04-06,2019-04-07
mmsi,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
563044300,2019-03-13 23:55:14,2019-03-14 23:43:46,2019-03-15 09:45:17,2019-04-06 23:48:59,2019-04-07 04:55:55


In [115]:
### convert to numeric value to do substraction and then convert back to number of hours
t2 = merged_time_series_first_new.astype('datetime64').astype(int).astype(float)
t1 = merged_time_series_last_new.astype('datetime64').astype(int).astype(float)
time_diff = (t1['dtg']-t2['dtg'])/(3600*1000*1000*1000)
time_diff = time_diff.replace(0,np.nan)

In [116]:
time_diff

Date,2019-03-13,2019-03-14,2019-03-15,2019-04-06,2019-04-07
mmsi,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
563044300,12.004722,23.156944,8.9725,1.571944,4.912778


In [117]:
time_diff_new = cumsum_rows(time_diff)

In [118]:
time_diff_new

Date,2019-03-13,2019-03-14,2019-03-15,2019-04-06,2019-04-07
mmsi,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
563044300,12.004722,35.161667,44.134167,45.706111,50.618889


In [80]:
### derive the port calls and time between coming in and out
time_diff_new = cumsum_rows(time_diff)
time_diff_new.replace(np.nan,0, inplace = True)
# time_diff_new = time_diff_new.diff(axis = 1)
# time_diff_new = time_diff_new[time_diff_new < 0]
# cols = time_diff_new.columns[:-1]
# time_diff_new.drop(time_diff_new.columns[0],axis=1,inplace=True)
# time_diff_new.columns = cols
# time_diff_new = time_diff_new * -1
# time_diff_new = time_diff_new.mask(time_diff_new < 3)

In [84]:
time_diff_new.diff(axis = 1)

Date,2019-03-13,2019-03-14,2019-03-15
mmsi,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
563044300,,23.156944,8.9725


In [78]:
### get the turnaround time
turn_around_time = time_diff_new.unstack().dropna().reset_index(level = ['mmsi','Date'])
turn_around_time = pd.merge(turn_around_time, merged_time_series_last_select_2, on=['Date','mmsi'])
turn_around_time.rename(columns={"draught": "draught-out","dtg": "datetime-leave", 0: "turn_around_time", 'Date': 'date-leave','heading':'heading-out'}, inplace = True)

In [79]:
turn_around_time.loc[turn_around_time.mmsi==top_mmsi].head()

Unnamed: 0,turn_around_time,mmsi,date-leave,datetime-leave,draught-out,heading-out


In [70]:
turn_around_time['seconds'] = turn_around_time['turn_around_time']*3600
turn_around_time['datetime-entry'] = turn_around_time.apply(lambda row: substract_seconds(row['datetime-leave'], row['seconds']), axis=1, result_type='reduce')
turn_around_time['datetime-entry']  = turn_around_time['datetime-entry'].dt.round('1s')
turn_around_time['date-entry'] = turn_around_time['datetime-entry'].dt.date
turn_around_time['date-entry'] = turn_around_time['date-entry'].astype(str)

In [71]:
turn_around_time.loc[turn_around_time.mmsi==top_mmsi].head()

Unnamed: 0,date-leave,mmsi,turn_around_time,datetime-leave,draught-out,heading-out,seconds,datetime-entry,date-entry
24,2019-03-15,563044300,44.134167,2019-03-15 09:45:17,10.2,0.0,158883.0,2019-03-13 13:37:14,2019-03-13
36,2019-04-07,563044300,13.904444,2019-04-07 12:21:06,10.1,0.0,50056.0,2019-04-06 22:26:50,2019-04-06
63,2019-05-11,563044300,15.989722,2019-05-11 23:03:17,10.2,274.0,57563.0,2019-05-11 07:03:54,2019-05-11
93,2019-06-09,563044300,23.875278,2019-06-09 13:59:56,10.2,0.0,85951.0,2019-06-08 14:07:25,2019-06-08
115,2019-07-07,563044300,18.264167,2019-07-07 20:25:18,9.9,0.0,65751.0,2019-07-07 02:09:27,2019-07-07


In [39]:
### get the final port calls
port_calls = pd.merge(turn_around_time, merged_time_series_first_select_2, on=['date-entry','mmsi'])
port_calls['draught-diff'] = port_calls['draught-out'] - port_calls['draught-in']
port_calls['heading-diff'] = port_calls['heading-out'] - port_calls['heading-in']

In [40]:
### check if vessels are passing by
port_calls['passing'] = port_calls.apply(passing_by, axis = 1)
port_calls = port_calls[port_calls['passing']!= 1]

In [41]:
### add information
port_calls['port-name'] = str(port)
port_calls['country'] = str(country)

In [42]:
port_calls.columns

Index(['date-leave', 'mmsi', 'turn_around_time', 'datetime-leave',
       'draught-out', 'heading-out', 'seconds', 'datetime-entry', 'date-entry',
       'vessel_type', 'dtg', 'length', 'width', 'draught-in', 'heading-in',
       'vessel_type_main', 'vessel_type_sub', 'draught-diff', 'heading-diff',
       'passing', 'port-name', 'country'],
      dtype='object')

In [43]:
len(port_calls)

793

In [44]:
port_calls.loc[turn_around_time.mmsi==top_mmsi].head(5)[['mmsi', 'turn_around_time', 'datetime-leave',
       'draught-out', 'heading-out', 'datetime-entry','vessel_type', 'length', 'width', 'draught-in', 'heading-in', 'draught-diff', 'heading-diff', 'port-name']]

Unnamed: 0,mmsi,turn_around_time,datetime-leave,draught-out,heading-out,datetime-entry,vessel_type,length,width,draught-in,heading-in,draught-diff,heading-diff,port-name
24,271044633,22.245,2019-03-22 09:29:34,6.2,305.0,2019-03-21 11:14:52,Cargo,151.0,24.0,6.7,86.0,-0.5,219.0,AL LADHIQIYAH
36,257972000,35.153056,2019-04-14 20:20:55,6.3,333.0,2019-04-13 09:11:44,Cargo,200.0,32.0,6.3,18.0,0.0,315.0,AL LADHIQIYAH
63,355267000,56.431111,2019-05-16 17:18:11,5.3,275.0,2019-05-14 08:52:19,Cargo,108.0,18.0,6.5,84.0,-1.2,191.0,AL LADHIQIYAH
93,620575000,8.099722,2019-06-15 15:16:15,6.5,0.0,2019-06-15 07:10:16,Tug,46.0,11.0,6.5,0.0,0.0,0.0,AL LADHIQIYAH
115,622113176,13.907778,2019-07-12 03:50:59,10.4,47.0,2019-07-11 13:56:31,Cargo,184.0,25.0,10.4,92.0,0.0,-45.0,AL LADHIQIYAH


In [45]:
port_calls.loc[turn_around_time.mmsi==top_mmsi].head(5)[['mmsi', 'turn_around_time', 'datetime-leave',
       'draught-out', 'heading-out', 'datetime-entry','vessel_type', 'length', 'width', 'draught-in', 'heading-in', 'draught-diff', 'heading-diff', 'port-name']]

Unnamed: 0,mmsi,turn_around_time,datetime-leave,draught-out,heading-out,datetime-entry,vessel_type,length,width,draught-in,heading-in,draught-diff,heading-diff,port-name
24,271044633,22.245,2019-03-22 09:29:34,6.2,305.0,2019-03-21 11:14:52,Cargo,151.0,24.0,6.7,86.0,-0.5,219.0,AL LADHIQIYAH
36,257972000,35.153056,2019-04-14 20:20:55,6.3,333.0,2019-04-13 09:11:44,Cargo,200.0,32.0,6.3,18.0,0.0,315.0,AL LADHIQIYAH
63,355267000,56.431111,2019-05-16 17:18:11,5.3,275.0,2019-05-14 08:52:19,Cargo,108.0,18.0,6.5,84.0,-1.2,191.0,AL LADHIQIYAH
93,620575000,8.099722,2019-06-15 15:16:15,6.5,0.0,2019-06-15 07:10:16,Tug,46.0,11.0,6.5,0.0,0.0,0.0,AL LADHIQIYAH
115,622113176,13.907778,2019-07-12 03:50:59,10.4,47.0,2019-07-11 13:56:31,Cargo,184.0,25.0,10.4,92.0,0.0,-45.0,AL LADHIQIYAH


In [46]:
len(port_calls)

793