In [23]:
# If running the notebook locally from the CSV file, Spark need not be installed
try:
    import pyspark.sql.functions as F
except:
    pass

import math
import subprocess

import numpy as np
import pandas as pd
from geopy import distance

from datetime import datetime

import plotly.express as px

import hdbscan

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))

### Process Spark dataframe
Run this only if on a cloud with Spark, with a processed AIS table. Otherwise, skip down two sections to the section `Load in date of interest extracted from Spark`

In [2]:
def extract_datetime(df):
    """Extract the datetime parts (year, month, day, hours, minutes, seconds) 
    from a Spark dataframe with a datetime column. Return enriched dataframe.
    
    Input: Spark dataframe with date column in yyyy-MM-dd HH:mm:ss format
    
    Output: Input dataframe with year, month, day column added
    """
    df = df.withColumn('year',  df['basedatetime'].substr(1,4))
    df = df.withColumn('month', df['basedatetime'].substr(6,2))
    df = df.withColumn('day',   df['basedatetime'].substr(9,2))
    df = df.withColumn('hour',  df['basedatetime'].substr(12,2))
    df = df.withColumn('min',   df['basedatetime'].substr(15,2))
    df = df.withColumn('sec',   df['basedatetime'].substr(18,2))
    
    return df

# Load in the AIS databases as a Spark dataframe
ais = spark.table('cmorris.af_vault_ais')

# Extract the datetime
ais = extract_datetime(ais)

# Simplify the column names using lower
for col in ais.columns:
    ais = ais.withColumnRenamed(col, col.lower())

# Keep the dataframe in memory for fast manipulation
ais.cache()

DataFrame[mmsi: string, basedatetime: string, lat: string, lon: string, sog: string, cog: string, heading: string, vesselname: string, imo: string, callsign: string, vesseltype: string, status: string, length: string, width: string, draft: string, cargo: string, filename: string, year: string, month: string, day: string, hour: string, min: string, sec: string]

### Extract date of interest to pandas for local processing

In [3]:
date_of_interest = '2015-01-01'

# Extract relevant date parameters
year, month, day = date_of_interest.split('-')

# Filter the Spark dateframe, extract to pandas local dataframe
sdf = ais.filter((ais['year']==year) & (ais['month']==month) & (ais['day']==day))
df  = sdf.toPandas()

Save to CSV for later use in demo.

In [44]:
df.to_csv(f'../data/ais_{date_of_interest}.csv', index=False)


In [69]:
# Save out a compressed version of this to check into the repo to save size
# Pandas is able to directly read compressed CSV files
subprocess.call(['zip', f'../data/ais_{date_of_interest}.csv.zip', f'../data/ais_{date_of_interest}.csv'])

0

### Load in date of interest extracted from Spark
After the initial data extraction, the notebook can be run starting from this step.

In [70]:
date_of_interest = '2015-01-01'

# try:
df = pd.read_csv(f'../data/ais_{date_of_interest}.csv.zip')

In [50]:
df.shape

(81434, 23)

In [51]:
df.head()

Unnamed: 0,mmsi,basedatetime,lat,lon,sog,cog,heading,vesselname,imo,callsign,...,width,draft,cargo,filename,year,month,day,hour,min,sec
0,366480000,2015-01-01T00:00:27,56.06017,-169.57212,1.9,143.7,157.0,BLUE PACIFIC,IMO6510746,WCX7690,...,9.81,3.31,30.0,/home/cmorris/vault/data/raw/AIS/AIS_ASCII_by_...,2015,1,1,0,0,27
1,366480000,2015-01-01T00:02:18,56.05941,-169.5711,1.7,139.2,157.0,BLUE PACIFIC,IMO6510746,WCX7690,...,9.81,3.31,30.0,/home/cmorris/vault/data/raw/AIS/AIS_ASCII_by_...,2015,1,1,0,2,18
2,366480000,2015-01-01T01:07:18,56.0298,-169.62041,0.4,133.5,165.0,BLUE PACIFIC,IMO6510746,WCX7690,...,9.81,3.31,30.0,/home/cmorris/vault/data/raw/AIS/AIS_ASCII_by_...,2015,1,1,1,7,18
3,366480000,2015-01-01T01:15:04,56.02854,-169.62082,2.3,-174.2,189.0,BLUE PACIFIC,IMO6510746,WCX7690,...,9.81,3.31,30.0,/home/cmorris/vault/data/raw/AIS/AIS_ASCII_by_...,2015,1,1,1,15,4
4,366480000,2015-01-01T01:21:36,56.02772,-169.61993,1.9,154.5,199.0,BLUE PACIFIC,IMO6510746,WCX7690,...,9.81,3.31,30.0,/home/cmorris/vault/data/raw/AIS/AIS_ASCII_by_...,2015,1,1,1,21,36


### Investigate vessel positions

As we investigated the AIS ship tracking data, we became interested in automatically detecting emergent behavior from groups of ships. For instance: can we automatically detect container ships following a shipping lane? can we find groups of ships moving together in a convoy, or a fishing fleet working together?

We began investigating traffic from our ETL-ed `AIS` data from January 1, 2015. The ships are all centered off the coast of Alaska, near the Aleutian Islands. First, we wanted an overview of the average ship position each hour, to get a sense of the data:

In [52]:
# Treat unique value combinations of this set of columns as a single entity
vessel = ['imo', 'vesselname', 'mmsi']

# Find average positions each hour per vessel
dfg = df.groupby(vessel + ['hour']).agg({'lat': np.mean, 'lon': np.mean})
dfg = dfg.reset_index()

In [53]:
dfg.head()

Unnamed: 0,imo,vesselname,mmsi,hour,lat,lon
0,IMO0598508,BARANOF,366270000,0,53.879893,-166.539235
1,IMO0598508,BARANOF,366270000,1,53.879895,-166.539223
2,IMO0598508,BARANOF,366270000,2,53.925267,-166.514811
3,IMO0598508,BARANOF,366270000,3,54.037642,-166.645957
4,IMO0598508,BARANOF,366270000,4,54.033703,-166.906105


Let's plot to see the distribution of ship positions for each unique ship:

In [54]:
fig = px.scatter_geo(dfg.sort_values('vesselname'),
                     lat='lat', 
                     lon='lon', 
                     color='vesselname', 
                     hover_name='hour', height=600)

fig.update_layout(
        title={
        'text': "Hourly Vessel Positions",
        'x':0.45,
        'xanchor': 'center',
        'yanchor': 'top'})
fig.show()

Near the end of the penninsula, seen in our map as the grey trapezoid on the right hand side, we can see a significant amount of ships all following the same path. Investigation of the ships following this path reveals that they are primarily large container and tanker vessels, indicating a shipping lane and a good target for testing our shipping lane identification algorithms.

In the southwestern corner of the map, we see several ships moving along approximately the same course. After direct examination, we identify that the two ships represented by the blue and yellow markers, the `Gulf Valour` and the `Pole`, travel at the same rate alongside each other throughout our sample day. This is the exact type of feature we want for identification of convoys, making this a good test candidate.

Finding groups in large datasets such as this is a great application of clustering analysis. Here, we leveraged `HDBSCAN`, developed by [Campello et al.](https://hdbscan.readthedocs.io/en/latest/how_hdbscan_works.html) in 2013. `HDBSCAN` improves upon the widely used density-based clustering algorithm `DBSCAN` by turning it into a hierarchical clustering algorithm, which allows it to discover clusters of varying densities within a dataset. This algorithm has a distinct advantage over more common methods such as k-means clustering, as the analyst does not need to specifiy the number of clusters before. Additionally, it can find clusters with more complex shapes and varying densities.

### Clustering with HDBSCAN

![kmeans_hdbscan_comparison](img/kmeans_hdbscan_comparison.png)

### Clustering: first, let's try using positions

We then used the lattitude and longitude of the ship during each hour to cluster ships together. Because we subset each position by hour, each ship will appear in a cluster each hour that it is present. This means that ships traveling along the same route at different times can end up in the same cluster. When this occurs for a large number of ships across differeing hour intervals, we have a strong indication for a shipping lane.

In [55]:
hdb = hdbscan.HDBSCAN()
hdb.fit(dfg[['lat','lon']])

dfg['cluster'] = ['cluster ' + str(x) for x in hdb.labels_]

In [56]:
fig = px.scatter_geo(dfg.sort_values('cluster'),
                     lat='lat', 
                     lon='lon', 
                     color='cluster',
                     hover_name='hour',  
                     hover_data=['vesselname'], 
                     height=600,
                     color_discrete_sequence=px.colors.qualitative.Dark24)

fig.update_layout(
        title={
        'text': "Hourly Vessel Position Clusters",
        'x':0.45,
        'xanchor': 'center',
        'yanchor': 'top'})
fig.show()

### Adding vessel bearings to improve clustering

In [57]:
date_of_interest = '2015-01-01'

df = pd.read_csv(f'../data/ais_{date_of_interest}.csv')

#### Calcualte vessel bearings

In [58]:
# Treat unique value combinations of this set of columns as a single entity
vessel = ['imo', 'vesselname', 'mmsi']

# We want things to be time ordered within each IMO. Sorting by IMO shouldn't technically matter,
# but it will make the index behave better and make checking things easier
df = df.sort_values(['imo','basedatetime'])

# Let's get the index in our new order
df = df.reset_index(drop=True)

# Create the lat-lon pairs the distance function needs
df['latlon'] = list(zip(df['lat'], df['lon']))

# Create columns for lat-lon and datetime that are shifted forward by 1, within each IMO. This means there
# will be a NaN at the first time point for each IMO
df['latlon_previous']       = df.groupby(vessel).latlon.shift(1)
df['basedatetime_previous'] = df.groupby(vessel)['basedatetime'].shift(1)

# Eliminate the rows without a pair (i.e the row of the first timestamp)
df = df[~df['latlon_previous'].isnull()].copy(deep=True)

# Create column with the pairings for lat-lon and datetime
df['latlon_pair']   = list(zip(df['latlon_previous'],       df['latlon']))
df['datetime_pair'] = list(zip(df['basedatetime_previous'], df['basedatetime']))

In [59]:
df[['mmsi','imo','vesselname','latlon','latlon_previous','basedatetime','basedatetime_previous','latlon_pair','datetime_pair']].head()

Unnamed: 0,mmsi,imo,vesselname,latlon,latlon_previous,basedatetime,basedatetime_previous,latlon_pair,datetime_pair
1,366270000,IMO0598508,BARANOF,"(53.8799, -166.53922)","(53.87989, -166.53923999999998)",2015-01-01T00:01:20,2015-01-01T00:00:12,"((53.87989, -166.53923999999998), (53.8799, -1...","(2015-01-01T00:00:12, 2015-01-01T00:01:20)"
2,366270000,IMO0598508,BARANOF,"(53.87989, -166.53926)","(53.8799, -166.53922)",2015-01-01T00:02:21,2015-01-01T00:01:20,"((53.8799, -166.53922), (53.87989, -166.53926))","(2015-01-01T00:01:20, 2015-01-01T00:02:21)"
3,366270000,IMO0598508,BARANOF,"(53.87989, -166.53922)","(53.87989, -166.53926)",2015-01-01T00:03:30,2015-01-01T00:02:21,"((53.87989, -166.53926), (53.87989, -166.53922))","(2015-01-01T00:02:21, 2015-01-01T00:03:30)"
4,366270000,IMO0598508,BARANOF,"(53.8799, -166.53927)","(53.87989, -166.53922)",2015-01-01T00:04:41,2015-01-01T00:03:30,"((53.87989, -166.53922), (53.8799, -166.53927))","(2015-01-01T00:03:30, 2015-01-01T00:04:41)"
5,366270000,IMO0598508,BARANOF,"(53.87991, -166.53928)","(53.8799, -166.53927)",2015-01-01T00:05:51,2015-01-01T00:04:41,"((53.8799, -166.53927), (53.87991, -166.53928))","(2015-01-01T00:04:41, 2015-01-01T00:05:51)"


In [60]:
# LICENSE: public domain
# https://gist.github.com/jeromer/2005586

def calculate_initial_compass_bearing(pointA, pointB):
    """
    Calculates the bearing between two points.
    The formulae used is the following:
        θ = atan2(sin(Δlong).cos(lat2),
                  cos(lat1).sin(lat2) − sin(lat1).cos(lat2).cos(Δlong))
    :Parameters:
      - `pointA: The tuple representing the latitude/longitude for the
        first point. Latitude and longitude must be in decimal degrees
      - `pointB: The tuple representing the latitude/longitude for the
        second point. Latitude and longitude must be in decimal degrees
    :Returns:
      The bearing in degrees
    :Returns Type:
      float
    """
    if (type(pointA) != tuple) or (type(pointB) != tuple):
        raise TypeError("Only tuples are supported as arguments")

    lat1 = math.radians(pointA[0])
    lat2 = math.radians(pointB[0])

    diffLong = math.radians(pointB[1] - pointA[1])

    x = math.sin(diffLong) * math.cos(lat2)
    y = math.cos(lat1) * math.sin(lat2) - (math.sin(lat1)
            * math.cos(lat2) * math.cos(diffLong))

    initial_bearing = math.atan2(x, y)

    # Now we have the initial bearing but math.atan2 return values
    # from -180° to + 180° which is not what we want for a compass bearing
    # The solution is to normalize the initial bearing as shown below
    initial_bearing = math.degrees(initial_bearing)
    compass_bearing = (initial_bearing + 360) % 360

    return compass_bearing

In [61]:
df['bearing'] = df['latlon_pair'].map(lambda x: calculate_initial_compass_bearing(*x))

In [62]:
df[['mmsi','imo','vesselname','basedatetime','lat','lon','bearing']].head()

Unnamed: 0,mmsi,imo,vesselname,basedatetime,lat,lon,bearing
1,366270000,IMO0598508,BARANOF,2015-01-01T00:01:20,53.8799,-166.53922,49.695202
2,366270000,IMO0598508,BARANOF,2015-01-01T00:02:21,53.87989,-166.53926,247.018084
3,366270000,IMO0598508,BARANOF,2015-01-01T00:03:30,53.87989,-166.53922,89.999984
4,366270000,IMO0598508,BARANOF,2015-01-01T00:04:41,53.8799,-166.53927,288.741178
5,366270000,IMO0598508,BARANOF,2015-01-01T00:05:51,53.87991,-166.53928,329.481517


### Aggregate by hour and cluster with lat-lon-bearing

In [63]:
dfg = df.groupby(vessel + ['hour'])[['lat','lon','bearing']].agg({'lat': np.mean, 'lon': np.mean, 'bearing': np.mean})
dfg = dfg.reset_index()

In [64]:
# Fit the clustering algorithm
hdb = hdbscan.HDBSCAN()
hdb.fit(dfg[['lat','lon','bearing']])

# Label the clusters
dfg['cluster'] = ['cluster ' + str(x) for x in hdb.labels_]

# Determine cluster size, join this back to the clustered dataframe
cluster_size = dfg['cluster'].value_counts().rename_axis('cluster').reset_index(name='cluster_size')
dfg = pd.merge(dfg, cluster_size, on='cluster')

In [65]:
fig = px.scatter_geo(dfg[dfg['cluster_size']>5], 
                     lat='lat', 
                     lon='lon', 
                     color='cluster', 
                     hover_name='hour', 
                     hover_data=['vesselname'], 
                     height=600,
                     color_discrete_sequence=px.colors.qualitative.Dark24_r)

fig.update_layout(
        title={
        'text': "Hourly Vessel Position + Bearing Clusters",
        'x':0.45,
        'xanchor': 'center',
        'yanchor': 'top'})

fig.show()

In [66]:
fig = px.scatter_geo(dfg[(dfg['cluster_size']>10) & (dfg['cluster']!='cluster -1')], 
                     lat='lat', 
                     lon='lon', 
                     color='cluster', 
                     hover_name='hour', 
                     hover_data=['vesselname'],
                     height=600,
                     color_discrete_sequence=px.colors.qualitative.Dark24_r)

fig.update_layout(
        title={
        'text': "Hourly Vessel Position + Bearing Clusters > 10 Members",
        'x':0.45,
        'xanchor': 'center',
        'yanchor': 'top'})
fig.show()