# AFV Missing Vessel Identifiers

The following Jupyter notebook visualizes the AIS data between 2009 and 2017. 
- Establish s3 bucket connection to retrieve processed AIS data for 2015, 2016, 2017
- Load parquet files, clean up column names, and prepare data for use in visualizations
- Create visualizations

# Missing metadata from AIS Data

Most of the AIS data contain vessel metadata; however, there are a few records missing vessel metadata between the years of 2015-2017. Roughly 1.7% (N=159,898) of the data points and 9% of the unique vessels between 2015 and 2017 are missing vessel metadata (VesselName, IMO, CallSign, VesselType, Length, Width, Draft, Cargo). The visualizations below are meant to aide the analyst in exploring the data that is missing vessel metadata. 

In [1]:
import boto3
from ipyleaflet import Map, basemaps, basemap_to_tiles
from ipyleaflet import CircleMarker, Heatmap
from ipyleaflet import Marker, MarkerCluster
import ipywidgets as widgets
from ipywidgets import Dropdown, HTML, Layout, VBox
import io
import json
import numpy as np
import pandas as pd
import plotly.graph_objects as go
from random import uniform

# local imports
import helpers


  shapely_geos_version, geos_capi_version_string


In [2]:
%%html
<style>
.jupyter-widgets.widget-tab > .p-TabBar .p-TabBar-tab {
    flex: 0 1 300px
}

.lbl_bg {
    width: auto;
    background-color: #F0F0F0;
    border-radius: 4px;
}
</style>

# Connect to AWS

In [3]:
with open("../creds/aws_creds.json") as f:
    aws_creds = json.loads(f.read())
    
s3_resource = boto3.resource('s3',aws_access_key_id=aws_creds["access_key_id"],
                            aws_secret_access_key=aws_creds["access_key_secret"])
client = boto3.client('s3', aws_access_key_id=aws_creds["access_key_id"],
                            aws_secret_access_key=aws_creds["access_key_secret"])


# Process AIS data for all years to produce data for visualizations

In [4]:
process = False # change to True if you need to process the data; this takes a while to run
if process:
    # process yearly AIS data for use in visualizations
    helpers.concat_ais_data(s3_resource, client)

# load processed AIS data used for visualizations
bucket = 'afv-scenario' 

long_missing_obj = client.get_object(Bucket=bucket, Key='analytics-products/ais_missing_long.csv')
long_missing_df = pd.read_csv(io.BytesIO(long_missing_obj['Body'].read()))

mmsi_count_obj = client.get_object(Bucket=bucket, Key='analytics-products/ais_mmsi_count.csv')
mmsi_count_df = pd.read_csv(io.BytesIO(mmsi_count_obj['Body'].read()))

missing_obj = client.get_object(Bucket=bucket, Key='analytics-products/ais_missing.csv')
missing_df = pd.read_csv(io.BytesIO(missing_obj['Body'].read()))

hour_obj = client.get_object(Bucket=bucket, Key='analytics-products/ais_hour.csv')
hour_df = pd.read_csv(io.BytesIO(hour_obj['Body'].read()))
    
    

# Visualizations of AIS data for a specific year

In [5]:

year = widgets.Dropdown(
    options=[2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017],
    value=2009,
    description='Select a year:',
    disabled=False,
)

g1 = go.FigureWidget()
g1.add_histogram()
g1.data[0].x = mmsi_count_df.Count[(mmsi_count_df.Missing == 0) & (mmsi_count_df.Year == year.value)]
g1.layout.title = 'Log-transformed histogram of complete vessel metadata counts by MMSI'
g1.layout.xaxis.title = 'Log-transformed Count'
g1.layout.yaxis.title = 'Bin Total'

g2 = go.FigureWidget()
g2.add_histogram()
g2.data[0].x = mmsi_count_df.Count[(mmsi_count_df.Missing == 1) & (mmsi_count_df.Year == year.value)]
g2.layout.title = 'Log-transformed histogram of incomplete vessel metadata counts by MMSI'
g2.layout.xaxis.title = 'Log-transformed Count'
g2.layout.yaxis.title = 'Bin Total'

if len(mmsi_count_df.Count[(mmsi_count_df.Missing == 1) & (mmsi_count_df.Year == year.value)]) == 0:
    g2.add_annotation(x=1, y=2,
        text="There are no data points missing vessel metadata.",
        showarrow=False,
        yshift=10
    )
    g2.data[0].x = []
else:
    g2.add_annotation(x=1, y=2,
        text="",
        showarrow=False,
        yshift=10
    )

g3 = go.FigureWidget()
g3.add_bar()
g3.data[0].y = missing_df.Missing[(missing_df.Year == year.value)]
g3.layout.title = 'Frequency of Missing Vessel Metadata Fields by Date'
g3.layout.xaxis.title = 'Date'
g3.layout.yaxis.title = 'Count'
g3.add_class('lbl_bg')

if len(missing_df.Missing[(missing_df.Year == year.value) & (missing_df.Missing > 0)]) == 0:
    g3.add_annotation(x=1, y=2,
        text="There are no data points missing vessel metadata.",
        showarrow=False,
        yshift=10
    )
    g3.data[0].y = []
else:
    g3.add_annotation(x=1, y=2,
        text="",
        showarrow=False,
        yshift=10
    )
    
g4 = go.FigureWidget()
g4.add_scatter()
g4.data[0].x = hour_df.Time[(hour_df.Year == year.value)]
g4.data[0].y = hour_df.Count[(hour_df.Year == year.value)]
g4.layout.title = 'Amount of reported vessel data every 60 minutes (green line is 2 SD below mean)'
g4.layout.xaxis.title = 'Time'
g4.layout.yaxis.title = 'Count'

g4.add_shape(type="line",
    x0=hour_df.Time[(hour_df.Year == year.value)].min(), 
    y0=hour_df.Count[(hour_df.Year == year.value)].mean() - 2*hour_df.Count[(hour_df.Year == year.value)].std(), 
    x1=hour_df.Time[(hour_df.Year == year.value)].max(), 
    y1=hour_df.Count[(hour_df.Year == year.value)].mean() - 2*hour_df.Count[(hour_df.Year == year.value)].std(),
    line=dict(
        color="LightSeaGreen",
        width=4,
        dash="dashdot",
    )
)


def validate():
    if year.value in range(2009, 2018):
        return True
    else:
        return False


def response(change):
    if validate():
        
        with g1.batch_update():
            g1.data[0].x = mmsi_count_df.Count[(mmsi_count_df.Missing == 0) & (mmsi_count_df.Year == year.value)]
        with g2.batch_update():
            if len(mmsi_count_df.Count[(mmsi_count_df.Missing == 1) & (mmsi_count_df.Year == year.value)]) == 0:
                g2.layout.annotations[0].text = "There are no data points missing vessel metadata."
                g2.data[0].x = []
            else:
                g2.layout.annotations[0].text = ""
                g2.data[0].x = mmsi_count_df.Count[(mmsi_count_df.Missing == 1) & (mmsi_count_df.Year == year.value)]
        with g3.batch_update():            
            if len(missing_df.Missing[(missing_df.Year == year.value) & (missing_df.Missing > 0)]) == 0:
                g3.layout.annotations[0].text = "There are no data points missing vessel metadata."
                g3.data[0].y = []
            else:
                g3.layout.annotations[0].text = ""
                g3.data[0].y = missing_df.Missing[(missing_df.Year == year.value)]            
        with g4.batch_update():
            g4.data[0].x = hour_df.Time[(hour_df.Year == year.value)]
            g4.data[0].y = hour_df.Count[(hour_df.Year == year.value)]
            g4.layout.shapes[0].x0 = hour_df.Time[(hour_df.Year == year.value)].min()
            g4.layout.shapes[0].x1 = hour_df.Time[(hour_df.Year == year.value)].max()
            g4.layout.shapes[0].y0 = hour_df.Count[(hour_df.Year == year.value)].mean() - 2*hour_df.Count[(hour_df.Year == year.value)].std()
            g4.layout.shapes[0].y1 = hour_df.Count[(hour_df.Year == year.value)].mean() - 2*hour_df.Count[(hour_df.Year == year.value)].std()

            
year.observe(response, names="value")

# Tab Specs
titles = ['Histogram of AIS vessel data', 
          'Incomplete AIS data by date', 
          'AIS data by hour']
children = [widgets.VBox([year, g1, g2]).add_class('lbl_bg'), 
            widgets.VBox([year, g3]).add_class('lbl_bg'),
            widgets.VBox([year, g4]).add_class('lbl_bg')]

# Initialize visualization
viz = widgets.Tab()

# Set Tab Specs
viz.children = children
for i in range(0, 3):
    viz.set_title(i, titles[i])

viz

Tab(children=(VBox(children=(Dropdown(description='Select a year:', options=(2009, 2010, 2011, 2012, 2013, 201…

The visualization in the third tab above (AIS data by hour) is meant to help users identify potential data collection issues or gaps related to a lack of satellite coverage or operational issues regarding one or more satellites. The green line, which is two standard deviations below the mean for the series of data for that year, is meant to provide a quick guide for the user so that she/he can focus on the gaps in data collection.

# When and where do vessels report with missing metadata?

In [6]:

def create_circle_marker(row):
    # Vessel Marker Creation
    circle_marker = CircleMarker()
    circle_marker.location = (row.xcoord, row.ycoord)
    circle_marker.radius = 5
    circle_marker.weight = 1
    circle_marker.color = "black"
    circle_marker.fill_color = "black"
    
    # Popup creation
    message = HTML()
    message.value = "<b>Time: </b>" + str(row.Time) + "<br>" + \
                     "<b>Vessel MMSI: </b>" + str(row.Vessel_MMSI)

    circle_marker.popup = message
    
    return circle_marker


def handle_vessel_select(change):
    cur_df = long_missing_df[long_missing_df['Vessel_MMSI'] == int(vessel_selector.value)]
    
    m.clear_layers()
    m.add_layer(basemap_layer)
    
    vessel_markers = []
    for col, row in cur_df.iterrows():
        vessel_markers.append(create_circle_marker(row))
        
    marker_cluster = MarkerCluster(markers=vessel_markers)
    m.add_layer(marker_cluster)
    
    m.center = (cur_df.xcoord.median(), cur_df.ycoord.median())
    m.zoom = 2.5
    

vessels = [str(x) for x in long_missing_df['Vessel_MMSI'].unique()]
vessels.sort()

# Map elements
header = HTML("<h2> <center> Map of Vessels with Missing Metadata</h2>", layout=Layout(height='auto'))
vessel_selector_label = HTML(value="<b> Select the Vessel MMSI: </b>")
vessel_selector = Dropdown(options = vessels, layout=Layout(width='auto'))
basemap_layer = basemap_to_tiles(basemaps.OpenStreetMap.Mapnik)
# Once a dropdown option is selected, change the data on the map
vessel_selector.observe(handle_vessel_select, names='value')

# Create starting map with all layers
m = Map(center=(long_missing_df.xcoord.median(), long_missing_df.ycoord.median()), zoom=2)
m.clear_layers()
m.add_layer(basemap_layer)

widgets.VBox([header, vessel_selector_label, vessel_selector, m]).add_class('lbl_bg')


VBox(children=(HTML(value='<h2> <center> Map of Vessels with Missing Metadata</h2>', layout=Layout(height='aut…

In [7]:
# Heatmap

heatmap_year = widgets.Dropdown(
    options=[2015, 2016, 2017],
    value=2015,
    description='Select a year:',
    disabled=False,
)

def heatmap_response(change):
        
    m_heatmap.clear_layers()
    m_heatmap.add_layer(basemap_layer)
    heatmap_vessel_markers = []
    for col, row in long_missing_df[long_missing_df.Year == heatmap_year.value].iterrows():
        heatmap_vessel_markers.append([row.xcoord, row.ycoord, uniform(0,10)])

    heatmap = Heatmap(
        locations=heatmap_vessel_markers,
        radius=20
    )
    m_heatmap.add_layer(heatmap)
    m_heatmap.center = (long_missing_df.xcoord[long_missing_df.Year == heatmap_year.value].median(), 
                        long_missing_df.ycoord[long_missing_df.Year == heatmap_year.value].median()
                       )
    m_heatmap.zoom = 5

            
m_heatmap = Map(center=(long_missing_df.xcoord[long_missing_df.Year == heatmap_year.value].median(), 
                        long_missing_df.ycoord[long_missing_df.Year == heatmap_year.value].median()
                       ), zoom=5)
m_heatmap.clear_layers()
m_heatmap.add_layer(basemap_layer)

heatmap_vessel_markers = []
for col, row in long_missing_df[long_missing_df.Year == heatmap_year.value].iterrows():
    heatmap_vessel_markers.append([row.xcoord, row.ycoord, uniform(0,10)])

heatmap = Heatmap(
    locations=heatmap_vessel_markers,
    radius=20
)
m_heatmap.add_layer(heatmap)

heatmap_year.observe(heatmap_response, names="value")
header_heatmap = HTML("<h2> <center> Heatmap of Vessels with Missing Metadata</h2>", layout=Layout(height='auto'))
widgets.VBox([header_heatmap, heatmap_year, m_heatmap]).add_class('lbl_bg')


VBox(children=(HTML(value='<h2> <center> Heatmap of Vessels with Missing Metadata</h2>', layout=Layout(height=…