# Packages

In [1]:
import pandas as pd
import geopandas as gpd
import numpy as np
from geopandas.tools import sjoin
from shapely.geometry import Point, MultiPolygon, Polygon, MultiPoint, LineString
from shapely import wkt
import matplotlib 
import mapclassify
import time
import folium
import contextily as cx
import json
from copy import copy
import branca.colormap as cm
import movingpandas as mpd

# Import GFW data

In [162]:
%%time
boats = pd.read_csv('../data/raw/GFW/unknown.csv') 
gdf = gpd.GeoDataFrame(boats, geometry=gpd.GeoSeries.from_xy(boats['lon'], boats['lat']), crs=4326)


CPU times: user 5.26 s, sys: 1.3 s, total: 6.56 s
Wall time: 10 s


In [164]:
gdf['boat_id'] = gdf.mmsi.astype('category').cat.codes
gdf['time'] = pd.to_datetime(gdf['timestamp'], unit='s')

In [169]:
# filter gdf by boat_id to exclude any boats for which "is_fishing" is always -1
# group the data by boat_id
grouped = gdf.groupby('boat_id')

# get the minimum value of is_fishing for each group
min_fishing = grouped['is_fishing'].max()

# exclude the boat_id elements for which the minimum value is -1
excluded_ids = min_fishing[min_fishing == -1].index
learn_gdf = gdf[~gdf['boat_id'].isin(excluded_ids)]


# Smaller df for demo

In [228]:
gdf_small = learn_gdf.loc[(learn_gdf['time'] >  ) & (learn_gdf['time'] < '2016-01-03')]
gdf_small[gdf_small['is_fishing'] != 1]

Unnamed: 0,mmsi,timestamp,distance_from_shore,distance_from_port,speed,course,lat,lon,is_fishing,source,geometry,boat_id,time
4359572,2.579933e+12,1.448958e+09,1657944.0,2.170851e+06,0.9,216.300003,75.576134,39.625248,1.0,gfw,POINT (39.62525 75.57613),2,2015-12-01 08:16:16
4359573,2.579933e+12,1.448958e+09,1657944.0,2.170851e+06,0.8,201.500000,75.575974,39.625023,1.0,gfw,POINT (39.62502 75.57597),2,2015-12-01 08:16:47
4359574,2.579933e+12,1.448958e+09,1657944.0,2.170851e+06,0.9,200.500000,75.575935,39.624966,1.0,gfw,POINT (39.62497 75.57594),2,2015-12-01 08:16:56
4359575,2.579933e+12,1.448958e+09,1657944.0,2.170851e+06,1.3,181.800003,75.575867,39.624966,1.0,gfw,POINT (39.62497 75.57587),2,2015-12-01 08:17:07
4359576,2.579933e+12,1.448958e+09,1657944.0,2.170851e+06,1.3,185.000000,75.575821,39.624954,1.0,gfw,POINT (39.62495 75.57582),2,2015-12-01 08:17:16
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6806717,2.698965e+14,1.451399e+09,0.0,5.507133e+04,0.0,0.000000,43.310925,13.734748,1.0,gfw,POINT (13.73475 43.31092),115,2015-12-29 14:29:51
6806718,2.698965e+14,1.451400e+09,0.0,5.507133e+04,0.0,0.000000,43.310932,13.734747,1.0,gfw,POINT (13.73475 43.31093),115,2015-12-29 14:46:08
6806719,2.698965e+14,1.451401e+09,0.0,5.507133e+04,0.0,0.000000,43.310944,13.734741,1.0,gfw,POINT (13.73474 43.31094),115,2015-12-29 15:02:30
6806720,2.698965e+14,1.451402e+09,0.0,5.507133e+04,0.0,0.000000,43.310955,13.734722,1.0,gfw,POINT (13.73472 43.31096),115,2015-12-29 15:18:29


In [229]:
# Pretend we only have location and fishing status
gdf_small = gdf_small[['time', 'mmsi', 'boat_id', 'geometry', 'is_fishing', 'source']]

# Construct trajectories and extract features from trajectories

In [231]:
traj_collection = mpd.TrajectoryCollection(gdf_small, 'boat_id', t='time')

In [232]:
traj_collection.add_acceleration(overwrite = True)
traj_collection.add_angular_difference(overwrite = True)
traj_collection.add_direction(overwrite = True)
traj_collection.add_speed(overwrite = True)

In [234]:
traj_gdf = traj_collection.to_point_gdf()

Unnamed: 0_level_0,mmsi,boat_id,geometry,is_fishing,source,acceleration,angular_difference,direction,speed
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2015-10-30 00:07:06,1.272260e+12,0,POINT (-4.01294 5.28467),-1.0,false_positives,0.000000e+00,0.000000,99.362669,0.000767
2015-10-30 01:25:07,1.272260e+12,0,POINT (-4.01291 5.28466),-1.0,false_positives,0.000000e+00,0.000000,99.362669,0.000767
2015-10-30 01:49:08,1.272260e+12,0,POINT (-4.01292 5.28467),-1.0,false_positives,4.222129e-07,141.001550,318.361119,0.001375
2015-10-30 02:25:10,1.272260e+12,0,POINT (-4.01293 5.28466),-1.0,false_positives,-2.758717e-07,97.296074,221.065045,0.000779
2015-10-30 02:43:09,1.272260e+12,0,POINT (-4.01292 5.28465),-1.0,false_positives,1.074021e-06,85.943015,135.122029,0.001937
...,...,...,...,...,...,...,...,...,...
2016-01-02 15:13:36,2.746770e+14,116,POINT (84.92099 -6.49699),-1.0,false_positives,-3.752785e-05,0.385320,178.871886,2.094468
2016-01-02 15:14:36,2.746770e+14,116,POINT (84.92091 -6.49817),-1.0,false_positives,1.174518e-03,5.199096,184.070981,2.164939
2016-01-02 16:51:37,2.746770e+14,116,POINT (84.84974 -6.55675),-1.0,false_positives,-7.105500e-05,46.282175,230.353156,1.751328
2016-01-02 16:52:15,2.746770e+14,116,POINT (84.84889 -6.55690),-1.0,false_positives,1.976060e-02,29.650207,260.003363,2.502231


# Rolling moments, and exclude missing fishing data

In [236]:
time_window = '5H'

traj_gdf['avg_speed'] = traj_gdf.groupby('boat_id').rolling(time_window).mean('speed')['speed'].tolist()
traj_gdf['avg_acceleration'] = traj_gdf.groupby('boat_id').rolling(time_window).mean('acceleration')['acceleration'].tolist()
traj_gdf['avg_angular_difference'] = traj_gdf.groupby('boat_id').rolling(time_window).mean('angular_difference')['angular_difference'].tolist()
traj_gdf['avg_direction'] = traj_gdf.groupby('boat_id').rolling(time_window).mean('direction')['direction'].tolist()

traj_gdf['median_speed'] = traj_gdf.groupby('boat_id').rolling(time_window).median('speed')['speed'].tolist()
traj_gdf['median_acceleration'] = traj_gdf.groupby('boat_id').rolling(time_window).median('acceleration')['acceleration'].tolist()
traj_gdf['median_angular_difference'] = traj_gdf.groupby('boat_id').rolling(time_window).median('angular_difference')['angular_difference'].tolist()
traj_gdf['median_direction'] = traj_gdf.groupby('boat_id').rolling(time_window).median('direction')['direction'].tolist()

traj_gdf['sd_speed'] = traj_gdf.groupby('boat_id')['speed'].rolling(time_window).std().tolist()
traj_gdf['sd_acceleration'] = traj_gdf.groupby('boat_id')['acceleration'].rolling(time_window).std().tolist()
traj_gdf['sd_angular_difference'] = traj_gdf.groupby('boat_id')['angular_difference'].rolling(time_window).std().tolist()
traj_gdf['sd_direction'] = traj_gdf.groupby('boat_id')['direction'].rolling(time_window).std().tolist()

traj_gdf['max_speed'] = traj_gdf.groupby('boat_id')['speed'].rolling(time_window).max().tolist()
traj_gdf['max_acceleration'] = traj_gdf.groupby('boat_id')['acceleration'].rolling(time_window).max().tolist()
traj_gdf['max_angular_difference'] = traj_gdf.groupby('boat_id')['angular_difference'].rolling(time_window).max().tolist()
traj_gdf['max_direction'] = traj_gdf.groupby('boat_id')['direction'].rolling(time_window).max().tolist()

traj_gdf['min_speed'] = traj_gdf.groupby('boat_id')['speed'].rolling(time_window).min().tolist()
traj_gdf['min_acceleration'] = traj_gdf.groupby('boat_id')['acceleration'].rolling(time_window).min().tolist()
traj_gdf['min_angular_difference'] = traj_gdf.groupby('boat_id')['angular_difference'].rolling(time_window).min().tolist()
traj_gdf['min_direction'] = traj_gdf.groupby('boat_id')['direction'].rolling(time_window).min().tolist()

# See https://stackoverflow.com/questions/37345493/kurtosis-on-groupby-of-pandas-dataframe-doesnt-work for why other moments require further "manual" implementation

In [279]:
lean_traj_gdf = traj_gdf[traj_gdf['is_fishing'] != -1]

# Add local time and spread time variable into month, week, day, hour

In [285]:
from timezonefinder import TimezoneFinder
import pytz 
from datetime import datetime

tf = TimezoneFinder()

lean_traj_gdf['timezone'] = lean_traj_gdf.apply(lambda row: tf.timezone_at(lng=row['geometry'].x, lat=row['geometry'].y), axis=1)


def infer_local_time(gdf, utc_col):
    tz = pytz.timezone('UTC')
    gdf['local_time'] = gdf.apply(lambda row: tz.localize(row[utc_col]).astimezone(pytz.timezone(row['timezone'])).strftime('%Y-%m-%d %H:%M:%S'), axis=1)
    return gdf

lean_traj_gdf['time'] = lean_traj_gdf.index

infer_local_time(lean_traj_gdf, 'time')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)


Unnamed: 0_level_0,mmsi,boat_id,geometry,is_fishing,source,acceleration,angular_difference,direction,speed,avg_speed,...,max_acceleration,max_angular_difference,max_direction,min_speed,min_acceleration,min_angular_difference,min_direction,time,timezone,local_time
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2015-12-01 08:16:16,2.579933e+12,2,POINT (39.62525 75.57613),1.0,gfw,2.169643e-05,68.247303,131.338598,0.813120,2.533349,...,0.148261,179.707318,239.469611,0.000000,-5.415739,0.629064,0.000000,2015-12-01 08:16:16,Etc/GMT-3,2015-12-01 11:16:16
2015-12-01 08:16:47,2.579933e+12,2,POINT (39.62502 75.57597),1.0,gfw,-6.512461e-03,67.947403,199.286001,0.611233,2.426565,...,0.148261,179.707318,239.469611,0.000000,-5.415739,0.629064,0.000000,2015-12-01 08:16:47,Etc/GMT-3,2015-12-01 11:16:47
2015-12-01 08:16:56,2.579933e+12,2,POINT (39.62497 75.57594),1.0,gfw,-1.179295e-02,1.202003,200.488004,0.505097,2.301340,...,0.148261,179.707318,239.469611,0.000000,-5.415739,0.629064,0.000000,2015-12-01 08:16:56,Etc/GMT-3,2015-12-01 11:16:56
2015-12-01 08:17:07,2.579933e+12,2,POINT (39.62497 75.57587),1.0,gfw,1.742602e-02,20.488004,180.000000,0.696783,2.216890,...,0.148261,179.707318,239.469611,0.000000,-5.415739,0.629064,0.000000,2015-12-01 08:17:07,Etc/GMT-3,2015-12-01 11:17:07
2015-12-01 08:17:16,2.579933e+12,2,POINT (39.62495 75.57582),1.0,gfw,-1.421493e-02,3.563481,183.563481,0.568849,1.953351,...,0.113803,179.707318,239.469611,0.000000,-5.415739,0.629064,0.000000,2015-12-01 08:17:16,Etc/GMT-3,2015-12-01 11:17:16
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2015-12-29 14:29:51,2.698965e+14,115,POINT (13.73475 43.31092),1.0,gfw,-1.752115e-03,3.131070,188.682470,1.729670,3.101349,...,0.001449,94.458514,311.651926,1.516438,-0.001752,0.758668,114.484696,2015-12-29 14:29:51,Europe/Rome,2015-12-29 15:29:51
2015-12-29 14:46:08,2.698965e+14,115,POINT (13.73475 43.31093),1.0,gfw,-1.769498e-03,166.120343,354.802813,0.000871,2.956162,...,0.001449,166.120343,354.802813,0.000871,-0.001769,0.758668,114.484696,2015-12-29 14:46:08,Europe/Rome,2015-12-29 15:46:08
2015-12-29 15:02:30,2.698965e+14,115,POINT (13.73474 43.31094),1.0,gfw,5.164526e-07,14.795417,340.007396,0.001378,2.867041,...,0.001449,166.120343,354.802813,0.000871,-0.001769,0.758668,114.484696,2015-12-29 15:02:30,Europe/Rome,2015-12-29 16:02:30
2015-12-29 15:18:29,2.698965e+14,115,POINT (13.73472 43.31096),1.0,gfw,7.403828e-07,30.498875,309.508521,0.002088,2.718624,...,0.001449,166.120343,354.802813,0.000871,-0.001769,0.758668,114.484696,2015-12-29 15:18:29,Europe/Rome,2015-12-29 16:18:29


In [286]:
lean_traj_gdf['local_time'] = pd.to_datetime(lean_traj_gdf['local_time'])
lean_traj_gdf.index = lean_traj_gdf['local_time']
lean_traj_gdf['month'] = lean_traj_gdf.index.month
lean_traj_gdf['day'] = lean_traj_gdf.index.day
lean_traj_gdf['hour'] = lean_traj_gdf.index.hour

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = 

# Missing values and last data transformation

In [288]:
lean_traj_gdf['lon'] = lean_traj_gdf['geometry'].x
lean_traj_gdf['lat'] = lean_traj_gdf['geometry'].y
lean_traj_gdf.drop(columns=['geometry', 'timezone', 'local_time', 'time', 'boat_id', 'source'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  lean_traj_gdf.drop(columns=['geometry', 'timezone', 'local_time', 'time', 'boat_id', 'source'], inplace=True)


In [293]:
lean_traj_gdf.fillna(-9999)
lean_traj_gdf.replace(np.nan, -9999, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  lean_traj_gdf.replace(np.nan, -9999, inplace=True)


# Learning

In [301]:
from sklearn.metrics import mean_absolute_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

y = lean_traj_gdf.is_fishing
X = lean_traj_gdf.drop('is_fishing', axis = 1)

train_X, val_X, train_y, val_y = train_test_split(X,y)

forest_model = RandomForestRegressor()
forest_model.fit(train_X, train_y)
val_preds = forest_model.predict(val_X)

local_time
2015-12-17 01:46:44    1.0
2015-12-23 23:27:35    0.0
2015-12-26 18:41:43    1.0
2015-12-13 17:52:58    1.0
2015-11-11 23:36:50    0.0
                      ... 
2015-12-07 07:30:57    0.0
2015-11-13 21:39:02    0.0
2015-12-13 17:58:17    1.0
2015-12-20 23:49:48    0.0
2015-12-22 01:41:15    1.0
Name: is_fishing, Length: 14666, dtype: float64
0.01829685666590307


In [336]:
mean_absolute_error(val_y, [0]*4889)

0.5283970828062998