In [1]:
%load_ext autoreload
%autoreload 2

# The Three Lords

The goal of this notebook is to get, parse, clean and analyse data from different sources to gain more insight into the activities of the 'Three Lords' of the North Sea: Parlevliet & van der Plas, Cornelis Vrolijk and Van der Zwan & Zonen. 

### Research question



### Methodological issues

1. We have some basic knowledge of fishing activity, or rather fishing efforts. How does that effort translate into what these ships actually offload in ports around the world? There are a couple of approaches we can take. The first is to get a few examples from actual offloading volumes that we can link to fishing efforts and interpolate all values from there. The second one is that we can use the ICES surveys ([DATRAS](https://www.ices.dk/data/data-portals/Pages/DATRAS.aspx)), where exact location and time of catches are known. The problem there is that the surveys are highly standardized and probably don't represent normal fishing activity.
2. Can a relationship be established between fishing efforts and quota rights? 
3. How can we find AIS on/off switching in the GFW data?
4. How do we know the AIS data is complete? Some areas, like the Central North Sea, are a bit sparse when we look at fishing efforts. Could poor AIS reception contribute to that sparsity? 
5. How do we compare the activities of the Three Lords with other fishery companies? Can we establish some baselines for comparisons? 

### Data sources

1. [Global Fishing Watch](https://globalfishingwatch.org): excellent source for fishing efforts, encounters, loitering events and port visits. 
2. [CompanyInfo](https://companyinfo.nl): company information (concern, participations) of the three Dutch companies. 
3. [OpenCorporates](https://opencorporates.com): information on foreign subsidiaries of the Three Lords.
4. NEEDED: quota rights
5. NEEDED: catchment data

In [2]:
# Make sure we can import company data modules

import sys
sys.path.append('../../../companies/src/')

In [266]:
from glob import glob
import pandas as pd
from typing import List
from followthemoney import model
from alephclient.api import AlephAPI
import geopandas as gpd
import requests
import os
import ast
import numpy as np
from tqdm import tqdm
from dotenv import load_dotenv
from . import company_info as ci

load_dotenv('../../.env')

True

In [4]:
PATH = os.environ.get('PATH_FISHERIES')
API_KEY = os.environ.get('GFW_API_KEY')

## Get and parse companies from company info

In [None]:
# Get concerns (full ownership)

concerns = []
for file in glob(PATH + 'companies/raw_concern/*.html'):
    concern = ci.get_concern_html(file)
    concerns.append(concern)
    
concern = pd.concat(concerns).reset_index(drop=True)

In [None]:
# Get participations (from annual reports)

deelnemingen = []
for file in glob(PATH + 'companies/raw/*.har'):
    deelneming = ci.parse_company_info(file, 'participations')
    deelnemingen.append(deelneming)

deelnemingen = pd.concat(deelnemingen)
deelnemingen.drop_duplicates(subset=['country', 'name', 'year', 'percentage'], inplace=True)
deelnemingen.dropna(subset='name', inplace=True)
deelnemingen = deelnemingen[deelnemingen.name != '--'].copy()
deelnemingen.reset_index(drop=True, inplace=True)

In [None]:
concern.to_csv(PATH + 'concern_lords.csv', index=False)
deelnemingen.to_csv(PATH + 'participations_lords.csv', index=False)

## Get Global Fishing Watch data

In [95]:
# Set up GFW environment

datasets = {'fishing_vessels': 'public-global-fishing-vessels:latest',
            'carrier_vessels': 'public-global-carrier-vessels:latest',
            'support_vessels': 'public-global-support-vessels:latest',
            'fishing_efforts': 'public-global-fishing-events:latest',
            'encounters': 'public-global-encounters-events:latest',
            'loitering': 'public-global-loitering-events-carriers:latest',
            'port_visits': 'public-global-port-visits-c2-events:latest'  
            }

endpoints = {'search': 'https://gateway.api.globalfishingwatch.org/v2/vessels/search?query=',
             'advanced_search': 'https://gateway.api.globalfishingwatch.org/v2/vessels/advanced-search?query=',
             'vessels': 'https://gateway.api.globalfishingwatch.org/v2/vessels/',
             'events': 'https://gateway.api.globalfishingwatch.org/v2/events'
             }

headers = {'Authorization': f'Bearer {API_KEY}'}

# Define some functions

def search_vessels(endpoint: str, 
                   dataset: str, 
                   queries: List,
                   query_field: str, 
                   limit: int
                   ) -> List:
    
    if isinstance(queries, list):
        results = []
        for query in tqdm(queries):
            r = requests.get(f'{endpoint}{str(query)}&query-fields={query_field}&datasets={dataset}&limit={str(limit)}&offset=0', headers=headers)
            if r.status_code == 200:
                result = r.json()
                for res in result.get('entries'):
                    if res.get('score') > 20:
                        res.update({'query': query})
                        results.append(res)
            else:
                print(f'could not find data for {query}')
                continue
    else:
        raise ValueError('The query must be a list')

    return results

def get_events(endpoint: str, 
               dataset: str, 
               queries: List, 
               limit: int, 
               start_date: str, 
               end_date: str
               ) -> List:

    results = []
    if isinstance(queries, list):
        for query in tqdm(queries):
            r = requests.get(f"{endpoint}?vessels={query}&datasets={dataset}&limit={str(limit)}&offset=0&start-date={start_date}&end-date={end_date}", headers=headers)
            if r.status_code == 200:
                result = r.json()
                for r in result.get('entries'):
                    results.append(r)
            else:
                print(f'could not find data for {query}')
                continue
    else:
        raise ValueError('The query must be a list')
    
    return results

## Get vessels

In [128]:
# Import company info lords

lords = pd.read_csv(PATH + 'companies/lords/lords_vessels.csv').dropna(subset='imo_number').drop_duplicates(subset='imo_number')
mmsi = pd.read_csv(PATH + 'companies/lords/lords_mmsi.csv').dropna(subset='MMSI').drop_duplicates(subset='MMSI')

In [130]:
cols = ['vessel_name', 'registration_date_start', 'imo_number', 'mmsi_number',
        'owner_name', 'owner_jurisdiction', 'owner_registration_nr','shareholders',
        'percentage_of_share']

lords = lords[cols].copy()

In [131]:
# Search on mmsi, so select rows with mmsi

lords = lords[lords.mmsi_number.notna()].copy()

# Convert mmsi to int

lords.mmsi_number = lords.mmsi_number.astype('int')

# Add all queries together

mmsis = list(set(lords.mmsi_number.to_list() + mmsi.MMSI.tolist()))
len(mmsis)

210

In [132]:
# Search vessels by mmsi

values = []
for key in ['fishing_vessels', 'carrier_vessels', 'support_vessels']:
    value = datasets.get(key)
    values.append(value)

dataset=','.join(values)

vessels = search_vessels(endpoint=endpoints.get('search'),
                         dataset=dataset,
                         queries=mmsis,
                         query_field='mmsi',
                         limit=10)


  0%|          | 0/210 [00:00<?, ?it/s]

100%|██████████| 210/210 [02:11<00:00,  1.60it/s]


In [184]:
# Create dataframe

df_vessels = pd.DataFrame(vessels)
len(df_vessels)

# Merge with company info

df_vessels.mmsi = df_vessels.mmsi.astype('int')

df_vessels = pd.merge(df_vessels,
                   lords,
                   left_on='mmsi',
                   right_on='mmsi_number',
                   how='left'
                   )

# Clean vessels

df_vessels.columns = df_vessels.columns.str.lower()
df_vessels = df_vessels.rename(columns={'flag_x': 'flag_gfw',
                                  'flag_y': 'flag_company_registry',
                                  'id': 'gfw_id'})

# Set date columns to proper datetime

date_cols = [x for x in df_vessels.columns if 'transmissiondate' in x]
for col in date_cols:
    df_vessels[col] = pd.to_datetime(df_vessels[col], format='mixed')

# Create int columns

for col in ['imo', 'imo_number', 'mmsi_number', 'owner_registration_nr']:
    df_vessels[col] = df_vessels[col].astype('Int64')

# Add missing ownership information

df_vessels.shareholders = np.where(df_vessels.shipname.str.contains('shrimp', case=False), 'Cornelis Vrolijk Holding BV', df_vessels.shareholders)
df_vessels.owner_name = np.where(df_vessels.shipname.str.contains('shrimp', case=False), 'ATLANTIC SHRIMPERS LTD', df_vessels.owner_name)

# Add right shareholders to previous mmsi observations

df_vessels.sort_values(by=['owner_name', 'mmsi']).shareholders.ffill(inplace=True)

# Add column with number of mmsi changes in the dataset

df_vessels['mmsi_changes'] = df_vessels.groupby('mmsi')['gfw_id'].transform("count")

A note of caution. There are mmsi records available for the same vessel. We need to search with mmsi, because IMO is often missing and mmsi is always present. The current lists therefore contains some false positives, in this case mmsi of vessels that were not always owned by the lords, but might have been acquired later. We don't know the exact date when a vessel was acquired. I've added an extra column with the number of vessels in the dataset that have the same mmsi. The higher this number, the more switching of mmsi numbers have taken place. That *could* be a sign of changing ownership, but doesn't have to be. If we can't find better information on when certain vessels were acquired, we could choose to use the most recent mmsi data, so that we don't ascribe the wrong activities to the wrong lord.

In [186]:
# Write to csv

df_vessels.to_csv(PATH + 'companies/gfw/vessels.csv', index=False)

## Get events

In [187]:
# Get events

values = []
for key in ['fishing_efforts', 'loitering', 'encounters', 'port_visits']:
    value = datasets.get(key)
    values.append(value)

dataset=','.join(values)

endpoint = endpoints.get('events')

ids = df_vessels.gfw_id.to_list()
results = get_events(endpoint=endpoint, 
                 dataset=dataset, 
                 queries=ids, 
                 limit=99999, 
                 start_date='2012-01-01', 
                 end_date='2023-09-27')

100%|██████████| 448/448 [50:26<00:00,  6.76s/it] 


In [188]:
# Parse results

fishing = []
port_visits = []
loitering = []
encounters = []

for res in results:
    if res.get('type') == 'fishing':
        fishing.append(res)
    elif res.get('type') == 'port_visit':
        port_visits.append(res)
    elif res.get('type') == 'loitering':
        loitering.append(res)
    elif res.get('type') == 'encounter':
        encounters.append(res)

df_fishing = pd.json_normalize(fishing)
df_ports = pd.json_normalize(port_visits)
df_loitering = pd.json_normalize(loitering)
df_encounters = pd.json_normalize(encounters) 

In [189]:
# Write to csv

df_fishing.to_csv(PATH + 'companies/gfw/fishing_efforts.csv', index=False)
df_ports.to_csv(PATH + 'companies/gfw/port_visits.csv', index=False)
df_loitering.to_csv(PATH + 'companies/gfw/loitering.csv', index=False)
df_encounters.to_csv(PATH + 'companies/gfw/encounters.csv', index=False)

## Fishing

In [244]:
# Read fishing data

df_fishing = pd.read_csv(PATH + 'companies/gfw/fishing_efforts.csv')

# Merge with vessel info

fishing = pd.merge(df_fishing,
                   df_vessels,
                   left_on='vessel.id',
                   right_on='gfw_id',
                   how='left')

# Clean dataframe

fishing = fishing.rename(columns={'id_x': 'event_id'})
fishing.owner_jurisdiction = fishing.owner_jurisdiction.str.strip()
fishing.columns = fishing.columns.str.replace('.', '_').str.lower()
fishing.start = fishing.start.apply(lambda x: pd.Timestamp(x))
fishing.end = fishing.end.apply(lambda x: pd.Timestamp(x))
fishing['hours'] = (fishing.end-fishing.start).dt.total_seconds() / 3600

# Create geodataframe

gdf_fishing = gpd.GeoDataFrame(fishing, 
                               geometry=gpd.points_from_xy(x=fishing['position_lon'], 
                                                           y=fishing['position_lat']),
                               crs=4326
                             )

In [258]:
gdf_fishing.columns

Index(['id', 'type', 'start', 'end', 'boundingbox', 'position_lat',
       'position_lon', 'regions_mpa', 'regions_eez', 'regions_rfmo',
       'regions_fao', 'regions_majorfao', 'regions_eez12nm',
       'regions_highseas', 'regions_mpanotakepartial', 'regions_mpanotake',
       'distances_startdistancefromshorekm',
       'distances_enddistancefromshorekm', 'distances_startdistancefromportkm',
       'distances_enddistancefromportkm', 'vessel_id', 'vessel_flag',
       'vessel_name', 'vessel_ssvid', 'vessel_authorizations',
       'fishing_totaldistancekm', 'fishing_averagespeedknots',
       'fishing_averagedurationhours', 'fishing_potentialrisk',
       'fishing_vesselauthorizationstatus', 'callsign',
       'firsttransmissiondate', 'flag', 'geartype', 'gfw_id', 'imo',
       'lasttransmissiondate', 'mmsi', 'msgcount', 'poscount', 'shipname',
       'source', 'vesseltype', 'years', 'dataset', 'score', 'query',
       'vessel_name', 'registration_date_start', 'imo_number', 'mmsi_num

In [269]:
gdf_fishing[gdf_fishing.regions_mpanotake.apply(lambda x: len(ast.literal_eval(x)) > 1)]

Unnamed: 0,id,type,start,end,boundingbox,position_lat,position_lon,regions_mpa,regions_eez,regions_rfmo,...,imo_number,mmsi_number,owner_name,owner_jurisdiction,owner_registration_nr,shareholders,percentage_of_share,mmsi_changes,hours,geometry


In [271]:
gdf_fishing[gdf_fishing.fishing_potentialrisk == True].groupby('vessel_flag').vessel_flag.count().sort_values(ascending=False)

vessel_flag
LTU    917
FRO    847
DEU    573
FRA    525
NLD    372
PRT    134
LVA    112
GBR     48
POL     15
DNK      1
RUS      1
Name: vessel_flag, dtype: int64

In [247]:
gdf_fishing.groupby('vessel_flag').hours.sum().sort_values(ascending=False)

vessel_flag
NGA    395312.599642
FRA    208712.688577
LTU    108164.425648
DEU     79269.111397
NLD     74513.380070
FRO     70945.961162
RUS     49784.642453
GBR     45468.455883
GRL     26645.399167
PRT     16326.224853
POL     14486.539569
BHS     10442.210944
REU      3816.603596
LVA      3103.568056
BES      2477.461944
ITA      1276.069167
DNK       935.229444
NAM       318.113611
Name: hours, dtype: float64

In [None]:
import json

with open(PATH + 'fishing.geojson') as file:
    json.dumps(gdf_fishing.to_json())

In [None]:
encounters = pd.merge(df_encounters,
                      df_vessels,
                      left_on='vessel.id',
                      right_on='id',
                      how='left')

len(encounters)

In [None]:
for item in ['start', 'end']:
    encounters[item] = encounters[item].apply(lambda x: pd.Timestamp(x))

encounters[['start', 'end', 'shipname', 'imo', 'encounter.vessel.name']].imo.value_counts()

In [None]:
gdf = gpd.GeoDataFrame(encounters, geometry=gpd.points_from_xy(x=encounters['position.lon'], y=encounters['position.lat']), crs=4326)

In [None]:
gdf[['shipname', 'imo', 'encounter.vessel.name', 'geometry']].explore()