In [4]:
import pandas as pd
from pathlib import Path
import json
from ast import literal_eval
import plotly.express as px
import pdfplumber
import numpy as np

PATH = Path.cwd().parent.joinpath('data')

In [32]:
owners = []
vessels = []

with open(PATH.joinpath('raw_gfw', 'api', 'vessels_v3.json'), 'r') as f:
    for line in f:
        record = literal_eval(line)
        for r in record.get('entries'):
            for owner in r.get('registryOwners'):
                owners.append(owner)
            for vessel in r.get('registryInfo'):
                vessels.append(vessel)

owners = pd.DataFrame(owners)
vessels = pd.DataFrame(vessels) 

owners.ssvid = owners.ssvid.astype(int)
owners.dateFrom = pd.to_datetime(owners.dateFrom)
owners.dateTo = pd.to_datetime(owners.dateTo)
owners.drop('sourceCode', axis=1, inplace=True)
owners = owners.drop_duplicates()

vessels.ssvid = vessels.ssvid.astype(int)
vessels.transmissionDateFrom = pd.to_datetime(vessels.transmissionDateFrom)
vessels.transmissionDateTo = pd.to_datetime(vessels.transmissionDateTo)
vessels.drop('sourceCode', axis=1, inplace=True)

print(f'Found {len(owners)} owners and {len(vessels)} vessels')

Found 1174 owners and 629 vessels


In [53]:
owners.head()

Unnamed: 0,name,flag,ssvid,dateFrom,dateTo
0,CECILSSON S,ISL,251248110,2012-01-15 16:49:02+00:00,2021-02-04 11:27:00+00:00
1,FRONAPE INTERNATIONAL,NLD,311067800,2012-03-09 01:34:51+00:00,2021-02-13 20:59:49+00:00
2,MORFLOT,RUS,273344600,2012-10-16 00:19:03+00:00,2024-07-31 21:16:43+00:00
3,MORFLOT,RUS,273344633,2018-01-04 21:25:34+00:00,2018-01-05 03:37:39+00:00
4,YARBUNKER,RUS,273343600,2012-06-03 19:25:22+00:00,2024-07-31 23:59:59+00:00


In [33]:
df = pd.merge(owners, vessels, on='ssvid', how='outer')
len(df)

1177

In [38]:
df = df[df.id.notna()].copy()
len(df)

783

In [44]:
px.histogram(df.groupby('imo').ssvid.count())

In [45]:
df.groupby('imo').ssvid.count().sort_values(ascending=False).tail(10)

imo
9283291    1
9284130    1
9286023    1
9284582    1
9284594    1
9284726    1
9285835    1
9285847    1
9285859    1
9866380    1
Name: ssvid, dtype: int64

In [51]:
df[df.imo =='9286023']

Unnamed: 0,name,flag_x,ssvid,dateFrom,dateTo,id,flag_y,shipname,nShipname,callsign,imo,latestVesselInfo,transmissionDateFrom,transmissionDateTo,geartypes,lengthM,tonnageGt,vesselInfoReference
117,SARAKINO SHIPPING,LBR,241314000,2014-03-25 17:25:01+00:00,2020-02-24 09:58:02+00:00,8482381c834d2e532bea6dcb19b531c7,GRC,MEGANISI,MEGANISI,SVBY5,9286023,True,2014-03-25 17:25:01+00:00,2020-02-24 09:58:02+00:00,[TANKER],,41526.0,daf73302-48a2-4fd3-be34-e7c580783f94


In [54]:
vessels[vessels.imo == '9286023']

Unnamed: 0,id,ssvid,flag,shipname,nShipname,callsign,imo,latestVesselInfo,transmissionDateFrom,transmissionDateTo,geartypes,lengthM,tonnageGt,vesselInfoReference
415,8482381c834d2e532bea6dcb19b531c7,241314000,GRC,MEGANISI,MEGANISI,SVBY5,9286023,True,2014-03-25 17:25:01+00:00,2020-02-24 09:58:02+00:00,[TANKER],,41526.0,daf73302-48a2-4fd3-be34-e7c580783f94


## Using pdf plumber

In [232]:
dfs = []
for file in PATH.joinpath('equasis').glob('*.pdf'):
    pdf = pdfplumber.open(file)    
    
    # Get imo
    for lines in pdf.pages[0].extract_text_lines():
        if isinstance(lines, dict):
            if 'imo: ' in lines.get('text'):
                imo = lines.get('text').replace('imo: ', '')

    # Get tables
    for page in pdf.pages[1:]:
        tables = page.extract_tables()

        for table in tables:
            tabs = []
            keys = table[0]
            keys = [x.replace('\n', ' ') for x in keys]
            for tab in table[1:]:
                values = [str(x).replace('\n', ' ') for x in tab]
                tab = dict(zip(keys, values))
                tabs.append(tab)
            df = pd.DataFrame(tabs)
            df['imo'] = imo
            dfs.append(df)

In [249]:
companies = []
inspections = []
human = []
names = []
flags = []
classifications = []


for df in dfs:
    if 'Company' in df.columns:
        companies.append(df)
    if 'Detention' in df.columns:
        inspections.append(df)
    if 'Human element deficiencies' in df.columns:
        human.append(df)
    if 'Name of ship' in df.columns:
        names.append(df)
    if 'Flag' in df.columns and 'Date of effect' in df.columns:
        flags.append(df)
    if 'Classification society' and 'Date of survey' in df.columns:
        classifications.append(df)



companies = pd.concat(companies)
inspections = pd.concat(inspections)
human = pd.concat(human)
names = pd.concat(names)
flags = pd.concat(flags)
classifications = pd.concat(classifications)

In [234]:

def clean_dates(df):
    cols = [x for x in df.columns if 'date' in x.lower()]
    to_replace = {'during ': '', 'since ': '', 'before ': ''}
    for col in cols: 
        df[col] = df[col].replace(to_replace, regex=True)
        df[col] = pd.to_datetime(df[col], format='mixed', dayfirst=True, errors='coerce')
    return df

companies = clean_dates(companies)

In [235]:
companies.drop('Sources', axis=1, inplace=True)
companies.columns = ['company', 'role', 'start_date', 'imo']
companies.role = companies.role.str.replace(' T', '').str.replace('manager/', 'manager /').str.replace('/Com', '/ Com')
companies = clean_dates(companies)

In [236]:
names = clean_dates(names)
names.columns = ['vessel_name', 'start_date', 'source', 'imo']

In [237]:
flags = clean_dates(flags)
flags.columns = ['flag', 'start_date', 'source', 'imo']

In [238]:
classifications = clean_dates(classifications)
classifications.columns = ['classification_society', 'date_of_survey', 'source', 'imo']

In [250]:
inspections = clean_dates(inspections)
inspections.columns = ['authority', 'port', 'date', 'detention', 'PSC_organisation', 
                       'inspection_type', 'duration', 'number_of_deficiencies', 'imo']
inspections = inspections.reset_index().sort_values(['imo', 'index'])
inspections.authority = inspections.authority.ffill()
inspections['port'] = inspections['port'].ffill()
inspections['date'] = inspections['date'].ffill()
inspections['detention'] = inspections['detention'].ffill()

In [252]:
inspections.drop('index', axis=1, inplace=True)

In [253]:
inspections.head()

Unnamed: 0,authority,port,date,detention,PSC_organisation,inspection_type,duration,number_of_deficiencies,imo
1789,Bulgaria,Varna,2010-08-12,N,Paris MoU,Expanded inspection,0,9.0,8727941
1790,Iran,Bandar Neka,2003-09-14,N,Indian Ocean MoU,Initial inspection,0,,8727941
1791,Iran,Bandar Neka,2002-12-25,N,Indian Ocean MoU,Initial inspection,0,1.0,8727941
282,Ukraine,Izmail,2021-09-15,N,Black Sea MoU,More detailed inspection,0,,8727953
283,Bulgaria,Burgas,2019-05-09,N,Black Sea MoU,Expanded inspection,0,2.0,8727953
