In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
from functools import reduce
import geopandas as gpd

PATH = Path.cwd().parent.joinpath('data')

In [2]:
vessels = pd.read_csv(PATH.joinpath('processed', 'kse_shadowfleet.csv'))


In [None]:
cols = [x for x in vessels.columns if '-202' in x]

crude = vessels[vessels.commodity == 'crude'].copy()
crude['crude_total'] = crude[cols].sum(axis=1)
crude['months_uninsured'] = crude[cols].notna().sum(axis=1)

products = vessels[vessels.commodity == 'oil products'].copy()
products['products_total'] = products[cols].sum(axis=1)
products['months_uninsured'] = products[cols].notna().sum(axis=1)

crude = crude[['imo', 'vessel_name', 'tanker_size', 'buildyear', 'crude_total', 'months_uninsured']].copy()
products = products[['imo', 'vessel_name', 'tanker_size', 'buildyear', 'products_total', 'months_uninsured']].copy()

vessels = pd.merge(crude, 
                   products, 
                   on=['imo', 'vessel_name', 'tanker_size', 'buildyear'], how='outer')

vessels.months_uninsured_x = vessels.months_uninsured_x.fillna(vessels.months_uninsured_y)
vessels.drop('months_uninsured_y', axis=1, inplace=True)
vessels.rename(columns={'months_uninsured_x': 'months_uninsured'}, inplace=True)
vessels.months_uninsured = vessels.months_uninsured.astype(int)
vessels = pd.merge(vessels,
                   pd.read_csv(PATH.joinpath('processed', 'uninsured.csv')),
                   on='imo', how='left')

vessels.sort_values(by='earliest_sanction_date', inplace=True)
vessels.drop_duplicates(subset='imo', keep='first', inplace=True)
len(vessels)


In [None]:
vessels.head()

## Add ownership data

In [5]:
owners = pd.read_csv(PATH.joinpath('processed', 'company_vessels_final.csv'))
owners.fillna({'country': 'Unknown', 'end_date': '2022-01-01'}, inplace=True)
owners.sort_values(by='start_date', inplace=True)

In [6]:
roles = ['Registered owner', 'Ship manager / Commercial manager', 'ISM Manager']
col_names = ['ownership', 'shipmanager', 'ism_manager']
temp = []
for r, n in zip(roles, col_names):

    temp_owners = owners[owners.role==r].groupby('imo').size().reset_index()
    temp_owners.columns = ['imo', f'{n}_changes_after_2022']
    temp.append(temp_owners)

    temp_owners = owners[owners.role==r]\
                         .groupby('imo')\
                         .agg({'country': lambda x: ', '.join(x)})\
                         .reset_index()
    temp_owners.columns = ['imo', f'{n}_jurisdictions_after_2022']
    temp.append(temp_owners)

    ownership = reduce(lambda x, y: pd.merge(x, y, on='imo', how='outer'), temp)

In [None]:
vessels = pd.merge(vessels, ownership, on='imo', how='left')
len(vessels)

## Add inspections

In [8]:
inspections = pd.read_csv(PATH.joinpath('processed', 'owners_inspections.csv'))

In [None]:
inspections.columns

In [10]:
detentions_after_2022 = inspections[(inspections.detention=='Y') \
                                    & (inspections.date >='2022-01-01')]\
                                    .groupby('imo')\
                                    .size()\
                                    .reset_index()\
                                    .rename(columns={0: 'detentions_after_2022'})

detentions = inspections[inspections.detention=='Y']\
                        .groupby('imo')\
                        .size()\
                        .reset_index()\
                        .rename(columns={0: 'detentions'})

deficiencies_after_2022 = inspections[inspections.date >= '2022-01-01']\
                                       .groupby('imo')\
                                       .number_of_deficiencies\
                                       .size()\
                                       .reset_index()\
                                       .rename(columns={'number_of_deficiencies': 'deficiencies_after_2022'})

deficiencies = inspections.groupby('imo')\
                            .number_of_deficiencies\
                            .size()\
                            .reset_index()\
                            .rename(columns={'number_of_deficiencies': 'deficiencies'})

In [11]:
inspections = reduce(lambda x, y: pd.merge(x, y, on='imo', how='outer'), [detentions, detentions_after_2022, deficiencies, deficiencies_after_2022])  

In [12]:
inspections.fillna(0, inplace=True)
inspections.detentions_after_2022 = inspections.detentions_after_2022.astype(int)
inspections.deficiencies_after_2022 = inspections.deficiencies_after_2022.astype(int)
inspections.detentions = inspections.detentions.astype(int)
inspections.deficiencies = inspections.deficiencies.astype(int)

In [None]:
vessels = pd.merge(vessels, inspections, on='imo', how='left')
len(vessels)

In [14]:
name = pd.read_csv(PATH.joinpath('processed', 'owners_names.csv'))
name = name[name.start_date >= '2022-01-01'].copy()
name_changes = name.groupby('imo').size().reset_index().rename(columns={0: 'name_changes_after_2022'})
names = name.groupby('imo').agg({'vessel_name': lambda x: ', '.join(x)}).reset_index().rename(columns={'vessel_name': 'names_after_2022'})
names = pd.merge(names, name_changes, on='imo', how='outer')

vessels = pd.merge(vessels, names, on='imo', how='left')

In [15]:
flags = pd.read_csv(PATH.joinpath('processed', 'owners_flags.csv'))
flags = flags[flags.start_date >= '2022-01-01'].copy()
flag_changes = flags.groupby('imo').size().reset_index().rename(columns={0: 'flag_changes_after_2022'})
flags = flags.groupby('imo').agg({'flag': lambda x: ', '.join(x)}).reset_index().rename(columns={'flag': 'flags_after_2022'})
flags = pd.merge(flags, flag_changes, on='imo', how='outer')

vessels = pd.merge(vessels, flags, on='imo', how='left')

## Add events to the dataset

In [16]:
ais = pd.read_parquet(PATH.joinpath('processed', 'ais.parquet'))

In [17]:
gaps = ais.groupby('imo').agg(ais_gaps=('imo', 'count'),
                              ais_gap_total_hours=('gap_durationhours', 'sum')).reset_index()

gaps_2022 = ais[ais.start >= '2022-01-01']\
                .groupby('imo')\
                .agg(ais_gaps_after_2022=('imo', 'count'),
                     ais_gap_total_hours_after_2022=('gap_durationhours', 'sum'))\
                .reset_index()

gaps = pd.merge(gaps, gaps_2022, on='imo', how='outer')

gaps.ais_gaps_after_2022 = gaps.ais_gaps_after_2022.fillna(0).astype(int)
gaps.ais_gaps = gaps.ais_gaps.fillna(0).astype(int)
gaps.ais_gap_total_hours = round(gaps.ais_gap_total_hours.fillna(0).astype(int))
gaps.ais_gap_total_hours_after_2022 = round(gaps.ais_gap_total_hours_after_2022.fillna(0).astype(int))
gaps.imo = gaps.imo.astype(int)

In [18]:
ports = pd.read_parquet(PATH.joinpath('processed', 'ports.parquet'))
ports.start = pd.to_datetime(ports.start).dt.strftime('%Y-%m-%d')

In [19]:
countries = ['RUS', 'TUR', 'IND']
temps = []
for country in countries:
    temp_2022 = ports[(ports.start >= '2022-01-01') & (ports.port_visit_startAnchorage_flag == country)].groupby('imo').size().reset_index()
    temp_2022.rename({0: f'{country}_port_visits_after_2022'}, axis=1, inplace=True)
    temps.append(temp_2022)
    temp = ports[(ports.start < '2022-01-01') & (ports.start >= '2019-02-01') & (ports.port_visit_startAnchorage_flag == country)].groupby('imo').size().reset_index()
    temp.rename({0: f'{country}_port_visits_before_2022'}, axis=1, inplace=True)
    temps.append(temp)

In [20]:
visits = reduce(lambda x, y: pd.merge(x, y, on='imo', how='outer'), temps)
visits.fillna(0, inplace=True)
visits[[col for col in visits.columns]] = visits[[col for col in visits.columns]].astype(int)

In [21]:
loitering = pd.read_parquet(PATH.joinpath('processed', 'loitering.parquet'))
loitering.start = pd.to_datetime(loitering.start).dt.strftime('%Y-%m-%d')

In [22]:
temp = loitering[(loitering.start < '2022-01-01') & (loitering.start >= '2019-02-01')].groupby('imo').size().reset_index().rename(columns={0: 'loitering_events_before_2022'})
temp_2022 = loitering[loitering.start >= '2022-01-01'].groupby('imo').size().reset_index().rename(columns={0: 'loitering_events_after_2022'})

loitering = pd.merge(temp, temp_2022, on='imo', how='outer')
loitering.fillna(0, inplace=True)
loitering.loitering_events_before_2022 = loitering.loitering_events_before_2022.astype(int)
loitering.loitering_events_after_2022 = loitering.loitering_events_after_2022.astype(int)
loitering.imo = loitering.imo.astype(int)

In [23]:
sts = pd.read_parquet(PATH.joinpath('processed', 'sts_tracks.parquet'))
sts = gpd.GeoDataFrame(sts, geometry=gpd.points_from_xy(sts.lon, sts.lat), crs='EPSG:4326')
sts_locations = gpd.read_file(PATH.joinpath('geo', 'sts_locations.geojson'), crs='EPSG:4326')
sts = gpd.sjoin(sts, sts_locations, how='left', predicate='within')
sts.drop(['index_right', 'geometry'], axis=1, inplace=True)
sts.rename(columns={'Name': 'sts_area'}, inplace=True)
sts.name = sts.name.str.upper().str.replace(' ', '').str.strip()

sts['time_diff_days'] = sts.groupby(['name', 'sts_area']).timestamp.diff().dt.total_seconds() / (60*60*24)
sts['group'] = (sts.time_diff_days > 7).cumsum()
sts['group'] = sts.groupby(['name', 'sts_area', 'group']).group.ffill().fillna(0).astype(int)
sts.sort_values(by=['group', 'timestamp'], inplace=True)

time_range = sts.groupby(['group', 'name']).agg(min_timestamp=('timestamp', 'min'), max_timestamp=('timestamp', 'max'))
time_range = (time_range['max_timestamp'] - time_range['min_timestamp']).dt.total_seconds() / (60*60*24)
sts = pd.merge(sts, time_range.reset_index(), on=['group', 'name'], how='left')
sts.rename(columns={sts.columns[-1]: 'time_range'}, inplace=True)


sts_start = sts.drop_duplicates(subset='group', keep='first').copy()
sts_end = sts.drop_duplicates(subset='group', keep='last').copy()
sts_start['type'] = 'possible_sts_start'
sts_end['type'] = 'possible_sts_end'

sts = pd.concat([sts_start, sts_end]).reset_index(drop=True)

sts.sort_values(by=['group', 'timestamp'], inplace=True)
sts.query('timestamp >= "2022-01-01" & time_range > 1', inplace=True)

sts = sts[['timestamp', 'lat', 'lon', 'sts_area', 'name', 'type', 'group']].copy()
sts.rename(columns={'timestamp': 'date'}, inplace=True)

sts.date = pd.to_datetime(sts.date).dt.date
sts['description'] = sts.apply(lambda row: f'Vessel {row["name"]} entered {row.sts_area.upper()}' if row['type'] == 'sts_start' else f'Vessel {row["name"]} exited {row.sts_area.upper()}', axis=1)

sts.sort_values(by=['group', 'date'], inplace=True)
sts.drop_duplicates(subset=['date', 'name', 'sts_area', 'type'], inplace=True)
sts = sts[sts.groupby('group').group.transform('count') > 1].reset_index(drop=True)

n = pd.read_csv(PATH.joinpath('processed', 'owners_names.csv'))
n.vessel_name = n.vessel_name.str.upper().str.replace(' ', '').str.strip()

sts = pd.merge(sts, 
                n[['vessel_name', 'imo']],
                left_on='name',
                right_on='vessel_name',
                how='left')

sts.dropna(subset='vessel_name', inplace=True)

sts = sts[['date', 'imo', 'description', 'type']].copy()

sts.imo = sts.imo.astype(int)
sts.date = pd.to_datetime(sts.date).dt.date

In [24]:
sts_events = sts.groupby('imo').size().reset_index().rename(columns={0: 'sts_events'})

In [30]:
sts.to_csv(PATH.joinpath('db', 'sts_events.csv'), index=False)

In [25]:
events = [gaps, visits, loitering, sts_events]
events = reduce(lambda x, y: pd.merge(x, y, on='imo', how='outer'), events)
vessels = pd.merge(vessels, events, on='imo', how='left')

In [26]:
cols = ['ownership_changes_after_2022', 'shipmanager_changes_after_2022', 'name_changes_after_2022',
         'flag_changes_after_2022',
       'ism_manager_changes_after_2022', 'detentions',
       'detentions_after_2022', 'deficiencies', 'deficiencies_after_2022',
       'ais_gaps', 'ais_gap_total_hours', 'ais_gaps_after_2022',
       'ais_gap_total_hours_after_2022', 'RUS_port_visits_after_2022',
       'RUS_port_visits_before_2022', 'TUR_port_visits_after_2022',
       'TUR_port_visits_before_2022', 'IND_port_visits_after_2022',
       'IND_port_visits_before_2022', 'loitering_events_before_2022',
       'loitering_events_after_2022', 'sts_events']



vessels[cols] = vessels[cols].fillna(0).astype(int)
vessels.fillna({'products_total': 0}, inplace=True)

In [None]:
vessels.head()

In [28]:
vessels.to_csv(PATH.joinpath('db', 'for_inspection.csv'), index=False)