In [1]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append('../src')

In [2]:
import pandas as pd
import geopandas as gpd
import shadowfleet as sf
from pathlib import Path
from shapely.geometry import Polygon
import plotly.express as px
import json
from ast import literal_eval

PATH = Path('../data')

## Data import and cleaning

In [3]:
# Import manually extracted vessel presence from terminals and locations of interest
dfs = []

for file in PATH.joinpath('oil_flows', 'visits').glob('*.json'):
    records = json.load(file.open()).get('entries')
    for record in records:
        r = record.get('public-global-presence:v3.0')
        df = pd.DataFrame(r)
        df['terminal'] = file.stem.split(' ')[0]
        dfs.append(df)

df = pd.concat(dfs)

# Filter out possible passerby vessels. Adapt min_hours to your needs

min_hours = 8
df.query(f'hours >= {min_hours}', inplace=True)


# Clean
df.date = pd.to_datetime(df.date)
df.drop(['entryTimestamp', 'exitTimestamp'], axis=1, inplace=True)
df.sort_values(['mmsi', 'date'], inplace=True)
df.query('mmsi.notnull() & mmsi != ""', inplace=True)
df.reset_index(drop=True, inplace=True)
df.mmsi = df.mmsi.astype(int)

# Calculate the difference in days between the current date and the previous date for each 'mmsi'
df['days_diff'] = df.groupby('mmsi')['date'].diff().dt.days

In [None]:
# Get vessel and owner information from GFW - uncomment to run
'''
vessels, owners = sf.get_vessels(query=df['mmsi'].unique()[2598:], 
                                 filename=PATH.joinpath('oil_flows', 'vessels.json'),
                                 limit=5, 
                                 field='mmsi')
'''

In [None]:
# Get vessel info

rows = []
with open(PATH.joinpath('oil_flows', 'vessels.json'), 'r') as file:
    for row in file:
        record = literal_eval(row).get('entries')[0].get('selfReportedInfo')[0]
        rows.append(record)
with open(PATH.joinpath('oil_flows', 'vessels2.json'), 'r') as file:
    for row in file:
        record  = literal_eval(row).get('entries')[0].get('selfReportedInfo')[0]
        rows.append(record)

vessels = pd.DataFrame(rows).drop_duplicates('id')
vessels = vessels[vessels.ssvid.notnull()].copy()
vessels.ssvid = vessels.ssvid.astype(int)
vessels.rename(columns={'ssvid': 'mmsi'}, inplace=True)

In [None]:
df = pd.merge(df, 
              vessels[['mmsi', 'imo', 'id', 'shipname', 'flag']], 
              on='mmsi', 
              how='left')
len(df)

In [None]:
# Get port visits from GFW - uncomment to run

'''
dfs = []
for id in df.id.unique():
    port_visits = sf.get_events(vessel_id=id, 
                                event_type='port_visits', 
                                filename=PATH.joinpath('oil_flows', 'port_visits.json'),
                                start_date='2022-01-01',
                                end_date='2024-11-01'
                                )
    dfs.append(port_visits)

port_visits = pd.concat(dfs)
'''

In [None]:
# Import port visits
rows = []
with open(PATH.joinpath('oil_flows', 'port_visits.json'), 'r') as file:
     for row in file:
         try:
            if len(literal_eval(row).get('entries')) > 0:
               for record in literal_eval(row).get('entries'):
                   vessel = record.get('vessel')
                   anchorage = record.get('port_visit').get('endAnchorage')
                   rows.append({'start': record.get('start'),
                                'end': record.get('end'),
                                'visit_id': record.get('id'),
                                'vessel_id': vessel.get('id'),
                                'mmsi': vessel.get('ssvid'),
                                'name': vessel.get('name'),
                                'flag': vessel.get('flag'),
                                'port': anchorage.get('name'),
                                'port_country': anchorage.get('flag'),
                                'lon': anchorage.get('lon'),
                                'lat': anchorage.get('lat'),
                                'duration': round(record.get('port_visit').get('durationHrs'), 1)
                                })
         except ValueError:
            print(f'could not parse row')
            continue

In [None]:
pv = pd.DataFrame(rows)
len(pv)

In [22]:
pv[['start' ,'end']] = pv[['start', 'end']].apply(pd.to_datetime)
pv.mmsi = pv.mmsi.astype(int)
pv.to_csv(PATH.joinpath('oil_flows', 'port_visits.csv'), index=False)

In [None]:
pv.head()

In [6]:
pv = pd.read_csv(PATH.joinpath('oil_flows', 'port_visits.csv'))
pv[['start' ,'end']] = pv[['start', 'end']].apply(pd.to_datetime)
pv.mmsi = pv.mmsi.astype(int)

In [None]:
# Merge df and port_visits on 'mmsi'
merged_df = pd.merge(pv, 
                     df[['date', 'mmsi', 'terminal',]], 
                     on='mmsi', 
                     how='left')



# Filter rows where the date of df falls between the start and end date of port_visits
filtered_df = merged_df[(merged_df['date'] >= merged_df['start'].dt.date) & (merged_df['date'] <= merged_df['end'].dt.date)].copy()

filtered_df.drop_duplicates(inplace=True)
len(filtered_df)

In [25]:
pv_filtered = pv[~pv.visit_id.isin(filtered_df.visit_id)].copy()
visits = pd.concat([filtered_df, pv_filtered])

In [26]:
visits.sort_values(['mmsi', 'date'], inplace=True)
visits.reset_index(drop=True, inplace=True)

In [None]:
# Define the terminal values of interest
country_of_interest = ['IND']
country = ['NLD']

# Initialize an empty list to store the rows of interest
rows_of_interest = []

# Group the dataframe by 'mmsi' to process each vessel separately
grouped = visits.groupby('mmsi')

# Iterate through each group
for mmsi, group in grouped:
    # Sort the group by the 'start' column to ensure chronological order
    group = group.sort_values('start')
    
    # Iterate through the rows of the group
    for i in range(len(group)):
        current_terminal = group.iloc[i]['port_country']
        
        # Check if the current terminal is in the terminals of interest
        if current_terminal in country_of_interest:
            # Check the previous row if it exists
            if i > 0:
                previous_terminal = group.iloc[i - 1]['port_country']
                if previous_terminal in country:
                    rows_of_interest.append(group.iloc[i])
                    rows_of_interest.append(group.iloc[i - 1])
            
            # Check the next row if it exists
            if i < len(group) - 1:
                next_terminal = group.iloc[i + 1]['terminal']
                if next_terminal in country:
                    rows_of_interest.append(group.iloc[i])
                    rows_of_interest.append(group.iloc[i + 1])

# Create a new dataframe from the rows of interest
result_df = pd.DataFrame(rows_of_interest).drop_duplicates()

# Display the resulting dataframe
result_df

In [None]:
# Function to count NaN values in each row
def count_nans(row):
    return row.isna().sum()

# Add a temporary column to count NaN values
visits['nan_count'] = visits.apply(count_nans, axis=1)

# Sort by 'nan_count' and drop duplicates, keeping the first occurrence (which has the least NaN values)
visits = visits.sort_values('nan_count').drop_duplicates(subset=['mmsi', 'date'], keep='first')

# Drop the temporary 'nan_count' column
visits.drop(columns=['nan_count'], inplace=True)

# Display the resulting dataframe
visits.head()

In [None]:
len(visits)

## Search by geometry

In [None]:
# Import polygons of terminals

terminals = gpd.read_file(PATH.joinpath('geo', 'oil_terminals.geojson'))
terminals.crs = 'EPSG:4326'

# Convert LineStrings to Polygons
terminals['geometry'] = terminals['geometry'].apply(lambda geom: Polygon(geom) if geom.is_valid else None)

# Drop any rows where the geometry conversion failed
terminals = terminals.dropna(subset=['geometry'])

In [None]:
# Get port visits from GFW

geometries = terminals.geometry.__geo_interface__.get('features')
terminal_list = terminals.name.to_list()
dfs = []

for geom, terminal in zip(geometries, terminal_list):
    geom = geom.get('geometry')

    df = sf.get_events_by_geometry(start_date = '2022-01-01',
                            end_date = '2024-11-01',
                            event_type='port_visits',
                            geometry = geom,
                            filename = PATH.joinpath('oil_flows', 'port_visits_oil_terminals.json'))
    df['terminal'] = terminal
    
    dfs.append(df)

terminal_visits = pd.concat(dfs)

len(terminal_visits)

In [None]:
# Get all port visits of these vessels

visits = []
for id in vessels.id.unique():
    port_visit = sf.get_events(vessel_id=id,
                               start_date='2022-01-01',
                               end_date='2024-11-01',
                               event_type='port_visits',
                               filename=PATH.joinpath('oil_flows', 'prev_visits.json'))
    visits.append(port_visit)

prev_visits = pd.concat(visits)
len(prev_visits)

In [None]:
# Write to file

terminal_visits.to_csv(PATH.joinpath('oil_flows', 'port_visits_oil_terminals.csv'), index=False)
vessels.to_csv(PATH.joinpath('oil_flows', 'vessels.csv'), index=False)
prev_visits.to_csv(PATH.joinpath('oil_flows', 'previous_port_visits.csv'), index=False)

In [None]:
terminals.to_file(PATH.joinpath('geo', 'oil_terminals.geojson'), driver='GeoJSON')

In [None]:
# Import data

terminal_visits = pd.read_csv(PATH.joinpath('oil_flows', 'port_visits_oil_terminals.csv'))
prev_visits = pd.read_csv(PATH.joinpath('oil_flows', 'previous_port_visits.csv'))
vessels = pd.read_csv(PATH.joinpath('oil_flows', 'vessels.csv'))

In [None]:
# Clean up columns

cols = ['start', 'end', 'id', 'vessel.id', 'vessel.flag', 'vessel.ssvid', 'vessel.name', 'vessel.type', 
        'port_visit.durationHrs', 'position.lat', 'port_visit.startAnchorage.id', 
        'port_visit.intermediateAnchorage.id', 'port_visit.endAnchorage.id', 'position.lon', 'terminal']

terminal_visits = terminal_visits[cols].copy()
prev_visits = prev_visits[cols[:-1]].copy()

renamed = {'port_visit.durationHrs': 'duration_hrs',
           'vessel.id': 'vessel_id',
           'port_visit.intermediateAnchorage.id': 'intermediate_anchorage_id',
           'port_visit.startAnchorage.anchorageId': 'start_anchorage_id',
           'port_visit.endAnchorage.id': 'end_anchorage_id',
           'position.lat': 'lat',
           'position.lon': 'lon'
          }

for key, value in renamed.items():
    cols = [x.replace(key, value) for x in cols]

cols = [col.replace('vessel.', '') for col in cols]

terminal_visits.columns = cols
prev_visits.columns = cols[:-1]

# Drop rows with irrelevant vessel types
terminal_visits = terminal_visits[~terminal_visits['type'].isin(['fishing', 'passenger', 'gear', 'seismic_vessel'])].copy()
prev_visits = prev_visits[~prev_visits['type'].isin(['fishing', 'passenger', 'gear', 'seismic_vessel'])].copy()

# Add IMO number to vessels
prev_visits = pd.merge(prev_visits, vessels[vessels.imo.notna()][['ssvid', 'imo']], on='ssvid', how='left')
prev_visits.dropna(subset=['imo'], inplace=True)
prev_visits.imo = prev_visits.imo.astype(int)

# Sort values
prev_visits.sort_values(['ssvid', 'start'], inplace=True)

# Add terminal visit column
prev_visits = pd.merge(prev_visits, 
                        terminal_visits[['id', 'terminal']], 
                        on='id', how='left')

len(prev_visits)


In [None]:
terminal_visits[(terminal_visits.terminal.notna()) & (terminal_visits.terminal.str.contains('Gunvor'))]

In [None]:
prev_visits = pd.merge(prev_visits, terminal_visits[['terminal', 'id']], on='id', how='left')
prev_visits.drop_duplicates(subset='id', inplace=True)

In [None]:
prev_visits.start = pd.to_datetime(prev_visits.start)
prev_visits.sort_values(['ssvid', 'start'], inplace=True)

In [None]:
prev_visits.head()

In [None]:
import pandas as pd

# Initialize an empty list to store the rows of interest
rows_of_interest = []

# Group the dataframe by 'ssvid' to process each vessel separately
grouped = prev_visits.groupby('ssvid')

# Iterate through each group
for ssvid, group in grouped:
    # Sort the group by the 'start' column to ensure chronological order
    group = group.sort_values('start')
    
    # Find the indices where the vessel was in a terminal of interest
    terminal_indices = group[group['terminal'].notna()].index
    
    for idx in terminal_indices:
        # Get the index of the current row
        current_idx = group.index.get_loc(idx)
        
        # Get the previous and next rows if they exist
        if current_idx > 0:
            rows_of_interest.append(group.iloc[current_idx - 1])
        rows_of_interest.append(group.iloc[current_idx])
        if current_idx < len(group) - 1:
            rows_of_interest.append(group.iloc[current_idx + 1])

# Create a new dataframe from the rows of interest
port_visits_before_after = pd.DataFrame(rows_of_interest)

# Display the new dataframe
port_visits_before_after

In [None]:
prev_visits.query('terminal == "Eurotank"')

In [None]:
port_visits_before_after.query('ssvid == 202509374')