In [1]:
import pandas as pd
from pathlib import Path
import geopandas as gpd
from zipfile import ZipFile

PATH = Path.cwd().parent.joinpath('data')

# Goal

Create dataframe of a timeline with the following columns:
0. Date
1. IMO
2. Event, which could be:
    - name change
    - flag change
    - owner/manager change
    - port visit
    - loitering event
    - ais gap
    - sts area
    - inspection
3. Description: changed name from .. to .., changed flag from .. to .., changed owner from .. to .., visited port .., turned AIS off, turned AIS on, visited sts area, left sts area, inspected at, found x defeciencies and was/was not held in detention
4. Type: flag_change, name_change, owner_change, ssvid_change, port_arrival, port_departure, loitering_start, loitering_stop, ais_off_switching, ais_on_switching, inspection. 
5. latitude
6. longitude

From 2022 onwards


### Name changes

In [None]:
names = pd.read_csv(PATH.joinpath('processed', 'owners_names.csv'))
names = names[['start_date', 'imo', 'vessel_name']].copy()
names.rename(columns={'start_date': 'date'}, inplace=True)
names['date'] = pd.to_datetime(names['date'])
names.sort_values(by='date', ascending=False, inplace=True)
names.reset_index(drop=True, inplace=True)
names['previous_name'] = names.groupby('imo').vessel_name.shift(-1).fillna('UNKNOWN')
names.query('date >= "2022-01-01"', inplace=True)
names['description'] = names.apply(lambda row: f'Vessel name changed from {row.previous_name.upper()} to {row.vessel_name}', axis=1)
names.drop(columns='previous_name', inplace=True)
names['type'] = 'vessel_name_change'
names.drop('vessel_name', axis=1, inplace=True)
names.head()

### Flag changes

In [None]:
flags = pd.read_csv(PATH.joinpath('processed', 'owners_flags.csv'))
flags.start_date = pd.to_datetime(flags.start_date)
flags.sort_values(by='start_date', ascending=False, inplace=True)
flags.reset_index(drop=True, inplace=True)
flags.flag = flags.flag.str.replace('Not Known', 'UNKNOWN').str.upper()
flags['previous_flag'] = flags.groupby('imo').flag.shift(-1).fillna('UNKNOWN')
flags['description'] = flags.apply(lambda row: f'Vessel flag changed from {row.previous_flag.upper()} to {row.flag}', axis=1)
flags = flags[['start_date', 'imo', 'flag', 'description']].copy()
flags.rename(columns={'start_date': 'date'}, inplace=True)
flags.query('date >= "2022-01-01"', inplace=True)
flags['type'] = 'vessel_flag_change'
flags.drop('flag', axis=1, inplace=True)
flags.head()

### Ownership changes

In [None]:
owners = pd.read_csv(PATH.joinpath('processed', 'owners_companies.csv'))
owners.start_date = pd.to_datetime(owners.start_date)
owners.sort_values(by='start_date', ascending=False, inplace=True)
owners.reset_index(drop=True, inplace=True)
owners['previous_owner'] = owners.groupby(['imo', 'role']).company.shift(-1).fillna('UNKNOWN')
owners['description'] = owners.apply(lambda row: f'{row.role.upper()} changed from {row.previous_owner.upper()} to {row.company}', axis=1)
owners = owners[['start_date', 'imo', 'company', 'description']].copy()
owners.rename(columns={'start_date': 'date'}, inplace=True)
owners.query('date >= "2022-01-01"', inplace=True)
owners['type']= 'vessel_owner_change'
owners.drop('company', axis=1, inplace=True)
owners.head()

### Inspections

In [None]:
inspections = pd.read_csv(PATH.joinpath('processed', 'owners_inspections.csv'))
inspections.date = pd.to_datetime(inspections.date)
inspections.sort_values(by='date', ascending=False, inplace=True)
inspections.reset_index(drop=True, inplace=True)
inspections[['authority', 'port']] = inspections[['authority', 'port']].fillna('UNKNOWN')
inspections['description'] = inspections.apply(lambda row: f'Vessel inspected in {row.port.upper()} in {row.authority.upper()} and {row.inspection_type} found {row.number_of_deficiencies} deficiencies. Detention={row.detention}', axis=1)
inspections = inspections[['date', 'imo', 'description']].copy()
inspections['type'] = 'vessel_inspection'
inspections.query('date >= "2022-01-01"', inplace=True)
inspections.head()

In [None]:
events = pd.concat([names, flags, owners, inspections], ignore_index=True).sort_values('date').reset_index(drop=True)
events.head()

In [49]:
location_data = pd.read_parquet(PATH.joinpath('processed', 'tracks.parquet'))

In [50]:
#### Add location data to dataframes

def get_middle_timestamp(group):
    sorted_group = group.sort_values(by='timestamp').reset_index(drop=True)
    middle_index = len(sorted_group) // 2
    return sorted_group.loc[middle_index]

def process_location_data(tracks, imo):
    filtered_location_date = tracks.query(f'imo == {imo}').copy()
    filtered_location_date['date'] = pd.to_datetime(filtered_location_date['timestamp']).dt.date
    middle_timestamps = filtered_location_date.groupby(['imo', 'date']).apply(get_middle_timestamp, include_groups=False)

    return middle_timestamps

unique_imos = names.imo.unique()
location_date_processed = pd.concat([process_location_data(location_data, imo) for imo in unique_imos])
location_date_processed.reset_index(inplace=True)
location_date_processed.date = pd.to_datetime(location_date_processed.date)



In [54]:
events.date = pd.to_datetime(events.date).dt.date

In [56]:
location_date_processed['date'] = pd.to_datetime(location_date_processed['date']).dt.date

In [70]:
owners.date = pd.to_datetime(owners.date).dt.date

In [None]:
merged_df = pd.merge(owners, 
                     location_date_processed[['timestamp', 'imo', 'lat', 'lon', 'date']], 
                     on=['imo', 'date'], 
                     how='left')
len(merged_df)

In [None]:
merged_df.head()

In [None]:
location_data[(location_data.imo == 9327372) & (location_data.timestamp.dt.date == '2024-09-23')]

In [60]:
merged_df.drop('timestamp', axis=1, inplace=True)

In [None]:
merged_df[merged_df.lat.isna()]['type'].value_counts()

### Port visits

In [None]:
ports = pd.read_parquet(PATH.joinpath('processed', 'ports.parquet'))
ports_start = ports[['imo', 'start', 'lat', 'lon', 'port_visit_startAnchorage_id']].copy()
ports_end = ports[['imo', 'end', 'lat', 'lon', 'port_visit_startAnchorage_id']].copy()
ports_start.rename(columns={'start': 'date', 'port_visit_startAnchorage_id': 'port'}, inplace=True)
ports_end.rename(columns={'end': 'date', 'port_visit_startAnchorage_id': 'port'}, inplace=True)
ports_start['type'] = 'port_arrival'
ports_end['type'] = 'port_departure'
ports = pd.concat([ports_start, ports_end]).reset_index(drop=True)
ports.query('date >= "2022-01-01"', inplace=True)
ports['date'] = pd.to_datetime(ports.date).dt.date
ports['description'] = ports.apply(lambda row: f'Vessel arrived to {row.port.upper()}' if row.type == 'port_arrival' else f'Vessel departed from {row.port.upper()}', axis=1)
ports.drop(columns=['port'], inplace=True)
ports.head()

### Loitering

In [None]:
loitering = pd.read_parquet(PATH.joinpath('processed', 'loitering.parquet'))
loitering_start = loitering[['imo', 'start', 'lat', 'lon']].copy()
loitering_end = loitering[['imo', 'end', 'lat', 'lon']].copy()
loitering_start.rename(columns={'start': 'date'}, inplace=True)
loitering_end.rename(columns={'end': 'date'}, inplace=True)
loitering_start['type'] = 'loitering_start'
loitering_end['type'] = 'loitering_end'
loitering = pd.concat([loitering_start, loitering_end]).reset_index(drop=True)
loitering.query('date >= "2022-01-01"', inplace=True)
loitering['date'] = pd.to_datetime(loitering.date).dt.date
loitering['description'] = loitering.apply(lambda row: 'Vessel started loitering' if row.type == 'loitering_start' else 'Vessel stopped loitering', axis=1)
loitering['type'] = 'loitering'
#loitering.drop(columns=['lat', 'lon'], inplace=True)
loitering.sample(4)

### AIS gaps

In [None]:
ais = pd.read_parquet(PATH.joinpath('processed', 'ais.parquet'))
ais_start = ais[['imo', 'start', 'lat', 'lon']].copy()
ais_end = ais[['imo', 'end', 'lat', 'lon']].copy()
ais_start.rename(columns={'start': 'date'}, inplace=True)
ais_end.rename(columns={'end': 'date'}, inplace=True)
ais_start['type'] = 'possible_ais_off_switching'
ais_end['type'] = 'possible_ais_on_switching'
ais = pd.concat([ais_start, ais_end]).reset_index(drop=True)
ais.query('date >= "2022-01-01"', inplace=True)
ais['date'] = pd.to_datetime(ais.date).dt.date
ais['description'] = ais.apply(lambda row: 'Vessel started broadcasting AIS' if row.type == 'ais_start' else 'Vessel stopped broadcasting AIS', axis=1)
#ais.drop(columns=['lat', 'lon'], inplace=True)
ais.head()

### ship to ship transfers

In [44]:
sts = pd.read_parquet(PATH.joinpath('processed', 'sts_tracks.parquet'))
sts = gpd.GeoDataFrame(sts, geometry=gpd.points_from_xy(sts.lon, sts.lat), crs='EPSG:4326')
sts_locations = gpd.read_file(PATH.joinpath('geo', 'sts_locations.geojson'), crs='EPSG:4326')
sts = gpd.sjoin(sts, sts_locations, how='left', predicate='within')
sts.drop(['index_right', 'geometry'], axis=1, inplace=True)
sts.rename(columns={'Name': 'sts_area'}, inplace=True)
sts.name = sts.name.str.upper().str.replace(' ', '').str.strip()

sts['time_diff_days'] = sts.groupby(['name', 'sts_area']).timestamp.diff().dt.total_seconds() / (60*60*24)
sts['group'] = (sts.time_diff_days > 7).cumsum()
sts['group'] = sts.groupby(['name', 'sts_area', 'group']).group.ffill().fillna(0).astype(int)
sts.sort_values(by=['group', 'timestamp'], inplace=True)

time_range = sts.groupby(['group', 'name']).agg(min_timestamp=('timestamp', 'min'), max_timestamp=('timestamp', 'max'))
time_range = (time_range['max_timestamp'] - time_range['min_timestamp']).dt.total_seconds() / (60*60*24)
sts = pd.merge(sts, time_range.reset_index(), on=['group', 'name'], how='left')
sts.rename(columns={sts.columns[-1]: 'time_range'}, inplace=True)


sts_start = sts.drop_duplicates(subset='group', keep='first').copy()
sts_end = sts.drop_duplicates(subset='group', keep='last').copy()
sts_start['type'] = 'possible_sts_start'
sts_end['type'] = 'possible_sts_end'

sts = pd.concat([sts_start, sts_end]).reset_index(drop=True)

sts.sort_values(by=['group', 'timestamp'], inplace=True)
sts.query('timestamp >= "2022-01-01" & time_range > 1', inplace=True)

sts = sts[['timestamp', 'lat', 'lon', 'sts_area', 'name', 'type', 'group']].copy()
sts.rename(columns={'timestamp': 'date'}, inplace=True)

sts.date = pd.to_datetime(sts.date).dt.date
sts['description'] = sts.apply(lambda row: f'Vessel {row["name"]} entered {row.sts_area.upper()}' if row['type'] == 'sts_start' else f'Vessel {row["name"]} exited {row.sts_area.upper()}', axis=1)

sts.sort_values(by=['group', 'date'], inplace=True)
sts.drop_duplicates(subset=['date', 'name', 'sts_area', 'type'], inplace=True)
sts = sts[sts.groupby('group').group.transform('count') > 1].reset_index(drop=True)

n = pd.read_csv(PATH.joinpath('processed', 'owners_names.csv'))
n.vessel_name = n.vessel_name.str.upper().str.replace(' ', '').str.strip()

sts = pd.merge(sts, 
                n[['vessel_name', 'imo']],
                left_on='name',
                right_on='vessel_name',
                how='left')

sts.dropna(subset='vessel_name', inplace=True)

sts = sts[['date', 'imo', 'description', 'type']].copy()

sts.imo = sts.imo.astype(int)
sts.date = pd.to_datetime(sts.date).dt.date

### Bring it all together

In [45]:
events.date = pd.to_datetime(events.date).dt.date

In [62]:
timeline = pd.concat([events, ports, loitering, ais, sts], ignore_index=True).sort_values('date').reset_index(drop=True)

In [None]:
timeline.head()

In [82]:
timeline.to_csv(PATH.joinpath('db', 'timeline.csv'), index=False)

In [None]:
timeline.info()

In [None]:
timeline.head()