# Data Cleaning

AIS data

In [3]:
import pandas as pd
import numpy as np
import os
import re

All the AIS files are the same time period: 01JUL2024 - 01JUL2025
AIS data is from Global Fishing Watch and utilizes AIS data and their algorithm to identify 'apparent fishing efforts'. This is aimed to be just fishing vessel movement and activity information.

Data is broken down by EEZ and High Seas Pockets (for areas outside the EEZs) and concentrated in the highy traversed parts of the WCPFC's Convention Area.

In [96]:
directory_path = '../data/AIS_data'

main_df = pd.DataFrame(columns=['flag', 'vessel_name', 'mmsi', 'imo'])


for filename in os.listdir(directory_path):
    file_path = os.path.join(directory_path, filename)
    print(file_path)
    if file_path != "../data/AIS_data/.DS_Store":
        df = pd.read_csv(file_path)

        df_filter = df[['Flag', 'Vessel Name', 'MMSI', 'IMO', 'Apparent Fishing Hours']].copy()

        area_name = file_path[17:][:-4]
        print(area_name)

        df_filter.columns = ['flag', 'vessel_name', 'mmsi', 'imo',str(area_name+'_fishing_hours')]

        df_filter['mmsi'] = df_filter['mmsi'].apply(lambda x: str(int(x)) if pd.notna(x) else x)
        df_filter['imo'] = df_filter['imo'].apply(lambda x: str(int(x)) if pd.notna(x) else x)

        df_filter['vessel_name'] = df_filter['vessel_name'].astype(str).str.upper()
        df_filter['vessel_name'] = df_filter['vessel_name'].str.replace("F/V", "", regex=False).str.replace("FV ", "", regex=False).str.replace("\"", "", regex=False).str.replace(".", "", regex=False).str.replace('"', '', regex=False).str.strip()

        df_filter = df_filter.drop_duplicates()

        main_df = pd.merge(main_df, df_filter, on = ['flag', 'vessel_name', 'mmsi', 'imo'], how = 'outer')

# Replace blanks and string 'nan' with actual NaN
main_df = main_df.replace(['', 'nan'], pd.NA)
# resolve duplicate vessels
# group by the first four columns and compute the mean for the rest
grouped_df = main_df.groupby(['flag', 'vessel_name', 'mmsi', 'imo'], dropna=False).mean(numeric_only=True).reset_index()

# check columns
print(grouped_df.columns)

# write to a csv
grouped_df.to_csv("../data/all_aid_data.csv", index=False)

../data/AIS_data/rmi_eez.csv
rmi_eez
../data/AIS_data/tonga_eez.csv
tonga_eez
../data/AIS_data/.DS_Store
../data/AIS_data/nauru_eez.csv
nauru_eez
../data/AIS_data/vanuatu_eez.csv
vanuatu_eez
../data/AIS_data/hsp2.csv
hsp2
../data/AIS_data/hsp3.csv
hsp3
../data/AIS_data/hsp1.csv
hsp1
../data/AIS_data/usa2_eez.csv
usa2_eez
../data/AIS_data/hsp4.csv
hsp4
../data/AIS_data/hsp5.csv
hsp5
../data/AIS_data/usa_eez.csv
usa_eez
../data/AIS_data/hsp7.csv
hsp7
../data/AIS_data/hsp6.csv
hsp6
../data/AIS_data/palau_eez.csv
palau_eez
../data/AIS_data/fsm_eez.csv
fsm_eez
../data/AIS_data/kirbati2_eez.csv
kirbati2_eez
../data/AIS_data/png_eez.csv
png_eez
../data/AIS_data/fiji_eez.csv
fiji_eez
../data/AIS_data/tuvalu_eez.csv
tuvalu_eez
../data/AIS_data/kiribati1_eez.csv
kiribati1_eez
../data/AIS_data/solomon_islands_eez.csv
solomon_islands_eez
Index(['flag', 'vessel_name', 'mmsi', 'imo', 'rmi_eez_fishing_hours',
       'tonga_eez_fishing_hours', 'nauru_eez_fishing_hours',
       'vanuatu_eez_fishing_hou