# Georgraphic Data Cleaning

We need to get a baseline of the locations for the fishing areas and then the locations of the companies that own the fishing vessels



In [1]:
import contextily as cx
import matplotlib.pyplot as plt
from shapely.geometry import Point
import networkx as nx
import numpy as np
import geopandas as gpd
import os
import pandas as pd

## Fishing Locations

First we will get the Centroids of the EEZs. 

The EEZ shapefiles are from: Maritime Boundaries Geodatabase: Maritime Boundaries and Exclusive Economic Zones (200NM), version 12

In [None]:
input_shapefile = "geo_files/World_EEZ_v12_20231025_0_360/eez_v12_0_360.gpkg"
output_centroids = "geo_files/EEZ_centroids.gpkg"

# read World EEZs
eezs = gpd.read_file(input_shapefile)

In [None]:
# filter for the EEZs we have data for
west_pac = ['United States Exclusive Economic Zone (Guam)', 'United States Exclusive Economic Zone (Northern Mariana Islands)',
            'Kiribati Exclusive Economic Zone (Phoenix Group)', 'Vanuatuan Exclusive Economic Zone', 'Solomon Islands Exclusive Economic Zone',
            'Fijian Exclusive Economic Zone', 'Kiribati Exclusive Economic Zone (Gilbert Islands)', 'Micronesian Exclusive Economic Zone', 
            'Papua New Guinean Exclusive Economic Zone', 'Nauruan Exclusive Economic Zone','Palauan Exclusive Economic Zone',
             'Marshallese Exclusive Economic Zone', 'Tongan Exclusive Economic Zone', 'Tuvaluan Exclusive Economic Zone' ]
eezs = eezs[eezs['GEONAME'].isin(west_pac)]

print(eezs.head(15))

     MRGID                                            GEONAME  MRGID_TER1  \
8     8448                     Tongan Exclusive Economic Zone      8674.0   
20    8450   Kiribati Exclusive Economic Zone (Phoenix Group)      8658.0   
37    8313                  Vanuatuan Exclusive Economic Zone      2202.0   
47    8314            Solomon Islands Exclusive Economic Zone      8593.0   
192  48957       United States Exclusive Economic Zone (Guam)      8599.0   
193   8318                Marshallese Exclusive Economic Zone      2226.0   
194  48980  United States Exclusive Economic Zone (Norther...      8598.0   
195   8315                    Palauan Exclusive Economic Zone      8594.0   
243   8326                   Tuvaluan Exclusive Economic Zone      2210.0   
246   8325                     Fijian Exclusive Economic Zone      2108.0   
249   8488  Kiribati Exclusive Economic Zone (Gilbert Isla...     17597.0   
253   8316                Micronesian Exclusive Economic Zone      8595.0   

In [49]:
# find centroids and write to a file
eez_centroids = eezs[['GEONAME','geometry']].copy()
eez_centroids['geometry'] = eez_centroids['geometry'].centroid

eez_centroids.to_file(output_centroids)
print(eez_centroids.head(20))

                                               GEONAME  \
8                       Tongan Exclusive Economic Zone   
20    Kiribati Exclusive Economic Zone (Phoenix Group)   
37                   Vanuatuan Exclusive Economic Zone   
47             Solomon Islands Exclusive Economic Zone   
192       United States Exclusive Economic Zone (Guam)   
193                Marshallese Exclusive Economic Zone   
194  United States Exclusive Economic Zone (Norther...   
195                    Palauan Exclusive Economic Zone   
243                   Tuvaluan Exclusive Economic Zone   
246                     Fijian Exclusive Economic Zone   
249  Kiribati Exclusive Economic Zone (Gilbert Isla...   
253                Micronesian Exclusive Economic Zone   
256          Papua New Guinean Exclusive Economic Zone   
258                    Nauruan Exclusive Economic Zone   

                        geometry  
8    POINT (185.23414 -20.22175)  
20    POINT (187.54603 -3.73181)  
37   POINT (168.55743 -1


  eez_centroids['geometry'] = eez_centroids['geometry'].centroid


Now we will add in the geometries from the High Seas Pockets

In [85]:
# Check the Coordinate Reference Systems
file = 'geo_files/HSP_geos/hps3_geometry.geojson'
test = gpd.read_file(file)

print(test.crs)

print(eez_centroids.crs)

EPSG:4326
EPSG:4326


In [72]:
# folder of geojsons
hsp_path = 'geo_files/HSP_geos/'

# init file to hold geometries
hsp_centroids = gpd.GeoDataFrame()

# loop through the files to combine to one dataframe
for file in os.listdir(hsp_path):
    # update name
    hsp = gpd.read_file(hsp_path+file)
    hsp_num = str(file)[3:4]
    hsp_name = "High Seas Pocket "+hsp_num
    hsp.columns = ['GEONAME','geometry']
    hsp['GEONAME'] = hsp_name
    # find the centroids
    hsp['geometry'] = hsp['geometry'].centroid
    # add to data frame
    hsp_centroids = pd.concat([hsp_centroids, hsp], ignore_index=True)

print(hsp_centroids.head(10))


              GEONAME                     geometry
0  High Seas Pocket 6  POINT (-165.39579 -4.88687)
1  High Seas Pocket 2    POINT (142.87993 3.05822)
2  High Seas Pocket 3   POINT (155.95434 16.04416)
3  High Seas Pocket 7   POINT (-164.5541 -4.16105)
4  High Seas Pocket 5  POINT (173.01948 -15.37524)
5  High Seas Pocket 4   POINT (165.92438 -3.38587)
6  High Seas Pocket 1   POINT (133.26613 16.03818)



  hsp['geometry'] = hsp['geometry'].centroid

  hsp['geometry'] = hsp['geometry'].centroid

  hsp['geometry'] = hsp['geometry'].centroid

  hsp['geometry'] = hsp['geometry'].centroid

  hsp['geometry'] = hsp['geometry'].centroid

  hsp['geometry'] = hsp['geometry'].centroid

  hsp['geometry'] = hsp['geometry'].centroid


In [74]:
# combine the two files together and save again

all_centroids = pd.concat([hsp_centroids, eez_centroids], ignore_index=True)

all_centroids.to_file(output_centroids)

print(all_centroids.head(20))

                                              GEONAME  \
0                                  High Seas Pocket 6   
1                                  High Seas Pocket 2   
2                                  High Seas Pocket 3   
3                                  High Seas Pocket 7   
4                                  High Seas Pocket 5   
5                                  High Seas Pocket 4   
6                                  High Seas Pocket 1   
7                      Tongan Exclusive Economic Zone   
8    Kiribati Exclusive Economic Zone (Phoenix Group)   
9                   Vanuatuan Exclusive Economic Zone   
10            Solomon Islands Exclusive Economic Zone   
11       United States Exclusive Economic Zone (Guam)   
12                Marshallese Exclusive Economic Zone   
13  United States Exclusive Economic Zone (Norther...   
14                    Palauan Exclusive Economic Zone   
15                   Tuvaluan Exclusive Economic Zone   
16                     Fijian E

## Locations of Vessel Owners and Home Ports

The addresses for the owners are not uniform at all, some are just cities, some are PO boxes, and others are regular addresses. The addresses are also spread across many countries, which do not have uniform ways to write addresses. To fix this and to get the latitude and longitude of these locations, I will use OpenAI's API to get the info.

Note: I used ChatGPT to trouble shoot my queries before using the API and also used it to help set up the API key funcitons and parse the response.

In [4]:
from openai import OpenAI
import json
import os
import re

# Load API key from JSON
def load_api_key(file_path):
    with open(file_path, 'r') as f:
        config = json.load(f)
    return config.get("openai_api_key")

# Load the OpenAI API key
api_key_file = 'api-keys.json'
openai_api_key = load_api_key(api_key_file)

# Instantiate the client
client = OpenAI(api_key=openai_api_key)

Create functions to call and parse the OpenAI API.

In [5]:
# Process job descriptions with GPT
def owner_location(owner_address):
    completion = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": "You are a helpful assistant helping to find the location of these companies."},
            {
                "role": "user",
                "content": "Can you look at this address and give me a bulleted list of its detailed location."+
                "Please provide the response as a bulleted list with the following structure (do not include the headers of the bullets):"+
                "-- [Cleaned Address] -- [Providence/State] -- [Country(three letter abbreviation)]"+ 
                "-- [Latitude (XXX.XXXXX format] --  [Longitude (XX.XXXXX format]"+
                "Here is the address:"+owner_address
            }
        ]
    )

    return completion.choices[0].message.content

def parse_location_response(response: str) -> dict:
    # Default structure
    result = {
        "Cleaned Address": None,
        "Province/State": None,
        "Country": None,
        "Latitude": None,
        "Longitude": None
    }

    # Clean and split the response into lines
    lines = [line.strip().lstrip('-').strip() for line in response.strip().split('\n') if line.strip()]

    for line in lines:
        line_lower = line.lower()

        # Handle labeled latitude/longitude, sometimes the API will label lat and long and sometimes not
        # This part of the function handles both
        if "latitude" in line_lower:
            try:
                result["Latitude"] = float(re.search(r"[-+]?\d*\.\d+|\d+", line).group())
            except:
                pass
        elif "longitude" in line_lower:
            try:
                result["Longitude"] = float(re.search(r"[-+]?\d*\.\d+|\d+", line).group())
            except:
                pass
        # Fallback to ordered structure for unlabeled values
        elif result["Cleaned Address"] is None:
            result["Cleaned Address"] = line
        elif result["Province/State"] is None:
            result["Province/State"] = line
        elif result["Country"] is None:
            result["Country"] = line
        elif result["Latitude"] is None:
            try:
                result["Latitude"] = float(line)
            except:
                pass
        elif result["Longitude"] is None:
            try:
                result["Longitude"] = float(line)
            except:
                pass

    return result

First I will test out the functions to make sure they are working before sending the whole list of addresses to it.

In [44]:
# load aid and RFV merged file
rfv = pd.read_csv('../data/ais_rfv_long.csv')

# rfv = rfv[:5]

In [88]:
orig_address = rfv['owner_address'][564]
print(orig_address)

response = owner_location(orig_address)
print(response)
# parse the response
address_info = parse_location_response(response)

# add all the informaiton together
new_row_df = pd.DataFrame({'original_address': [orig_address], 'new_address': [address_info['Cleaned Address']], 
                            'owner_providence_state':[address_info['Province/State']], 'owner_country':[address_info['Country']],
                            'owner_latitude':[address_info['Latitude']],'owner_longitude':[address_info['Longitude']]})

print(new_row_df)

9F SHANDONG HIGH-SPEED BUILDING,NO.29 MIAOLING ROAD,QINGDAO,CHINA
- 9F Shandong High-Speed Building, No.29 Miaoling Road  
- Qingdao  
- CHN  
- 36.06732  
- 120.38702  
                                    original_address  \
0  9F SHANDONG HIGH-SPEED BUILDING,NO.29 MIAOLING...   

                                         new_address owner_providence_state  \
0  9F Shandong High-Speed Building, No.29 Miaolin...                Qingdao   

  owner_country  owner_latitude  owner_longitude  
0           CHN        36.06732        120.38702  


Now I will do it on just the unique addresses and then combine with the other dataset later.

In [92]:
# find just the unique addresses
u_addresses = rfv['owner_address'].unique()
# print(len(u_addresses))
# There are 616 unique addresses

# # for testing a smaller group
# u_addresses_sub = u_addresses[305:310]
# # print(u_addresses_sub)

# init a df to store the location results
company_locs = pd.DataFrame()

for i in range(0,len(u_addresses)):
    # the old address
    orig_address = u_addresses[i]
    # use the OpenAI API to find the location info
    response = owner_location(orig_address)
    # parse the response
    address_info = parse_location_response(response)

    # add all the informaiton together
    new_row_df = pd.DataFrame({'original_address': [orig_address], 'new_address': [address_info['Cleaned Address']], 
                               'owner_providence_state':[address_info['Province/State']], 'owner_country':[address_info['Country']],
                                'owner_latitude':[address_info['Latitude']],'owner_longitude':[address_info['Longitude']]})
    
    # add to the dataframe
    company_locs = pd.concat([company_locs,new_row_df], ignore_index=True)
    # save to csv in case something goes wrong while the API is running
    company_locs.to_csv('../data/company_locations_unique.csv', index=False)

# print the dataframe to check 
print(len(company_locs))
print(company_locs.head())

  company_locs = pd.concat([company_locs,new_row_df], ignore_index=True)


616
                                    original_address  \
0  Room503,5F,Building 1, No.1 Xingye Road, Ganla...   
1  ROOM 1606 DIAMOND PLAZA,415 JIANGAN ROAD,JIANG...   
2  Building 3, 87 Yuan 'an xinjie, Haizhu Distric...   
3  Room 1401,No.644 Tongfu Road East, Guangzhou, ...   
4  Room 1401, No.644 Tongfu Road East, Guangzhou,...   

                                         new_address owner_providence_state  \
0  Room 503, 5F, Building 1, No. 1 Xingye Road, G...               Zhejiang   
1  ROOM 1606 DIAMOND PLAZA, 415 JIANGAN ROAD, JIA...                   None   
2  Building 3, 87 Yuan 'an xinjie, Haizhu Distric...     Guangdong Province   
3                Room 1401, No. 644 Tongfu Road East   Guangzhou, Guangdong   
4                 Room 1401, No.644 Tongfu Road East   Guangzhou, Guangdong   

  owner_country  owner_latitude  owner_longitude  
0           CHN       29.983330       122.113330  
1          None             NaN              NaN  
2           CHN       23.104274

### Home Port Locations

Now I want to find all the exact locations of the home ports.

In [None]:
# load aid and RFV merged file
rfv = pd.read_csv('../data/ais_rfv_long.csv')

# find just the unique homeports
u_ports = rfv['home_port'].unique()
# print(len(u_ports))
# there are 82 unique ports

82


Using the OpenAI API again, I will try the original location function out for the port.

In [9]:
orig_port = rfv['home_port'][564]
print(orig_port)

response = owner_location(orig_port)
print(response)
# parse the response
port_info = parse_location_response(response)

# add all the informaiton together
new_row_df = pd.DataFrame({'original_address': [orig_port], 'new_address': [port_info['Cleaned Address']], 
                            'owner_providence_state':[port_info['Province/State']], 'owner_country':[port_info['Country']],
                            'owner_latitude':[port_info['Latitude']],'owner_longitude':[port_info['Longitude']]})

print(new_row_df)

('Qingdao', 'CHN')
- Qingdao  
- Shandong  
- CHN  
- 36.099999  
- 120.371111
     original_address new_address owner_providence_state owner_country  \
0  ('Qingdao', 'CHN')     Qingdao               Shandong           CHN   

   owner_latitude  owner_longitude  
0       36.099999       120.371111  


In [15]:
# # # for testing a smaller group
# u_port_sub = u_ports[30:35]
# # print(u_port_sub)

# init a df to store the location results
port_locs = pd.DataFrame()

for i in range(0,len(u_ports)):
    # the port name
    orig_port = u_ports[i]

    # call the API again
    response = owner_location(orig_port)
    # parse the response
    port_info = parse_location_response(response)

    # add all the informaiton together
    new_row_df = pd.DataFrame({'original_home_port': [orig_port], 'clean_home_port': [port_info['Cleaned Address']], 
                                'home_port_providence_state':[port_info['Province/State']], 'home_port_country':[port_info['Country']],
                                'home_port_latitude':[port_info['Latitude']],'home_port_longitude':[port_info['Longitude']]})
    
    # add to the dataframe
    port_locs = pd.concat([port_locs,new_row_df], ignore_index=True)
    # save to csv in case something goes wrong while the API is running
    port_locs.to_csv('../data/port_locations_unique.csv', index=False)

# print the dataframe to check 
print(len(port_locs))
print(port_locs.head())

82
     original_home_port clean_home_port      home_port_providence_state  \
0   ('Zhoushan', 'CHN')        Zhoushan                        Zhejiang   
1     ('Ningbo', 'CHN')   Ningbo, China               Zhejiang Province   
2  ('Guangzhou', 'CHN')       Guangzhou              Guangdong Province   
3     ('Shidao', 'CHN')          Shidao  Providence/State: 不明 (Unknown)   
4   ('Shanghai', 'CHN')        Shanghai                        Shanghai   

                          home_port_country  home_port_latitude  \
0                                       CHN            29.99000   
1                                       CHN            29.86820   
2                                       CHN            23.12911   
3  Country (three letter abbreviation): CHN            25.18109   
4                                       CHN            31.23040   

   home_port_longitude  
0            122.20000  
1            121.54940  
2            113.26436  
3            119.43783  
4            121.4

## Geo Data Merging

Now we will merge the locations with our other data

In [3]:
# load aid and RFV merged file
rfv = pd.read_csv('../data/ais_rfv_long.csv')

# load the two new geo coded files
ports = pd.read_csv("../data/port_locations_unique.csv")
company_locs = pd.read_csv("../data/company_locations_unique.csv")

In [4]:
print(rfv.head())

  flag     vessel_name         mmsi        imo  fishing_hours   fishing_area  \
0  CHN    DONG YU 1521  412421095.0        NaN          22.86           hsp7   
1  CHN    DONG YU 1521  412421095.0        NaN         291.10           hsp6   
2  CHN    DONG YU 1521  412421095.0        NaN        1421.71   kirbati2_eez   
3  CHN    DONG YU 1521  412421095.0        NaN         741.84  kiribati1_eez   
4  CHN  FENG XIANG 818  412439604.0  8996176.0          12.19      nauru_eez   

                                     owner  \
0  Zhejiang Xingpeng Ocean Fishery CO.,LTD   
1  Zhejiang Xingpeng Ocean Fishery CO.,LTD   
2  Zhejiang Xingpeng Ocean Fishery CO.,LTD   
3  Zhejiang Xingpeng Ocean Fishery CO.,LTD   
4    Ningbo Yongfa Ocean Fisheries CO.,LTD   

                                       owner_address   vessel_type  \
0  Room503,5F,Building 1, No.1 Xingye Road, Ganla...     Longliner   
1  Room503,5F,Building 1, No.1 Xingye Road, Ganla...     Longliner   
2  Room503,5F,Building 1, No.1 X

In [7]:
print(ports.head())

     original_home_port clean_home_port      home_port_providence_state  \
0   ('Zhoushan', 'CHN')        Zhoushan                        Zhejiang   
1     ('Ningbo', 'CHN')   Ningbo, China               Zhejiang Province   
2  ('Guangzhou', 'CHN')       Guangzhou              Guangdong Province   
3     ('Shidao', 'CHN')          Shidao  Providence/State: 不明 (Unknown)   
4   ('Shanghai', 'CHN')        Shanghai                        Shanghai   

                          home_port_country  home_port_latitude  \
0                                       CHN            29.99000   
1                                       CHN            29.86820   
2                                       CHN            23.12911   
3  Country (three letter abbreviation): CHN            25.18109   
4                                       CHN            31.23040   

   home_port_longitude  
0            122.20000  
1            121.54940  
2            113.26436  
3            119.43783  
4            121.4737

In [12]:
# merge the data frames
merged = pd.merge(rfv, company_locs, left_on="owner_address", right_on = "original_address")
merged = pd.merge(merged, ports, left_on="home_port", right_on = "original_home_port")

# drop some redundent columns
merged = merged[['flag', 'vessel_name', 'fishing_hours', 'fishing_area',
       'owner', 'vessel_type', 'new_address', 'owner_providence_state',
       'owner_country', 'owner_latitude', 'owner_longitude',
       'clean_home_port', 'home_port_providence_state',
       'home_port_country', 'home_port_latitude', 'home_port_longitude']]

# rename some columns
merged.columns = ['flag', 'vessel_name', 'fishing_hours', 'fishing_area',
       'owner', 'vessel_type', 'owner_address', 'owner_providence_state',
       'owner_country', 'owner_latitude', 'owner_longitude',
       'home_port_name', 'home_port_providence_state',
       'home_port_country', 'home_port_latitude', 'home_port_longitude']

print(merged.head())

# save to a file
merged.to_csv("../data/geo_ais_rfv_all.csv", index=False)

  flag     vessel_name  fishing_hours   fishing_area  \
0  CHN    DONG YU 1521          22.86           hsp7   
1  CHN    DONG YU 1521         291.10           hsp6   
2  CHN    DONG YU 1521        1421.71   kirbati2_eez   
3  CHN    DONG YU 1521         741.84  kiribati1_eez   
4  CHN  FENG XIANG 818          12.19      nauru_eez   

                                     owner   vessel_type  \
0  Zhejiang Xingpeng Ocean Fishery CO.,LTD     Longliner   
1  Zhejiang Xingpeng Ocean Fishery CO.,LTD     Longliner   
2  Zhejiang Xingpeng Ocean Fishery CO.,LTD     Longliner   
3  Zhejiang Xingpeng Ocean Fishery CO.,LTD     Longliner   
4    Ningbo Yongfa Ocean Fisheries CO.,LTD  Purse seiner   

                                       owner_address owner_providence_state  \
0  Room 503, 5F, Building 1, No. 1 Xingye Road, G...               Zhejiang   
1  Room 503, 5F, Building 1, No. 1 Xingye Road, G...               Zhejiang   
2  Room 503, 5F, Building 1, No. 1 Xingye Road, G...             