In [1]:
import matplotlib.pyplot as plt
import geopandas as gpd
import numpy as np
import pandas as pd
from shapely.geometry import Polygon, LineString, Point
from collections import Counter

In [2]:
#ref 1
df_stj = gpd.read_file('../data/raw/E95-St_Thomas_John_benthic Map_Shapefile/stsj_fin.shp')
df_pr = gpd.read_file('../data/raw/E95-Puerto_Rico_benthic habitat Map_Shapefile')
df_stc = gpd.read_file('../data/raw/E95-St_Croix_benthic habitat Map_Shapefile')#.set_index('POLYGONID')

In [3]:
print(df_stj.shape)
print(df_pr.shape)
print(df_stc.shape)

(1301, 11)
(5288, 12)
(1184, 11)


In [4]:
print(df_stj.columns)
print(df_pr.columns)
print(df_stc.columns)

Index(['POLYGONID', 'AREA', 'ID', 'DESCRIPTOR', 'ZONE', 'HABITAT', 'TYPE',
       'MODIFIER1', 'MODIFIER2', 'MODIFIER3', 'geometry'],
      dtype='object')
Index(['POLYGONID', 'AREA', 'ID', 'DESCRIPTOR', 'ZONE', 'HABITAT', 'TYPE',
       'MODIFIER1', 'MODIFIER2', 'MODIFIER3', 'REPRESENTA', 'geometry'],
      dtype='object')
Index(['POLYGONID', 'AREA', 'ID', 'DESCRIPTOR', 'ZONE', 'HABITAT', 'TYPE',
       'MODIFIER1', 'MODIFIER2', 'MODIFIER3', 'geometry'],
      dtype='object')


In [5]:
#Drop extra pr column
print(df_pr['REPRESENTA'].unique())
df_pr.drop('REPRESENTA', axis = 1, inplace = True)

[None]


In [6]:
#add polygon id as feature
df_pr['polyid'] = ['PR_{}' .format(i) for i in df_pr['POLYGONID']]
df_stj['polyid'] = ['STJ_{}' .format(i) for i in df_stj['POLYGONID']]
df_stc['polyid'] = ['STC_{}' .format(i) for i in df_stc['POLYGONID']]

In [7]:
#PR and USVI dataframe concat
frames = [df_stj, df_pr, df_stc]
df_combined = pd.concat(frames)
df_combined.shape

(7773, 12)

In [8]:
#save file with geodata
df_combined.drop(['ID', 'POLYGONID', 'MODIFIER1', 'MODIFIER2', 'MODIFIER3'], axis = 1, inplace = True)
df_combined.to_file("../data/processed/with_land_df.shp")

In [18]:
#drop descriptor unknown, not a whole lot of information
df = df_combined.copy()
print(df.query('ZONE == "Unclassified" and DESCRIPTOR == "Unknown"').head())
t_df = df.query('DESCRIPTOR != "Unknown"')
print('Number of unknown DESCRIPTOR dropped: {}' .format(len(df)-len(t_df)))

              AREA DESCRIPTOR          ZONE             HABITAT     TYPE  \
1234  1.272646e+06    Unknown  Unclassified  Other Delineations  Unknown   
1295  3.287270e+08    Unknown  Unclassified  Other Delineations  Unknown   
858   3.066965e+05    Unknown  Unclassified  Other Delineations  Unknown   
893   1.775990e+06    Unknown  Unclassified  Other Delineations  Unknown   
919   5.356256e+05    Unknown  Unclassified  Other Delineations  Unknown   

                                               geometry    polyid  
1234  POLYGON ((-64.91207 18.26981, -64.91196 18.270...  STJ_1235  
1295  MULTIPOLYGON (((-64.95861 18.32100, -64.95882 ...  STJ_1296  
858   POLYGON ((-65.36747 18.10804, -65.36750 18.108...    PR_859  
893   POLYGON ((-65.41597 18.10542, -65.41596 18.105...    PR_894  
919   POLYGON ((-65.46880 18.08202, -65.46880 18.081...    PR_920  
Number of unknown DESCRIPTOR dropped: 107


In [24]:
#unclassified ZONE still have substantial info
print(t_df.query('ZONE == "Unclassified"')['DESCRIPTOR'].unique())
print(t_df.query('ZONE == "Unclassified"')['HABITAT'].unique())
print(t_df.query('ZONE == "Unclassified"')['TYPE'].unique())

['Land' 'Seagrass/Patchy/30-50%' 'Mangrove' 'Seagrass/Patchy/70-90%'
 'Reef/Colonized Bedrock' 'Reef/Patch Reef (Aggregated)'
 'Seagrass/Patchy/10-30%' 'Mud']
['Other Delineations' 'Submerged Vegetation'
 'Coral Reef and Colonized Hardbottom' 'Unconsolidated Sediments']
['Land' 'Seagrass' 'Mangrove' 'Colonized Bedrock'
 'Patch Reef (Aggregated)' 'Mud']


In [25]:
#Set index to polygonid, drop unused columns
df = t_df.copy()
df.set_index('polyid', inplace = True, drop = False)
df.head()

Unnamed: 0_level_0,AREA,DESCRIPTOR,ZONE,HABITAT,TYPE,geometry,polyid
polyid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
STJ_1,6445.33,Sand,Bank/Shelf,Unconsolidated Sediments,Sand,"POLYGON ((-64.80970 18.36644, -64.80955 18.366...",STJ_1
STJ_2,47409.3,Sand,Bank/Shelf,Unconsolidated Sediments,Sand,"POLYGON ((-64.80417 18.35974, -64.80419 18.359...",STJ_2
STJ_3,44504.3,Reef/Scattered Coral-Rock,Bank/Shelf,Coral Reef and Colonized Hardbottom,Scattered Coral/Rock in Unconsolidated,"POLYGON ((-64.81332 18.35635, -64.81342 18.356...",STJ_3
STJ_4,5698.1,Reef/Scattered Coral-Rock,Bank/Shelf,Coral Reef and Colonized Hardbottom,Scattered Coral/Rock in Unconsolidated,"POLYGON ((-64.81626 18.34365, -64.81617 18.344...",STJ_4
STJ_5,24547.68,Reef/Linear Reef,Forereef,Coral Reef and Colonized Hardbottom,Linear Reef,"POLYGON ((-64.79880 18.35579, -64.79875 18.355...",STJ_5


In [26]:
#Drop land for analysis
df = df.query('DESCRIPTOR != "Land"').copy()
print(df.shape)

(7384, 7)


In [27]:
#ref 2 lionfish reported data
lf_df = pd.read_csv('../data/raw/NAS-Data-Download.csv')
cols = ['Latitude', 'Longitude', 'Accuracy', 'Drainage Name', 'Year', 'Month', 'Day'] 
lf_df = lf_df[cols]
min_lon, max_lon, min_lat, max_lat = -68.0, -64.4, 17.55, 18.6 #StThomas, StJohn, PR
lf_df = lf_df.query('Longitude > {} and Longitude < {} and Latitude > {} and Latitude < {}' .format(min_lon,
                                                                                                    max_lon,
                                                                                                    min_lat,
                                                                                                    max_lat)).reset_index(drop = True)
lf_df.tail()

Unnamed: 0,Latitude,Longitude,Accuracy,Drainage Name,Year,Month,Day
452,18.49583,-64.41333,Accurate,Eastern Caribbean,2015,2.0,7.0
453,18.48016,-64.41737,Approximate,Eastern Caribbean,2013,7.0,21.0
454,18.48491,-64.41457,Accurate,Eastern Caribbean,2015,4.0,27.0
455,18.43258,-64.44406,Approximate,Eastern Caribbean,2016,3.0,29.0
456,18.43252,-64.4451,Approximate,Eastern Caribbean,2016,1.0,2.0


In [28]:
#make lat/lon combo feature
lat_lon = [(i[1][1], i[1][0]) for i in lf_df[['Latitude', 'Longitude']].iterrows()]
lf_df['lat_lon'] = lat_lon

In [30]:
#find polygons with LF, NOTE: exceptions lie outside benthic mapping (may be useable with some work)
polygonids = []
exceptions = []
def check_point(point):
    t_point = Point(point)
    t = df[:].geometry.apply(lambda x: x.contains(t_point))
    try:
        if t.value_counts()[True] == 1:
            idx = np.argmax(t)
            polygonids.append(df.iloc[idx]['polyid'])
    except:
        exceptions.append(point)
_ = lf_df['lat_lon'][:].apply(check_point)

In [31]:
#count of 311
print(len(polygonids))
assert len(polygonids) == 311

311


In [32]:
#add LF count to polygonid
counts = Counter(polygonids)
zeros = np.zeros_like(df['ZONE'])
df['num_lf'] = zeros
for k, value in counts.items():
    k = str(k)
    df.loc[k, 'num_lf'] = value

In [33]:
df.head()

Unnamed: 0_level_0,AREA,DESCRIPTOR,ZONE,HABITAT,TYPE,geometry,polyid,num_lf
polyid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
STJ_1,6445.33,Sand,Bank/Shelf,Unconsolidated Sediments,Sand,"POLYGON ((-64.80970 18.36644, -64.80955 18.366...",STJ_1,0
STJ_2,47409.3,Sand,Bank/Shelf,Unconsolidated Sediments,Sand,"POLYGON ((-64.80417 18.35974, -64.80419 18.359...",STJ_2,0
STJ_3,44504.3,Reef/Scattered Coral-Rock,Bank/Shelf,Coral Reef and Colonized Hardbottom,Scattered Coral/Rock in Unconsolidated,"POLYGON ((-64.81332 18.35635, -64.81342 18.356...",STJ_3,0
STJ_4,5698.1,Reef/Scattered Coral-Rock,Bank/Shelf,Coral Reef and Colonized Hardbottom,Scattered Coral/Rock in Unconsolidated,"POLYGON ((-64.81626 18.34365, -64.81617 18.344...",STJ_4,0
STJ_5,24547.68,Reef/Linear Reef,Forereef,Coral Reef and Colonized Hardbottom,Linear Reef,"POLYGON ((-64.79880 18.35579, -64.79875 18.355...",STJ_5,0


In [34]:
df.drop('geometry', axis = 1, inplace = True)
df.head()

Unnamed: 0_level_0,AREA,DESCRIPTOR,ZONE,HABITAT,TYPE,polyid,num_lf
polyid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
STJ_1,6445.33,Sand,Bank/Shelf,Unconsolidated Sediments,Sand,STJ_1,0
STJ_2,47409.3,Sand,Bank/Shelf,Unconsolidated Sediments,Sand,STJ_2,0
STJ_3,44504.3,Reef/Scattered Coral-Rock,Bank/Shelf,Coral Reef and Colonized Hardbottom,Scattered Coral/Rock in Unconsolidated,STJ_3,0
STJ_4,5698.1,Reef/Scattered Coral-Rock,Bank/Shelf,Coral Reef and Colonized Hardbottom,Scattered Coral/Rock in Unconsolidated,STJ_4,0
STJ_5,24547.68,Reef/Linear Reef,Forereef,Coral Reef and Colonized Hardbottom,Linear Reef,STJ_5,0


In [35]:
lf_df.head()

Unnamed: 0,Latitude,Longitude,Accuracy,Drainage Name,Year,Month,Day,lat_lon
0,18.43382,-64.44499,Accurate,Eastern Caribbean,2022,2.0,27.0,"(-64.44499, 18.43382)"
1,18.051071,-67.908004,Accurate,Greater Antilles,2010,1.0,10.0,"(-67.908004, 18.051071)"
2,18.1121,-67.934,Accurate,Greater Antilles,2009,4.0,5.0,"(-67.934, 18.1121)"
3,17.9389,-67.0231,Approximate,Greater Antilles,2009,5.0,15.0,"(-67.0231, 17.9389)"
4,17.9658,-66.7616,Accurate,Greater Antilles,2010,6.0,6.0,"(-66.7616, 17.9658)"


In [21]:
lf_df.to_csv('../data/processed/lion_fish.csv', index = False)
df.to_csv('../data/processed/combined_df.csv', index = False)