# Data Wrangling with Shapefile Data

<i>The data used in this example: </i>

1) A shapefile created by myself using a fishnet in ArcGIS

2) A Sea Surface Temperature csv fata file gathered from: https://coastwatch.pfeg.noaa.gov/erddap/griddap/jplG1SST.html for 2012 with a bounding box of 32.94N, 35.40N, -120.88W, -117.20W

In [16]:
# Import packages
import pandas as pd # to read in csv file
import datetime as dt # to understand datetime formats
import shapefile # to read in shapefiles
from shapely.geometry import Point # to iterate over features
from shapely.geometry import shape # to iterate over features

In [22]:
# Import data (update to where you stored local files)
grid = shapefile.Reader('D:/Documents/SpringBoard/capstone-1/datasets/gridded-shapefile') # read in the gridded shapefile
sst = pd.read_csv('D:/Documents/SpringBoard/capstone-1/datasets/SST_2012.csv', skiprows=1) # read in shapefile and skip
    # the first row (because it's just an extra header)
    # colnames are: UTC, degrees_north, degrees_east, degree_C
    # where degrees_north = latitude and degrees_east = longitude

In [23]:
# Some housekeeping... for SST
sst['UTC'] = pd.to_datetime(sst['UTC'], infer_datetime_format=True, errors='coerce') 
    # change the UTC column to a datetime and coerce non-date formats into NaNs
sst['UTC'] = sst['UTC'].dt.tz_localize('UTC') # give the datetime a tz (utc)
sst = sst[sst['UTC'] == sst['UTC'].min()] # and only keep the sst data for the first day 
    # we really just need to create a dataframe with lat/lon coordinates of SST data paired with the
    # shapefile ID so that we can merge the shapefile ID into the other SST datasets 
sst = sst.dropna().reset_index(drop=True) # only keep the sst data that are not NAs
sst['Zone'] = 'NA' # add a new column that will house the shapefile zone ID

In [25]:
# For the purpose of this example, let's just take the first 100 rows of sst for this day. 
sst = sst.loc[0:99,:]

In [None]:
# Some housekeeping... for the grid
all_shapes = grid.shapes() # get all the polygons
all_records = grid.records() # get all the records

# And just so we get an idea of what we're looking at
print(grid.shapes()[0])
print(grid.records()[0]) # I'm pretty sure this corresponds to [ID, shape , ]

<shapefile.Shape object at 0x000001DD449F9E88>


In [None]:
# Run the loop
for row in range(len(sst)): # for the rows in sst
    points = (sst['degrees_east'][row], sst['degrees_north'][row]) # put the GPS points from that row into a tuple
    for shape_id in range(1, len(all_shapes)): # for each of the squares in the shapefile
        boundary = all_shapes[shape_id] # get a boundary polygon
        if Point(points).within(shape(boundary)): # make a point and see if it's in the polygon
            name = all_records[shape_id][0] # grab the field corresponding to the id of the feature
            sst.iloc[row,4] = name # set the feature id as the 5th column (Zone)
            break # and if/when this happens for the GPS tuple, stop the inner for loop and continue with the outer loop

In [None]:
print(sst) # print out the output