# Data Wrangling - The Beginning
This sheet should be run first, in order to get an idea of the geospatial range of acoustic telemetry data that we have, so that we know the spatial extent of environmental data that we need to grab from the web.

In [36]:
# Load in the packages
import pandas as pd
import numpy as np
import datetime as dt
import shapely
import geopandas as gpd
from geopandas.tools import sjoin
from shapely.geometry import Point
from shapely import speedups
speedups.enable() # makes geospatial merges quicker

In [37]:
# Let's start with the receiver logs - when and where a receiver was put into the water
receiver_log = pd.read_csv('D:/Documents/SpringBoard/capstone-1/datasets/Receiver_Deployment_Log.csv', parse_dates=True)
receiver_log = receiver_log[['Station', 'Lat', 'Lng', 'In_PSTPDT', 'Out_PSTPDT', 'Receiver SN']] 
    # we only care about these columns

receiver_log.Lat = pd.to_numeric(receiver_log.Lat, errors='coerce') # make sure the latitude is numeric
receiver_log.Lng = pd.to_numeric(receiver_log.Lng, errors='coerce') # make sure the longitude is numeric

print(receiver_log[['Lat','Lng']].describe()) # describe the data so that we know where to grab environmental data

# ok so we need our data to basically be between 32.9 and 35.4 N and -120.9 and -118.2 W
# I will use those limits (sometimes rounded to ints) to download the relevant environmental data.

              Lat         Lng
count  617.000000  617.000000
mean    33.824355 -118.407236
std      0.378652    0.692174
min     32.954270 -120.874690
25%     33.513000 -118.675670
50%     33.749320 -118.393980
75%     34.014000 -118.014210
max     35.396930 -117.271300


In [38]:
# Now we want to make sure the datetimes are aware and convert them to UTC, because all other
# datasets are in UTC time
receiver_log['In_PSTPDT'] = pd.to_datetime(receiver_log['In_PSTPDT'], infer_datetime_format=True, errors='coerce')
receiver_log['Out_PSTPDT'] = pd.to_datetime(receiver_log['Out_PSTPDT'], infer_datetime_format=True, errors='coerce')

receiver_log['In_UTC'] = receiver_log['In_PSTPDT'].dt.tz_localize('America/Los_Angeles').dt.tz_convert('UTC')
receiver_log['Out_UTC'] = receiver_log['Out_PSTPDT'].dt.tz_localize('America/Los_Angeles').dt.tz_convert('UTC')

receiver_log = receiver_log.drop(['In_PSTPDT', 'Out_PSTPDT'], axis=1) # no longer care for the PDT/PST times
receiver_log = receiver_log.dropna(subset=['In_UTC','Out_UTC']) 
    # remove NAs (receivers that are still deployed or that were lost)

In [None]:
# Next step: See in which zones these data fall...

In [39]:
# Import data (update to where you stored local files)
grid = gpd.read_file('D:/Documents/SpringBoard/capstone-1/datasets/trunc/gridded-shapefile.shp') 
    # read in the gridded shapefile

# And only keep the unique receiver locations. 
just_locations = receiver_log[['Lng','Lat']].drop_duplicates().reset_index(drop=True)

In [40]:
# Turn the SST data into a geometric dataset
geometry = [Point(xy) for xy in zip(just_locations['Lng'], just_locations['Lat'])]

# Coordinate reference system : WGS84
crs = {'init': 'epsg:4326'}

# Create a Geographic data frame 
just_locations_geom = gpd.GeoDataFrame(just_locations, crs=crs, geometry=geometry)

# Make sure the grid is the same crs
grid = gpd.GeoDataFrame(grid, crs=crs)
grid = grid.loc[:,['OBJECTID', 'geometry']] # only keep the object ID values and the geometry values

In [41]:
just_locations_polygons = sjoin(just_locations_geom, grid, how='left') 
    # join the gps data with the polygon data to get the object ID value

receiver_log = receiver_log.merge(just_locations_polygons[['Lng','Lat','OBJECTID']])
    # merge the data so that we know which receivers are in which zone

In [42]:
receiver_log.columns = ['Station', 'Lat', 'Lng', 'Receiver SN', 'In_UTC', 'Out_UTC', 'Zone']
    # make the col names match

In [43]:
# Clean up the receiver ID codes so that they are strings and not numbers
receiver_log['Receiver SN'] = receiver_log['Receiver SN'].astype(str) # convert to string
receiver_log['Receiver SN'] = receiver_log['Receiver SN'].str.split('.', n=1, expand=True)[0] # get rid of the silly .0

In [58]:
# Now that we're done cleaning our receiver deployment log, we can start calculating
# how many receivers are in the water (in each zone) each day. 

time_sequence = pd.date_range(receiver_log['In_UTC'].dt.date.min(), receiver_log['Out_UTC'].dt.date.max(), tz='UTC')
    # We start by building a list of dates that we want data for (basically every day since the first
    # receiver went in the water, to the last day a receiver was pulled out of the water)

receivers_per_zone_per_day = pd.DataFrame([]) # we want to build an empty dataframe to add to

for date in time_sequence: # for each day ...
    for zone in receiver_log['Zone'].drop_duplicates(): # for each unique zone
        receivers_in = receiver_log[(date > receiver_log['In_UTC']) &  
                                    # get a list of the receivers that were deployed before that day
                                    (date < receiver_log['Out_UTC']) & 
                                    # and that were pulled out of the water after that day
                                    (receiver_log['Zone'] == zone)]
                                    # and that were in the appropriate zone
        no_receivers = len(receivers_in['Station'].drop_duplicates())
            # find the number of receivers by counting how many station names are in the new series
        new_df = pd.DataFrame({'Date': date, 'Receiver_D': no_receivers, 'Zone': zone}, index=[0]) 
            # make a pd dataframe that saves the date, receiver density, and the zone ID
        receivers_per_zone_per_day = receivers_per_zone_per_day.append(new_df, ignore_index=True)
            # append these data to the new pd dataframe
        #print(date, no_receivers, zone)
            
# And save this to a csv for later...
receivers_per_zone_per_day.to_csv('D:/Documents/SpringBoard/capstone-1/datasets/final_files/receiver_density_2.csv', index=False)

## Now we have our receiver density figured out.
## Next, we want to make sure our shark detection data only includes data when we <i>know</i> receivers were in the water.

In [59]:
# Let's read in the shark detection data
shark_detections = pd.read_csv('D:/Documents/SpringBoard/capstone-1/datasets/AllYOYDetections_filtered.csv', parse_dates=True)
shark_detections = shark_detections[['Date_Time', 'Receiver', 'Transmitter', 'Station.Name', 'Latitude', 'Longitude']]
    # only keep these columns

In [60]:
# We need to make these data datetime objects and aware
shark_detections['Date_Time'] = pd.to_datetime(shark_detections['Date_Time'], infer_datetime_format=True, errors='coerce')
shark_detections['Date_Time'] = shark_detections['Date_Time'].dt.tz_localize('UTC')

In [61]:
# We also need to make sure that the Receiver column matches the Receiver SN column 
# in the receiver deployment log dataset, so we need to remove the 'VR2W-' part and make it a category
shark_detections['Receiver'] = shark_detections['Receiver'].str.split('-', n=1, expand=True)[1]
shark_detections['Receiver'] = shark_detections['Receiver'].astype('category')

In [62]:
shark_detections.head()

Unnamed: 0,Date_Time,Receiver,Transmitter,Station.Name,Latitude,Longitude
0,2013-11-20 13:08:21+00:00,112300,A69-1303-46859,JWS_Scammon_3,27.69878,-114.1474
1,2013-12-10 03:34:27+00:00,112305,A69-1303-46859,JWS_Scammon_3,27.69878,-114.1474
2,2013-12-31 00:49:30+00:00,112305,A69-1601-30030,JWS_Scammon_3,27.69878,-114.1474
3,2014-02-15 03:19:33+00:00,112305,A69-1303-46859,JWS_Scammon_3,27.69878,-114.1474
4,2014-03-25 15:33:56+00:00,112305,A69-1601-30030,JWS_Scammon_3,27.69878,-114.1474


In [70]:
# I need to go through the receiver data and only keep the data that are present when a shark lab receiver is in the water. 
good_detection_data = pd.DataFrame([]) # make an empty dataframe

for row in range(len(receiver_log)): # and for each row in receiver log
    receiver_id = receiver_log.loc[row, 'Receiver SN'] # get the receiver id value
    det_in_range = shark_detections[(shark_detections.Date_Time > receiver_log['In_UTC'][row]) &
                                    # keep shark detection data that were collected after the receiver went in
                                    (shark_detections.Date_Time < receiver_log['Out_UTC'][row]) & 
                                    # and before the receiver came out of the water
                                    (shark_detections.Receiver == receiver_id)]
                                    # and only use detections from the relevant receiver.
    if det_in_range.shape[0] > 0: # if there are sharks present...
        det_in_range.loc[:,'Lat'] = receiver_log.loc[row,'Lat'] # fill in the receiver latitude from the log
            # (sometimes the detection data do not have updated location data associated with them)
        det_in_range.loc[:,'Lng'] = receiver_log.loc[row,'Lng'] # fill in the receiver longitude from the log
        det_in_range = det_in_range.drop(['Latitude', 'Longitude'], axis=1)
            # drop the old latitude and longitude
        good_detection_data = good_detection_data.append(det_in_range, ignore_index=True)
            # and add these data to the new list

In [71]:
# Just a little clean up
good_detection_data = good_detection_data.drop_duplicates() # remove any duplicates that may have occurred
good_detection_data = good_detection_data.merge(receiver_log[['Lat', 'Lng', 'Zone']]) 
    # we want to keep the zone information for these data that we calculated for the receiver log
good_detection_data['Date'] = good_detection_data['Date_Time'].dt.date # and we want to make sure 
    # there is a Date column, because detection data are in YYYY-mm-dd HH:MM:SS and that's too fine-scale
    # for this analysis.

In [74]:
good_detection_data.tail()

Unnamed: 0,Date_Time,Receiver,Transmitter,Station.Name,Lat,Lng,Zone,Date
60409,2018-04-23 04:25:38+00:00,130677,A69-1602-3235,JWS_Zuma_Beach,34.014,-118.8235,1296043,2018-04-23
60410,2018-04-23 04:25:38+00:00,130677,A69-1602-3235,JWS_Zuma_Beach,34.014,-118.8235,1296043,2018-04-23
60411,2018-04-23 04:25:38+00:00,130677,A69-1602-3235,JWS_Zuma_Beach,34.014,-118.8235,1296043,2018-04-23
60412,2018-04-23 04:25:38+00:00,130677,A69-1602-3235,JWS_Zuma_Beach,34.014,-118.8235,1296043,2018-04-23
60413,2017-09-19 23:43:29+00:00,127959,A69-1602-1351,SW Mooring Buoy,34.40287,-119.67183,1198628,2017-09-19


In [72]:
# Finally, save these data for later, when we combine it with the environmental datasets.
good_detection_data.to_csv('D:/Documents/SpringBoard/capstone-1/datasets/edited_files/shark_detection_data2.csv')

Next, move onto the Data Wrangling Shapefiles notebook.