In [1]:
from astral import LocationInfo
from astral.sun import sun
import datetime as dt
from pathlib import Path  
import pytz
from pytz import timezone
import matplotlib.pyplot as plt
from multiprocessing import Pool
import numpy as np
import os
from os import listdir
from os.path import isfile, join
import pandas as pd
import re
import seaborn as sns
import swifter
from swifter import set_defaults
import sys

In [2]:
set_defaults(
    npartitions=None,
    dask_threshold=1,
    scheduler="processes",
    progress_bar=True,
    progress_bar_desc=None,
    allow_dask_on_strings=True,
    force_parallel=False,
)

# Standardization


In [3]:
def get_standardized_datetime(row, originaltzstring):
    """
    Given a row representing one time series 
    point from a corrected white shark archival tag 
    file and a str of the timezone of the timestamps, 
    returns a datetime object for the event in 
    Pacific/Honolulu time.
    """
    originaltz = pytz.timezone(originaltzstring)
    originaldt = dt.datetime(row["Year"], row["Month"], row["Day"], row["Hour"], row["Min"], row["Sec"], 0, originaltz)
    return originaldt.astimezone(pytz.timezone("Pacific/Honolulu"))
    

def get_time_of_day(hour):
    """
    Given an int between 0 and 23 representing the
    hour of the day, returns the time of day 
    corresponding to that hour in Hawaii e.g.
    "Dawn," "Day," "Dusk," or "Night."
    """
    sunrise = 6
    sunset = 18

    if (hour in range(0, sunrise - 1)) or (hour in range(sunset + 1, 24)):
        return 'Night'
    elif hour in range(sunrise - 1, sunrise + 1):
        return 'Dawn'
    elif hour in range(sunrise + 1, sunset - 1):
        return 'Day'
    elif hour in range(sunset - 1, sunset + 1):
        return 'Dusk'
    else:
        return np.NaN


def get_plot_data(filename): 
    """
    Given a filename to corrected archival White Shark tag
    data, returns a pandas dataframe containing data for
    plotting
    """
    df = pd.read_csv(filename)
    
    # Get timezone of date/time
    dateColName = df.columns[0]
    originaltz = ''
    if dateColName == "Date(UTC-8)":
        originaltz = 'Etc/GMT+8' # why? no one knows
    elif dateColName == "Date(EST)":
        originaltz = 'UTC' # ehy? EST was mistake
    elif dateColName == "Date":
        originaltz = 'UTC'
    else:
        raise ValueError("Cannot processes timezone of Date column" + dateColName)
    
    # Build standard datetime
    df["Datetime (UTC-10)"] = df.swifter.apply(lambda row: get_standardized_datetime(row, originaltz), axis=1)
    
    # add hour column
    df["Hour (UTC-10)"] = df["Datetime (UTC-10)"].swifter.apply(lambda x: x.hour)
    
    # add time of day column
    df["Time of Day"] = df["Hour (UTC-10)"].swifter.apply(get_time_of_day)
    
    return df


def get_filepaths_in_dir(dir_path):
    """
    Given a directory path, return a list of files in the given directory.
    >>> get_files_in_dir('./test')
    ['test1.txt', 'test.txt']
    """
    only_files = [join(dir_path, f) for f in listdir(dir_path) if isfile(join(dir_path, f))]
    return only_files


def filter_csvs(filepaths):
    """
    Given a list of filepaths, returns a list containing only the csv filepaths 
    in the given list.
    """
    csv_files = [f for f in filepaths if '.csv' in f]
    return csv_files


def get_shark_ID(filepath):
    """
    Given string filepath, returns 7 digit shark ID in filepath
    name.
    """
    # pattern matches any sequence of 7 digits
    pattern = '\\d{7}'

    sharkIDMatch = re.search(pattern, filepath)
    assert sharkIDMatch, 'Could not find 7 digit ID in filepath: {}'.format(filepath)
    
    sharkID = sharkIDMatch.group(0)
    return sharkID + '00'

In [4]:
# from manuel digging:
# sample rate on 190000400 is once every 2 min
# 190400900 is once every min
# 190502800 is once every min
# 190600200 is once every min
# 190601200 is oncer every 15 sec
# 190900200 is once every 10 sec
# 191909200 is once every 57 m 36 s (effectively 1/hour)
# So reampling everything to once per house will be good

In [16]:
# all Hawaii white shark archival files
files = filter_csvs(get_filepaths_in_dir('./HawaiiData'))

# metadata for Hawaii white sharks with Id, tag number, 
# ptt, tagging date, length, sex, and first/last date
# that shark was in Hawaiian lees
meta_df = pd.read_csv('./ws_hawaiionly_ssm_archivals_dateranges_2022apr18.csv')

# build a master dataframe containing these columns for all Hawaii white sharks
columns = ["Id", "Datetime (UTC-10)", "Hour (UTC-10)", "Time of Day", "Depth(m)", "External Temp (c)", "Sex", "Shark Length (cm)"] # add all the other stuff just incase
dfs = []

for file in sorted(files):
    df = get_plot_data(file)
    shark_id = get_shark_ID(file)
    df['Id'] = shark_id
    sex = meta_df[meta_df['eventid'] == int(shark_id) ]['sex'].iloc[0]
    df['Sex'] = sex
    df['External Temp (c)'] = df['ExtTemp(C)']
    df['Shark Length (cm)'] = meta_df[meta_df['eventid'] == int(shark_id) ]['length'].iloc[0]
    df = df[columns]

    # uncomment for re-sampling
    # # standardize sample frequency in master dataframe
    # if shark_id == '190000400':
    #     df = df.iloc[::30, :]
    # elif (shark_id == '190400900') or (shark_id == '190502800') or (shark_id == '190600200'):
    #     df = df.iloc[::60, :]
    # elif shark_id == '190601200':
    #     df = df.iloc[::240, :]
    # elif shark_id == '190900200':
    #     df = df.iloc[::360, :]

    dfs.append(df)

combined = pd.concat(dfs, ignore_index=True)

Dask Apply:   0%|          | 0/32 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/74310 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/74310 [00:00<?, ?it/s]

Dask Apply:   0%|          | 0/32 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/116640 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/116640 [00:00<?, ?it/s]

Dask Apply:   0%|          | 0/32 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/87840 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/87840 [00:00<?, ?it/s]

Dask Apply:   0%|          | 0/32 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/125280 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/125280 [00:00<?, ?it/s]

Dask Apply:   0%|          | 0/32 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/673920 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/673920 [00:00<?, ?it/s]

Dask Apply:   0%|          | 0/32 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/734400 [00:00<?, ?it/s]

Dask Apply:   0%|          | 0/32 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/2017 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/2017 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/2017 [00:00<?, ?it/s]

# Calculating Time of Day Using Astral

In [None]:
"""Provides a `DateTimeRange` class, which is used for managing ranges of datetimes."""
import datetime

class DateTimeRange(object):
    """Represents a range of datetimes, with a start and (optionally) an end.
       
       Basically implements most of the methods on a standard sequence data type to provide
       some lovely syntactic sugar. Specifically, you can iterate on this, index it, slice it,
       use the in operator, reverse it, and use it in a boolean context to see if there is any
       time in between the start and end."""
    DEFAULT_STEP = datetime.timedelta(seconds=1)
    
    def __init__(self, start, end=None, step=DEFAULT_STEP, *args, **kwargs):
        self.start = start
        self.end = end
        self.step = step
        return super(DateTimeRange, self).__init__(*args, **kwargs)
    
    def __contains__(self, item):
        """Returns whether or not the passed datetime is within the range. Does not take into
           account the stride length from `self.step` -- if you need that use dateutil's rrule
           instead."""
        if self.end is None:
            # The range never ends, so we just need to check `item` is beyond the start
            return (self.start <= item)
        else:
            return (self.start <= item <= self.end)
    
    def __iter__(self):
        """Returns a generator which will yield datetime objects within the range, incrementing
           with `self.step` as its stride length on each iteration."""
        value = self.start
        while (value in self):
            yield value
            value += self.step
    
    def __reversed__(self):
        """Reverse iterator yielding the datetime objects within the range in reverse. Similarly
           to the forward-iterator, decrements (rather than increments) by `self.step` each time.
           
           This can only be called if an end is defined."""
        assert self.end is not None, 'Reverse iteration is not supported without an end'
        
        value = self.end
        while (value in self):
            yield value
            value -= self.step
    
    def __nonzero__(self):
        """Returns whether the date range covers a length of time (i.e. the end value is beyond
           the start). If no end is defined, always returns True as the range continues forever."""
        return ((not self.end) or (self.end > self.start))
    
    def __get_slice(self, start, stop, step=None):
        """Internal method for slicing the date range. Use the standard slicing syntax as the
           external interface."""
        indices = (xrange(start, stop, step) if step is not None else xrange(start, stop))
        result = []
        
        for index in indices:
            try:
                result.append(self[index])
            except IndexError:
                pass
        return result
    
    def __getitem__(self, key):
        """Returns the n'th datetime from the range, using `self.step` to determine the
           increment. Does not calculate every datetime up until the index, but rather
           multiplies the step value by the index to achieve the same result more efficiently.
           
           Negative indexing is only supported if an end is defined. Also supports slicing -- the
           same rule regarding negative indexing still applying."""
        if isinstance(key, tuple):
            # Multiple indices
            return [self[i] for i in key]
        elif isinstance(key, slice):
            # Slicing
            return self.__get_slice(start=key.start, stop=key.stop, step=key.step)
        else:
            # Regular indexing
            if key < 0:
                # Reverse-indexing
                assert self.end is not None, 'Negative indexing is not supported without an end'
                value = (self.end - (self.step * key))
            else:
                # Forward-indexing
                value = (self.start + (self.step * key))
            
            # Check that the value is in the range; return it if it is, raise IndexError if not
            if value in self:
                return value
            else:
                raise IndexError('index out of range')

Pandas Apply:   0%|          | 0/116640 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/116640 [00:00<?, ?it/s]

Dask Apply:   0%|          | 0/32 [00:00<?, ?it/s]

In [None]:
# test inclusivity of class
one_hr = dt.timedelta(hours=1)
sunrise = dt.datetime(2020, 3, 21, 6, 0, 0).astimezone(pytz.timezone("Pacific/Honolulu"))
sunset = dt.datetime(2020, 3, 21, 18, 0, 0).astimezone(pytz.timezone("Pacific/Honolulu"))
ran = DateTimeRange(sunrise + one_hr, sunset - one_hr)
(sunset - one_hr) in ran # TRUE if DateTimeRange is end inclusive, FALSE else

Pandas Apply:   0%|          | 0/87840 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/87840 [00:00<?, ?it/s]

Dask Apply:   0%|          | 0/32 [00:00<?, ?it/s]

In [8]:
def get_time_of_day_astral(date_time, lat, lon):
    """
    Given a datetime object in Hawaii time (UTC-10), 
    an int representing latitude, and an int representing 
    longitude, returns returns local apparent the time 
    of day corresponding to that hour in Hawaii e.g.
    "Dawn," "Day," "Dusk," or "Night."
    """
    # date_time = date_time.astimezone(pytz.timezone("Pacific/Honolulu")) # run this line if input date_time os not in Hawaii time
    if (date_time.date is None) or (date_time.year is None) or (date_time.month is None) or (date_time.day is None):
        raise ValueError("Cannot processes date with value: ", date)

    date_time_std = dt.datetime(date_time.year, date_time.month, date_time.day, date_time.hour, date_time.minute, date_time.second, 0, pytz.UTC)
    date_time_std = date_time_std.replace(tzinfo=pytz.timezone("Pacific/Honolulu"))

    location = LocationInfo("Honolulu", "Hawaii", "Pacific/Honolulu", lat, lon)
    s = sun(location.observer, date=datetime.date(date_time.year, date_time.month, date_time.day), tzinfo=location.timezone) 

    # time of day
    sunrise = s["sunrise"]
    sunrise_std = dt.datetime(sunrise.year, sunrise.month, sunrise.day, sunrise.hour, sunrise.minute, sunrise.second, 0, pytz.UTC)
    sunrise_std = sunrise_std.replace(tzinfo=pytz.timezone("Pacific/Honolulu"))

    sunset = s["sunset"]
    sunset_std = dt.datetime(sunset.year, sunset.month, sunset.day, sunset.hour, sunset.minute, sunset.second, 0, pytz.UTC)
    sunset_std = sunset_std.replace(tzinfo=pytz.timezone("Pacific/Honolulu"))

    one_hr = dt.timedelta(hours=1)
    one_s = dt.timedelta(seconds=1)

    midnight = dt.datetime(date_time.year, date_time.month, date_time.day, 0, 0, 0, 0, pytz.UTC)
    midnight = midnight.replace(tzinfo=pytz.timezone("Pacific/Honolulu"))
    
    pre_midnight = dt.datetime(date_time.year, date_time.month, date_time.day, 23, 59, 59, 0, pytz.UTC)
    pre_midnight = pre_midnight.replace(tzinfo=pytz.timezone("Pacific/Honolulu"))
    
    # DateTimeRange is inclusive of start and stop inputs
    if (date_time_std in DateTimeRange(midnight, sunrise_std - one_hr)) or (date_time_std in DateTimeRange(sunset_std + one_hr, pre_midnight)):
        return 'Night'
    elif date_time_std in DateTimeRange(sunrise_std - one_hr + one_s, sunrise_std + one_hr):
        return 'Dawn'
    elif date_time_std in DateTimeRange(sunrise_std + one_hr + one_s, sunset_std - one_hr):
        return "Day"
    elif date_time_std in DateTimeRange(sunset_std - one_hr + one_s, sunset_std + one_hr - one_s):
        return 'Dusk'
    else:
        return np.NaN

Pandas Apply:   0%|          | 0/125280 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/125280 [00:00<?, ?it/s]

Dask Apply:   0%|          | 0/32 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/673920 [00:00<?, ?it/s]

Dask Apply:   0%|          | 0/32 [00:00<?, ?it/s]

Dask Apply:   0%|          | 0/32 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/734400 [00:00<?, ?it/s]

Dask Apply:   0%|          | 0/32 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/2017 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/2017 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/2017 [00:00<?, ?it/s]

True

In [9]:
avg_lat = pd.read_csv('./ws_hawaiionly_ssm_archivals_2022apr12.csv')["latitude"].mean()
avg_lon = pd.read_csv('./ws_hawaiionly_ssm_archivals_2022apr12.csv')["longitude"].mean()
location = LocationInfo("Honolulu", "Hawaii", "Pacific/Honolulu", avg_lat, avg_lon)
s = sun(location.observer, dt.datetime.now(), tzinfo=location.timezone) 

pre_midnight = dt.datetime(dt.datetime.now().year, dt.datetime.now().month, dt.datetime.now().day, 23, 59, 59, 0, pytz.UTC)
pre_midnight = pre_midnight.replace(tzinfo=pytz.timezone("Pacific/Honolulu"))

s["sunrise"], pre_midnight

(datetime.datetime(2022, 7, 14, 6, 4, 52, 722561, tzinfo=<DstTzInfo 'Pacific/Honolulu' HST-1 day, 14:00:00 STD>),
 datetime.datetime(2022, 7, 14, 23, 59, 59, tzinfo=<DstTzInfo 'Pacific/Honolulu' LMT-1 day, 13:29:00 STD>))

In [10]:
# add a "Time of Day (Astral)" column" - time of day ("Dusk," "Dawn," etc.) calculated by the Astral API

# can parallelize with multiprocessing later https://stackoverflow.com/questions/45545110/make-pandas-dataframe-apply-use-all-cores 
avg_lat = pd.read_csv('./ws_hawaiionly_ssm_archivals_2022apr12.csv')["latitude"].mean()
avg_lon = pd.read_csv('./ws_hawaiionly_ssm_archivals_2022apr12.csv')["longitude"].mean()
combined["Time of Day (Astral)"] = combined.swifter.apply(lambda x: get_time_of_day_astral(x["Datetime (UTC-10)"], avg_lat, avg_lon), axis=1)

Dask Apply:   0%|          | 0/32 [00:00<?, ?it/s]

In [11]:
combined.info()
combined

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14838 entries, 0 to 14837
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype                           
---  ------                --------------  -----                           
 0   Id                    14838 non-null  object                          
 1   Datetime (UTC-10)     14838 non-null  datetime64[ns, Pacific/Honolulu]
 2   Hour (UTC-10)         14838 non-null  int64                           
 3   Time of Day           14838 non-null  object                          
 4   Depth(m)              14838 non-null  float64                         
 5   External Temp (c)     14838 non-null  float64                         
 6   Sex                   14838 non-null  object                          
 7   Shark Length (cm)     14838 non-null  float64                         
 8   Time of Day (Astral)  14838 non-null  object                          
dtypes: datetime64[ns, Pacific/Honolulu](1), float64(3),

Unnamed: 0,Id,Datetime (UTC-10),Hour (UTC-10),Time of Day,Depth(m),External Temp (c),Sex,Shark Length (cm),Time of Day (Astral)
0,190000400,2001-01-02 18:59:50-10:00,18,Dusk,38.4,24.600002,M,457.0,Dusk
1,190000400,2001-01-02 19:59:50-10:00,19,Night,154.4,20.400002,M,457.0,Night
2,190000400,2001-01-02 20:59:50-10:00,20,Night,124.4,21.600000,M,457.0,Night
3,190000400,2001-01-02 21:59:50-10:00,21,Night,28.4,24.600002,M,457.0,Night
4,190000400,2001-01-02 22:59:50-10:00,22,Night,126.4,21.150002,M,457.0,Night
...,...,...,...,...,...,...,...,...,...
14833,191901200,2020-04-16 18:47:59-10:00,18,Dusk,303.3,11.500000,M,366.0,Dusk
14834,191901200,2020-04-16 19:45:35-10:00,19,Night,146.3,19.880000,M,366.0,Dusk
14835,191901200,2020-04-16 20:43:11-10:00,20,Night,163.3,19.120000,M,366.0,Night
14836,191901200,2020-04-16 21:40:47-10:00,21,Night,147.3,20.640000,M,366.0,Night


# Compare Orginal Time of Day with Time of Day Computed Using Astral

In [12]:
combined["Time of Day"].value_counts()

Day      6237
Night    6146
Dawn     1228
Dusk     1227
Name: Time of Day, dtype: int64

In [13]:
combined["Time of Day (Astral)"].value_counts()

Day      6366
Night    6001
Dusk     1242
Dawn     1229
Name: Time of Day (Astral), dtype: int64

In [14]:
# check for NaN
combined[combined['Time of Day'].isna()]
combined[combined['Time of Day (Astral)'].isna()]

Unnamed: 0,Id,Datetime (UTC-10),Hour (UTC-10),Time of Day,Depth(m),External Temp (c),Sex,Shark Length (cm),Time of Day (Astral)


In [15]:
combined.to_csv('./master.csv')