# Let's grab a pseudo-random subset of all the data that equally samples data with sharks and data without sharks. 

In [1]:
# Import relevant packages
import pandas as pd
import datetime as dt
import glob
import random

In [12]:
files = glob.glob('D:/Documents/SpringBoard/capstone-1/datasets/final_files/combined/filled/*all.csv')
    # get a list of all the files that we need.

### Jackknifing Method 1

This method takes random samples from the nearest 10 zones, but does not check to see whether there are receivers deployed in the zones.

In [20]:
all_samples = [] # make an empty list that we can add to

for file in files: # for each file
    dat = pd.read_csv(file, # load in the file
                  dtype={'Zone':'int64', 'Transmitter':'category', 
                         'animal_weight':'float64','animal_length_total':'float64', 
                         'gender':'category', 'location':'category', 'year':'category',
                         'Receiver_D':'float64', 'Lat':'float64', 'Lng':'float64', 
                         'DepthGradient':'float64', 'TempC':'float64', 'Sal':'float64',
                         'ChlA':'float64', 'MoonPhase':'category'}, 
                 parse_dates = ['Date'])
    year = dat['Date'].dt.year[0]+1 # figure out which year it is, and add one to it
    dat = dat[dat['Date'] < (str(year)+'-01-01')] # so that we can subset the data and make sure we
        # only have the data for the year we care about (data were overlapped about 1 day upon download to make)
        # sure that all values were collected
    
    # Housekeeping.. 
    dat['Transmitter'] = dat['Transmitter'].cat.add_categories(['NoSharks'])
        # add a category to replace NAs when no sharks are present
    dat['Transmitter'] = dat['Transmitter'].fillna(value='NoSharks') # and replace it with 'NoSharks'
    
    # Separate the data into data with sharks and data without sharks
    shark_dat = dat[dat['Transmitter'] != 'NoSharks'] # with sharks
    no_sharks = dat[dat['Transmitter'] == 'NoSharks'] # without sharks
    
    if len(shark_dat) > 0 : # if there are actually shark data
        all_samples.append(shark_dat.drop_duplicates()) # add the shark data to the running
            # dataframe
    
        # And make a range of values from which we can grab data (it's best if we grab 
        # 'no shark' data from the same/or close zones to which we have shark data for. 
        # Then we aren't comparing super offshore areas where there are never receivers
        # to nearshore areas where there are more receivers)
        zone_ids = shark_dat.drop_duplicates(subset='Zone').reset_index(drop=True)
            # remove duplicate zones and reset the index
        zone_ids = zone_ids['Zone'] # we only really care about the zones
        possible_zones = [] # make a new blank list for all the zones that we want to 
            # grab data from
        zone_ids = pd.to_numeric(zone_ids) # make sure the zones are numeric
        for row, value in enumerate(zone_ids): # and go through each zone
            possible = list(range(value-10, value+11)) # to make a range of possible zones
                # that are close to or equal to zones that have shark data
            possible_zones.extend(possible) # extend that running list.
        possible_zones = pd.DataFrame(possible_zones) # make the possible zones into a
            # data frame
        possible_zones = possible_zones.drop_duplicates() # and remove duplicates
       
        # Use these zones to randomly sample the non-shark dataset
        to_sample = no_sharks[no_sharks['Zone'].isin(list(possible_zones[0]))]
            # make a dataframe that includes zones that are present in possible_zones
        left_out = no_sharks[~no_sharks['Zone'].isin(list(possible_zones[0]))]
            # and just in case, make one that includes zones that are not present
            # in possible zones
        if len(to_sample) >= len(shark_dat): # if there are enough NoShark data ...
            sample = to_sample.sample(n=len(shark_dat)) # then randomly sample the 
                # NoShark data so that there's an equal number of shark vs non-shark data
            all_samples.append(sample) # and add that to the all_samples
        else: # otherwise, if there are not enough shark data
            all_samples.append(to_sample) # add all the data in to_sample
            backup_sample = left_out.sample(n=(len(shark_dat)-len(to_sample)))
                # and take the remaining amount of data from teh backup dataset
            all_samples.append(backup_sample) # add the backup data to the all_samples
            print(file) # and let me know which files this happens for (in case I decide
                # to take those data out later)
        
all_samples = pd.concat(all_samples) # finally, make this into a big pandas dataframe

In [21]:
all_samples.to_csv('D:/Documents/SpringBoard/capstone-1/datasets/final_files/combined/filled/jackknifed.csv', index=False)
    # save the file without index values.

### Jackknifing Method 2

This method keeps all data that has at least 1 or more receivers deployed, regardless of if there are sharks or not.

In [None]:
all_samples = [] # make an empty list that we can add to

for file in files: # for each file
    dat = pd.read_csv(file, # load in the file
                  dtype={'Zone':'int64', 'Transmitter':'category', 
                         'animal_weight':'float64','animal_length_total':'float64', 
                         'gender':'category', 'location':'category', 'year':'category',
                         'Receiver_D':'float64', 'Lat':'float64', 'Lng':'float64', 
                         'DepthGradient':'float64', 'TempC':'float64', 'Sal':'float64',
                         'ChlA':'float64', 'MoonPhase':'category'}, 
                 parse_dates = ['Date'])
    year = dat['Date'].dt.year[0]+1 # figure out which year it is, and add one to it
    dat = dat[dat['Date'] < (str(year)+'-01-01')] # so that we can subset the data and make sure we
        # only have the data for the year we care about (data were overlapped about 1 day upon download to make)
        # sure that all values were collected
    
    # Housekeeping.. 
    dat['Transmitter'] = dat['Transmitter'].cat.add_categories(['NoSharks'])
        # add a category to replace NAs when no sharks are present
    dat['Transmitter'] = dat['Transmitter'].fillna(value='NoSharks') # and replace it with 'NoSharks'
    
    # Separate the data into data with sharks and data without sharks
    shark_dat = dat[(dat['Transmitter'] != 'NoSharks') & (dat['Receiver_D'] > 0.0)] # with sharks
    no_sharks = dat[(dat['Transmitter'] == 'NoSharks') & (dat['Receiver_D'] > 0.0)] # without sharks
    
    if len(shark_dat) > 0:
        all_samples.append(shark_dat) # add all the data in to_sample
        all_samples.append(no_sharks) # add non-shark data

all_samples = pd.concat(all_samples)

In [None]:
all_samples.to_csv('D:/Documents/SpringBoard/capstone-1/datasets/final_files/combined/filled/jackknifed_new.csv', index=False)
    # save the file without index values.