# Regions
Import regions data from [CamHD_motion_metadata](https://github.com/CamHD-Analysis/CamHD_motion_metadata) into a Pandas dataframe. The region_list.txt file was generated usng the following BASH command from inside the CamHD_motion_metadata directory:
```bash
find $PWD -type f -name "*optical_flow_regions.json" | grep --color=never -i region | sort > ~/camhd_floc_model/data_camhd/region_list_generated.txt
```

#### Start a Dask cluster
We do this first because it can take a while for the Kubernetes cluster to scale up to accomodate workers.

In [None]:
from dask_kubernetes import KubeCluster
cluster = KubeCluster(n_workers=20)
cluster

## Loop through JSON files and create lists with info for the desired scene

In [None]:
import json

## Build dataframe from lists

In [None]:
url = []
filename = []
scene_tag = []
deployment = []
start_frame = []
end_frame = []

with open('/home/jovyan/camhd_floc_model/data_camhd/region_list_generated.txt') as f:
    for line in f:
        with open(line.strip()) as l:
            data = json.load(l)
            for region in data['regions']:
                if region['type'] == 'static':
                    try:
                        if '_p2_z0' in region['sceneTag']:
                            url.append('https://rawdata.oceanobservatories.org/files' + data['movie']['URL'])
                            filename.append((data['movie']['URL'].split('/')[-1]))
                            scene_tag.append(region['sceneTag'])
                            deployment.append(int(scene_tag[-1].split('_')[0][1]))                            
                            start_frame.append(int(region['startFrame']))
                            end_frame.append(int(region['endFrame']))
                    except:
                        continue

In [None]:
import pandas as pd

In [None]:
scene_windows = pd.DataFrame({'filename': filename, 'url': url, 'scene_tag': scene_tag, 'deployment': deployment, 'start_frame': start_frame, 'end_frame': end_frame})
scene_windows.tail()

Here is a cell that starts by defining a proper buffer for our start and end frames. The idea being that the buffer will provent us from pulling frames that may include the end or beggining of the CamHD's movement.
We go on to create a dataframe of relevent information from a list of movies that fit specifications made in the cell above
From here we apply a function to the rows of the dataframe that takes the defined frame_interval for the range give by the start_frame to end_frame
We then add that the list of frames as a column 'window' to our data
We then go through our dataframe and filter it to display only the rows from the given deployment (5) and pipe this into a new dataframe to work from

#### Create list of frames to process for each scene window
 ##### A large frame interval >80 will speed up the processing
 ##### A small frame interval <25 will provide a high resolution

In [None]:
frame_interval = 69
window_buffer = 30
frame_lists = []
for i, start_frame in enumerate(scene_windows.start_frame):
    frame_lists.append(list(range(start_frame+window_buffer, scene_windows.end_frame[i]-window_buffer, frame_interval)))
scene_windows['frame_list'] = frame_lists
scene_windows.head()

#### Wrappers to deal with potential get_moov_atom and get_frame timeouts

In [None]:
import pycamhd as camhd
import numpy as np

In [None]:
def get_moov_atom_timeout(filename):
    try:
        return camhd.get_moov_atom(filename)
    except:
        return False

In [None]:
def get_frame_timeout(filename, frame_number, pix_fmt, moov_atom):
    if moov_atom:
        try:
            return camhd.get_frame(filename, frame_number, pix_fmt, moov_atom)
        except:
            return np.zeros((1080, 1920), dtype=np.uint16)
    else:
        return np.zeros((1080, 1920), dtype=np.uint16)

#### Set up a Dask array of delayed images
In this notebook we don't actually use this Dask array of delayed objects for the analysis. Below we refactor into using a list of delayed functions which I think works better here. Or at least it is easier.

In [None]:
from dask import delayed
import dask.array as dsa

In [None]:
delayed_frame_list = []
for i, row in scene_windows[scene_windows.deployment == 5].iterrows():
    filename = row.url
    delayed_moov_atom = delayed(get_moov_atom_timeout)(filename)
    for frame_number in row.frame_list:
        delayed_frame = delayed(get_frame_timeout)(filename, frame_number, 'gray16le', delayed_moov_atom)
        delayed_frame_list.append(dsa.from_delayed(delayed_frame, (1080, 1920), np.uint16))
delayed_frame_array = dsa.stack(delayed_frame_list)
delayed_frame_array

A dask array is in many ways like a numpy array, except in this case it holds a set of instructions for how to acquire each chunk of the array, which makes it easy to farm this array out to workers in the cloud using the [distributed](http://distributed.readthedocs.io/en/latest/#) scheduler.

#### To deal with variations in lighting and high-frequency noise, we filter each subimage using a Butterworth bandpass filter.

In [None]:
frame = delayed_frame_array[0].compute()
frame.shape

In [None]:
def butterworth(d1, d2, n):
    x = np.arange(-1024/2+0.5,1024/2+1-0.5)
    xx, yy = np.meshgrid(x, x)
    d = np.sqrt(xx**2+yy**2)
    bff = (1 - (1./(1 + (d/d1)**(2*n))))*(1/(1 + (d/d2)**(2*n)))
    return bff

In [None]:
d1 = 20 # low cut wavenumber
d2 = 400 # high cut wavenumber
n = 4
bff = butterworth(d1, d2, n)
# plt.rc('figure', figsize=(6, 6))
#imgplot = plt.imshow(bff, cmap='gray')

#### Setup filtering and thresholding functions

In [None]:
def frame_filter(frame, d1, d2, n):
    if frame.ndim == 3 and frame.shape[0] == 1:
        I = np.squeeze(frame[0, 0:1024, 0:1024])
    else:
        I = frame[0:1024, 0:1024]
    bff = butterworth(d1, d2, n)
    I_fft = np.fft.fft2(I)
    I_fft_shift = np.fft.fftshift(I_fft)
    I_fft_shift_filt = I_fft_shift*bff # filter with the Butterworth filter
    I_fft_filt = np.fft.ifftshift(I_fft_shift_filt)
    I_filt = np.fft.ifft2(I_fft_filt)
    return I_filt

In [None]:
threshold = 4000; # this is an arbitrary threshold that seems to work
def frame_thresh(frame, d1, d2, n, threshold):
    I_filt = frame_filter(frame, d1, d2, n)
    I_thresh = np.array(np.absolute(I_filt)>threshold)
    return I_thresh

#### Show example of a thresholded subimage

In [None]:
I_thresh = frame_thresh(frame, d1, d2, n, threshold)

In [None]:
from skimage.measure import label

#### Setup labeling stats function

In [None]:
# this function takes a thresholded binary image as input
def frame_label_stats(I_thresh):
    I_labeled = label(I_thresh)
    label_stats = []
    for i in range(1, I_labeled.max()+1):
        label_stats.append((I_labeled==i).sum())
    return label_stats

#### Assemble a list of Dask delayed functions using our labeling function
Here we abandon the delayed Dask array because I am still not sure of what it buys us. It might be worth returning to, but we will use a simple list of nested delayed objects for now.

In [None]:
delayed_label_stats = []
for i, row in scene_windows[scene_windows.deployment == 5].iterrows():
    filename = row.url
    delayed_moov_atom = delayed(get_moov_atom_timeout)(filename)
    for frame_number in row.frame_list:
        delayed_frame = delayed(get_frame_timeout)(filename, frame_number, 'gray16le', delayed_moov_atom)
        delayed_frame_thresh = delayed(frame_thresh)(delayed_frame, d1, d2, n, threshold)
        delayed_label_stats.append(delayed(frame_label_stats)(delayed_frame_thresh))
delayed_label_stats[0]

In [None]:
len(delayed_label_stats)

#### Attach the distributed client to the cluster
Make sure all of the workers are ready to go before attaching and running compute.

In [None]:
from dask.distributed import Client
client = Client(cluster)
client

#### Calculate label stats for each frame

In [None]:
from dask import compute

In [None]:
%%time
label_stats = compute(*delayed_label_stats)

In [None]:
import datetime, math
import matplotlib.dates as dates

In [None]:
url = []
frame_datenum = []
frame_datetime = []
frame_numbers = []
dbcamhd = pd.read_json('/home/jovyan/floc_gsa_2019/dbcamhd.json', orient="records", lines=True)
for i, row in scene_windows[scene_windows.deployment == 5].iterrows():
    for frame_number in row.frame_list:
        url.append(row.url)
        frame_epoch_seconds = dbcamhd['timestamp'][dbcamhd.filename == row.url].iloc[0] + frame_number/29.97
        frame_datetime.append(datetime.datetime.fromtimestamp(frame_epoch_seconds))
        frame_datenum.append(dates.date2num(frame_datetime[-1]))
        frame_numbers.append(frame_number)

#### Count floc particles for each image

In [None]:
nflocs = [len(i) for i in label_stats]

In [None]:
len(nflocs)

In [None]:
total_floc = [sum(i) for i in label_stats]

In [None]:
len(total_floc)

In [None]:
regions_results = pd.DataFrame({'url': url, 'frame_number': frame_numbers, 'timestamp': frame_datenum,
                        'datetime': frame_datetime, 'nflocs': nflocs, 'label_stats': label_stats, 'total_floc' : total_floc,})
regions_results.head()

## Save this data as a pickle file:

In [None]:
import pickle

In [None]:
with open('results_for_dep_5_03.pickle', 'wb') as f:
    pickle.dump(regions_results, f)