# START HERE:

In [None]:
from dask_kubernetes import KubeCluster
cluster = KubeCluster(n_workers=0)
cluster

#### Get a list of CamHD files to process

In [None]:
import pandas as pd

In [None]:
dbcamhd = pd.read_json('/home/jovyan/rte-camhd/tim/dbcamhd.json', orient="records", lines=True)
dbcamhd.tail()

In [None]:
print("Total files: %i" % len(dbcamhd))
print("Total frames: %i" % dbcamhd.frame_count.sum())

In [None]:
filenames = list(dbcamhd.filename[(dbcamhd.deployment == 5) & (dbcamhd.frame_count > 5000) & (dbcamhd.frame_count < 30000)])
filenames.sort()
filenames[0]

In [None]:
len(filenames)

#### Define the frame numbers from each file to process

In [None]:
frame_numbers = [1000, 2000, 3841, 3933, 4052, 4382, 5000]

These frame numbers correspond to times in the camera system.

### Create Wrapper

In [None]:
import pycamhd as camhd
import numpy as np

In [None]:
def get_moov_atom_timeout(filename):
    try:
        return camhd.get_moov_atom(filename)
    except:
        return False

In [None]:
def get_frame_timeout(filename, frame_number, pix_fmt, moov_atom):
    if moov_atom:
        try:
            return camhd.get_frame(filename, frame_number, pix_fmt, moov_atom)
        except:
            return np.zeros((1080, 1920), dtype=np.uint16)
    else:
        return np.zeros((1080, 1920), dtype=np.uint16)

#### Set up a delayed Dask array of images

In [None]:
import pycamhd as camhd
import numpy as np
from dask import delayed
import dask.array as dsa

In [None]:
delayed_frame_list = []
for filename in filenames:
    delayed_moov_atom = delayed(get_moov_atom_timeout)(filename)         
    for frame_number in frame_numbers:
        delayed_frame = delayed(get_frame_timeout)(filename, frame_number, 'gray16le', delayed_moov_atom)
        delayed_frame_list.append(dsa.from_delayed(delayed_frame, (1080, 1920), np.uint16))
delayed_frame_array = dsa.stack(delayed_frame_list)
delayed_frame_array


In [None]:
len(delayed_frame_array)

#### Show one of the images

In [None]:
frame = delayed_frame_array[0].compute()
frame.shape

In [None]:
%matplotlib inline
%config InlineBackend.figure_format = 'svg'
import matplotlib.pyplot as plt
import matplotlib.patches as patches
plt.rc('figure', figsize=(11, 11))
fig, ax = plt.subplots()
im1 = ax.imshow(frame)
im1.set_cmap('gray')
plt.yticks(np.arange(0,1081,270))
plt.xticks(np.arange(0,1921,480))
rect = patches.Rectangle((10,10),1024,1024,linewidth=1.5,edgecolor='w',facecolor='none')
ax.add_patch(rect)
plt.show();

#### Show the filter that will be used to filter images in the frequency domain
To deal with variations in lighting and high-frequency noise, we filter each subimage using a Butterworth bandpass filter.

In [None]:
def butterworth(d1, d2, n):
    x = np.arange(-1024/2+0.5,1024/2+1-0.5)
    xx, yy = np.meshgrid(x, x)
    d = np.sqrt(xx**2+yy**2)
    bff = (1 - (1./(1 + (d/d1)**(2*n))))*(1/(1 + (d/d2)**(2*n)))
    return bff

In [None]:
d1 = 20 # low cut wavenumber
d2 = 400 # high cut wavenumber
n = 4
bff = butterworth(d1, d2, n)
plt.rc('figure', figsize=(6, 6))
imgplot = plt.imshow(bff, cmap='gray')

#### Define the floc proxy function
The floc proxy is simply the number of pixels in each filtered subimage that have a value greater than 4000.

In [None]:
def frame_filter(frame, d1, d2, n):
    if frame.ndim == 3 and frame.shape[0] == 1:
        I = np.squeeze(frame[0, 0:1024, 0:1024])
    else:
        I = frame[0:1024, 0:1024]
    bff = butterworth(d1, d2, n)
    I_fft = np.fft.fft2(I)
    I_fft_shift = np.fft.fftshift(I_fft)
    I_fft_shift_filt = I_fft_shift*bff # filter with the Butterworth filter
    I_fft_filt = np.fft.ifftshift(I_fft_shift_filt)
    I_filt = np.fft.ifft2(I_fft_filt)
    return I_filt

In [None]:
threshold = 4000; # this is an arbitrary threshold that seems to work
def frame_thresh(frame, d1, d2, n, threshold):
    I_filt = frame_filter(frame, d1, d2, n)
    I_thresh = np.array(np.absolute(I_filt)>threshold)
    return I_thresh

In [None]:
I_thresh = frame_thresh(frame, d1, d2, n, threshold)

In [None]:
# def calc_floc_proxy(frame, d1, d2, n):
#     I_filt = frame_filter(frame, d1, d2, n)
#     return np.array([(np.absolute(I_filt)>4000).sum()])

#### Show example for one frame

In [None]:
# I_filt = frame_filter(frame, d1, d2, n)

In [None]:
plt.rc('figure', figsize=(6, 6))
imgplot = plt.imshow(I_thresh, cmap='gray')
plt.title('floc_proxy value = %i' % I_thresh.sum());

Show an example of image labeling using a subimage¶

In [None]:
from skimage.measure import label

In [None]:
I_thresh_sub = I_thresh[735:836, 595:696] # arbitrary area with a few floc particles
plt.rc('figure', figsize=(6, 6))
imgplot = plt.imshow(I_thresh_sub, cmap='gray')
plt.title('Subimage of Thresholded Image to Label');

In [None]:
import matplotlib as mpl
cmap = mpl.colors.ListedColormap(['white', 'blue', 'red', 'green', 'black'])
I_labeled = label(I_thresh_sub, connectivity=2)
plt.rc('figure', figsize=(6, 6))
imgplot = plt.imshow(I_labeled, cmap=cmap)
plt.title('Labeled Image');

This plot shows us that skimage.measure.label returns an Numpy array with the same dimensions as the input, with each area labeled with an different integer. The background is all zero, blue=1, red=2, green=3, black=4. This gives us a very easy way to count the number of labeled areas, and determine the size of each.

In [None]:
print('Pixels per Labeled Area\nblue: %i\nred: %i\ngreen: %i\nblack: %i' % 
      ((I_labeled==1).sum(), (I_labeled==2).sum(), 
       (I_labeled==3).sum(),(I_labeled==4).sum()))

In [None]:
# this function takes a thresholded binary image as input
def frame_label_stats(I_thresh):
    I_labeled = label(I_thresh)
    label_stats = []
    for i in range(1, I_labeled.max()+1):
        label_stats.append((I_labeled==i).sum())
    return label_stats

In [None]:
delayed_label_stats = []
for filename in filenames:
    delayed_moov_atom = delayed(get_moov_atom_timeout)(filename)
    for frame_number in frame_numbers:
        delayed_frame = delayed(get_frame_timeout)(filename, frame_number, 'gray16le', delayed_moov_atom)
        delayed_frame_thresh = delayed(frame_thresh)(delayed_frame, d1, d2, n, threshold)
        delayed_label_stats.append(delayed(frame_label_stats)(delayed_frame_thresh))
delayed_label_stats[0]

Attach the distributed client to the cluster

In [None]:
from dask.distributed import Client
client = Client(cluster)
client

In [None]:
from dask import compute

In [None]:
%%time
label_stats = compute(*delayed_label_stats)

Count floc particles for each image

In [None]:
# nflocs = [len(i) for i in label_stats]

In [None]:
max(nflocs)

#### Get a timestamp for each frame

In [None]:
import datetime, math
import matplotlib.dates as dates

In [None]:
url = [] #(0,)
frame_datenum = [] #(20566,)
frame_datetime = [] #(20566,)
nflocs = [len(i) for i in label_stats] #(0,)
frame_numbered = [] #(20566,)

dbcamhd = pd.read_json('/home/jovyan/rte-camhd/tim/dbcamhd.json', orient="records", lines=True)
for filename in filenames:
    timestamp = dbcamhd['timestamp'][dbcamhd.filename == filename].iloc[0] + frame_number/29.97
    for frame_number in frame_numbers:
        frame_epoch_seconds = dbcamhd['timestamp'][dbcamhd.filename == filename].iloc[0] + frame_number/29.97
        frame_datetime.append(datetime.datetime.fromtimestamp(frame_epoch_seconds))
        frame_datenum.append(dates.date2num(frame_datetime[-1]))
        frame_numbered.append(frame_number)
    
#         timestamp = timestamp + frame_number/29.97
#         dt = datetime.datetime.fromtimestamp(timestamp)
#         frame_timestamp.append(dates.date2num(dt))


In [None]:
np.shape(nflocs)

In [None]:
# url = []
# frame_timestamp = []
# frame_datetime = []
# nfloc = []
# frame_numbers = []

# dbcamhd = pd.read_json('/home/jovyan/rte-camhd/tim/dbcamhd.json', orient="records", lines=True)
# for i, row in scene_windows[scene_windows.deployment == 5].iterrows():
#     for frame_number in row.frame_list:
#         url.append(row.url)
#         frame_epoch_seconds = dbcamhd['timestamp'][dbcamhd.filename == row.url].iloc[0] + frame_number/29.97
#         frame_datetime.append(datetime.datetime.fromtimestamp(frame_epoch_seconds))
#         frame_datenum.append(dates.date2num(frame_datetime[-1]))
#         frame_numbers.append(frame_number)

In [None]:
nflocs = [len(i) for i in label_stats]

In [None]:
frames_results = pd.DataFrame({'frame_number': frame_numbered, 'timestamp': frame_datenum,
                        'datetime': frame_datetime, 'nflocs': nflocs, 'label_stats': label_stats})
#results.head()

In [None]:
frames_results

## Set index as datetime

In [None]:
frames_results.set_index('datetime', inplace =True)

In [None]:
test.set_index('Time', inplace =True)
#DATAFRAME WITH RESULTS AND A DIFF df WITH DATETIME INDEX, MERGE AND CONVERT TO WHAT i NEED

In [None]:
frames_results.to_csv('frames_resultsfor_Dep4.csv')

In [None]:
# timestamps_floc_for_Dep5_floc_proxy

In [None]:
# timestamps_floc_for_Dep5_floc_proxy = pd.DataFrame(
#     {'timestamps': frame_timestamp, 'floc_volume': nflocs})

In [None]:
# timestamps_floc_for_Dep5_floc_proxy.to_json('timestamps_floc_for_Dep5_floc_proxy.json', orient="records", lines=True)

Plot a two-dimensional multivariate histogram of the results

In [None]:
plt.rc('font', size=11)
fig, ax = plt.subplots()
fig.set_size_inches(18, 6)
fig.frameon = False
hb1 = ax.hexbin(datetime, nflocs, vmin=0, vmax=1.75, bins='log', linewidths=0.1,
#   gridsize=(80,100), mincnt=1, cmap=plt.cm.BuPu)
fig.colorbar(hb1)
ax.set_ylim([0, 1000])
ax.set_xlim([frame_timestamp[0],frame_timestamp[-1]])
ax.yaxis.grid(True)
ax.xaxis.grid(True)
months = dates.MonthLocator()  # every month
monthsFmt = dates.DateFormatter('%b %Y')
ax.xaxis.set_major_locator(months)
ax.xaxis.set_major_formatter(monthsFmt)
plt.ylabel('Number of Floc Particles');
plt.savefig('Number_of_Floc_Particles_for_Dep5_floc_proxy.png')

# STOP HERE

#### Assemble a new Dask array including our computation using map_blocks

In [None]:
floc_proxy = dsa.map_blocks(calc_floc_proxy, delayed_frame_array, d1, d2, n, dtype='i8', drop_axis=[1,2])
floc_proxy

#### Compute the floc_proxy (subset)

In [None]:
%%time
results = floc_proxy[0:40].compute()

#### Calculate all the results

In [None]:
print('Number of images: %i' % len(floc_proxy))
print('Size of dataset (GB): %i' % round(len(floc_proxy)*1080*1920*2/1024/1024/1024))

In [None]:
%%time
results = floc_proxy.compute()

#### Get a timestamp for each frame

In [None]:
import datetime, math
import matplotlib.dates as dates
frame_timestamp = []
for filename in filenames:
    timestamp = dbcamhd['timestamp'][dbcamhd.filename == filename].iloc[0]
    for frame_number in frame_numbers:
        timestamp = timestamp + frame_number/29.97
        dt = datetime.datetime.fromtimestamp(timestamp)
        frame_timestamp.append(dates.date2num(dt))

#### Plot a two-dimensional multivariate histogram of the results

In [None]:
plt.rc('font', size=11)
fig, ax = plt.subplots()
fig.set_size_inches(14, 6)
fig.frameon = False
hb1 = ax.hexbin(frame_timestamp, results, vmin=0, vmax=1, bins='log', linewidths=0.25,
  gridsize=(225,4500), mincnt=1, cmap=plt.cm.BuPu)
fig.colorbar(hb1)
ax.set_ylim([0, 8000])
ax.set_xlim([frame_timestamp[0],frame_timestamp[-1]])
ax.yaxis.grid(True)
ax.xaxis.grid(True)
months = dates.MonthLocator()  # every month
monthsFmt = dates.DateFormatter('%b %Y')
ax.xaxis.set_major_locator(months)
ax.xaxis.set_major_formatter(monthsFmt)
plt.ylabel('Floc Proxy Value');

Starting in mid-June a large "floc event" occurs where the floc proxy values increase on average by about a factor of ten. The cause of this floc event is being investigated.

In [None]:
print(results)

### References

 - [Pangeo](http://pangeo-data.org/)
 - [PyCamHD](https://github.com/tjcrone/pycamhd)
 - [CamHD Raw Data Archive](https://rawdata.oceanobservatories.org/files/RS03ASHS/PN03B/06-CAMHDA301)
 - [AGU Abstract](https://agu.confex.com/agu/fm16/meetingapp.cgi/Paper/192670)
 - [AGU Poster](https://drive.google.com/open?id=0B-dWW4GM434obGpTM0FZME10Nkk)
 - [Dask](http://dask.pydata.org/en/latest/)