# dbCamHD Update

This notebook updates the dbcamhd.json metadata database.

#### Load the current database

In [None]:
import pandas as pd
dbcamhd = pd.read_json('/home/jovyan/camhd_floc_model/data_camhd/dbcamhd.json', orient="records", lines=True).sort_values(by=['timestamp'])
dbcamhd.tail()

#### Get the last date in the database

In [None]:
from datetime import date
year = int(dbcamhd.filename.iloc[-1].split('/')[7])
month = int(dbcamhd.filename.iloc[-1].split('/')[8])
day = int(dbcamhd.filename.iloc[-1].split('/')[9])
start_date = date(year, month, day)
start_date

#### Get a list of the dates since that date and use Dask delayed and Pycamhd to generate a file list

In [None]:
import pycamhd as camhd
from dask import delayed, compute

In [None]:
date_list = pd.date_range(start_date, date.today()).to_pydatetime()

In [None]:
delayed_file_tuples = []
for day in date_list:
    delayed_file_tuples.append(delayed(camhd.get_file_list)(start_date=day, end_date=day))

In [None]:
from dask_kubernetes import KubeCluster
cluster = KubeCluster(n_workers=6)
cluster

In [None]:
from dask.distributed import Client
client = Client(cluster)
client

In [None]:
%%time
file_tuples = compute(*delayed_file_tuples)

In [None]:
file_list = []
file_sizes = []
for i in file_tuples:
    if i[0]:
        file_list = file_list + i[0]
        file_sizes = file_sizes + i[1]

In [None]:
dbcamhd_new = pd.DataFrame(
    {'filename': file_list,
     'filesize': file_sizes,
    }).drop_duplicates().reset_index()
dbcamhd_new.tail()

#### Use Dask to get additional information about the files

In [None]:
def get_file_info(filename):
    try:
        moov_atom = camhd.get_moov_atom(filename)
        timestamp = camhd.get_timestamp(filename, moov_atom)
        frame_count = camhd.get_frame_count(filename, moov_atom)
        moov = True
    except:
        timestamp = 0
        frame_count = 0
        moov = False
    return [timestamp, frame_count, moov]

In [None]:
delayed_file_info = []
for i in dbcamhd_new.index:
    filename = dbcamhd_new['filename'][i]
    delayed_file_info.append(delayed(get_file_info)(filename))

In [None]:
%%time
file_info = compute(*delayed_file_info)

In [None]:
timestamp = [item[0] for item in file_info]
frame_count = [item[1] for item in file_info]
moov = [item[2] for item in file_info]

#### Add these to the new dataframe

In [None]:
dbcamhd_new['moov'] = pd.Series(moov, index=dbcamhd_new.index)
dbcamhd_new['timestamp'] = pd.Series(timestamp, index=dbcamhd_new.index)
dbcamhd_new['frame_count'] = pd.Series(frame_count, index=dbcamhd_new.index)
dbcamhd_new.tail()

#### Add deployment numbers to the new dataframe

See the [asset management](https://github.com/ooi-integration/asset-management/blob/master/deployment/RS03ASHS_Deploy.csv) page for deployment information.

In [None]:
dt = pd.to_datetime(dbcamhd_new.timestamp, unit='s')
dbcamhd_new['deployment'] = dbcamhd_new.timestamp*0
dbcamhd_new.loc[dt < '2016-07-26 21:18:00', 'deployment'] = 2
dbcamhd_new.loc[dt >= '2016-07-26 21:18:00', 'deployment'] = 3
dbcamhd_new.loc[dt >= '2017-08-14 06:00:00', 'deployment'] = 4
dbcamhd_new.loc[dt >= '2018-07-04 00:00:00', 'deployment'] = 5

#### Concatenate and deduplicate

In [None]:
dbcamhd = pd.concat([dbcamhd, dbcamhd_new], ignore_index=True, sort=True).drop_duplicates(subset=['filename'])

In [None]:
dbcamhd.tail()

#### Save dataframe to JSON file

In [None]:
dbcamhd.to_json('dbcamhd.json', orient="records", lines=True)

### References

https://github.com/tjcrone/pycamhd<br>
https://rawdata.oceanobservatories.org/files/RS03ASHS/PN03B/06-CAMHDA301/<br>
https://pandas.pydata.org/