In [1]:
from dask import delayed
from dask.diagnostics import ProgressBar
from dask.utils import SerializableLock
from datetime import datetime, timedelta
from floater.generators import FloatSet
import bcolz
import dask.array as dsa
import numpy as np
import os, re
import pandas as pd
import xarray as xr

In [2]:
fname = '/home/cz2397/data/rclv-eddies/traj_0000-0090.bcolz'

In [3]:
fs = FloatSet(xlim=(0, 360), ylim=(-80, 70), dx=0.03125, dy=0.03125)
Npart = fs.Nx*fs.Ny

In [4]:
def bcolz2arrays(bc, time, npart_range, fields, lock=None, dtype='f4'):
    npart_min, npart_max = npart_range
    npart = np.arange(npart_min, npart_max)
    query = "(time==%g) & (npart>=%g) & (npart<%g)" % (time, npart_min, npart_max)
    if lock is not None:
        lock.acquire()
    df = pd.DataFrame(bc[query])
    if lock is not None:
        lock.release()
    df = df.set_index('npart', drop=True, verify_integrity=True)
    df = df.reindex(npart)
    data = df[fields].values
    return data

In [5]:
def bcolz2netcdf(fname, num_floats, fields=['x', 'y', 'vort'], delta_t=86400, date0=datetime(1993,1,1)):
    datadir = os.path.dirname(fname)
    basename = os.path.splitext(os.path.basename(fname))[0]
    day0, day1 = [int(day) for day in re.search('traj_(\d+)-(\d+)', basename).groups()]
    days = np.arange(day1-day0)
    times = delta_t * days
    refdate = date0 + timedelta(days=day0)
    bc = bcolz.open(rootdir=fname, mode='r')
    nt = len(times)
    npart = np.arange(1,num_floats+1)
    npart_range = npart[0], npart[-1]+1
    lock = SerializableLock()
    dtype = 'f4'
    data = [dsa.from_delayed(
                delayed(bcolz2arrays)(
                    bc, time, npart_range, fields, lock=lock, dtype=dtype),
                (num_floats, len(fields)), dtype)
            for time in times]
    stacked_data = dsa.stack(data)
    data_variables = {field: (('time', 'npart'), stacked_data[...,n])
                      for n, field in enumerate(fields)}
    ds = xr.Dataset(data_variables, {'npart': npart, 'time': days})
    ds.time.attrs['units'] = 'days since %s' % date0.strftime('%Y-%m-%d')
    outfile = os.path.join(datadir, basename + '.nc')
    with ProgressBar():
        ds.to_netcdf(outfile, engine='netcdf4')

In [None]:
bcolz2netcdf(fname, Npart)

[#########################               ] | 63% Completed | 57min 31.0s