# Bootstrapping

In [1]:
from dask.distributed import Client,LocalCluster
from dask_jobqueue import PBSCluster

In [2]:
# One node on Gadi has 48 cores - try and use up a full node before going to multiple nodes (jobs)

walltime = '01:00:00'
cores = 6
memory = str(4 * cores) + 'GB'

cluster = PBSCluster(walltime=str(walltime), cores=cores, memory=str(memory), processes=cores,
                     job_extra_directives=['-q normal',
                                           '-P w42',
                                           '-l ncpus='+str(cores),
                                           '-l mem='+str(memory),
                                           '-l storage=gdata/w42+gdata/rt52'],
                     local_directory='$TMPDIR',
                     job_directives_skip=["select"])

Perhaps you already have a cluster running?
Hosting the HTTP server on port 43507 instead


In [3]:
cluster.scale(jobs=1)
client = Client(cluster)

In [4]:
client

0,1
Connection method: Cluster object,Cluster type: dask_jobqueue.PBSCluster
Dashboard: /proxy/43507/status,

0,1
Dashboard: /proxy/43507/status,Workers: 0
Total threads: 0,Total memory: 0 B

0,1
Comm: tcp://10.6.121.2:45595,Workers: 0
Dashboard: /proxy/43507/status,Total threads: 0
Started: Just now,Total memory: 0 B


In [5]:
import xarray as xr
import numpy as np

In [6]:
obs_array = np.random.normal(size=(150, 200))

In [7]:
obs = xr.DataArray(
    obs_array,
    dims=["lat", "lon"],
    coords={"lat": range(150), "lon": range(200)}
)

In [8]:
samples_array = np.random.normal(size=(150, 200, 1000))

In [9]:
samples2 = xr.DataArray(
    samples_array,
    dims=["lat", "lon", "iteration"],
    coords={"lat": range(150), "lon": range(200), "iteration": range(1000)}
)

In [10]:
def get_quantile(obs, bootstrap):
    """
    Returns the quantile of obs in bootstrap
    
    obs: float or int
    bootstrap: array
    """
    return np.searchsorted(np.sort(bootstrap), obs) / len(bootstrap)

In [16]:
obs_stack = obs.stack(point=("lat", "lon")).groupby('point')
samples_stack = samples2.stack(point=("lat", "lon")).groupby('point')

In [18]:
samples_stack

DataArrayGroupBy, grouped over 'point'
30000 groups with labels (0,, 0), (0,, 1), ..., (149,, 199).

In [19]:
%%time
quantiles = xr.apply_ufunc(
        get_quantile,
        obs_stack,
        samples_stack,
        input_core_dims=[[], ['iteration']],
        output_core_dims=[[]],
        dask='allowed'
    )

CPU times: user 2min 5s, sys: 7.46 s, total: 2min 12s
Wall time: 2min 14s


In [23]:
quantiles.unstack("point")

### Or with vectorize following https://forum.access-hive.org.au/t/lazy-sorting-with-dask-bootstrapping-problem/866/2 

In [None]:
# no need for groupby()
obs_stack = obs.stack(point=("lat", "lon"))
samples_stack = samples2.stack(point=("lat", "lon"))

In [17]:
%%time
quantiles = xr.apply_ufunc(
        get_quantile,
        obs_stack,
        samples_stack,
        input_core_dims=[[], ['iteration']],
        output_core_dims=[[]],
        vectorize=True,
        dask='parallelized'
    )

CPU times: user 2min 5s, sys: 7.19 s, total: 2min 13s
Wall time: 2min 11s


### Now with neat quantiles function from above post:

In [26]:
%%time
(samples2 < obs).mean("iteration")

CPU times: user 138 ms, sys: 288 ms, total: 426 ms
Wall time: 451 ms


In [9]:
obs.to_dataset(name="obs").to_netcdf("/g/data/w42/dr6273/tmp/obs.nc", mode="w")
samples.to_dataset(name="samples").to_netcdf("/g/data/w42/dr6273/tmp/samples.nc", mode="w")

### Now with my data...

In [90]:
obs = xr.open_mfdataset("/g/data/w42/dr6273/tmp/obs.nc")["obs"]#.compute()
samples = xr.open_mfdataset("/g/data/w42/dr6273/tmp/samples.nc")["samples"]#.compute()

In [92]:
samples

Unnamed: 0,Array,Chunk
Bytes,11.72 MiB,11.72 MiB
Shape,"(63, 141, 173)","(63, 141, 173)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 11.72 MiB 11.72 MiB Shape (63, 141, 173) (63, 141, 173) Dask graph 1 chunks in 2 graph layers Data type float64 numpy.ndarray",173  141  63,

Unnamed: 0,Array,Chunk
Bytes,11.72 MiB,11.72 MiB
Shape,"(63, 141, 173)","(63, 141, 173)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray


In [6]:
from xbootstrap import block_bootstrap

In [119]:
import math
from collections import OrderedDict
from itertools import chain, islice, cycle

import numpy as np
import xarray as xr


def _get_blocked_random_indices(
    shape, block_axis, block_size, prev_block_sizes, circular
):
    """
    Return indices to randomly sample an axis of an array in consecutive
    (cyclic) blocks
    """

    def _random_blocks(length, block, circular):
        """
        Indices to randomly sample blocks in a along an axis of a specified
        length
        """
        if block == length:
            return list(range(length))
        else:
            repeats = math.ceil(length / block)
            if circular:
                indices = list(
                    chain.from_iterable(
                        islice(cycle(range(length)), s, s + block)
                        for s in np.random.randint(0, length, repeats)
                    )
                )
            else:
                indices = list(
                    chain.from_iterable(
                        islice(range(length), s, s + block)
                        for s in np.random.randint(0, length - block + 1, repeats)
                    )
                )
            return indices[:length]

    # Don't randomize within an outer block
    if len(prev_block_sizes) > 0:
        orig_shape = shape.copy()
        for i, b in enumerate(prev_block_sizes[::-1]):
            prev_ax = block_axis - (i + 1)
            shape[prev_ax] = math.ceil(shape[prev_ax] / b)

    if block_size == 1:
        indices = np.random.randint(
            0,
            shape[block_axis],
            shape,
        )
    else:
        non_block_shapes = [s for i, s in enumerate(shape) if i != block_axis]
        indices = np.moveaxis(
            np.stack(
                [
                    _random_blocks(shape[block_axis], block_size, circular)
                    for _ in range(np.prod(non_block_shapes))
                ],
                axis=-1,
            ).reshape([shape[block_axis]] + non_block_shapes),
            0,
            block_axis,
        )

    if len(prev_block_sizes) > 0:
        for i, b in enumerate(prev_block_sizes[::-1]):
            prev_ax = block_axis - (i + 1)
            indices = np.repeat(indices, b, axis=prev_ax).take(
                range(orig_shape[prev_ax]), axis=prev_ax
            )
        return indices
    else:
        return indices


def _n_nested_blocked_random_indices(sizes, n_iteration, circular):
    """
    Returns indices to randomly resample blocks of an array (with replacement)
    in a nested manner many times. Here, "nested" resampling means to randomly
    resample the first dimension, then for each randomly sampled element along
    that dimension, randomly resample the second dimension, then for each
    randomly sampled element along that dimension, randomly resample the third
    dimension etc.

    Parameters
    ----------
    sizes : OrderedDict
        Dictionary with {names: (sizes, blocks)} of the dimensions to resample
    n_iteration : int
        The number of times to repeat the random resampling
    circular : bool
        Whether or not to do circular resampling
    """

    shape = [s[0] for s in sizes.values()]
    indices = OrderedDict()
    prev_blocks = []
    for ax, (key, (_, block)) in enumerate(sizes.items()):
        indices[key] = _get_blocked_random_indices(
            shape[: ax + 1] + [n_iteration], ax, block, prev_blocks, circular
        )
        prev_blocks.append(block)
    return indices


def _expand_n_nested_random_indices(indices):
    """
    Expand the dimensions of the nested input arrays so that they can be
    broadcast and return a tuple that can be directly indexed

    Parameters
    ----------
    indices : list of numpy arrays
        List of numpy arrays of sequentially increasing dimension as output by
        the function `_n_nested_blocked_random_indices`. The last axis on all
        inputs is assumed to correspond to the iteration axis
    """
    broadcast_ndim = indices[-1].ndim
    broadcast_indices = []
    for i, ind in enumerate(indices):
        expand_axes = list(range(i + 1, broadcast_ndim - 1))
        broadcast_indices.append(np.expand_dims(ind, axis=expand_axes))
    return (..., *tuple(broadcast_indices))


def _block_bootstrap(*objects, blocks, n_iteration, exclude_dims=None, circular=True):
    """
    Repeatedly circularly bootstrap the provided arrays across the specified
    dimension(s) and stack the new arrays along a new "iteration"
    dimension. The boostrapping is done in a nested manner. I.e. bootstrap
    the first provided dimension, then for each bootstrapped sample along
    that dimenion, bootstrap the second provided dimension, then for each
    bootstrapped sample along that dimenion etc.

    Note, this function expands out the iteration dimension inside a
    universal function. However, this can generate very large chunks (it
    multiplies chunk size by the number of iterations) and it falls over for
    large numbers of iterations for reasons I don't understand. It is thus
    best to apply this function in blocks using `block_bootstrap`

    Parameters
    ----------
    objects : xarray DataArray(s) or Dataset(s)
        The data to bootstrap. Multiple datasets can be passed to be
        bootstrapped in the same way. Where multiple datasets are passed, all
        datasets need not contain all bootstrapped dimensions. However, because
        of the bootstrapping is applied in a nested manner, the dimensions in
        all input objects must also be nested. E.g., for `blocks.keys=['d1',
        'd2','d3']` an object with dimensions 'd1' and 'd2' is valid but an
        object with only dimension 'd2' is not. All datasets are boostrapped
        according to the same random samples along available dimensions.
    blocks : dict
        Dictionary of the dimension(s) to bootstrap and the block sizes to use
        along each dimension: `{dim: blocksize}`. Nesting is carried out according
        to the order of this dictionary.
    n_iteration : int
        The number of times to repeat the bootstrapping.
    exclude_dims : list of list
        List of the same length as the number of objects giving a list of
        dimensions specifed in `blocks` to exclude from each object. Default is
        to assume that no dimensions are excluded and all `objects` are
        bootstrapped across all (available) dimensions `blocks`.
    circular : boolean, optional
        Whether or not to do circular block bootstrapping

    References
    ----------
    Wilks, Daniel S. Statistical methods in the atmospheric sciences. Vol. 100.
      Academic press, 2011.
    """

    def _bootstrap(*arrays, indices):
        """Bootstrap the array(s) using the provided indices"""
        bootstrapped = [array[ind] for array, ind in zip(arrays, indices)]
        if len(bootstrapped) == 1:
            return bootstrapped[0]
        else:
            return tuple(bootstrapped)

    objects = list(objects)

    # Rename exclude_dims so they are not bootstrapped
    if exclude_dims is None:
        exclude_dims = [[] for _ in range(len(objects))]
    msg = (
        "exclude_dims should be a list of the same length as the number of "
        "objects containing lists of dimensions to exclude for each object"
    )
    assert isinstance(exclude_dims, list), msg
    assert len(exclude_dims) == len(objects), msg
    assert all(isinstance(x, list) for x in exclude_dims), msg
    renames = []
    for i, (obj, exclude) in enumerate(zip(objects, exclude_dims)):
        objects[i] = obj.rename(
            {d: f"dim{ii}" for ii, d in enumerate(exclude)},
        )
        renames.append({f"dim{ii}": d for ii, d in enumerate(exclude)})

    dim = list(blocks.keys())
    if isinstance(dim, str):
        dim = [dim]

    # Check that boostrapped dimensions are the same size on all objects
    for d in blocks.keys():
        dim_sizes = [o.sizes[d] for o in objects if d in o.dims]
        assert all(
            s == dim_sizes[0] for s in dim_sizes
        ), f"Block dimension {d} is not the same size on all input objects"

    # Get the sizes of the bootstrap dimensions
    sizes = None
    for obj in objects:
        try:
            sizes = OrderedDict(
                {d: (obj.sizes[d], b) for d, b in blocks.items()},
            )
            break
        except KeyError:
            pass
    if sizes is None:
        raise ValueError(
            "At least one input object must contain all dimensions in blocks.keys()",
        )

    # Generate the random indices first so that we can be sure that each
    # dask chunk uses the same indices. Note, I tried using random.seed()
    # to achieve this but it was flaky. These are the indices to bootstrap
    # all objects.
    nested_indices = _n_nested_blocked_random_indices(sizes, n_iteration, circular)

    # Need to expand the indices for broadcasting for each object separately
    # as each object may have different dimensions
    indices = []
    input_core_dims = []
    for obj in objects:
        available_dims = [d for d in dim if d in obj.dims]
        indices_to_expand = [nested_indices[key] for key in available_dims]

        # Check that dimensions are nested
        ndims = [i.ndim for i in indices_to_expand]
        # Start at 2 due to iteration dim
        if ndims != list(range(2, len(ndims) + 2)):
            raise ValueError("The dimensions of all inputs must be nested")

        indices.append(_expand_n_nested_random_indices(indices_to_expand))
        input_core_dims.append(available_dims)

    # Loop over objects because they may have non-matching dimensions and
    # we don't want to broadcast them as this will unnecessarily increase
    # chunk size for dask arrays
    result = []
    for obj, ind, core_dims in zip(objects, indices, input_core_dims):
        if isinstance(obj, xr.Dataset):
            # Assume all variables have the same dtype
            output_dtype = obj[list(obj.data_vars)[0]].dtype
        else:
            output_dtype = obj.dtype

        result.append(
            xr.apply_ufunc(
                _bootstrap,
                obj,
                kwargs=dict(
                    indices=[ind],
                ),
                input_core_dims=[core_dims],
                output_core_dims=[core_dims + ["iteration"]],
                dask="parallelized",
                dask_gufunc_kwargs=dict(
                    output_sizes={"iteration": n_iteration},
                ),
                output_dtypes=[output_dtype],
            )
        )

    # Rename excluded dimensions
    return tuple(res.rename(rename) for res, rename in zip(result, renames))


def block_bootstrap(*objects, blocks, n_iteration, exclude_dims=None, circular=True):
    """
    Repeatedly circularly bootstrap the provided arrays across the specified
    dimension(s) and stack the new arrays along a new "iteration"
    dimension. The boostrapping is done in a nested manner. I.e. bootstrap
    the first provided dimension, then for each bootstrapped sample along
    that dimenion, bootstrap the second provided dimension, then for each
    bootstrapped sample along that dimenion etc.

    Parameters
    ----------
    objects : xarray DataArray(s) or Dataset(s)
        The data to bootstrap. Multiple datasets can be passed to be
        bootstrapped in the same way. Where multiple datasets are passed, all
        datasets need not contain all bootstrapped dimensions. However, because
        of the bootstrapping is applied in a nested manner, the dimensions in
        all input objects must also be nested. E.g., for `blocks.keys=['d1',
        'd2','d3']` an object with dimensions 'd1' and 'd2' is valid but an
        object with only dimension 'd2' is not. All datasets are boostrapped
        according to the same random samples along available dimensions.
    blocks : dict
        Dictionary of the dimension(s) to bootstrap and the block sizes to use
        along each dimension: `{dim: blocksize}`. Nesting is carried out according
        to the order of this dictionary.
    n_iteration : int
        The number of times to repeat the bootstrapping.
    exclude_dims : list of list
        List of the same length as the number of objects giving a list of
        dimensions specifed in `blocks` to exclude from each object. Default is
        to assume that no dimensions are excluded and all `objects` are
        bootstrapped across all (available) dimensions `blocks`.
    circular : boolean, optional
        Whether or not to do circular block bootstrapping

    References
    ----------
    Wilks, Daniel S. Statistical methods in the atmospheric sciences. Vol. 100.
      Academic press, 2011.
    """
    # The fastest way to perform the iterations is to expand out the
    # iteration dimension inside the universal function (see
    # _iterative_bootstrap). However, this can generate very large chunks (it
    # multiplies chunk size by the number of iterations) and it falls over
    # for large numbers of iterations for reasons I don't understand. Thus
    # here we loop over blocks of iterations to generate the total number
    # of iterations.

    def _max_chunk_size_MB(ds):
        """
        Get the max chunk size in a dataset
        """

        def size_of_chunk(chunks, itemsize):
            """
            Returns size of chunk in MB given dictionary of chunk sizes
            """
            N = 1
            for value in chunks:
                if not isinstance(value, int):
                    value = max(value)
                N = N * value
            return itemsize * N / 1024**2

        if isinstance(ds, xr.DataArray):
            ds = ds.to_dataset(name="ds")

        chunks = []
        for var in ds.data_vars:
            da = ds[var]
            chunk = da.chunks
            itemsize = da.data.itemsize
            if chunk is None:
                # numpy array
                chunks.append((da.data.size * itemsize) / 1024**2)
            else:
                chunks.append(size_of_chunk(chunk, itemsize))
        return max(chunks)

    # Choose iteration blocks to limit chunk size on dask arrays
    if objects[
        0
    ].chunks:  # TO DO: this is not a very good check that input is dask array
        MAX_CHUNK_SIZE_MB = 200
        ds_max_chunk_size_MB = max(
            [_max_chunk_size_MB(obj) for obj in objects],
        )
        blocksize = int(MAX_CHUNK_SIZE_MB / ds_max_chunk_size_MB)
        if blocksize > n_iteration:
            blocksize = n_iteration
        if blocksize < 1:
            blocksize = 1
    else:
        blocksize = n_iteration

    print(blocksize)
        
    bootstraps = []
    for _ in range(blocksize, n_iteration + 1, blocksize):
        bootstraps.append(
            _block_bootstrap(
                *objects,
                blocks=blocks,
                n_iteration=blocksize,
                exclude_dims=exclude_dims,
                circular=circular,
            )
        )

    leftover = n_iteration % blocksize
    if leftover:
        bootstraps.append(
            _block_bootstrap(
                *objects,
                blocks=blocks,
                n_iteration=leftover,
                exclude_dims=exclude_dims,
                circular=circular,
            )
        )

    if len(objects) == 1:
        print(bootstraps[0])
        return xr.concat(
            *bootstraps,
            dim="iteration",
            coords="minimal",
            compat="override",
        )
    else:
        return tuple(
            [
                xr.concat(
                    b,
                    dim="iteration",
                    coords="minimal",
                    compat="override",
                )
                for b in zip(*bootstraps)
            ]
        )
    
    # bootstraps = tuple(
    #         [
    #             xr.concat(
    #                 b,
    #                 dim="iteration",
    #                 coords="minimal",
    #                 compat="override",
    #             )
    #             for b in zip(*bootstraps)
    #         ]
    #     )
    # if len(objects) == 1:
    #     return bootstraps[0]
    # else:
    #     return bootstraps

In [7]:
# obs = xr.open_mfdataset("/g/data/w42/dr6273/tmp/obs.nc")["obs"]#.compute()
# samples = xr.open_mfdataset("/g/data/w42/dr6273/tmp/samples.nc")["samples"]#.compute()

In [97]:
import numpy as np
import xarray as xr
from xbootstrap import block_bootstrap

obs_array = np.random.normal(size=(150, 200))

obs = xr.DataArray(
    obs_array,
    dims=["lat", "lon"],
    coords={"lat": range(150), "lon": range(200)}
)

samples_array = np.random.normal(size=(150, 200, 65))

samples = xr.DataArray(
    samples_array,
    dims=["lat", "lon", "time"],
    coords={"lat": range(150), "lon": range(200), "time": range(65)}
)

obs = obs.chunk()
samples = samples.chunk()

bootstraps = block_bootstrap(
    samples,
    blocks={'time': 1},
    n_iteration=100,
    exclude_dims=None,
    circular=True
)

In [120]:
%%time
bootstraps = block_bootstrap(
    samples,
    blocks={'time': 1},
    n_iteration=100,
    exclude_dims=None,
    circular=True
)
# bootstraps = bootstraps.assign_coords({"iteration": range(100)})

13
(<xarray.DataArray (lat: 150, lon: 200, time: 65, iteration: 13)>
dask.array<transpose, shape=(150, 200, 65, 13), dtype=float64, chunksize=(150, 200, 65, 13), chunktype=numpy.ndarray>
Coordinates:
  * lat      (lat) int64 0 1 2 3 4 5 6 7 8 ... 142 143 144 145 146 147 148 149
  * lon      (lon) int64 0 1 2 3 4 5 6 7 8 ... 192 193 194 195 196 197 198 199
  * time     (time) int64 0 1 2 3 4 5 6 7 8 9 ... 55 56 57 58 59 60 61 62 63 64
Dimensions without coordinates: iteration,)


TypeError: concat() got multiple values for argument 'dim'

In [116]:
bootstraps

Unnamed: 0,Array,Chunk
Bytes,1.45 GiB,193.41 MiB
Shape,"(150, 200, 65, 100)","(150, 200, 65, 13)"
Dask graph,8 chunks in 26 graph layers,8 chunks in 26 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 1.45 GiB 193.41 MiB Shape (150, 200, 65, 100) (150, 200, 65, 13) Dask graph 8 chunks in 26 graph layers Data type float64 numpy.ndarray",150  1  100  65  200,

Unnamed: 0,Array,Chunk
Bytes,1.45 GiB,193.41 MiB
Shape,"(150, 200, 65, 100)","(150, 200, 65, 13)"
Dask graph,8 chunks in 26 graph layers,8 chunks in 26 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray


In [231]:
bs1 = bootstraps.isel(time=range(10)).mean("time")
bs2 = bootstraps.isel(time=range(10,20)).mean("time")

In [232]:
diffs = bs1 - bs2
diffs

Unnamed: 0,Array,Chunk
Bytes,186.10 MiB,195.31 kiB
Shape,"(141, 173, 1000)","(5, 5, 1000)"
Dask graph,1015 chunks in 12 graph layers,1015 chunks in 12 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 186.10 MiB 195.31 kiB Shape (141, 173, 1000) (5, 5, 1000) Dask graph 1015 chunks in 12 graph layers Data type float64 numpy.ndarray",1000  173  141,

Unnamed: 0,Array,Chunk
Bytes,186.10 MiB,195.31 kiB
Shape,"(141, 173, 1000)","(5, 5, 1000)"
Dask graph,1015 chunks in 12 graph layers,1015 chunks in 12 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray


In [222]:
# diffs.to_dataset(name='ds').to_netcdf("/g/data/w42/dr6273/tmp/bootstraps.nc", mode="w")

In [236]:
obs = xr.open_mfdataset("/g/data/w42/dr6273/tmp/obs.nc")["obs"]
samples = xr.open_mfdataset("/g/data/w42/dr6273/tmp/bootstraps.nc")["ds"]

In [237]:
# diffs = diffs.persist()

In [238]:
obs_stack = obs.stack(point=("lat", "lon")).groupby('point')
samples_stack = diffs.stack(point=("lat", "lon")).groupby('point')

In [239]:
obs_stack

DataArrayGroupBy, grouped over 'point'
24393 groups with labels (-10.0,, 112.0), ..., (-45.0,, ....

In [240]:
samples_stack

DataArrayGroupBy, grouped over 'point'
24393 groups with labels (-10.0,, 112.0), ..., (-45.0,, ....

In [241]:
def get_quantile(a, v):
    """
    Returns the quantile of a in v
    
    a: float or int
    v: array
    """
    return np.searchsorted(np.sort(v), a) / len(v)

In [260]:
a = samples.isel(lat=0, lon=0).chunk()

In [263]:
a.dims

('iteration',)

In [273]:
a.values[838]

1.1

In [268]:
a.compute().argsort(axis=0)

In [242]:
%%time
quantiles = xr.apply_ufunc(
        get_quantile,
        obs_stack,
        samples_stack,
        input_core_dims=[[], ['iteration']],
        output_core_dims=[[]],
        dask='allowed'
    )



ValueError: Input array a must be one dimensional

In [9]:
obs.to_dataset(name="obs").to_netcdf("/g/data/w42/dr6273/tmp/obs.nc", mode="w")
samples.to_dataset(name="samples").to_netcdf("/g/data/w42/dr6273/tmp/samples.nc", mode="w")

In [10]:
obs = xr.open_mfdataset("/g/data/w42/dr6273/tmp/obs.nc")["obs"]
samples = xr.open_mfdataset("/g/data/w42/dr6273/tmp/samples.nc")["samples"]

In [12]:
obs_stack = obs.stack(point=("lat", "lon")).groupby('point')
samples_stack = samples.stack(point=("lat", "lon")).groupby('point')

In [13]:
obs_stack

DataArrayGroupBy, grouped over 'point'
30000 groups with labels (0,, 0), (0,, 1), ..., (149,, 199).

In [11]:
# %%time
# quantiles = xr.apply_ufunc(
#         get_quantile,
#         obs_stack,
#         samples_stack,
#         input_core_dims=[[], ['iteration']],
#         output_core_dims=[[]],
#         dask='allowed'
#     )

In [26]:
from xbootstrap import block_bootstrap

In [47]:
da = xr.DataArray(
    np.random.normal(size=(300, 300, 10)),
    dims=["x", "y", "t"],
    coords={"x": range(300), "y": range(300), "t": range(10)}
)

In [48]:
%%time
bootstraps = block_bootstrap(
    da.chunk(),
    blocks={'t': 1},
    n_iteration=2,
    exclude_dims=None,
    circular=True
)

CPU times: user 43.8 ms, sys: 194 µs, total: 44 ms
Wall time: 39.4 ms


In [52]:
da2 = xr.DataArray(
    np.random.normal(size=(300, 300, 1000)),
    dims=["x", "y", "t"],
    coords={"x": range(300), "y": range(300), "t": range(1000)}
)

In [69]:
%%time
bootstraps = block_bootstrap(
    da2.chunk({"x": 10}),
    blocks={'t': 1},
    n_iteration=2,
    exclude_dims=None,
    circular=True
)

CPU times: user 2.17 s, sys: 25.8 ms, total: 2.2 s
Wall time: 1.98 s


In [75]:
da2.chunk({"x": 1})

Unnamed: 0,Array,Chunk
Bytes,686.65 MiB,2.29 MiB
Shape,"(300, 300, 1000)","(1, 300, 1000)"
Dask graph,300 chunks in 1 graph layer,300 chunks in 1 graph layer
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 686.65 MiB 2.29 MiB Shape (300, 300, 1000) (1, 300, 1000) Dask graph 300 chunks in 1 graph layer Data type float64 numpy.ndarray",1000  300  300,

Unnamed: 0,Array,Chunk
Bytes,686.65 MiB,2.29 MiB
Shape,"(300, 300, 1000)","(1, 300, 1000)"
Dask graph,300 chunks in 1 graph layer,300 chunks in 1 graph layer
Data type,float64 numpy.ndarray,float64 numpy.ndarray


In [33]:
samples2.chunk()

Unnamed: 0,Array,Chunk
Bytes,2.24 GiB,2.24 GiB
Shape,"(150, 200, 10000)","(150, 200, 10000)"
Dask graph,1 chunks in 1 graph layer,1 chunks in 1 graph layer
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 2.24 GiB 2.24 GiB Shape (150, 200, 10000) (150, 200, 10000) Dask graph 1 chunks in 1 graph layer Data type float64 numpy.ndarray",10000  200  150,

Unnamed: 0,Array,Chunk
Bytes,2.24 GiB,2.24 GiB
Shape,"(150, 200, 10000)","(150, 200, 10000)"
Dask graph,1 chunks in 1 graph layer,1 chunks in 1 graph layer
Data type,float64 numpy.ndarray,float64 numpy.ndarray
