# Estimate time to rechunk an existing dataset, varying resolutions

Estimate time to rechunk as a variable of size of dataset and size of desired chunk.

In [6]:
%load_ext autoreload
%autoreload
import numpy as np
import s3fs
import xarray as xr
import sys; sys.path.append('..')
from profiler.main import Timer
import eodc_hub_role
import zarr_helpers

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
!pip install rio-tiler==4.1.11 loguru

Collecting rio-tiler==4.1.11
  Using cached rio_tiler-4.1.11-py3-none-any.whl (208 kB)
Collecting color-operations (from rio-tiler==4.1.11)
  Using cached color_operations-0.1.1-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (131 kB)
Collecting numexpr (from rio-tiler==4.1.11)
  Using cached numexpr-2.8.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (383 kB)
Installing collected packages: numexpr, color-operations, rio-tiler
Successfully installed color-operations-0.1.1 numexpr-2.8.5 rio-tiler-4.1.11


## Setup 2: Setup data access

Fetch data from the fake data directory.

In [2]:
credentials = eodc_hub_role.fetch_and_set_credentials()
bucket = 'nasa-eodc-data-store'
s3_fs = s3fs.S3FileSystem(
    key=credentials['AccessKeyId'],
    secret=credentials['SecretAccessKey'],
    token=credentials['SessionToken'], 
    anon=False
)

## Test 1: Data with single chunk, varied chunk size

Hypothesis: Larger chunk sizes mean slower tile times, at all zoom levels.

### Step 1: Generate dataset specs

In [7]:
# Fake data directory
data_path = 'fake_data/single_chunk'
directories = s3_fs.ls(f'{bucket}/{data_path}')

datastore_specs = zarr_helpers.get_dataset_specs_from_directory(directories)

In [24]:
target_size = 2
# not variable: bytes per mb
onemb = 1024 # bytes per mb
# number of data values per chunk
data_values_per_chunk = (target_size * onemb * onemb)/8 # 8 bytes for each data value
# since there are half as many latitudes as longitudes, calculate the y dimension to be half the x dimension
ydim = round(np.sqrt(data_values_per_chunk/2))
xdim = 2*ydim
target_chunks = (1, ydim, xdim)
max_mem = "1GB"
target_chunks

(1, 362, 724)

In [17]:
store_directory = f'{bucket}/fake_data/single_chunk_rechunked'
# for each datastore, rechunk and restore
for key, datastore in datastore_specs.items():
    # Open existing Zarr dataset
    with Timer() as t:
        ds = xr.open_zarr(datastore['source'])

        # Perform rechunking
        ds_rechunked = ds.copy()
        ds_rechunked.chunk({'time': target_chunks[0], 'lat': target_chunks[1], 'lon': target_chunks[2]})

        # Save rechunked dataset to a new Zarr store
        path = f"{store_directory}/store_lat{ds['shape']['lat']}_lon{ds['shape']['lon']}.zarr"
        store = s3fs.S3Map(root=path, s3=s3_fs, check=False)
        print(f"Writing to {path}")
        ds_rechunked.to_zarr(store, mode='w')
    datastore_specs[key]['time to rechunk'] = round(t.elapsed * 1000, 2) 

Writing to nasa-eodc-data-store/fake_data/single_chunk_rechunked/store_lat512_lon1024.zarr
Writing to nasa-eodc-data-store/fake_data/single_chunk_rechunked/store_lat512_lon1024.zarr
Writing to nasa-eodc-data-store/fake_data/single_chunk_rechunked/store_lat512_lon1024.zarr
Writing to nasa-eodc-data-store/fake_data/single_chunk_rechunked/store_lat512_lon1024.zarr
Writing to nasa-eodc-data-store/fake_data/single_chunk_rechunked/store_lat512_lon1024.zarr
Writing to nasa-eodc-data-store/fake_data/single_chunk_rechunked/store_lat512_lon1024.zarr
Writing to nasa-eodc-data-store/fake_data/single_chunk_rechunked/store_lat512_lon1024.zarr


In [19]:
datastore_specs

{'single_chunk/store_lat1024_lon2048.zarr': {'source': 's3://nasa-eodc-data-store/fake_data/single_chunk/store_lat1024_lon2048.zarr',
  'collection_name': 'single_chunk/store_lat1024_lon2048.zarr',
  'variable': 'data',
  'shape': {'time': 1, 'lat': 1024, 'lon': 2048},
  'lat_resolution': 0.17595307917888564,
  'lon_resolution': 0.17586712261846604,
  'chunk_size_mb': 16.0,
  'chunks': {'time': 1, 'lat': 1024, 'lon': 2048},
  'dataarray_size': 16.0,
  'dtype': dtype('float64'),
  'number_coord_chunks': 3,
  'compression': Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0),
  'number_of_chunks': 1,
  'time to rechunk': 3131.36},
 'single_chunk/store_lat1448_lon2896.zarr': {'source': 's3://nasa-eodc-data-store/fake_data/single_chunk/store_lat1448_lon2896.zarr',
  'collection_name': 'single_chunk/store_lat1448_lon2896.zarr',
  'variable': 'data',
  'shape': {'time': 1, 'lat': 1448, 'lon': 2896},
  'lat_resolution': 0.1243953006219765,
  'lon_resolution': 0.12435233160621761,
  'ch

In [25]:
from rechunker import rechunk
import zarr

# for each datastore, rechunk and restore
for key, datastore in datastore_specs.items():
    # Open existing Zarr dataset
    with Timer() as t:
        source_store = datastore['source'].replace('s3://', '')
        store = s3fs.S3Map(root=source_store, s3=s3_fs, check=False)
        group = zarr.open_consolidated(store, mode="r")
        source_array = group["data"]
        # Perform rechunking
        store_temp = "temp.zarr"
        target_store = f"{bucket}/fake_data/rechunker_rechunked/store_lat{datastore['shape']['lat']}_lon{datastore['shape']['lon']}.zarr"
        store_target = s3fs.S3Map(root=target_store, s3=s3_fs, check=False)
        r = rechunk(source_array, target_chunks, max_mem, store_target, temp_store=store_temp)
        result = r.execute()
    datastore_specs[key]['time to rechunk'] = round(t.elapsed * 1000, 2)

In [26]:
datastore_specs

{'single_chunk/store_lat1024_lon2048.zarr': {'source': 's3://nasa-eodc-data-store/fake_data/single_chunk/store_lat1024_lon2048.zarr',
  'collection_name': 'single_chunk/store_lat1024_lon2048.zarr',
  'variable': 'data',
  'shape': {'time': 1, 'lat': 1024, 'lon': 2048},
  'lat_resolution': 0.17595307917888564,
  'lon_resolution': 0.17586712261846604,
  'chunk_size_mb': 16.0,
  'chunks': {'time': 1, 'lat': 1024, 'lon': 2048},
  'dataarray_size': 16.0,
  'dtype': dtype('float64'),
  'number_coord_chunks': 3,
  'compression': Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0),
  'number_of_chunks': 1,
  'time to rechunk': 917.19},
 'single_chunk/store_lat1448_lon2896.zarr': {'source': 's3://nasa-eodc-data-store/fake_data/single_chunk/store_lat1448_lon2896.zarr',
  'collection_name': 'single_chunk/store_lat1448_lon2896.zarr',
  'variable': 'data',
  'shape': {'time': 1, 'lat': 1448, 'lon': 2896},
  'lat_resolution': 0.1243953006219765,
  'lon_resolution': 0.12435233160621761,
  'chu