# Estimate time to rechunk an existing dataset, varying resolutions

Estimate time to rechunk as a variable of size of dataset and size of desired chunk.

In [1]:
import numpy as np
import s3fs
import xarray as xr
import sys; sys.path.append('..')
from profiler.main import Timer
import eodc_hub_role
import zarr_helpers

In [2]:
%%capture
!pip uninstall apache-beam -y
!pip install 'apache-beam[interactive, dataframe]==2.48.0' git+https://github.com/carbonplan/cmip6-downscaling.git git+https://github.com/pangeo-forge/pangeo-forge-recipes.git@beam-refactor

## Setup 2: Setup data access

Fetch data from the fake data directory.

In [3]:
credentials = eodc_hub_role.fetch_and_set_credentials()
bucket = 'nasa-eodc-data-store'
s3_fs = s3fs.S3FileSystem(
    key=credentials['AccessKeyId'],
    secret=credentials['SecretAccessKey'],
    token=credentials['SessionToken'], 
    anon=False
)

## Test 1: Data with single chunk, varied chunk size

Hypothesis: Larger chunk sizes mean slower tile times, at all zoom levels.

### Step 1: Generate dataset specs

In [4]:
# Fake data directory
data_path = 'fake_data/single_chunk'
directories = s3_fs.ls(f'{bucket}/{data_path}')

datastore_specs = zarr_helpers.get_dataset_specs_from_directory(directories)


In [15]:
target_size = 4
# not variable: bytes per mb
onemb = 1024 # bytes per mb
# number of data values per chunk
data_values_per_chunk = (target_size * onemb * onemb)/8 # 8 bytes for each data value
# since there are half as many latitudes as longitudes, calculate the y dimension to be half the x dimension
ydim = round(np.sqrt(data_values_per_chunk/2))
xdim = 2*ydim
target_chunks = {'time': 1, 'lat': ydim, 'lon': xdim}
target_chunks

{'time': 1, 'lat': 512, 'lon': 1024}

In [19]:
store_directory = f'{bucket}/fake_data/single_chunk_rechunked'
# for each datastore, rechunk and restore
for key, datastore in datastore_specs.items():
    # Open existing Zarr dataset
    with Timer() as t:
        ds = xr.open_zarr(datastore['source'])

        # Perform rechunking
        ds_rechunked = ds.copy()
        ds_rechunked.chunk(target_chunks)

        # Save rechunked dataset to a new Zarr store
        path = f'{store_directory}/store_lat{ydim}_lon{xdim}.zarr'
        store = s3fs.S3Map(root=path, s3=s3_fs, check=False)
        print(f"Writing to {path}")
        ds_rechunked.to_zarr(store, mode='w')
    datastore_specs[key]['time to rechunk'] = round(t.elapsed * 1000, 2) 

{'source': 's3://nasa-eodc-data-store/fake_data/single_chunk/store_lat1024_lon2048.zarr', 'collection_name': 'store_lat1024_lon2048.zarr', 'variable': 'data', 'shape': {'time': 1, 'lat': 1024, 'lon': 2048}, 'lat_resolution': 0.17595307917888564, 'lon_resolution': 0.17586712261846604, 'chunk_size_mb': 16.0, 'chunks': {'time': 1, 'lat': 1024, 'lon': 2048}, 'dtype': dtype('float64'), 'number_coord_chunks': 3, 'compression': Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0), 'number_of_chunks': 1}
Writing to nasa-eodc-data-store/fake_data/single_chunk_rechunked/store_lat512_lon1024.zarr
{'source': 's3://nasa-eodc-data-store/fake_data/single_chunk/store_lat1448_lon2896.zarr', 'collection_name': 'store_lat1448_lon2896.zarr', 'variable': 'data', 'shape': {'time': 1, 'lat': 1448, 'lon': 2896}, 'lat_resolution': 0.1243953006219765, 'lon_resolution': 0.12435233160621761, 'chunk_size_mb': 31.9931640625, 'chunks': {'time': 1, 'lat': 1448, 'lon': 2896}, 'dtype': dtype('float64'), 'number_c

In [21]:
datastore_specs

{'store_lat1024_lon2048.zarr': {'source': 's3://nasa-eodc-data-store/fake_data/single_chunk/store_lat1024_lon2048.zarr',
  'collection_name': 'store_lat1024_lon2048.zarr',
  'variable': 'data',
  'shape': {'time': 1, 'lat': 1024, 'lon': 2048},
  'lat_resolution': 0.17595307917888564,
  'lon_resolution': 0.17586712261846604,
  'chunk_size_mb': 16.0,
  'chunks': {'time': 1, 'lat': 1024, 'lon': 2048},
  'dtype': dtype('float64'),
  'number_coord_chunks': 3,
  'compression': Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0),
  'number_of_chunks': 1,
  'time to rechunk': 3486.02},
 'store_lat1448_lon2896.zarr': {'source': 's3://nasa-eodc-data-store/fake_data/single_chunk/store_lat1448_lon2896.zarr',
  'collection_name': 'store_lat1448_lon2896.zarr',
  'variable': 'data',
  'shape': {'time': 1, 'lat': 1448, 'lon': 2896},
  'lat_resolution': 0.1243953006219765,
  'lon_resolution': 0.12435233160621761,
  'chunk_size_mb': 31.9931640625,
  'chunks': {'time': 1, 'lat': 1448, 'lon': 2896}