In [3]:
import os
import xarray
import s3fs

  from pandas.core.computation.check import NUMEXPR_INSTALLED


In [4]:
def xarray_open_dataset(src_path: str) -> xarray.Dataset:
    """Open dataset."""
    xr_open_args: Dict[str, Any] = {
        "decode_coords": "all",
        "decode_times": False,
        "chunks": None,
        "engine": "h5netcdf"
    }
    fs = s3fs.S3FileSystem()
    file_handler = fs.open(src_path)
    ds = xarray.open_dataset(file_handler, **xr_open_args)
    return ds

In [5]:
%%time
src_path = 's3://nasa-veda-scratch/cmip6-staging/prXin/ACCESS-CM2/prXin-ACCESS-CM2-ssp126.nc'
original_ds = xarray_open_dataset(src_path)



CPU times: user 988 ms, sys: 409 ms, total: 1.4 s
Wall time: 2.68 s


In [6]:
%%time
src_path = 's3://veda-data-store-staging/rechunking/prXin-ACCESS-CM2-ssp126_compressed.nc'
ds_rechunked_lrg = xarray_open_dataset(src_path)

CPU times: user 1.67 s, sys: 318 ms, total: 1.98 s
Wall time: 15.2 s


In [9]:
%%time
loaded1 = original_ds.load()

CPU times: user 35.5 s, sys: 10.5 s, total: 46 s
Wall time: 4min 31s


In [10]:
%%time
loaded2 = ds_rechunked_lrg.load()

CPU times: user 11.3 s, sys: 1.67 s, total: 12.9 s
Wall time: 16.2 s


In [11]:
display_variable = 'prXin_1'
ds_rechunked = original_ds.chunk(chunks={'time': 1, 'lat': 600, 'lon': 1440})
ds_rechunked[display_variable]

Unnamed: 0,Array,Chunk
Bytes,566.89 MiB,6.59 MiB
Shape,"(86, 600, 1440)","(1, 600, 1440)"
Dask graph,86 chunks in 1 graph layer,86 chunks in 1 graph layer
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 566.89 MiB 6.59 MiB Shape (86, 600, 1440) (1, 600, 1440) Dask graph 86 chunks in 1 graph layer Data type float64 numpy.ndarray",1440  600  86,

Unnamed: 0,Array,Chunk
Bytes,566.89 MiB,6.59 MiB
Shape,"(86, 600, 1440)","(1, 600, 1440)"
Dask graph,86 chunks in 1 graph layer,86 chunks in 1 graph layer
Data type,float64 numpy.ndarray,float64 numpy.ndarray


In [13]:
%%time
target_file_name = "prXin-ACCESS-CM2-ssp126_small_chunks_compressed.nc"
comp = dict(zlib=True, complevel=5)
encoding = {var: comp for var in original_ds.data_vars}
loaded1.to_netcdf(target_file_name, encoding=encoding)

CPU times: user 12.3 s, sys: 63.5 ms, total: 12.3 s
Wall time: 12.4 s


In [14]:
print(f"Compressed netCDF size {os.path.getsize(target_file_name)/1000000} MB")

Compressed netCDF size 23.582471 MB


In [15]:
fs = s3fs.S3FileSystem()
new_target = f"s3://nasa-veda-scratch/cmip6-staging/{target_file_name}"
fs.put(target_file_name, new_target)

[None]

In [16]:
%%time
ds_rechunked_sm = xarray_open_dataset(new_target)

CPU times: user 1.52 s, sys: 298 ms, total: 1.82 s
Wall time: 12.4 s


In [17]:
%%time
ds_rechunked_sm.load()

CPU times: user 12 s, sys: 1.8 s, total: 13.8 s
Wall time: 18 s
