In [1]:
import os
import xarray
import s3fs

  from pandas.core.computation.check import NUMEXPR_INSTALLED


In [2]:
def xarray_open_dataset(src_path: str) -> xarray.Dataset:
    """Open dataset."""
    xr_open_args: Dict[str, Any] = {
        "decode_coords": "all",
        "decode_times": False,
        "engine": "h5netcdf"
    }
    fs = s3fs.S3FileSystem()
    file_handler = fs.open(src_path)
    ds = xarray.open_dataset(file_handler, **xr_open_args)
    return ds

In [3]:
%%time
src_path = 's3://nasa-veda-scratch/cmip6-staging/prXin/ACCESS-CM2/prXin-ACCESS-CM2-ssp126.nc'
original_ds = xarray_open_dataset(src_path)



CPU times: user 976 ms, sys: 184 ms, total: 1.16 s
Wall time: 1.77 s


In [5]:
%%time
loaded1 = original_ds.load()

CPU times: user 31.9 s, sys: 10.1 s, total: 42 s
Wall time: 4min 20s


In [4]:
%%time
src_path = 's3://veda-data-store-staging/rechunking/prXin-ACCESS-CM2-ssp126_compressed.nc'
ds_rechunked_lrg = xarray_open_dataset(src_path)

CPU times: user 1.61 s, sys: 311 ms, total: 1.92 s
Wall time: 15 s


In [6]:
%%time
loaded2 = ds_rechunked_lrg.load()

CPU times: user 11.8 s, sys: 1.8 s, total: 13.6 s
Wall time: 16.9 s


In [7]:
display_variable = 'prXin_1'
ds_rechunked = original_ds.chunk(chunks={'time': 1, 'lat': 600, 'lon': 1440})
ds_rechunked[display_variable]

Unnamed: 0,Array,Chunk
Bytes,566.89 MiB,6.59 MiB
Shape,"(86, 600, 1440)","(1, 600, 1440)"
Dask graph,86 chunks in 1 graph layer,86 chunks in 1 graph layer
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 566.89 MiB 6.59 MiB Shape (86, 600, 1440) (1, 600, 1440) Dask graph 86 chunks in 1 graph layer Data type float64 numpy.ndarray",1440  600  86,

Unnamed: 0,Array,Chunk
Bytes,566.89 MiB,6.59 MiB
Shape,"(86, 600, 1440)","(1, 600, 1440)"
Dask graph,86 chunks in 1 graph layer,86 chunks in 1 graph layer
Data type,float64 numpy.ndarray,float64 numpy.ndarray


In [8]:
%%time
target_file_name = "prXin-ACCESS-CM2-ssp126_small_chunks_compressed.nc"
comp = dict(zlib=True, complevel=5)
encoding = {var: comp for var in original_ds.data_vars}
loaded1.to_netcdf(target_file_name, encoding=encoding)

CPU times: user 13.2 s, sys: 378 ms, total: 13.6 s
Wall time: 13.8 s


In [9]:
print(f"Compressed netCDF size {os.path.getsize(target_file_name)/1000000} MB")

Compressed netCDF size 23.582471 MB


In [10]:
fs = s3fs.S3FileSystem()
new_target = f"s3://nasa-veda-scratch/cmip6-staging/{target_file_name}"
fs.put(target_file_name, new_target)

[None]

In [11]:
%%time
ds_rechunked_sm = xarray_open_dataset(new_target)

CPU times: user 1.41 s, sys: 288 ms, total: 1.7 s
Wall time: 15.2 s


In [12]:
%%time
ds_rechunked_sm.load()

CPU times: user 13.5 s, sys: 2.23 s, total: 15.8 s
Wall time: 22.3 s
