In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import gtsa

from pathlib import Path
import shutil
import psutil
import pandas as pd

In [3]:
workers = psutil.cpu_count(logical=True)-1
client = gtsa.io.dask_start_cluster(workers,
                                    port=':8787')


Dask dashboard at: http://127.0.0.1:8787/status
Workers: 9
Threads per worker: 1 



### Download test data

In [4]:
# gtsa.dataquery.download_hi_res_test_data(site = 'south-cascade',
#                                          output_directory = '../test_data',
#                                          include_refdem = True,
#                                          overwrite = True,
#                                         )  

In [5]:
! ls ../test_data/south-cascade_1m_dems/*.tif

../test_data/south-cascade_1m_dems/WV_south-cascade_20151014_1m_dem.tif
../test_data/south-cascade_1m_dems/hsfm_NAGAP_south-cascade_19670921_1m_dem.tif
../test_data/south-cascade_1m_dems/hsfm_NAGAP_south-cascade_19700929_1m_dem.tif
../test_data/south-cascade_1m_dems/hsfm_NAGAP_south-cascade_19740810_1m_dem.tif
../test_data/south-cascade_1m_dems/hsfm_NAGAP_south-cascade_19771003_1m_dem.tif
../test_data/south-cascade_1m_dems/hsfm_NAGAP_south-cascade_19790820_1m_dem.tif
../test_data/south-cascade_1m_dems/hsfm_NAGAP_south-cascade_19791006_1m_dem.tif
../test_data/south-cascade_1m_dems/hsfm_NAGAP_south-cascade_19840814_1m_dem.tif
../test_data/south-cascade_1m_dems/hsfm_NAGAP_south-cascade_19860905_1m_dem.tif
../test_data/south-cascade_1m_dems/hsfm_NAGAP_south-cascade_19870821_1m_dem.tif
../test_data/south-cascade_1m_dems/hsfm_NAGAP_south-cascade_19900905_1m_dem.tif
../test_data/south-cascade_1m_dems/hsfm_NAGAP_south-cascade_19910909_1m_dem.tif
../test_data/south-cascade_1m_dems/h

### Parse file paths and time stamps

In [6]:
data_dir = '../test_data/south-cascade_1m_dems/'

In [7]:
dems = [x.as_posix() for x in sorted(Path(data_dir).glob('*.tif'))]
date_strings = [x[1:-1] for x in gtsa.io.parse_timestamps(dems,
                                                          date_string_pattern='_........_')]
# ensure chronological sorting 
date_strings, dems = list(zip(*sorted(zip(date_strings, dems))))
date_times = [pd.to_datetime(x, format="%Y%m%d") for x in date_strings]

In [8]:
dems, date_times

(('../test_data/south-cascade_1m_dems/hsfm_NAGAP_south-cascade_19670921_1m_dem.tif',
  '../test_data/south-cascade_1m_dems/hsfm_NAGAP_south-cascade_19700929_1m_dem.tif',
  '../test_data/south-cascade_1m_dems/hsfm_NAGAP_south-cascade_19740810_1m_dem.tif',
  '../test_data/south-cascade_1m_dems/hsfm_NAGAP_south-cascade_19771003_1m_dem.tif',
  '../test_data/south-cascade_1m_dems/hsfm_NAGAP_south-cascade_19790820_1m_dem.tif',
  '../test_data/south-cascade_1m_dems/hsfm_NAGAP_south-cascade_19791006_1m_dem.tif',
  '../test_data/south-cascade_1m_dems/hsfm_NAGAP_south-cascade_19840814_1m_dem.tif',
  '../test_data/south-cascade_1m_dems/hsfm_NAGAP_south-cascade_19860905_1m_dem.tif',
  '../test_data/south-cascade_1m_dems/hsfm_NAGAP_south-cascade_19870821_1m_dem.tif',
  '../test_data/south-cascade_1m_dems/hsfm_NAGAP_south-cascade_19900905_1m_dem.tif',
  '../test_data/south-cascade_1m_dems/hsfm_NAGAP_south-cascade_19910909_1m_dem.tif',
  '../test_data/south-cascade_1m_dems/hsfm_NAGAP_south-cascade_19

In [9]:
ref_dem = dems[-1]
ref_dem

'../test_data/south-cascade_1m_dems/WV_south-cascade_20151014_1m_dem.tif'

### Reproject to reference DEM grid
- Create a reprojected NetCDF file for each DEM
- Loads all NetCDF files lazily

In [10]:
ds = gtsa.io.xr_stack_geotifs(dems,
                              date_times,
                              ref_dem,
                              resampling="bilinear",
                              save_to_nc = True,
                              nc_out_dir = Path(data_dir,'nc_files').as_posix(),
                              overwrite = False)

Saved .nc files in ../test_data/south-cascade_1m_dems/nc_files


### Examine current chunk shape
- Each time stamped DEM is a single chunk

In [17]:
ds['band1']

In [16]:
ds['band1'].sel(time = ds.time.values[0])

### Rechunk along time dimension
- Creates temporary zarr file for efficient rechunking
- Saves a zarr file chunked along full time dimension to disk
- Significantly improves dask worker occupation and processing time for computations along the time dimension

In [13]:
ds_zarr = gtsa.io.create_zarr_stack(ds,
                                    output_directory = Path(data_dir,'stack').as_posix(),
                                    overwrite = False)

Creating temporary zarr stack
/
 ├── band1 (19, 21418, 22940) float32
 ├── time (19,) int64
 ├── x (22940,) float64
 └── y (21418,) float64
Name               : /band1
Type               : zarr.core.Array
Data type          : float32
Shape              : (19, 21418, 22940)
Chunk shape        : (1, 21418, 22940)
Order              : C
Read-only          : False
Compressor         : Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0)
Store type         : zarr.storage.DirectoryStore
No. bytes          : 37340997920 (34.8G)
No. bytes stored   : 1595321965 (1.5G)
Storage ratio      : 23.4
Chunks initialized : 19/19

Rechunking temporary zarr stack and saving as
../test_data/south-cascade_1m_dems/stack/stack.zarr

Rechunked zarr file info
/
 ├── band1 (19, 21418, 22940) float32
 ├── time (19,) int64
 ├── x (22940,) float64
 └── y (21418,) float64
Name               : /band1
Type               : zarr.core.Array
Data type          : float32
Shape              : (19, 21418, 22940)
Chunk 

In [18]:
ds_zarr['band1']

In [19]:
ds_zarr['band1'].sel(time = ds_zarr.time.values[0])