In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import gtsa

from pathlib import Path
import shutil
import psutil
import pandas as pd
import hvplot.xarray

# Raster stacking

Stacks single band rasters and chunks along the time dimension (on disk) for memory-efficient data retrieval.

#### Prerequesites
- Download DEM data with `00_download_dem_data.py` or `00_download_dem_data.ipynb`

## Start dask cluster
- For parallel read/write

In [None]:
workers = psutil.cpu_count(logical=True)-1
client = gtsa.io.dask_start_cluster(workers,
                                    ip_addres='http://sunhado.ce.washington.edu', # replace with address if working on remote machine
                                    port=':8787', # if occupies a different port will automatically be assigned
                                   )

## Get DEM file paths and time stamps

In [None]:
data_dir = '../../data/dems/south-cascade/' # small test dataset
# data_dir = '../../data/dems/mount-baker' # large dataset

In [None]:
dems = [x.as_posix() for x in sorted(Path(data_dir).glob('*.tif'))]
date_strings = [x[1:-1] for x in gtsa.io.parse_timestamps(dems,date_string_pattern='_........_')]
date_strings, dems = list(zip(*sorted(zip(date_strings, dems)))) # ensure chronological sorting 
date_times = [pd.to_datetime(x, format="%Y%m%d") for x in date_strings]

In [None]:
ref_dem = dems[-1] # always last after chronological sorting
ref_dem

## Reproject to reference DEM grid
- Create a reprojected NetCDF file for each DEM
- Loads all NetCDF files lazily

In [None]:
ds = gtsa.io.xr_stack_geotifs(dems,
                              date_times,
                              ref_dem,
                              resampling="bilinear",
                              save_to_nc = True,
                              nc_out_dir = Path(data_dir,'nc_files').as_posix(),
                              overwrite = False)

## Examine current chunk shape
- Each time stamped DEM is a single chunk

In [None]:
ds['band1']

In [None]:
ds['band1'].sel(time = ds.time.values[0])

## Rechunk along time dimension
- Creates temporary zarr file for efficient rechunking
- Saves a zarr file chunked along full time dimension to disk
- Significantly improves dask worker occupation and processing time for computations along the time dimension

In [None]:
ds_zarr = gtsa.io.create_zarr_stack(ds,
                                    output_directory = Path(data_dir,'stack').as_posix(),
                                    variable_name='band1',
                                    zarr_stack_file_name='stack.zarr',
                                    overwrite = False,
                                    cleanup=True)

In [None]:
ds_zarr['band1']

In [None]:
ds_zarr['band1'].sel(time = ds_zarr.time.values[0])