# <u>NSW DCCEEW training â€“ Demo 7: loading data from S3 storage</u>

 - <b>Author</b>: Eric.Lehmann@csiro.au &amp; Kesav.Unnithan@csiro.au
 - <b>Release date / version</b>: Aug. 2024, v1.0 &ndash; updated from notebook by Kesav U.
 - <b>Dev. platform</b>: CSIRO ADIAS/ADS (hub.adias.aquawatchaus.space)
 - <b>Server profile</b>: EASI Open Data Cube No ML &ndash; Version 2023.10.2 
 - <b>Server resources</b>: 32 CPU &ndash; 64GB RAM
 - <b>Python kernel</b>: `Python 3 (ipykernel)`
 - <b>Dask</b>: no cluster


# Overview

Demonstrates how to access pre-computed data saved to an S3 project bucket on EASI / ADIAS. 

This dataset represents the output from a deep-learning model used to infer water quality parameters using Landsat data.

In [None]:
import sys
import xarray as xr
from matplotlib.colors import LogNorm
import matplotlib.pyplot as plt

import numpy as np

# from: https://github.com/csiro-easi/easi-notebooks.git
sys.path.append('/home/jovyan/git_hub_notebooks/scripts/')
import notebook_utils   # for xarray_object_size(), localcluster_dashboard()
# from app_utils import display_map

In [None]:
### User parameters
s3_access_path = f's3://adias-prod-dc-data-projects/nsw-outflows/DL/Clarence'   # path to S3 bucket

In [None]:
%%time

### Load entire time series
bb_wq_ds = xr.open_mfdataset(f"{s3_access_path}/*", engine='zarr', chunks={})
bb_wq_ds

In [None]:
notebook_utils.xarray_object_size( bb_wq_ds )

In [None]:
bb_wq_ds.op.values

In [None]:
%%time

### Extract desired WQ parameters, e.g. here TSS and DOC
# Also subsample the data spatially to reduce computation / memory requirements
bb_wq_ds_wq = bb_wq_ds.output_data[::6, ::6, 0:2, :].compute()
display( notebook_utils.xarray_object_size( bb_wq_ds_wq ) )
bb_wq_ds_wq

In [None]:
### Convert to Xarray Dataset with WQ parameters as data variables
bb_wq_ds1 = xr.Dataset({var: bb_wq_ds_wq.sel(op=var).drop('op') for var in bb_wq_ds_wq.op.values})
bb_wq_ds1

In [None]:
### Remove empty time slices
bb_wq_ds1 = bb_wq_ds1.dropna('time', how='all')
bb_wq_ds1

In [None]:
### Filter out outliers...
percentile_10_1 = bb_wq_ds1.quantile(0.1, dim='time')
percentile_90_1 = bb_wq_ds1.quantile(0.9, dim='time')
bb_wq_ds1 = bb_wq_ds1.where((bb_wq_ds1 >= percentile_10_1) & (bb_wq_ds1 <= percentile_90_1))

median_ds = bb_wq_ds1.median(dim='time')

In [None]:
median_ds.tss.plot.imshow(figsize=(9, 6), norm=LogNorm(), cmap='turbo')
plt.gca().set_title('Median TSS (subsampled)');
plt.gca().set_aspect('equal','box');

In [None]:
plt_ind = np.linspace(2, bb_wq_ds1.sizes['time'], 6, dtype='int') - 1   # some selected time slices to display
pp = bb_wq_ds1.tss[:,:,plt_ind].plot( col='time', col_wrap=3, figsize=(14,8), norm=LogNorm(), cmap='turbo', cbar_kwargs={'label': 'TSS'} )

for ii,ax in enumerate(pp.axs.flat): ax.set_aspect('equal','box')

In [None]:
### End notebook