In [1]:
# pip/conda installed
import dask.array as da
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import rtree
import xarray as xr
from dask.distributed import Client

In [2]:
from utils.hls import HLSBand
from utils.hls import HLSCatalog
from utils.hls import HLSTileLookup
from utils.hls import fia_csv_to_data_catalog_input
from utils.hls import scene_to_urls

## Setup necessary utility functions/classes

In [3]:
lookup = HLSTileLookup()

Reading tile extents...
Read tile extents for 56686 tiles


In [4]:
bands = [
    HLSBand.COASTAL_AEROSOL,
    HLSBand.BLUE,
    HLSBand.GREEN,
    HLSBand.RED,
    HLSBand.NIR_NARROW,
#     HLSBand.SWIR1,
#     HLSBand.SWIR2,
#     HLSBand.CIRRUS,
    HLSBand.QA,
]

In [9]:
# based on https://github.com/scottyhq/cog-benchmarking/blob/master/notebooks/landsat8-cog-ndvi-mod.ipynb
def create_multiband_dataset(row, bands, chunks):
    '''A function to load multiple bands into an xarray dataset '''
    datasets = []
    for band, url in zip(bands, scene_to_urls(row['scene'], row['sensor'], bands)):
        da = xr.open_rasterio(url, chunks=chunks)
        da = da.squeeze().drop(labels='band')
        datasets.append(da.to_dataset(name=band))
    return xr.merge(datasets)

def create_timeseries_multiband_dataset(df, bands, chunks):
    '''For a single HLS tile create a multi-date, multi-band xarray dataset'''
    datasets = []
    for i,row in df.iterrows():
        try:
            ds = create_multiband_dataset(row, bands, chunks)
            datasets.append(ds)
        except Exception as e:
            print('ERROR loading, skipping acquistion!')
            print(e)
    DS = xr.concat(datasets, dim=pd.DatetimeIndex(df['dt'].tolist(), name='time'))
    print('Dataset size (Gb): ', DS.nbytes/1e9)
    return DS

In [5]:
point_catalog = HLSCatalog.from_point_pandas(df=fia_csv_to_data_catalog_input('./fia_10.csv'), bands=bands, tile_lookup=lookup)

In [6]:
point_catalog.xr_ds

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 4.77 µs


In [10]:
def get_mask(qa_band):
    """Takes a data array HLS qa band and returns a mask of True where quality is good, False elsewhere
    Mask usage:
        ds.where(mask)
        
    Example:
        qa_mask = get_mask(dataset[HLSBand.QA])
        ds = dataset.drop_vars(HLSBand.QA)
        masked = ds.where(qa_mask)
    """
    def is_bad_quality(qa):
        cirrus = 0b1
        cloud = 0b10
        adjacent_cloud = 0b100
        cloud_shadow = 0b1000
        high_aerosol = 0b11000000

        return (qa & cirrus > 0) | (qa & cloud > 0) | (qa & adjacent_cloud > 0) | \
            (qa & cloud_shadow > 0) | (qa & high_aerosol == high_aerosol)
    return xr.where(is_bad_quality(qa_band), False, True)  # True where is_bad_quality is False, False where is_bad_quality is True

In [11]:
client = Client("tcp://127.0.0.1:38859")
client

0,1
Client  Scheduler: tcp://127.0.0.1:38859  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 8  Cores: 16  Memory: 67.35 GB


In [12]:
grps = list(point_catalog.xr_ds.groupby('INDEX'))
jobs = []
for idx, ds in grps:
    df = ds.to_dataframe()
    jobs.append((idx, df))  # list of jobs - each df is a list of scenes

In [13]:
%%time
# HLS data on Azure isn't tiled so we want to read the entire data once (each tile is 3660x3660)...
x_chunk = 3660
y_chunk = 3660
chunks = {'band': 1, 'x': x_chunk, 'y': y_chunk}
training_ds = create_timeseries_multiband_dataset(jobs[0][1], point_catalog.xr_ds.attrs['bands'], chunks)

Dataset size (Gb):  24.902480312
CPU times: user 45.1 s, sys: 2.55 s, total: 47.6 s
Wall time: 2min 41s


In [20]:
qa_mask = None
if HLSBand.QA in training_ds.data_vars:
    qa_mask = get_mask(training_ds[HLSBand.QA])
    training_ds = training_ds.drop_vars(HLSBand.QA)

In [21]:
masked = training_ds.where(qa_mask)

In [22]:
# Rename vars from Enum to string for saving
masked = masked.rename({var: var.name for var in masked.data_vars})

In [23]:
median = masked.groupby('time.month').median()

In [24]:
median

Unnamed: 0,Array,Chunk
Bytes,1.29 GB,6.70 MB
Shape,"(12, 3660, 3660)","(1, 915, 915)"
Count,5598 Tasks,432 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 1.29 GB 6.70 MB Shape (12, 3660, 3660) (1, 915, 915) Count 5598 Tasks 432 Chunks Type float64 numpy.ndarray",3660  3660  12,

Unnamed: 0,Array,Chunk
Bytes,1.29 GB,6.70 MB
Shape,"(12, 3660, 3660)","(1, 915, 915)"
Count,5598 Tasks,432 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,1.29 GB,6.70 MB
Shape,"(12, 3660, 3660)","(1, 915, 915)"
Count,5598 Tasks,432 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 1.29 GB 6.70 MB Shape (12, 3660, 3660) (1, 915, 915) Count 5598 Tasks 432 Chunks Type float64 numpy.ndarray",3660  3660  12,

Unnamed: 0,Array,Chunk
Bytes,1.29 GB,6.70 MB
Shape,"(12, 3660, 3660)","(1, 915, 915)"
Count,5598 Tasks,432 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,1.29 GB,6.70 MB
Shape,"(12, 3660, 3660)","(1, 915, 915)"
Count,5598 Tasks,432 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 1.29 GB 6.70 MB Shape (12, 3660, 3660) (1, 915, 915) Count 5598 Tasks 432 Chunks Type float64 numpy.ndarray",3660  3660  12,

Unnamed: 0,Array,Chunk
Bytes,1.29 GB,6.70 MB
Shape,"(12, 3660, 3660)","(1, 915, 915)"
Count,5598 Tasks,432 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,1.29 GB,6.70 MB
Shape,"(12, 3660, 3660)","(1, 915, 915)"
Count,5598 Tasks,432 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 1.29 GB 6.70 MB Shape (12, 3660, 3660) (1, 915, 915) Count 5598 Tasks 432 Chunks Type float64 numpy.ndarray",3660  3660  12,

Unnamed: 0,Array,Chunk
Bytes,1.29 GB,6.70 MB
Shape,"(12, 3660, 3660)","(1, 915, 915)"
Count,5598 Tasks,432 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,1.29 GB,6.70 MB
Shape,"(12, 3660, 3660)","(1, 915, 915)"
Count,5598 Tasks,432 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 1.29 GB 6.70 MB Shape (12, 3660, 3660) (1, 915, 915) Count 5598 Tasks 432 Chunks Type float64 numpy.ndarray",3660  3660  12,

Unnamed: 0,Array,Chunk
Bytes,1.29 GB,6.70 MB
Shape,"(12, 3660, 3660)","(1, 915, 915)"
Count,5598 Tasks,432 Chunks
Type,float64,numpy.ndarray


In [25]:
# groupby + median changes chunk size...lets change it back
median = median.chunk({'month': 1, 'y': 3660, 'x': 3660})

In [None]:
%%time

median.to_zarr("monthly_median_5.zarr")