In [27]:
# pip/conda installed
import dask.array as da
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import rtree
import xarray as xr
from dask.distributed import Client

In [3]:
from utils.hls import HLSBand
from utils.hls import HLSCatalog
from utils.hls import HLSTileLookup
from utils.hls import fia_csv_to_data_catalog_input
from utils.hls import scene_to_urls

## Setup necessary utility functions/classes

In [4]:
lookup = HLSTileLookup()

Reading tile extents...
Read tile extents for 56686 tiles


In [5]:
point_catalog = HLSCatalog.from_point_pandas(df=fia_csv_to_data_catalog_input('../fia_10.csv'), bands=[HLSBand.NIR_NARROW, HLSBand.QA], tile_lookup=lookup)

In [6]:
point_catalog.xr_ds

In [7]:
%%time
bbox = [-124.98046874999999, 24.367113562651262, -66.70898437499999, 49.49667452747045]
years = [2019]
bands=[HLSBand.NIR_NARROW, HLSBand.QA]
# bbox_catalog = HLSCatalog.from_bbox(bbox, years, landsat_bands, sentinel_bands, lookup)

CPU times: user 7 µs, sys: 0 ns, total: 7 µs
Wall time: 11.2 µs


In [8]:
def create_multiband_dataset(row, bands, chunks):
    '''A function to load multiple bands into an xarray dataset '''
    
    # Each image is a dataset containing both band4 and band5
    datasets = []
    for band, url in zip(bands, scene_to_urls(row['scene'], row['sensor'], bands)):
        da = xr.open_rasterio(url, chunks=chunks)
        da = da.squeeze().drop(labels='band')
        ds = da.to_dataset(name=band)
        datasets.append(ds)
    return xr.merge(datasets)

def create_timeseries_multiband_dataset(df, bands, chunks):
    '''For a single HLS tile create a multi-date, multi-band xarray dataset'''
    datasets = []
    for i,row in df.iterrows():
        try:
            # print('loading...', row['dt'])
            ds = create_multiband_dataset(row, bands, chunks)
            datasets.append(ds)
        except Exception as e:
            print('ERROR loading, skipping acquistion!')
            print(e)
    DS = xr.concat(datasets, dim=pd.DatetimeIndex(df['dt'].tolist(), name='time'))
    print('Dataset size (Gb): ', DS.nbytes/1e9)
    return DS

In [9]:
client = Client("tcp://127.0.0.1:38853")

In [10]:
x_chunk = 366*2
y_chunk = 366*2
chunks = {'band': 1, 'x': x_chunk, 'y': y_chunk}
grps = list(point_catalog.xr_ds.groupby('INDEX'))
jobs = []
for g in grps:
    idx, ds = g
    df = ds.to_dataframe()
    jobs.append((g[0], df))
training_ds = create_timeseries_multiband_dataset(jobs[0][1], point_catalog.xr_ds.attrs['bands'], chunks)

Dataset size (Gb):  6.791629112


In [29]:
training_ds = training_ds.assign_coords({"time": training_ds['time.month']})

AttributeError: 'IndexVariable' object has no attribute 'month'

In [30]:
training_da = training_ds.data_vars[HLSBand.NIR_NARROW].data

In [34]:
med = da.median(training_da, axis=0)
med.compute()

array([[ 3384.,  3552.,  3671., ..., -1000., -1000., -1000.],
       [ 3650.,  3575.,  3521., ..., -1000., -1000., -1000.],
       [ 3514.,  3476.,  3360., ..., -1000., -1000., -1000.],
       ...,
       [   44.,    36.,    40., ...,   -18.,   -18.,   -17.],
       [   43.,    42.,    45., ...,   -21.,   -18.,   -20.],
       [   42.,    39.,    45., ...,   -22.,   -18.,   -19.]])

In [35]:
med

Unnamed: 0,Array,Chunk
Bytes,107.16 MB,1.07 MB
Shape,"(3660, 3660)","(366, 366)"
Count,17519 Tasks,100 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 107.16 MB 1.07 MB Shape (3660, 3660) (366, 366) Count 17519 Tasks 100 Chunks Type float64 numpy.ndarray",3660  3660,

Unnamed: 0,Array,Chunk
Bytes,107.16 MB,1.07 MB
Shape,"(3660, 3660)","(366, 366)"
Count,17519 Tasks,100 Chunks
Type,float64,numpy.ndarray


In [None]:
x_chunk = 366*2
y_chunk = 366*2
chunks = {'band': 1, 'x': x_chunk, 'y': y_chunk}
entries = point_catalog.xr_ds.to_dataframe().to_dict('records')
arrays = []
for entry in entries:
    for band, url in zip(point_catalog.xr_ds.attrs['bands'], scene_to_urls(entry['scene'], point_catalog.xr_ds.attrs['landsat_bands'])):
        da = xr.open_rasterio(url, chunks={'band': 1, 'x': x_chunk, 'y': y_chunk})
        da = da.squeeze().drop(labels='band')
        ds = da.to_dataset(name=band)
        r = r.expand_dims({'datetime': [entry['dt']], 'index': [entry['INDEX']], 'band': band})
        arrays.append(r)
    break
        
        

In [None]:
arrays[2]

In [17]:
idx, month, urls = jobs[0]
url = urls[0]
x_chunk = 366*2
y_chunk = 366*2
r = xr.open_rasterio(url, chunks={'band': 1, 'x': x_chunk, 'y': y_chunk}) # get underlying dask array because xarray doesn't support median w/ dask



In [None]:
%%time

def job_to_median(job):
    xr
    x_chunk = 366*2
    y_chunk = 366*2
    da_lst = [
        xr.open_rasterio(url, chunks={'band': 1, 'x': x_chunk, 'y': y_chunk}).data # get underlying dask array because xarray doesn't support median w/ dask
        for url in urls
    ]
    year_array = da.concatenate(da_lst, axis=0)
    median_array = da.median(year_array, axis=0)
    median_array.compute()
    return median_array

for idx, month, urls in jobs[:12]:
    med = urls_to_median(urls)
    print(f"Completed median for {idx}, {month}")

Completed median for 2, 1
Completed median for 2, 2
Completed median for 2, 3
Completed median for 2, 4
Completed median for 2, 5
Completed median for 2, 6
Completed median for 2, 7
Completed median for 2, 8
Completed median for 2, 9
Completed median for 2, 10
