In [1]:
import os
import time

# pip/conda installed
import dask.array as da
import fsspec
import pandas as pd
import xarray as xr
from dask.distributed import as_completed
from dask.distributed import Client
from dask_gateway import GatewayCluster

from utils import get_logger
from utils.dask import create_cluster
from utils.dask import upload_source
from utils.hls.catalog import HLSBand
from utils.hls.catalog import HLSCatalog
from utils.hls.catalog import scene_to_urls
from utils.hls.compute import process_catalog
from utils.hls.compute import calculate_job_median

In [2]:
logger = get_logger('hls-ri')

In [3]:
# fill with your account key
os.environ['AZURE_ACCOUNT_KEY'] = ""

In [4]:
# read the entire data once (each tile is 3660x3660)...
chunks = {'band': 1, 'x': 3660, 'y': 3660}

In [5]:
df = pd.read_csv('az://fia/fia_no_pltcn.csv', storage_options={'account_name': "usfs", 'account_key': os.environ['AZURE_ACCOUNT_KEY']})

In [6]:
ri_df = df[df['STATECD']==44].rename(columns = {'LAT':'lat', 'LON':'lon', 'INVYR':'year'})

In [8]:
my_bands = [HLSBand(x) for x in [2, 3, 4, 5, 6, 7, 11]]
ri_catalog = HLSCatalog.from_point_pandas(ri_df, bands=my_bands)

Reading tile extents...
Read tile extents for 56686 tiles


In [9]:
num_workers = 8
cluster = create_cluster(
    workers=num_workers,
    worker_threads=1,
    worker_memory=4,
    scheduler_threads=1,
    scheduler_memory=8
)
client = cluster.get_client()
cluster

VBox(children=(HTML(value='<h2>GatewayCluster</h2>'), HBox(children=(HTML(value='\n<div>\n<style scoped>\n    …

In [10]:
# All workers must be started when source code is uploaded to them.
logger.info("Waiting for cluster workers to start")
client.wait_for_workers(num_workers)
logger.info("Uploading code to workers")
upload_source('./utils', client)

2021-01-14 01:37:45,881 [INFO] hls-ri - Waiting for cluster workers to start
2021-01-14 01:37:45,885 [INFO] hls-ri - Uploading code to workers


In [11]:
account_name="usfs"
storage_container="fia/hls-testing/ri_best"
account_key=os.environ["AZURE_ACCOUNT_KEY"]
catalog_groupby = "tile"
job_groupby = "time.month"

In [12]:
# Filter to scenes from 1998 and later, then group by year
yr_catalogs = ri_catalog.xr_ds.where(ri_catalog.xr_ds['year'] >= 1998, drop=True).groupby('year')

In [13]:
for yr, ca in yr_catalogs:
        logger.info(f"Starting process for {yr}")
        ca.info()
        storage_prefix = f"{storage_container}/{yr}"
        process_catalog(
            catalog=ca,
            catalog_groupby=catalog_groupby,
            job_fn=calculate_job_median,
            job_groupby=job_groupby,
            chunks=chunks,
            account_name=account_name,
            storage_container=storage_prefix,
            account_key=account_key,
            client=client,
            concurrency=4,
            logger=logger
        )

2021-01-14 01:37:53,771 [INFO] hls-ri - Starting process for 2013.0
xarray.Dataset {
dimensions:
	index = 1456 ;

variables:
	float64 INDEX(index) ;
	float64 year(index) ;
	float64 STATECD(index) ;
	float64 lat(index) ;
	float64 lon(index) ;
	object tile(index) ;
	object scene(index) ;
	object sensor(index) ;
	datetime64[ns] dt(index) ;
	int64 index(index) ;

// global attributes:
	:bands = [<HLSBand.BLUE: 2>, <HLSBand.GREEN: 3>, <HLSBand.RED: 4>, <HLSBand.NIR_NARROW: 5>, <HLSBand.SWIR1: 6>, <HLSBand.SWIR2: 7>, <HLSBand.QA: 11>] ;
}2021-01-14 01:37:53,776 [INFO] hls-ri - Submitting job 18TYL
2021-01-14 01:37:53,888 [INFO] hls-ri - Submitting job 18TYM
2021-01-14 01:37:54,034 [INFO] hls-ri - Submitting job 19TBG
2021-01-14 01:37:56,159 [INFO] hls-ri - Submitting job 19TCG
2021-01-14 01:53:16,368 [INFO] hls-ri - Completed job 18TYL
2021-01-14 01:59:25,136 [INFO] hls-ri - Completed job 18TYM
2021-01-14 02:00:17,439 [INFO] hls-ri - Completed job 19TCG
2021-01-14 02:09:19,612 [ERROR] hls-ri

KeyboardInterrupt: 

In [None]:
cluster.shutdown()