In [1]:
import sys
sys.path.insert(0, '/home/jovyan/data-ingestion/')
from utils.create_logger import create_logger

In [2]:
import dask
from pystac_client import Client as pystacClient
import json
import planetary_computer as pc
from utils.create_logger import create_logger
from utils.dask import create_cluster
# try catch search catalogue
logger = create_logger(file='../logs/test.log', name='test_logger')
# for summary upon completion
metrics = dict(
    search_errors=0,
    search_completes=0,
)

'''
#@dask.delayed
def search_catalogue(area_of_interest, 
                     time_of_interest, 
                     logger=logger, 
                     metrics=metrics, 
                     bad_sites=None):
    catalog = pystacClient.open("https://planetarycomputer.microsoft.com/api/stac/v1")
    
    # attempt to search catalogue with AOI and TOI
    # log problematic AOI and TOI if exception occurs
    try:
        search = catalog.search(
            collections=["sentinel-2-l2a"],
            intersects=area_of_interest,
            datetime=time_of_interest,
            query={"eo:cloud_cover": {"lt": 100}},
        )
        items = [item for items in search.get_items()]
        metrics['search_completes'] += 1
        return items
    except:
        metrics['search_errors'] += 1
        logger.warning(f'Failed to get catalogue item(s) for AOI:{area_of_interest}, TOI:{time_of_interest}')
'''        

catalog = pystacClient.open("https://planetarycomputer.microsoft.com/api/stac/v1")

@dask.delayed
def search_catalogue(area_of_interest, 
                     time_of_interest, 
                     logger=logger, 
                     metrics=metrics, 
                     bad_sites=None): 
    # attempt to search catalogue with AOI and TOI
    # log problematic AOI and TOI if exception occurs
    search = catalog.search(
            collections=["sentinel-2-l2a"],
            intersects=area_of_interest,
            datetime=time_of_interest,
            query={"eo:cloud_cover": {"lt": 100}},
        )
    items = [item for item in search.get_items()]
    return items
        
def get_asset_href(item, asset='visual'):
    asset_href = item.assets[asset].href
    return pc.sign(asset_href)  

def collect_items(gdf):
    items = []
    for i,_ in gdf.iterrows():
        area_of_interest = gdf['AOI'].iloc[i]
        time_of_interest = gdf['DATERANGE'].iloc[i]
        items.append(search_catalogue(area_of_interest, time_of_interest))
    return items

@dask.delayed
def collect_hrefs(items, *bands):
    bands = [*bands]
    hrefs = []
    for _,itemlist in enumerate(items):
        for b in bands:
            hrefs.append([(b, get_asset_href(item, b)) for _,item in enumerate(itemlist)])
    return hrefs



In [3]:
import pandas as pd
import os
import ast
# reads the credential file
with open('../credentials') as credentials:
    f = credentials.readlines() #gets the individual lines
    
# now assign those values to os.environ as accessible variables
for var in f:
    key, value = var.split(' = ') 
    #be sure the ' = 'has a space before and after in the credentials file
    os.environ[key] = value.rstrip('\n')

# then we will store our keys into a variable called storage_options

storage_options={'account_name':os.environ['ACCOUNT_NAME'],\
                 'account_key':os.environ['BLOB_STORAGE_KEY']}
def get_data():
    gdf = pd.read_csv('az://sentinel/fia/fia_fuzzed_gdf.csv', storage_options=storage_options)
    area_of_interest = [ast.literal_eval(AOI) for AOI in gdf.AOI]
    gdf.AOI = area_of_interest
    return gdf

In [4]:
# make logs directory first if it doesn't exist yet
cluster = create_cluster(workers=4,worker_threads=1,\
                         worker_memory=6)
client = cluster.get_client()
gdf = get_data()

In [5]:
cluster.dashboard_link

'https://pccompute.westeurope.cloudapp.azure.com/compute/services/dask-gateway/clusters/prod.541553d89c7e4fc9b1ced0755c4d86e7/status'

In [6]:
bands = ['AOT', 'B02', 'B03', 'B04', 'B08', 'visual', 'WVP']

# test on a sample
gdf_sample = gdf.head(8).reset_index()
sample_items = collect_items(gdf_sample)
sample_hrefs = collect_hrefs(sample_items, *bands)
#hrefs_out = dask.compute(*sample_hrefs)

# record summary of search errors and completes
#logger.info(f"Metrics: {json.dumps(metrics)}")

In [None]:
test = sample_hrefs.compute()

In [None]:
client.shutdown()