In [11]:
import sys
sys.path.insert(0, '/home/jovyan/data-ingestion/')
from utils.create_logger import create_logger

In [12]:
import dask
from pystac_client import Client as pystacClient
import json
import planetary_computer as pc
from utils.create_logger import create_logger
from utils.dask import create_cluster
# try catch search catalogue
logger = create_logger(file='../logs/test.log', name='test_logger')
# for summary upon completion
metrics = dict(
    search_errors=0,
    search_completes=0,
)

catalog = pystacClient.open("https://planetarycomputer.microsoft.com/api/stac/v1")

@dask.delayed
def search_catalogue(area_of_interest, 
                     time_of_interest, 
                     logger=logger, 
                     metrics=metrics, 
                     bad_sites=None): 
    # attempt to search catalogue with AOI and TOI
    # log problematic AOI and TOI if exception occurs
    # returns list of items
    try:
        search = catalog.search(
                collections=["sentinel-2-l2a"],
                intersects=area_of_interest,
                datetime=time_of_interest,
                query={"eo:cloud_cover": {"lt": 100}},
            )
        items = [item for item in search.get_items()]
        return items
    except:
        #network failure
        print('cannot connect to collect catalog!')
        return []
    
        
def get_asset_href(item, asset='visual'):
    asset_href = item.assets[asset].href
    #no need to sign just yet...
    #return pc.sign(asset_href)  
    return asset_href  

def collect_items(gdf):
    items_list = []
    for i,_ in gdf.iterrows():
        area_of_interest = gdf['AOI'].iloc[i]
        time_of_interest = gdf['DATERANGE'].iloc[i]
        items_list.append({'uid':gdf['UID'].iloc[i],'catalog':search_catalogue(area_of_interest, time_of_interest)})
    return items_list

@dask.delayed
def collect_hrefs(samples, *bands):
    bands = [*bands]
    #samples_hrefs = [] #don't store these
    for sample_items in samples:
        item_hrefs = []
        uid = sample_items['uid']
        if len(sample_items['catalog']) == 0:
            print(f'{uid} is empty')
            continue #go to the next sample
        #next loop
        for item in sample_items['catalog']:
            links = {b:get_asset_href(item,b) for b in bands}
            item_hrefs.append(pd.DataFrame(links, index=pd.Index([item.id],\
                                                                    name='scene_name')))
        if len(item_hrefs) > 0:
            hrefs = pd.concat(item_hrefs)
            #samples_hrefs.append({'uid':uid, 'hrefs':hrefs}) no storage
            hrefs.to_csv(f'az://sentinel/hrefs/{uid}.csv',\
                         storage_options=storage_options)
    return 0



In [13]:
import pandas as pd
import os
import ast
# reads the credential file
with open('../credentials') as credentials:
    f = credentials.readlines() #gets the individual lines
    
# now assign those values to os.environ as accessible variables
for var in f:
    key, value = var.split(' = ') 
    #be sure the ' = 'has a space before and after in the credentials file
    os.environ[key] = value.rstrip('\n')

# then we will store our keys into a variable called storage_options

storage_options={'account_name':os.environ['ACCOUNT_NAME'],\
                 'account_key':os.environ['BLOB_STORAGE_KEY']}
def get_data():
    gdf = pd.read_csv('az://sentinel/fia/fia_fuzzed_gdf.csv', storage_options=storage_options)
    area_of_interest = [ast.literal_eval(AOI) for AOI in gdf.AOI]
    gdf.AOI = area_of_interest
    return gdf

In [14]:
# make logs directory first if it doesn't exist yet
cluster = create_cluster(workers=1,worker_threads=4,\
                         worker_memory=32)
client = cluster.get_client()
cluster.dashboard_link

'https://pccompute.westeurope.cloudapp.azure.com/compute/services/dask-gateway/clusters/prod.4cf01b76b5de4f41b08bddb75bd5717e/status'

In [15]:
gdf = get_data()

In [None]:
'''
#alias the CN to UID
import random
import uuid
import numpy as np
uid = []
cn = []
for i in range(len(gdf)):
    rnd = random.Random()
    rnd.seed(gdf.iloc[i].CN)
    u_uid = uuid.UUID(int=rnd.getrandbits(128), version=4)
    uid.append(str(u_uid))
    cn.append(gdf.iloc[i].CN)
uid = np.array([uid]).T
cn = np.array([cn]).T

linked_df = pd.DataFrame(cn, columns = ['CN'])
linked_df['UID'] = uid
gdf = pd.merge(gdf, linked_df, on='CN')

#gdf.to_csv('az://sentinel/fia/fia_fuzzed_gdf.csv', index=False, storage_options=storage_options)
'''

In [19]:
bands = ['AOT', 'B02', 'B03', 'B04', 'B08', 'visual', 'WVP']

# test on a sample
gdf_sample = gdf.head(3500)
sample_items_list = collect_items(gdf_sample)
collect_hrefs(sample_items_list, *bands).compute()
#hrefs_out = dask.compute(*sample_hrefs)

# record summary of search errors and completes
#logger.info(f"Metrics: {json.dumps(metrics)}")

0

distributed.client - ERROR - Failed to reconnect to scheduler after 30.00 seconds, closing client
_GatheringFuture exception was never retrieved
future: <_GatheringFuture finished exception=CancelledError()>
asyncio.exceptions.CancelledError


In [17]:
client.restart()

0,1
Connection method: Cluster object,Cluster type: GatewayCluster
Dashboard: https://pccompute.westeurope.cloudapp.azure.com/compute/services/dask-gateway/clusters/prod.4cf01b76b5de4f41b08bddb75bd5717e/status,


In [18]:
import fsspec
fs = fsspec.filesystem('az',\
                       account_name=storage_options['account_name'],\
                       account_key=storage_options['account_key'])
len(fs.ls('sentinel/hrefs'))

3017

In [None]:
client.shutdown()