In [1]:
import geopandas as gpd
from satio_pc.grid import get_blocks_gdf
from dask import delayed

# from elogs import Elogs, ElogsTask

with open('../../../connstr') as f:
    connect_str = f.read()

# with open('../../../tap') as f:
#     aws_access_key_id, aws_secret_access_key = f.read().split('\n')


tiles = '29TPJ, 29TQJ, 30TUP, 30TVP, 29TPH, 29TQH, 30TUN, 30TVN, 29TPG'.split(', ')

aoi_fn = '../../../habitat_mapping_aoi.gpkg'
aoi = gpd.read_file(aoi_fn).to_crs(epsg=4326)

blocks_gdf = get_blocks_gdf(tiles)
blocks_gdf = gpd.sjoin(blocks_gdf, aoi[['geometry']])


In [2]:
settings = {

    "l2a": {
        "max_cloud_cover": 90,
        "composite": {"freq": 10, "window": 20},
        "mask": {"erode_r": 3,
                 "dilate_r": 13,
                 "max_invalid_ratio": 1}},

    "gamma0": {
        "composite": {"freq": 10, "window": 10}},
}

In [3]:
def preprocess_l2a(ds_dict,
                   clouds_mask,
                   start_date,
                   end_date,
                   composite_freq=10,
                   composite_window=20,
                   composite_mode='median',
                   reflectance=True):

    ds10_block = ds_dict[10].chunk((-1, -1, 512, 512))
    ds20_block = ds_dict[20].chunk((-1, -1, 256, 256))
    scl20_block = clouds_mask.chunk((-1, -1, 256, 256))

    # timer10 = FeaturesTimer(10, 'l2a')
    # timer20 = FeaturesTimer(20, 'l2a')

    # download
    logger.info("Loading block data")
    # timer10.load.start()
    ds10_block = ds10_block.persist()
    # timer10.load.stop()

    # timer20.load.start()
    ds20_block = ds20_block.persist()
    scl20_block = scl20_block.persist()
    scl10_block = scl20_block.ewc.rescale(scale=2,
                                          order=0).persist()
    # scl10_block = scl10_block.ewc.cache(tmpdirname)
    # timer20.load.stop()

    # 10m
    # mask clouds
    # timer10.composite.start()
    ds10_block_masked = ds10_block.ewc.mask(
        scl10_block)

    logger.info("Compositing 10m block data")
    # composite
    ds10_block_comp = ds10_block_masked.ewc.composite(
        freq=composite_freq,
        window=composite_window,
        start=start_date,
        end=end_date).persist()
    # timer10.composite.stop()

    logger.info("Interpolating 10m block data")
    # interpolation
    # timer10.interpolate.start()
    ds10_block_interp = ds10_block_comp.ewc.interpolate(
    ).persist()
    # timer10.interpolate.stop()

    # 20m
    # mask
    # timer20.composite.start()
    ds20_block_masked = ds20_block.ewc.mask(
        scl20_block)

    logger.info("Compositing 20m block data")
    # composite
    ds20_block_comp = ds20_block_masked.ewc.composite(
        freq=composite_freq,
        window=composite_window,
        start=start_date,
        end=end_date).persist()
    # timer20.composite.stop()

    logger.info("Interpolating 20m block data")
    # interpolation
    # timer20.interpolate.start()
    ds20_block_interp = ds20_block_comp.ewc.interpolate(
    ).persist()
    # timer20.interpolate.stop()

    logger.info("Merging 10m and 20m series")
    # merging to 10m cleaned data
    ds20_block_interp_10m = ds20_block_interp.ewc.rescale(scale=2,
                                                          order=1,
                                                          nodata_value=0)
    dsm10 = xr.concat([ds10_block_interp,
                       ds20_block_interp_10m],
                      dim='band')

    if reflectance:
        dsm10 = dsm10.astype(np.float32) / 10000

    dsm10.attrs = ds10_block.attrs

#     for t in timer10, timer20:
#         t.load.log()
#         t.composite.log()
#         t.interpolate.log()

#     for t in timer10, timer20:
#         t.log()

    dsm10 = dsm10.ewc.persist()

    return dsm10

In [4]:
import xarray as xr
import dask.array as da
from pyproj.crs import CRS
from loguru import logger
import tempfile
from satio_pc.sentinel2 import load_l2a
from satio_pc.preprocessing.clouds import preprocess_scl
from satio_pc._habitat import RSI_META_S2_HABITAT
from satio_pc.grid import get_blocks_gdf, tile_to_epsg

# 29TPJ_073_2022
year = 2022
tile = '29TPJ'
block_id = 73

start_date = f'{year}-01-01'
end_date = f'{year + 1}-01-01'
max_cloud_cover = settings['l2a']['max_cloud_cover']

blocks = get_blocks_gdf([tile])
block = blocks[blocks.block_id == block_id].iloc[0]

s2_dict = load_l2a(block.bounds,
                   block.epsg,
                   block.tile,
                   start_date,
                   end_date,
                   max_cloud_cover=max_cloud_cover)

# preprocess s2
# tmpdir = tempfile.TemporaryDirectory(prefix='ewc_tmp-', dir=self.block_folder)

# mask preparation
mask_settings = settings['l2a']['mask']
scl = preprocess_scl(s2_dict['scl'],
                     **mask_settings)

scl20_mask = scl.mask
scl20_aux = scl.aux



In [None]:
s2 = preprocess_l2a(s2_dict,
                    scl20_mask,
                    start_date,
                    end_date,
                    composite_freq=settings['l2a']['composite']['freq'],
                    composite_window=settings['l2a']['composite'][
                        'window'])

In [None]:
s2

In [None]:
s2_indices = list(RSI_META_S2_HABITAT.keys())

# compute indices
s2_vi = s2.ewc.indices(s2_indices,
                       rsi_meta=RSI_META_S2_HABITAT)

# percentiles sensors and vis
q = [10, 25, 50, 75, 90]
ps = [s.ewc.percentile(q, name_prefix='s2') for s in (s2, s2_vi)]

# fix time to same timestamp (only 1) to avoid concat issues
# (different compositing settings for s2 and s1)
for p in ps:
    p['time'] = ps[0].time

# ndvi 12 timestamps
ndvi_ts = s2_vi.sel(band=['ndvi'])
ndvi_ts = ndvi_ts.ewc.composite(freq=30,
                                window=30,
                                start=start_date,
                                end=end_date)

ndvi_ts = xr.DataArray(da.transpose(ndvi_ts.data, (1, 0, 2, 3)),
                       dims=ps[0].dims,
                       coords={'time': ps[0].time,
                               'band': [f's2-ndvi-ts{i}'
                                        for i in range(1, 13)],
                               'y': ps[0].y,
                               'x': ps[0].x},
                       attrs=ps[0].attrs)

# scl aux 10m
scl10_aux = scl20_aux.ewc.rescale(scale=2, order=1)
scl10_aux['time'] = ps[0].time

final = xr.concat(ps + [ndvi_ts, scl10_aux], dim='band')
final.name = 'satio-features-s2'

logger.info("Computing features stack")
final = final.persist()
final = final.squeeze()

epsg = tile_to_epsg(tile)
crs = CRS.from_epsg(epsg)
final = final.rio.write_crs(crs)
final_ds = final.to_dataset('band')

output_folder = Path(self.block_folder)
fn = output_folder / \
    f'{final.name}_{tile}_{block.block_id:03d}_{year}.tif'
logger.info(f"Saving features stack to {fn}")
final_ds.rio.to_raster(fn,
                       windowed=False,
                       tiled=True,
                       compress='deflate',
                       predictor=3,
                       zlevel=4)

return fn

In [3]:
from satio_pc.utils.azure import AzureBlobReader

az = AzureBlobReader(connect_str, 'habitat')

In [4]:
for blob in az.container_client.list_blobs():
    print(blob.name)

features/2021/s2/29TPJ/satio-features-s2_29TPJ_071_2021.tif
features/2022/s2/29TPJ/satio-features-s2_29TPJ_071_2022.tif
logs/done/2021/s2/done_29TPJ_071_2021.log
logs/done/2022/s2/done_29TPJ_071_2022.log
logs/error/2022/s2/error_29TPJ_073_2022.log
logs/proc/2021/s2/proc_29TPJ_071_2021.log
logs/proc/2022/s2/proc_29TPJ_071_2022.log
logs/proc/2022/s2/proc_29TPJ_073_2022.log


In [10]:
import io

def read_txt(self, blob_name):
    blob_client = self.container_client.get_blob_client(blob_name)
    data_stream = io.BytesIO()
    blob_client.download_blob().download_to_stream(data_stream)
    data_stream.seek(0)
    return data_stream.read().decode("utf-8")

In [11]:
blob_name = 'logs/error/2021/s2/error_29TPJ_073_2021.log'
txt = read_txt(az, blob_name)

# Cluster setup

In [18]:
# from dask_gateway import Gateway
# gateway = Gateway()

# clusters = gateway.list_clusters()

# gateway.stop_cluster(clusters[1].name)

In [3]:
from dask.distributed import PipInstall, Client
import dask_gateway

cluster = dask_gateway.GatewayCluster()
client = cluster.get_client()

In [4]:
# cluster.adapt(minimum=5, maximum=100)
cluster.scale(20)

# plugin = PipInstall(packages=["git+https://github.com/dzanaga/satio-pc/tree/main/satio_pc"], pip_options=["--upgrade"])
# client.register_worker_plugin(plugin)
satio_pc_url = "git+https://github.com/dzanaga/satio-pc.git@main"
elogs_url = "http://s3-eu-central-1.amazonaws.com/vito-worldcover-public/wheels/elogs-0.1.5-py3-none-any.whl"
plugin = PipInstall(packages=[satio_pc_url,
                              elogs_url],
                    pip_options=["--upgrade"])
client.register_worker_plugin(plugin)

print(client.dashboard_link)

https://pccompute.westeurope.cloudapp.azure.com/compute/services/dask-gateway/clusters/prod.5f189b1d4b0346499202a61caf3c4655/status


In [21]:
# client.upload_file('s2feats.py')

In [3]:
# !pip uninstall elogs -y

In [4]:
!pip install "http://s3-eu-central-1.amazonaws.com/vito-worldcover-public/wheels/elogs-0.1.5-py3-none-any.whl"

In [13]:
scheduler_info.keys()

dict_keys(['type', 'id', 'address', 'services', 'started', 'workers'])

# L2A Features

In [5]:
import dask
import subprocess
from dataclasses import dataclass

# def process(block):
#     cmd = f'ewc l2a {block.tile} {block.block_id} {year} -r habitat -c -k "{connect_str}"'
#     p = subprocess.run(cmd.split())
#     return p.returncode

def process(args):
    from satio_pc.extraction import S2BlockExtractor
    s2ex = S2BlockExtractor(args.tile,
                            args.block_id,
                            args.year,
                            output_folder=args.output,
                            connection_str=args.connstr,
                            container_name=args.container,
                            cleanup=args.cleanup,
                            terminate_if_failed=args.terminate)

    s2ex.extract()
    
    
# def process(args):
#     import satio_pc.cli
#     script = satio_pc.cli.__file__
    
#     cmd = (f'python {script} l2a '
#            f'{args.tile} {args.block_id} {args.year} '
#            f'-r {args.container} -k "{args.connstr}" ')
#     if args.cleanup:
#         cmd += '-c '
    
#     p = subprocess.run(cmd.split())
#     return p.returncode

In [49]:
# workers = client.scheduler_info()['workers']

In [50]:
# workers_names = list(workers.keys())

In [None]:
tasks = [client.submit(process, arg, workers=worker) for arg, worker in zip(args[:20], workers)]

In [6]:
@dataclass
class Args:
    tile: str
    block_id: int
    year: int
    output: str
    connstr: str
    container: str
    cleanup: bool
    terminate: bool

In [7]:
output_folder = '.'
container_name = 'habitat'
cleanup = True
terminate_if_failed = False

blocks = list(blocks_gdf.itertuples())
args = [Args(block.tile,
             block.block_id,
             year,
             output_folder,
             connect_str,
             container_name,
             cleanup,
             terminate_if_failed)
        for block in blocks
        for year in (2021, 2022)]


In [12]:
cluster.shutdown()

  self.scheduler_comm.close_rpc()


In [10]:
futures = client.map(process, args[:3])
# results = client.gather(futures)

In [12]:
futures2 = client.map(process, args[3:6])

In [18]:
futures

[<Future: finished, type: NoneType, key: process-c4ebd2a393efcce448edb9d02871be74>,
 <Future: finished, type: NoneType, key: process-8a5956550ed70d4969df4ec869cfd795>,
 <Future: pending, key: process-01df6f2c8669d8c03be42d2df75214be>]

In [19]:
futures2

[<Future: pending, key: process-cf4444238dcbe39a4baa97a5e0bd860c>,
 <Future: pending, key: process-c7fcbd1721aff3b9ba6995b819b66f39>,
 <Future: finished, type: NoneType, key: process-af1fe9a286b33b15f80059d24de7a365>]

In [8]:
process(args[5])

2023-04-07 11:29:41.823 | INFO     | satio_pc.sentinel2:preprocess_l2a:287 - Loading block data
2023-04-07 11:30:08.832 | ERROR    | satio_pc.extraction:_extract_s2_wrapper:133 - Features extraction failed: [Errno 13] Permission denied: b'/home/jovyan/PlanetaryComputerExamples/personal/satio-pc/notebooks/dev/ewc_29TPJ_073/ewc_tmp-w0o1giqf/ewc_tmp-7ycu5yby/satio-w0dhpum4.nc'
Traceback (most recent call last):

  File "/srv/conda/envs/notebook/lib/python3.10/runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
           │         │     └ {'__name__': '__main__', '__doc__': 'Entry point for launching an IPython kernel.\n\nThis is separate from the ipykernel pack...
           │         └ <code object <module> at 0x7f60e7a41f20, file "/srv/conda/envs/notebook/lib/python3.10/site-packages/ipykernel_launcher.py", ...
           └ <function _run_code at 0x7f60e7879750>
  File "/srv/conda/envs/notebook/lib/python3.10/runpy.py", line 86, in _run_code
    e

In [17]:
workers = client.scheduler_info()['workers']

In [18]:
len(workers)

20

In [72]:
client.who_has()

Key,Copies,Workers
compute_matrix_product-c30dff958c63a8d6f3e20380781540e7,0,
compute_matrix_product-ade763fb9ecea0884662ba3cedfb173a,1,tls://10.244.130.116:46795
compute_matrix_product-f33a596a66d0b14803c6fd98421440e9,1,tls://10.244.155.58:42335
process-8d83bff502b68de073dd66147381ac56,0,
process-3d4a1ecba50dafd3c8377d468e4e7778,0,


In [80]:
task = client.submit(process, args[7], workers=workers_names[8])

In [82]:
task

In [19]:
tasks = [client.submit(process, arg, workers=worker) for arg, worker in zip(args[:20], workers)]

In [27]:
tasks[0].result()

KeyboardInterrupt: 

In [25]:
for arg, worker in zip(args[:20], workers):
    print(worker)

tls://10.244.13.38:39297
tls://10.244.15.39:34601
tls://10.244.16.239:37387
tls://10.244.20.44:34639
tls://10.244.229.51:42869
tls://10.244.229.52:44791
tls://10.244.229.53:40061
tls://10.244.230.34:44193
tls://10.244.244.54:38007
tls://10.244.25.56:33245
tls://10.244.252.44:35099
tls://10.244.252.45:36595
tls://10.244.254.41:43253
tls://10.244.254.42:42639
tls://10.244.26.142:36129
tls://10.244.3.241:38777
tls://10.244.3.242:34619
tls://10.244.42.240:41591
tls://10.244.5.62:38081
tls://10.244.6.73:36909


In [23]:
dask.compute(tasks[0])

(<Future: pending, key: process-e2c88b0b7d4843e3702f515691e1be20>,)

In [None]:
print('done')

In [10]:
len(results)

597

In [None]:
for blob in az.container_client.list_blobs():
    print(blob.name)

In [11]:
cluster.shutdown()

  self.scheduler_comm.close_rpc()


In [22]:
# import os
# os.environ['AWS_DEFAULT_REGION'] = 'eu-central-1'

# app_id = (f'habitat_s2')
# elogs = Elogs(app_id,
#               aws_access_key_id,
#               aws_secret_access_key,
#               logs_bucket='vito-worldcover',
#               overwrite_table=True)

In [10]:
import dask

blocks_gdf['cid'] = blocks_gdf.tile + '_' + blocks_gdf.block_id.astype(str)

sensor = 'l2a'
container_name = 'habitattest'

tasks = [ElogsTask(block.cid, block, year)
         for block in blocks_gdf.iloc[:3].itertuples()
         for year in (2017, 2018, 2019, 2020, 2021, 2022)]

with elogs.start(tasks) as filtered_tasks:
    futures = client.map(extract_s2_features, tasks,
                         resources={'processes': 4})
    results = dask.compute(*futures)

In [4]:
blocks = list(blocks_gdf.iloc[:3].itertuples())

In [None]:
results = client.gather(*futures)

In [10]:
results[0]

In [10]:
client.compute(results[0])

In [11]:
client.has_what().items()

dict_items([('tls://10.244.125.10:40821', ('process-6c1b0516-3cac-4b16-922b-48901fe1c7e8',)), ('tls://10.244.125.7:41765', ()), ('tls://10.244.125.8:40949', ('process-75a5eb70-3a61-499e-82dd-28399d86bb62',)), ('tls://10.244.125.9:34319', ('process-b3836ba7-1b85-43db-b226-63e736aeb5d9',)), ('tls://10.244.66.98:33573', ('process-d09fa4c6-6201-4925-9df7-b394dc5792e2',)), ('tls://10.244.70.89:33237', ())])

In [10]:
results = dask.compute(*futures)

In [19]:
cluster.shutdown()

  self.scheduler_comm.close_rpc()


In [9]:
from azure.storage.blob import BlobServiceClient

# Replace with your connection string


# Replace with your container name
container_name = "habitattest"

# Initialize the BlobServiceClient
blob_service_client = BlobServiceClient.from_connection_string(connect_str)

# Get a reference to the container
container_client = blob_service_client.get_container_client(container_name)

# List blobs in the container
blobs = container_client.list_blobs()

# Iterate through the blobs and print their names, then delete them
for blob in blobs:
    print(f"Deleting blob: {blob.name}")
    container_client.delete_blob(blob.name)

Deleting blob: ewc_29TPJ_071/satio-features-s2_29TPJ_071.tif
Deleting blob: ewc_29TPJ_072/satio-features-s2_29TPJ_072_2021.tif
Deleting blob: ewc_29TPJ_073/PROC_29TPJ_073.log
Deleting blob: ewc_29TPJ_073/satio-features-s2_29TPJ_073_2021.tif
Deleting blob: ewc_29TPJ_074/PROC_29TPJ_074.log
Deleting blob: ewc_29TPJ_074/satio-features-s2_29TPJ_074_2021.tif
Deleting blob: satio-features-s2_29TPJ_071.tif
Deleting blob: satio-s2-features_29TQH_004.tif


# GAMMA0

In [None]:
# S1 features
s1 = load_gamma0(block.bounds,
                 block.epsg,
                 start_date,
                 end_date)

# preprocess s1
s1, obs_gamma0 = preprocess_gamma0(s1,
                                   start_date,
                                   end_date,
                                   composite_freq=settings['gamma0']['composite']['freq'], 
                                   composite_window=settings['gamma0']['composite']['window'],
                                   tmpdir=tmpdir.name)

s1_indices = ["vh_vv", "rvi"]

s1_vi = s1.ewc.indices(s1_indices)

ps = [s.ewc.percentile(q, name_prefix='s1') for s in (s1, s1_vi)]

# fix time to same timestamp (only 1) to avoid concat issues (different compositing settings for s2 and s1)
for p in ps:
    p['time'] = ps[0].time
    
obs_gamma0 = np.expand_dims(np.squeeze(obs_gamma0), (0, 1))
obs_gamma0 = ps[0].isel(band=0).copy(data=obs_gamma0)
obs_gamma0['band'] = ['obs_gamma0']

final = xr.concat(ps + [obs_gamma0], dim='band')
final.name = 'satio-features-s1'

final = final.ewc.cache('.', (1, 1, 512, 512))
final = final.squeeze()

crs = CRS.from_epsg(block.epsg)
final = final.rio.write_crs(crs)
final_ds = final.to_dataset('band')

fn = f'{final.name}_{block.tile}_{block.block_id:03d}.tif'
final_ds.rio.to_raster(fn,
                       windowed=False,
                       tiled=True,
                       compress='deflate',
                       predictor=3,
                       zlevel=4)