In [1]:
# Jupyter notebook related
%reload_ext autoreload
%autoreload 2

In [2]:
import geopandas as gpd
from satio_pc.grid import get_blocks_gdf
from dask import delayed

# from elogs import Elogs, ElogsTask

with open('../../../connstr_vegteam') as f:
    connect_str = f.read()
container_name = 'sen4ldn'

# Test tiles for SEN4LDN

# Uganda: 36NUG
# Portugal: 29TNE
# Colombia: 18NUJ

In [12]:
from satio_pc.utils.azure import AzureBlobReader

azure = AzureBlobReader(connect_str,
                        container_name)
done_logs = azure.list_files('logs/done')


In [13]:
done_ids = set(map(lambda d: d.split('done_')[-1].split('.')[0], done_logs))

In [14]:
len(done_ids)

1496

In [None]:
key = 'logs/error/2022/s2/error_36NUG_000_2022.log'
azure.download_file(key, key.split('/')[-1])

In [None]:
# for fn in azure.list_files():
#     azure.delete_file(fn)

In [None]:
azure.list_files()

In [15]:

settings = {

    "l2a": {
        "max_cloud_cover": 90,
        "composite": {"freq": 10,
                      "window": 20,
                      "mode": "median"},
        "mask": {"erode_r": 3,
                 "dilate_r": 13,
                 "max_invalid_ratio": 1},
        "bands": ['B02', 'B03', 'B04', 'B08', 'B11', 'B12'],
        "indices": ["ndvi"],
        "percentiles": [10, 25, 50, 75, 90],
    },

    "gamma0": {
        "composite": {"freq": 10,
                      "window": 10,
                      "mode": "median"}},
}


tiles = '18NUJ 29TNE 36NUG'.split()

blocks_gdf = get_blocks_gdf(tiles)
blocks_gdf = blocks_gdf[blocks_gdf['area'] == 104857600]  # only squares, bug on others
blocks_gdf = blocks_gdf.sort_values('block_id')


In [16]:
def extract_s2(arg):
    tile, block_id, year = arg
    from satio_pc.extraction import S2BlockExtractor
    extractor = S2BlockExtractor(tile,
                                 block_id,
                                 year,
                                 settings,
                                 output_folder='/tmp',
                                 connection_str=connect_str,
                                 container_name=container_name)
    
    extractor.extract()

args = [(b.tile, b.block_id, y) for b in blocks_gdf.itertuples()
        for y in range(2018, 2023) if f"{b.tile}_{b.block_id:03d}_{y}" not in done_ids]

len(args)

4

### Run locally with multiprocessing

In [None]:
# import multiprocessing

# for a in args:
#     p = multiprocessing.Process(target=extract_s2,
#                                 args=a)
#     p.start()
#     p.join()

In [17]:
from satio_pc.utils import parallelize

In [9]:
from loguru import logger
logger.remove()

In [18]:
from tqdm.auto import tqdm

In [19]:
_ = parallelize(extract_s2, args, max_workers=2, use_process_pool=True)

  times = pd.to_datetime(
  times = pd.to_datetime(
  times = pd.to_datetime(
  times = pd.to_datetime(
  times = pd.to_datetime(
  times = pd.to_datetime(


TypeError: 'module' object is not callable

# Cluster setup

In [None]:
from dask.distributed import PipInstall, Client
import dask_gateway

cluster = dask_gateway.GatewayCluster()
client = cluster.get_client()
cluster.scale(10)
print(client.dashboard_link)

In [None]:
satio_pc_url = "git+https://github.com/dzanaga/satio-pc.git@main"
# elogs_url = "http://s3-eu-central-1.amazonaws.com/vito-worldcover-public/wheels/elogs-0.1.5-py3-none-any.whl"
plugin = PipInstall(packages=[satio_pc_url],
                    pip_options=["--upgrade"])
client.register_worker_plugin(plugin)

In [None]:
# check logs
logs = client.get_worker_logs()
for worker, worker_logs in logs.items():
    print(f"Logs for worker {worker}:")
    for log in worker_logs:
        print(log)
    print()
    print('*'*100)

In [None]:
import dask
import subprocess

@dask.delayed
def extract_s2_cli(cmd):
    p = subprocess.run(cmd.split())
    return p.returncode

cmds = [f'ewc l2a -o "/tmp" -k "{connect_str}" -r "{container_name}" -c -t {arg[0]} {arg[1]} {arg[2]}'
        for arg in args]

tasks = [extract_s2_cli(cmd) for cmd in cmds[:20]]

In [None]:
results = dask.compute(*tasks)

In [None]:
results

In [None]:
cluster.shutdown()

In [None]:
# def install():
#     import os
#     os.system("pip install git+https://github.com/dzanaga/satio-pc -y")  # or pip

# client.run(install)  # Run on all workers

In [None]:
import dask

In [None]:
extract_delayed = dask.delayed(extract_s2)

In [None]:
import dask
lazy_results = []

for ag in args[:20]:
    lazy_result = extract_delayed(*ag)
    lazy_results.append(lazy_result)

# futures = dask.persist(*lazy_results)  # trigger computation in the background

In [None]:
f = lazy_results[1].compute()

In [None]:
futures = client.map(extract_s2, args[:6])

In [None]:
f = futures[5]
f

In [None]:
f.result()

In [None]:
@dask.delayed
def test(n):
    try:
        import satio_pc
        s2grid = satio_pc.layers.load('s2grid')
        return s2grid.shape[0]
    except:
        return 0

In [None]:
nn = list(range(30))

In [None]:
futures = client.map(test, nn)

In [None]:
f = futures[10]

In [None]:
f.result().compute()

In [None]:
client.

In [None]:
client

In [None]:
# list clusters
from dask_gateway import Gateway
gateway = Gateway()
gateway.list_clusters()

In [None]:
# # plugin = PipInstall(packages=["git+https://github.com/dzanaga/satio-pc/tree/main/satio_pc"], pip_options=["--upgrade"])
# # client.register_worker_plugin(plugin)
# satio_pc_url = "git+https://github.com/dzanaga/satio-pc.git@main"
# elogs_url = "http://s3-eu-central-1.amazonaws.com/vito-worldcover-public/wheels/elogs-0.1.5-py3-none-any.whl"
# plugin = PipInstall(packages=[satio_pc_url,
#                               elogs_url],
#                     pip_options=["--upgrade"])
# client.register_worker_plugin(plugin)

# print(client.dashboard_link)

In [None]:
# plugin = PipInstall(packages=["git+https://github.com/dzanaga/satio-pc/tree/main/satio_pc"], pip_options=["--upgrade"])
# client.register_worker_plugin(plugin)
satio_pc_url = "git+https://github.com/dzanaga/satio-pc.git@main"
elogs_url = "http://s3-eu-central-1.amazonaws.com/vito-worldcover-public/wheels/elogs-0.1.5-py3-none-any.whl"
plugin = PipInstall(packages=[satio_pc_url],
                    pip_options=["--upgrade"])
client.register_worker_plugin(plugin)

print(client.dashboard_link)

In [None]:
import dask
lazy_results = []

for ag in args[:20]:
    lazy_result = dask.delayed(extract_s2)(ag)
    lazy_results.append(lazy_result)

futures = dask.persist(*lazy_results)  # trigger computation in the background

In [None]:
futures[0].compute()

In [None]:
workers = client.scheduler_info()['workers']
tasks = [client.submit(extract_s2, arg, workers=worker)
         for arg, worker in zip(args[:16], workers)]

In [None]:
tasks[0]

In [None]:
# test first block

block = next(blocks_gdf.iloc[[10]].itertuples())
year = 2021

In [None]:
extractor = S2BlockExtractor(block.tile,
                             block.block_id,
                             year,
                             settings,
                             bands=bands,
                             indices=indices,
                             percentiles=percentiles,
                             output_folder='.',
                             connection_str=connect_str,
                             container_name=container_name)

In [None]:
data, fn, bounds, epsg = extractor._extract_s2()

In [None]:
fn

In [None]:
data.ewc.save_features(fn, bounds, epsg)

# debug extract function

In [None]:
import xarray as xr
from loguru import logger

from satio_pc.sentinel2 import load_l2a, preprocess_l2a
from satio_pc.preprocessing.clouds import preprocess_scl
from satio_pc.grid import get_blocks_gdf

# year = self.year
# tile = self.tile
# block_id = self.block_id
self = extractor

start_date = f'{year}-01-01'
end_date = f'{year + 1}-01-01'
max_cloud_cover = self._settings['l2a']['max_cloud_cover']

# blocks = get_blocks_gdf([tile])
# block = blocks[blocks.block_id == block_id].iloc[0]

s2_dict = load_l2a(block.bounds,
                   block.epsg,
                   block.tile,
                   start_date,
                   end_date,
                   bands=self._bands,
                   max_cloud_cover=max_cloud_cover)

# mask preparation
mask_settings = self._settings['l2a']['mask']
scl = preprocess_scl(s2_dict['scl'],
                     **mask_settings)

scl20_mask = scl.mask
scl20_aux = scl.aux

s2 = preprocess_l2a(s2_dict,
                    scl20_mask,
                    start_date,
                    end_date,
                    composite_freq=self._settings[
                        'l2a']['composite']['freq'],
                    composite_window=self._settings[
                        'l2a']['composite']['window'],
                    composite_mode=self._settings[
                        'l2a']['composite']['mode'])

s2_indices = self._indices

# compute indices
s2_vi = s2.ewc.indices(s2_indices)

In [None]:
s2_vi.ewc.show(band='ndvi')

In [None]:
# percentiles sensors and vis
q = self._percentiles
ps = [s.ewc.percentile(q, name_prefix='s2') for s in (s2, s2_vi)]

# fix time to same timestamp (only 1) to avoid concat issues
# (different compositing settings for s2 and s1)
for p in ps:
    p['time'] = ps[0].time

# scl aux 10m
scl10_aux = scl20_aux.ewc.rescale(scale=2, order=1)
scl10_aux['time'] = ps[0].time

final = xr.concat(ps + [scl10_aux], dim='band')
final.name = 'satio-features-s2'

logger.info("Computing features stack")
final = final.persist()
final = final.squeeze()

output_folder = Path(self.block_folder)
fn = output_folder / \
    f'{final.name}_{tile}_{block.block_id:03d}_{year}.tif'

In [None]:
s2 = preprocess_l2a(s2_dict,
                    scl20_mask,
                    start_date,
                    end_date,
                    composite_freq=settings['l2a']['composite']['freq'],
                    composite_window=settings['l2a']['composite'][
                        'window'],
                    tmpdir=tmpdir.name)

s2_indices = indices

# compute indices
s2_vi = s2.ewc.indices(s2_indices)

In [None]:
s2_vi.ewc.plot(band='ndvi', vmin=-1, vmax=1)

In [None]:
import hvplot.xarray  # noqa
import hvplot.pandas  # noqa
import panel as pn  # noqa
import panel.widgets as pnw

im = s2
band = im.band[0]
im = im.sel(band=band)

im.interactive.sel(time=pnw.DiscreteSlider).hvplot(
    clim=(0, 0.2), colormap='plasma', aspect=1)

bug in the interpolate function? there are 0s in the s2 ts and nans in the ndvi ts
ndvi is saved as mostly nan?

### debug preprocess l2a

In [None]:
import xarray as xr
import dask.array as da
from pyproj.crs import CRS
from loguru import logger
import tempfile
from satio_pc.sentinel2 import load_l2a, preprocess_l2a
from satio_pc.preprocessing.clouds import preprocess_scl
from satio_pc.grid import get_blocks_gdf, tile_to_epsg


year = 2021
tile = block.tile
block_id = block.block_id

start_date = f'{year}-01-01'
end_date = f'{year + 1}-01-01'
max_cloud_cover = settings['l2a']['max_cloud_cover']

s2_dict = load_l2a(block.bounds,
                   block.epsg,
                   block.tile,
                   start_date,
                   end_date,
                   bands=bands,
                   max_cloud_cover=max_cloud_cover)

# preprocess s2
tmpdir = tempfile.TemporaryDirectory(prefix='ewc_tmp-',
                                     dir='.')

# mask preparation
mask_settings = settings['l2a']['mask']
scl = preprocess_scl(s2_dict['scl'],
                     **mask_settings)

scl20_mask = scl.mask
scl20_aux = scl.aux

In [None]:
# s2 = preprocess_l2a(s2_dict,
#                     scl20_mask,
#                     start_date,
#                     end_date,
#                     composite_freq=settings['l2a']['composite']['freq'],
#                     composite_window=settings['l2a']['composite'][
#                         'window'],
#                     tmpdir=tmpdir.name)

ds_dict = s2_dict
clouds_mask = scl20_mask
composite_freq=settings['l2a']['composite']['freq']
composite_window=settings['l2a']['composite']['window']

ds10_block = ds_dict[10]
ds20_block = ds_dict[20]
scl20_block = clouds_mask

In [None]:
ds10_block.ewc.rgb(vmax=2000)

In [None]:
from satio_pc.preprocessing.timer import FeaturesTimer

In [None]:
ds10_block = ds_dict[10].sel(band=['B04', 'B08'])
# ds20_block = ds_dict[20]
scl20_block = clouds_mask

timer10 = FeaturesTimer(10, 'l2a')
timer20 = FeaturesTimer(20, 'l2a')

# download
logger.info("Loading block data")
timer10.load.start()
ds10_block = ds10_block.ewc.persist_chunk()
timer10.load.stop()

timer20.load.start()
# ds20_block = ds20_block.ewc.persist_chunk()
scl20_block = scl20_block.ewc.persist_chunk()
scl10_block = scl20_block.ewc.rescale(scale=2,
                                      order=0)
scl10_block = scl10_block.ewc.persist_chunk()
timer20.load.stop()

# 10m
# mask clouds
timer10.composite.start()
ds10_block_masked = ds10_block.ewc.mask(
    scl10_block).ewc.persist_chunk()

logger.info("Compositing 10m block data")
# composite
ds10_block_comp = ds10_block_masked.ewc.composite(
    freq=composite_freq,
    window=composite_window,
    start=start_date,
    end=end_date).ewc.persist_chunk()
timer10.composite.stop()

logger.info("Interpolating 10m block data")
# interpolation
timer10.interpolate.start()
ds10_block_interp = ds10_block_comp.ewc.interpolate(
).ewc.persist_chunk()
timer10.interpolate.stop()

# 20m
# mask
# timer20.composite.start()
# ds20_block_masked = ds20_block.ewc.mask(
#     scl20_block).ewc.persist_chunk()

# logger.info("Compositing 20m block data")
# # composite
# ds20_block_comp = ds20_block_masked.ewc.composite(
#     freq=composite_freq,
#     window=composite_window,
#     start=start_date,
#     end=end_date).ewc.persist_chunk()
# timer20.composite.stop()

# logger.info("Interpolating 20m block data")
# # interpolation
# timer20.interpolate.start()
# ds20_block_interp = ds20_block_comp.ewc.interpolate(
# ).ewc.persist_chunk()
# timer20.interpolate.stop()

# logger.info("Merging 10m and 20m series")
# # merging to 10m cleaned data
# ds20_block_interp_10m = ds20_block_interp.ewc.rescale(scale=2,
#                                                       order=1,
#                                                       nodata_value=0)
dsm10 = xr.concat([ds10_block_interp],
                  dim='band')

In [None]:
show(ds10_block_interp)

In [None]:
if reflectance:
    dsm10 = dsm10.astype(np.float32) / 10000

dsm10.attrs = ds10_block.attrs

for t in timer10, timer20:
    t.load.log()
    t.composite.log()
    t.interpolate.log()

for t in timer10, timer20:
    t.log()

dsm10 = dsm10.ewc.persist_chunk()

In [None]:
def show(ds, band=None, vmin=None, vmax=None,
         colormap='plasma', **kwargs):
    import hvplot.xarray  # noqa
    import hvplot.pandas  # noqa
    import panel as pn  # noqa
    import panel.widgets as pnw

    im = ds
    band = im.band[0] if band is None else band
    im = im.sel(band=band)
    return im.interactive.sel(time=pnw.DiscreteSlider).hvplot(
        clim=(vmin, vmax),
        colormap=colormap,
        aspect=1,
        x='x',
        y='y',
        **kwargs)

In [None]:
# show(ds10_block)

In [None]:
def preprocess_l2a(ds_dict,
                   clouds_mask,
                   start_date,
                   end_date,
                   composite_freq=10,
                   composite_window=20,
                   composite_mode='median',
                   reflectance=True,
                   tmpdir='.'):

    ds10_block = ds_dict[10]
    ds20_block = ds_dict[20]
    scl20_block = clouds_mask

    timer10 = FeaturesTimer(10, 'l2a')
    timer20 = FeaturesTimer(20, 'l2a')

    # download
    logger.info("Loading block data")
    timer10.load.start()
    ds10_block = ds10_block.persist()
    timer10.load.stop()

    timer20.load.start()
    ds20_block = ds20_block.persist()
    scl20_block = scl20_block.persist()
    scl10_block = scl20_block.ewc.rescale(scale=2,
                                          order=0)
    scl10_block = scl10_block.persist()
    timer20.load.stop()

    # 10m
    # mask clouds
    timer10.composite.start()
    ds10_block_masked = ds10_block.ewc.mask(
        scl10_block).persist()

    logger.info("Compositing 10m block data")
    # composite
    ds10_block_comp = ds10_block_masked.ewc.composite(
        freq=composite_freq,
        window=composite_window,
        start=start_date,
        end=end_date).persist()
    timer10.composite.stop()

    logger.info("Interpolating 10m block data")
    # interpolation
    timer10.interpolate.start()
    ds10_block_interp = ds10_block_comp.ewc.interpolate(
    ).persist()
    timer10.interpolate.stop()

    # 20m
    # mask
    timer20.composite.start()
    ds20_block_masked = ds20_block.ewc.mask(
        scl20_block).persist()

    logger.info("Compositing 20m block data")
    # composite
    ds20_block_comp = ds20_block_masked.ewc.composite(
        freq=composite_freq,
        window=composite_window,
        start=start_date,
        end=end_date).persist()
    timer20.composite.stop()

    logger.info("Interpolating 20m block data")
    # interpolation
    timer20.interpolate.start()
    ds20_block_interp = ds20_block_comp.ewc.interpolate(
    ).persist()
    timer20.interpolate.stop()

    logger.info("Merging 10m and 20m series")
    # merging to 10m cleaned data
    ds20_block_interp_10m = ds20_block_interp.ewc.rescale(scale=2,
                                                          order=1,
                                                          nodata_value=0)
    dsm10 = xr.concat([ds10_block_interp,
                       ds20_block_interp_10m],
                      dim='band')

    if reflectance:
        dsm10 = dsm10.astype(np.float32) / 10000

    dsm10.attrs = ds10_block.attrs

    for t in timer10, timer20:
        t.load.log()
        t.composite.log()
        t.interpolate.log()

    for t in timer10, timer20:
        t.log()

    dsm10 = dsm10.ewc.persist_chunk()

    return dsm10