Step 1: download the data from eotdl

In [None]:
import os
import zipfile
import eotdl
from eotdl.datasets import download_dataset

download_dataset("EuroSAT", version=1, path="data", force=True)

os.makedirs("data/EuroSAT", exist_ok=True)

with zipfile.ZipFile("data/EuroSAT/v1/EuroSAT.zip", 'r') as zip_ref:
    zip_ref.extractall("data/EuroSAT")

Step 2: Make a random selection of labelled data for which we want to extract EO data, from the images we can readily extract their geospatial bounding box. We append this information into a dataframe which will lateron be used to orchestrate the EO data extraction.

In [6]:
from dataframe_utils import *

# Constants
src_dir = r"C:\Git_projects\eotdl_wip\data\EuroSAT\ds\images\remote_sensing"
num_files = 100 #number of images we wish to process

start_date = "2020-01-01"
nb_months = 3

distance_m = 320  # Buffer distance in meters
resolution = 20.0  # Grid resolution in meters

tif_files = get_tif_files(src_dir)
selected_files = random.sample(tif_files, num_files)

base_df = generate_geodataframe_pet_utm(selected_files, start_date, nb_months, distance_m, resolution)
split_df = process_split_jobs(base_df, 10)
job_df = create_job_dataframe_s2(split_df)

# TODO; remove: for testing
#job_df = job_df[job_df['feature_count'] > 1][0:3]
#job_df





  polygons["centroid"] = polygons.geometry.centroid

  s2_grid["geometry"] = s2_grid.geometry.centroid


  polygons["centroid"] = polygons.geometry.centroid

  s2_grid["geometry"] = s2_grid.geometry.centroid


  polygons["centroid"] = polygons.geometry.centroid

  s2_grid["geometry"] = s2_grid.geometry.centroid


  polygons["centroid"] = polygons.geometry.centroid

  s2_grid["geometry"] = s2_grid.geometry.centroid


  polygons["centroid"] = polygons.geometry.centroid

  s2_grid["geometry"] = s2_grid.geometry.centroid


  polygons["centroid"] = polygons.geometry.centroid

  s2_grid["geometry"] = s2_grid.geometry.centroid


  polygons["centroid"] = polygons.geometry.centroid

  s2_grid["geometry"] = s2_grid.geometry.centroid


  polygons["centroid"] = polygons.geometry.centroid

  s2_grid["geometry"] = s2_grid.geometry.centroid



TypeError: Object of type CRS is not JSON serializable

In [None]:
job_df

Unnamed: 0,file_name,geometry,crs,temporal_extent,h3index,tile
0,AnnualCrop_1149,"POLYGON ((577140 4328380, 577140 4327740, 5765...",EPSG:32630,"[2020-01-01, 2020-04-01]",833909fffffffff,30SWJ
1,AnnualCrop_1747,"POLYGON ((575200 4329020, 575200 4328380, 5745...",EPSG:32630,"[2020-01-01, 2020-04-01]",833909fffffffff,30SWJ
2,AnnualCrop_620,"POLYGON ((598120 4325160, 598120 4324520, 5974...",EPSG:32630,"[2020-01-01, 2020-04-01]",833909fffffffff,30SWJ


Step 3: set up a job manager

In [None]:
import openeo
from openeo.extra.job_management import MultiBackendJobManager, CsvJobDatabase

# Authenticate and add the backend
connection = openeo.connect(url="openeo.dataspace.copernicus.eu").authenticate_oidc()

# initialize the job manager
manager = MultiBackendJobManager()
manager.add_backend("cdse", connection=connection, parallel_jobs=2)

job_tracker = 'job_tracker.parquet'
job_db = CsvJobDatabase(path=job_tracker)
if not job_db.exists():
    df = manager._normalize_df(job_df)
    job_db.persist(df)

    

Authenticated using refresh token.


In [None]:
import geojson
from s3proxy_utils import upload_geoparquet_file

def start_job(row: pd.Series, connection: openeo.Connection, **kwargs) -> openeo.BatchJob:

        temporal_extent = row["temporal_extent"]

        #
        geom_geojson = geojson.loads(row.geometry)
        features = gpd.GeoDataFrame.from_features(geom_geojson).set_crs(row.crs)
        url = upload_geoparquet_file(features,connection)

        cube = connection.load_collection(
                "SENTINEL2_L2A",
                temporal_extent=temporal_extent,
                bands=["B04", "B08"]
                )
        
        cube = cube.filter_spatial(connection.load_url(url, format="Parquet"))

        job = cube.create_job(
                out_format="NetCDF",
                sample_by_feature = True,
                feature_id_property = "file_name",
                filename_prefix = "eotdl-s2"
        )

        return job


fire up the manager

In [None]:
manager.start_job_thread(start_job=start_job, job_db=job_db)




In [None]:
import xarray as xr

test = xr.open_dataset('job_j-24111565cd3049988fccd62b93053401\openEO_HerbaceousVegetation_140.nc')

test


#TODO; improve accuracy of size by staying in UTM

Code when moving towards UDP based manager

In [None]:
import openeo
from openeo.extra.job_management import (
        create_job_db,
        ProcessBasedJobCreator,
    )

job_db = create_job_db("jobs.csv", job_df)

process_graph_url = "https://raw.githubusercontent.com/ESA-APEx/apex_algorithms/main/openeo_udp/bap_composite.json"

#Make use of the Best Available Pixel openeo Process to obtain Sentinel 2 composites
job_starter = ProcessBasedJobCreator(
        namespace=process_graph_url,
        parameter_defaults={},
    )


In [None]:
from openeo.extra.job_management import MultiBackendJobManager

# Initiate MultiBackendJobManager 
job_manager = MultiBackendJobManager()  
connection = openeo.connect(url="openeo.dataspace.copernicus.eu").authenticate_oidc()
job_manager.add_backend("cdse", connection=connection, parallel_jobs=10)


job_manager.run_jobs(job_db=job_db, start_job=job_starter)


In [None]:
import xarray as xr
import matplotlib.pyplot as plt

# Load the dataset
test = xr.open_dataset('test.nc')

# Select a specific time slice for plotting, e.g., the first time point
time_index = 1
b04 = test['B04'].isel(t=time_index)

# Create a figure to plot 4 subplots
fig, axs = plt.subplots(2, 2, figsize=(12, 10))

# Plot each variable in a different subplot
im1 = axs[0, 0].imshow(b04, cmap='viridis', origin='lower')
axs[0, 0].set_title('B04_P10')
fig.colorbar(im1, ax=axs[0, 0])




TODO:


- Scale up (saving data, improve efficiency, ...)

