Step 1: download the data from eotdl

In [1]:
import os
import zipfile
import eotdl
from eotdl.datasets import download_dataset

download_dataset("EuroSAT", version=1, path="data", force=True)

os.makedirs("data/EuroSAT", exist_ok=True)

with zipfile.ZipFile("data/EuroSAT/v1/EuroSAT.zip", 'r') as zip_ref:
    zip_ref.extractall("data/EuroSAT")

ModuleNotFoundError: No module named 'eotdl'

Step 2: Make a random selection of labelled data for which we want to extract EO data, from the images we can readily extract their geospatial bounding box. We append this information into a dataframe which will lateron be used to orchestrate the EO data extraction.

In [1]:
from dataframe_utils import create_job_dataframe_from_tif_files,get_tif_files



# Constants
src_dir = r"C:\Git_projects\eotdl_wip\data\EuroSAT\ds\images\remote_sensing\otherDatasets\sentinel_2\tif"
num_files = 10
start_date = "2020-01-01"
nb_months = 3
distance_m = 320  # Buffer distance in meters
resolution = 20.0  # Grid resolution in meters
epsg_latlon = "EPSG:4326"  # CRS in WGS84


# Call the function to generate the job dataframe
job_df = create_job_dataframe_from_tif_files(
    src_dir=src_dir,
    num_files=num_files,
    start_date=start_date,
    nb_months=nb_months,
    distance_m=distance_m,
    resolution=resolution,
    epsg_latlon=epsg_latlon
)

# Display the resulting DataFrame
job_df.head()



                                           file_name  \
0  C:\Git_projects\eotdl_wip\data\EuroSAT\ds\imag...   
1  C:\Git_projects\eotdl_wip\data\EuroSAT\ds\imag...   
2  C:\Git_projects\eotdl_wip\data\EuroSAT\ds\imag...   
3  C:\Git_projects\eotdl_wip\data\EuroSAT\ds\imag...   
4  C:\Git_projects\eotdl_wip\data\EuroSAT\ds\imag...   
5  C:\Git_projects\eotdl_wip\data\EuroSAT\ds\imag...   
6  C:\Git_projects\eotdl_wip\data\EuroSAT\ds\imag...   
7  C:\Git_projects\eotdl_wip\data\EuroSAT\ds\imag...   
8  C:\Git_projects\eotdl_wip\data\EuroSAT\ds\imag...   
9  C:\Git_projects\eotdl_wip\data\EuroSAT\ds\imag...   

                                            geometry           temporal_extent  
0  POLYGON ((267640 4139140, 267640 4138500, 2670...  [2020-01-01, 2020-04-01]  
1  POLYGON ((604320 4107540, 604320 4106900, 6036...  [2020-01-01, 2020-04-01]  
2  POLYGON ((612380 5303700, 612380 5303060, 6117...  [2020-01-01, 2020-04-01]  
3  POLYGON ((675240 6111220, 675240 6110580, 6746...  [2020

  return lib.centroid(geometry, **kwargs)


GEOSException: UnsupportedOperationException: getY called on empty Point


In [32]:
from typing import List
import pandas as pd

def create_job_dataframe_s2(split_jobs: List[gpd.GeoDataFrame]) -> pd.DataFrame:
    """Create a DataFrame from the split jobs containing all the necessary information to run the job."""
    columns = ['temporal_extent', 'geometry', 's2_tile', 'h3index', 'geometry']
    rows = []
    
    for job in split_jobs:
        # Calculate median date from temporal_extent and create a buffer around it
        temporal_extent = job.temporal_extent.iloc[0]  # Access the temporal extent column for the first row
        s2_tile = job.tile.iloc[0]  # Access the tile ID
        h3index = job.h3index.iloc[0]  # Access the H3 index
        
        # Append job details to rows
        rows.append(
            pd.Series(
                dict(zip(
                    columns, 
                    [
                        temporal_extent, 
                        job.geometry.to_json(),  # geometry as JSON 
                        s2_tile,             
                        h3index         
                    ]
                ))
            )
        )

    return pd.DataFrame(rows)

# Example usage:
job_df = create_job_dataframe_s2(split_jobs)
job_df

Unnamed: 0,temporal_extent,geometry,s2_tile,h3index
0,"[2020-01-01, 2020-04-01]","{""type"": ""FeatureCollection"", ""features"": [{""i...",30TWT,831845fffffffff
1,"[2020-01-01, 2020-04-01]","{""type"": ""FeatureCollection"", ""features"": [{""i...",30UWD,831951fffffffff
2,"[2020-01-01, 2020-04-01]","{""type"": ""FeatureCollection"", ""features"": [{""i...",31UFS,831fa4fffffffff
3,"[2020-01-01, 2020-04-01]","{""type"": ""FeatureCollection"", ""features"": [{""i...",31UFU,831968fffffffff
4,"[2020-01-01, 2020-04-01]","{""type"": ""FeatureCollection"", ""features"": [{""i...",32TPP,831ea2fffffffff
5,"[2020-01-01, 2020-04-01]","{""type"": ""FeatureCollection"", ""features"": [{""i...",32ULV,831f85fffffffff
6,"[2020-01-01, 2020-04-01]","{""type"": ""FeatureCollection"", ""features"": [{""i...",32UMC,831f12fffffffff
7,"[2020-01-01, 2020-04-01]","{""type"": ""FeatureCollection"", ""features"": [{""i...",33VVE,8308b6fffffffff
8,"[2020-01-01, 2020-04-01]","{""type"": ""FeatureCollection"", ""features"": [{""i...",34TDS,831e19fffffffff
9,"[2020-01-01, 2020-04-01]","{""type"": ""FeatureCollection"", ""features"": [{""i...",35TNH,831eebfffffffff


Step 2: Define the processing pipeline (Sentinel 2)

In [6]:
import openeo
from openeo.extra.job_management import (
        create_job_db,
        ProcessBasedJobCreator,
    )

job_db = create_job_db("jobs.csv", split_jobs)

connection = openeo.connect("openeofed.dataspace.copernicus.eu").authenticate_oidc()
process_graph_url = "https://raw.githubusercontent.com/ESA-APEx/apex_algorithms/main/openeo_udp/bap_composite.json"

#Make use of the Best Available Pixel openeo Process to obtain Sentinel 2 composites
job_starter = ProcessBasedJobCreator(
        namespace=process_graph_url,
        parameter_defaults={},
    )


AttributeError: 'list' object has no attribute 'columns'

In [None]:
from openeo.extra.job_management import MultiBackendJobManager

# Initiate MultiBackendJobManager 
job_manager = MultiBackendJobManager()  
connection = openeo.connect(url="openeofed.dataspace.copernicus.eu").authenticate_oidc()
job_manager.add_backend("cdse", connection=connection, parallel_jobs=10)


job_manager.run_jobs(job_db=job_db, start_job=job_starter)


In [None]:
import rasterio
import numpy as np
import matplotlib.pyplot as plt


# Define the paths to the two images
image_path1 = r"C:\Git_projects\eotdl\job_cdse-j-241107313355441090984aa82cf25583\openEO_2020-01-01Z.tif"
image_path2 = df.iloc[0]['file_name']

# Normalization function
def normalize(array):
    return (array - array.min()) / (array.max() - array.min())

# Function to load and normalize an RGB image
def load_rgb_image(path):
    with rasterio.open(path) as src:

        width, height = src.width, src.height
        print(f"Image width: {width} pixels, Image height: {height} pixels")
        # Read RGB bands (assuming the first three bands are RGB)
        red = src.read(1)
        green = src.read(2)
        blue = src.read(3)

        # Normalize each band
        red_norm = normalize(red)
        green_norm = normalize(green)
        blue_norm = normalize(blue)

        # Stack to create RGB image
        rgb_image = np.dstack((red_norm, green_norm, blue_norm))
    return rgb_image

# Load and normalize both images
rgb_image1 = load_rgb_image(image_path1)
rgb_image2 = load_rgb_image(image_path2)

# Plot the two images side by side
fig, axs = plt.subplots(1, 2, figsize=(15, 10))

# Display first image
axs[0].imshow(rgb_image1)
axs[0].set_title("Image 1: openEO 2020-01-01Z")
axs[0].axis('off')

# Display second image
axs[1].imshow(rgb_image2)
axs[1].set_title("Image 2: Herbaceous Vegetation 1539")
axs[1].axis('off')

plt.show()

TODO:


- Scale up (saving data, improve efficiency, ...)

