# Test Download Code

This notebook tests the core functionality of the Satellite Data Manager library by calling its methods directly. It sets up all the available options (download parameters, sensor selections, band configurations, cropping, masking, etc.) as used in the interactive GUI.

In [None]:
# Import necessary modules
import os
import glob
import json
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt

from sentinelhub import SHConfig
from SatelliteDataManager.core.sdm import SDM
from SatelliteDataManager.analyses.SEDM_Wildfire.burned_area_dataset_builder import BurnedAreaSegmentationDatasetBuilder
from SatelliteDataManager.analyses.SEDM_Wildfire.burned_area_model import build_burned_area_segmentation_model
from SatelliteDataManager.analyses.SEDM_Wildfire.burned_area_optuna import objective as burned_area_objective


# Import splitting and visualization utilities
from SatelliteDataManager.core.ml.data_split import train_test_split, stratified_train_test_split, kfold_split, stratified_kfold_split
from SatelliteDataManager.core.ml.hyperparameter_optimization import run_optuna_study
from SatelliteDataManager.core.ml.result_visualizer import plot_training_history, plot_roc_curve

In [5]:
config = SHConfig("peppe")


# Define paths for activation info and data directories
activation_info_path = "FireData/activation_info.json"  # JSON file with activation information
base_data_folder = "./test/raw"           # Directory for raw downloads
base_manipulated_folder = "./test/man"      # Directory for organized data
tfrecord_folder = "./test/tfrecords"         # Directory for saving TFRecord files

print("Configuration and directories set up.")

# Initialize the BurnedAreaSegmentationDatasetBuilder with all parameters
builder = BurnedAreaSegmentationDatasetBuilder(
    config=config,
    activation_info_path=activation_info_path,
    base_data_folder=base_data_folder,
    base_manipulated_folder=base_manipulated_folder,
    tfrecord_folder=tfrecord_folder,
    sampleType="FLOAT32",
    download=True
)

print("BurnedAreaSegmentationDatasetBuilder initialized.")

Configuration and directories set up.
BurnedAreaSegmentationDatasetBuilder initialized.


In [6]:
# Define per-satellite download parameters
download_params = {
    "interval_days": {
         "Sentinel-2": 12,
         "Sentinel-1": 12,
         "Sentinel-3-OLCI": 3,
         "Sentinel-3-SLSTR-Thermal": 3,
         "DEM": 36
    },
    "size": {
         "default": (256, 256),
         "Sentinel-2": (256, 256),
         "Sentinel-1": (256, 256),
         "Sentinel-3-OLCI": (256, 256),
         "Sentinel-3-SLSTR-Thermal": (256, 256),
         "DEM": (256, 256)
    },
    "mosaicking_order": {
         "Sentinel-2": "leastCC",
         "Sentinel-1": "mostRecent",
         "Sentinel-3-OLCI": "mostRecent",
         "Sentinel-3-SLSTR-Thermal": "mostRecent",
         "DEM": "mostRecent"
    },
    "resolutions": {
         "default": None
    }
}

print("Download parameters defined.")

Download parameters defined.


In [7]:
# Define additional parameters for testing

# Specify which sensors to process
sensors = ["Sentinel-2", "Sentinel-1", "Sentinel-3-OLCI", "Sentinel-3-SLSTR-Thermal", "DEM"]

# Specify per-sensor band selection parameters (optional)
sensor_bands = {
    "Sentinel-2": {"bands": ["B02", "B03", "B04", "B8A"]},
    "Sentinel-1": {"polarizations": ["VV", "VH"]}
}

# Specify per-sensor download (evalscript) parameters (optional)
evalscript_params = {
    "Sentinel-2": {"units": "REFLECTANCE", "sampleType": "FLOAT32"},
    "Sentinel-1": {"polarizations": ["VV", "VH"], "backCoeff": "GAMMA0_ELLIPSOID"}
}

fire_info_base_folder = "./"  

print("Additional parameters set.")

Additional parameters set.


In [None]:
# Build the dataset for all activations
# Specify the base folder for fire information (GeoJSON files)
fire_info_base_folder = "./"  
# For this test, process 1 activation, with a time window of 18 days around the fire date,
# and apply normalization, masking, and cropping (crop_factor=4).
builder.build_dataset_for_all_activations(
    max_activations=2,
    use_skip_list=True,
    time_window=18,
    download_params=download_params,
    apply_normalization=True,
    apply_mask=True,
    crop=True,
    crop_factor=2,
    #sensors=sensors,
    #sensor_bands=sensor_bands,
    #evalscript_params=evalscript_params,
    fire_info_base_folder=fire_info_base_folder
)

print("Dataset building complete.")

INFO:SatelliteDataManager.core.data_download:**STARTING DATA DOWNLOAD**
INFO:SatelliteDataManager.core.data_download:Downloading Sentinel-2 images for interval: 2019-04-27 to 2019-05-08 (1/3)


Processing activation: EMSR353-AOI01
Skipping activation EMSR353-AOI01: Fire GeoJSON not found.
Processing activation: EMSR360-AOI01
Interval days dict: {'Sentinel-2': 12, 'Sentinel-1': 12, 'Sentinel-3-OLCI': 3, 'Sentinel-3-SLSTR-Thermal': 3, 'DEM': 36}
Size dict: {'Sentinel-2': (256, 256), 'Sentinel-1': (256, 256), 'Sentinel-3-OLCI': (256, 256), 'Sentinel-3-SLSTR-Thermal': (256, 256), 'DEM': (256, 256)}
Mosaicking order dict: {'Sentinel-2': 'leastCC', 'Sentinel-1': 'mostRecent', 'Sentinel-3-OLCI': 'mostRecent', 'Sentinel-3-SLSTR-Thermal': 'mostRecent', 'DEM': 'mostRecent'}
Date from dict: {'Sentinel-2': datetime.datetime(2019, 4, 27, 0, 0), 'Sentinel-1': datetime.datetime(2019, 4, 27, 0, 0), 'Sentinel-3-OLCI': datetime.datetime(2019, 4, 27, 0, 0), 'Sentinel-3-SLSTR-Thermal': datetime.datetime(2019, 4, 27, 0, 0), 'DEM': datetime.datetime(2019, 4, 27, 0, 0)}
Date to dict: {'Sentinel-2': datetime.datetime(2019, 6, 2, 0, 0), 'Sentinel-1': datetime.datetime(2019, 6, 2, 0, 0), 'Sentinel-3-O

INFO:SatelliteDataManager.core.data_download:Data downloaded and saved.
INFO:SatelliteDataManager.core.data_download:Downloading Sentinel-2 images for interval: 2019-05-09 to 2019-05-20 (2/3)
INFO:SatelliteDataManager.core.data_download:Data downloaded and saved.
INFO:SatelliteDataManager.core.data_download:Downloading Sentinel-2 images for interval: 2019-05-21 to 2019-06-01 (3/3)
INFO:SatelliteDataManager.core.data_download:Data downloaded and saved.
INFO:SatelliteDataManager.core.data_download:Downloading Sentinel-1 images for interval: 2019-04-27 to 2019-05-08 (1/3)
INFO:SatelliteDataManager.core.data_download:Data downloaded and saved.
INFO:SatelliteDataManager.core.data_download:Downloading Sentinel-1 images for interval: 2019-05-09 to 2019-05-20 (2/3)
INFO:SatelliteDataManager.core.data_download:Data downloaded and saved.
INFO:SatelliteDataManager.core.data_download:Downloading Sentinel-1 images for interval: 2019-05-21 to 2019-06-01 (3/3)
INFO:SatelliteDataManager.core.data_down

In [None]:
# Visualization test
import os
from SatelliteDataManager.core.sdm import SDM

# Initialize SDM instance for visualization
sdm = SDM(
    config=config,
    data_folder=base_data_folder,
    manipulated_folder=base_manipulated_folder,
    tfrecord_folder=tfrecord_folder
)

# List all TFRecord files generated
tfrecord_files = sorted([os.path.join(tfrecord_folder, f) 
                         for f in os.listdir(tfrecord_folder) 
                         if f.endswith(".tfrecord")])
print("TFRecord files:", tfrecord_files)

# For visualization, select the first TFRecord file (if available) and visualize it
if tfrecord_files:
    file_to_visualize = tfrecord_files[0]
    print(f"Visualizing TFRecord file: {file_to_visualize}")
    #sdm.data_visualizer.inspect_and_visualize_custom_tfrecord(file_to_visualize, crop=True, crop_factor=2)
else:
    print("No TFRecord files found for visualization.")

In [None]:
import glob
# Import necessary modules
from datetime import datetime, timedelta
import json
import os

# Import the required classes from the library
from SatelliteDataManager.analyses.SEDM_Wildfire.burned_area_dataset_builder import BurnedAreaSegmentationDatasetBuilder
from SatelliteDataManager.core.sdm import SDM
from sentinelhub import SHConfig
# Initialize Sentinel Hub configuration
config = SHConfig("peppe")

# Initialize the SDM instance with the appropriate folders
sdm = SDM(
    config=config,
    data_folder="./test/raw",
    manipulated_folder="./test/man",
    tfrecord_folder="./test/tfrecords"
)

# Get the list of TFRecord files from the specified folder
tfrecord_files = glob.glob("data/tfrecords/*.tfrecord")

# Define the list of sensors to include (if None, all default sensors are used)
selected_sensors = ["Sentinel-2", "Sentinel-1"]

# Create the dataset using the new get_dataset method with sensor selection, cropping, augmentation, etc.
dataset = sdm.dataset_preparer.get_dataset(
    tfrecord_files=tfrecord_files,
    batch_size=16,
    augment=True,
    crop=True,
    crop_factor=2,
    min_image_nonzero_percentage=0.2,
    min_label_nonzero_percentage=0.2,
    sensors=selected_sensors
)



In [None]:
# Print dataset characteristics (stampa solo i sensori selezionati)
sdm.dataset_preparer.print_dataset_characteristics(dataset, sensors=selected_sensors)
import numpy as np

# Retrieve one batch from the dataset
for batch_inputs, batch_labels in dataset.take(1):
    # Get the Sentinel-1 images from the batch.
    # Assuming the shape is (batch_size, n_steps, height, width, channels)
    sentinel1_images = batch_inputs["Sentinel-2"].numpy()
    
    # Check if there are at least three time steps.
    if sentinel1_images.shape[1] < 3:
        print("Not enough temporal steps for Sentinel-1 data.")
    else:
        for i in range(sentinel1_images.shape[0]):
            step0 = sentinel1_images[i, 0]
            step1 = sentinel1_images[i, 1]
            step2 = sentinel1_images[i, 2]
            
            # Use np.array_equal to check if the arrays are exactly equal.
            if np.array_equal(step0, step1) and np.array_equal(step1, step2):
                print(f"Example {i}: All three temporal steps for Sentinel-1 are equal.")
            else:
                print(f"Example {i}: Temporal steps differ for Sentinel-1.")


# Visualize one batch of the dataset using the visualizer, mostrando solo i sensori selezionati.
sdm.data_visualizer.visualize_batch(dataset, sensors_to_show=selected_sensors, max_examples=1, crop=True, crop_factor=4)

In [None]:
# --------------------------
# STEP 1: SPLIT INTO TRAIN AND TEST (WITH OPTIONAL STRATIFICATION)
# --------------------------
# For demonstration, we simulate a label (e.g., binary indicator) for each TFRecord file.
# In practice, these labels should be extracted from metadata.

# List all TFRecord files generated
tfrecord_files = sorted([os.path.join(tfrecord_folder, f) 
                         for f in os.listdir(tfrecord_folder) 
                         if f.endswith(".tfrecord")])


file_array = np.array(tfrecord_files)
# Simulate labels: random 0 or 1 for each file
simulated_labels = np.random.randint(0, 2, size=len(file_array))

# Set this flag to True to stratify, or False for a simple random split.
stratify_split = False # Set to True for stratified split TO BE FIXED
test_size = 0.25


if stratify_split:
    # Using stratified split based on simulated_labels
    from SatelliteDataManager.core.ml.data_split import stratified_train_test_split
    train_files, test_files, train_labels, test_labels = stratified_train_test_split(file_array, simulated_labels, test_size=test_size, random_state=42)
    print(f"Stratified split: {len(train_files)} train files, {len(test_files)} test files")
else:
    # Simple random split
    from SatelliteDataManager.core.ml.data_split import train_test_split
    train_files, test_files = train_test_split(file_array, test_size=test_size, random_state=42)
    print(f"Random split: {len(train_files)} train files, {len(test_files)} test files")

# Create datasets from the file lists
sdm = SDM(
    config=config,
    data_folder=base_data_folder,
    manipulated_folder=base_manipulated_folder,
    tfrecord_folder=tfrecord_folder
)

train_ds = sdm.dataset_preparer.get_dataset(
    tfrecord_files=train_files.tolist(),
    batch_size=8,
    augment=True,
    crop=True,
    crop_factor=2,
    min_label_percentage=0.
)
test_ds = sdm.dataset_preparer.get_dataset(
    tfrecord_files=test_files.tolist(),
    batch_size=8,
    augment=False,
    crop=True,
    crop_factor=2,
    min_label_percentage=0.
)
print("Train and test datasets created.")

# --------------------------
# Define input shapes (must match dataset creation)
input_shapes = {
    "Sentinel-2": (3, 128, 128, 17),
    "Sentinel-1": (3, 128, 128, 2),
    "Sentinel-3-OLCI": (12, 128, 128, 21),
    "Sentinel-3-SLSTR-Thermal": (12, 128, 128, 5),
    "DEM": (128, 128, 1)
}


#sdm.dataset_preparer.print_dataset_characteristics(train_ds)



In [None]:
# --------------------------
# STEP 2: HYPERPARAMETER OPTIMIZATION WITH OPTUNA
# --------------------------
# Run a short Optuna study 
study = run_optuna_study(
    objective=lambda trial: burned_area_objective(trial, input_shapes, train_ds, test_ds, epochs=5),
    n_trials=5,
    study_name="burned_area_optimization",
    direction="minimize"
)
best_params = study.best_params
print("Optuna study complete. Best hyperparameters found:", best_params)

# Build the best model using the best parameters
best_model = build_burned_area_segmentation_model(
    input_shapes,
    dropout_rate=best_params["dropout_rate"],
    l2_reg=best_params["l2_reg"],
    s2_filters1=best_params["s2_filters1"],
    s2_filters2=best_params["s2_filters2"],
    s1_filters1=best_params["s1_filters1"],
    s1_filters2=best_params["s1_filters2"],
    s3olci_filters1=best_params["s3olci_filters1"],
    s3olci_filters2=best_params["s3olci_filters2"],
    s3slstr_filters1=best_params["s3slstr_filters1"],
    s3slstr_filters2=best_params["s3slstr_filters2"],
    dem_filters1=best_params["dem_filters1"],
    dem_filters2=best_params["dem_filters2"]
)
best_model.summary()

# Train best model on the entire train set and evaluate on test set
history_best = best_model.fit(train_ds, validation_data=test_ds, epochs=10)
print("Best model training complete.")

# Save best model weights (in .keras format) and training history
best_model.save("best_burned_area_model.keras")
with open("best_model_history.json", "w") as f:
    json.dump(history_best.history, f)
print("Best model and training history saved.")



In [None]:
# --------------------------
# STEP 3: K-FOLD CROSS VALIDATION ON TRAINING SET
# --------------------------
# For k-fold, we split the train_files array. We use stratified_kfold_split if stratification is desired.
from SatelliteDataManager.core.ml.data_split import kfold_split, stratified_kfold_split

k = 5  # number of folds
if stratify_split:
    # Use simulated_labels corresponding to train_files for stratification
    train_labels_for_fold = [int(x) for x in train_labels]
    fold_splits = list(stratified_kfold_split(train_files, train_labels_for_fold, k=k, random_state=42))
else:
    fold_splits = list(kfold_split(train_files, k=k, random_state=42))

fold_histories = []
fold_metrics = []

for fold, (train_idx, val_idx) in enumerate(fold_splits):
    print(f"\nFold {fold+1}/{k}")
    fold_train_files = train_files[train_idx]
    fold_val_files = train_files[val_idx]
    
    # Create fold datasets
    fold_train_ds = sdm.dataset_preparer.get_dataset(
        tfrecord_files=fold_train_files.tolist(),
        batch_size=16,
        augment=True,
        crop=True,
        crop_factor=4,
        min_label_percentage=0.
    )
    fold_val_ds = sdm.dataset_preparer.get_dataset(
        tfrecord_files=fold_val_files.tolist(),
        batch_size=16,
        augment=False,
        crop=True,
        crop_factor=4,
        min_label_percentage=0.
    )
    
    # Build a new model with the best hyperparameters
    fold_model = build_burned_area_segmentation_model(
        input_shapes,
        dropout_rate=best_params["dropout_rate"],
        l2_reg=best_params["l2_reg"],
        s2_filters1=best_params["s2_filters1"],
        s2_filters2=best_params["s2_filters2"],
        s1_filters1=best_params["s1_filters1"],
        s1_filters2=best_params["s1_filters2"],
        s3olci_filters1=best_params["s3olci_filters1"],
        s3olci_filters2=best_params["s3olci_filters2"],
        s3slstr_filters1=best_params["s3slstr_filters1"],
        s3slstr_filters2=best_params["s3slstr_filters2"],
        dem_filters1=best_params["dem_filters1"],
        dem_filters2=best_params["dem_filters2"]
    )
    
    # Train for a fixed number of epochs (e.g., 5 for demo)
    history_fold = fold_model.fit(fold_train_ds, validation_data=fold_val_ds, epochs=5, verbose=0)
    fold_histories.append(history_fold.history)
    
    # Evaluate fold model on validation fold
    metrics_fold = fold_model.evaluate(fold_val_ds, verbose=0)
    fold_metrics.append(dict(zip(fold_model.metrics_names, metrics_fold)))
    print(f"Fold {fold+1} metrics:", fold_metrics[-1])

# Optionally, aggregate and print average metrics across folds
avg_metrics = {}
for metric in fold_metrics[0].keys():
    avg_metrics[metric] = np.mean([fold[metric] for fold in fold_metrics])
print("\nAverage k-fold metrics:", avg_metrics)

# --------------------------
# FINAL EVALUATION: BEST MODEL ON TRAIN/TEST SPLIT
# --------------------------
# Evaluate the best model (obtained via Optuna) on train and test sets separately
train_eval = best_model.evaluate(train_ds, verbose=0)
test_eval = best_model.evaluate(test_ds, verbose=0)
print("\nBest Model Evaluation:")
print("Train set metrics:", dict(zip(best_model.metrics_names, train_eval)))
print("Test set metrics:", dict(zip(best_model.metrics_names, test_eval)))

# Plot training history and ROC curve for final best model
plot_training_history(history_best)

all_labels_test = []
all_preds_test = []
for batch_inputs, batch_labels in test_ds:
    preds = best_model.predict(batch_inputs)
    all_labels_test.append(batch_labels.numpy().flatten())
    all_preds_test.append(preds.flatten())
all_labels_test = np.concatenate(all_labels_test)
all_preds_test = np.concatenate(all_preds_test)
plot_roc_curve(all_labels_test, all_preds_test)