In [1]:
!pip install zarr -q
!pip install plotly -q
!pip install seaborn -q

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m205.4/205.4 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.8/8.8 MB[0m [31m77.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.7/53.7 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
# ==============================================================================
# SSL4EO-S12 DATA DOWNLOAD SCRIPT (ADAPTED FOR KAGGLE)
# ==============================================================================
# Target: 20GB of S2RGB data only (3-channel RGB composite).
# Change: Removed Google Drive check. Using Kaggle's /kaggle/working/
#         directory for persistent storage.
# ==============================================================================

import requests
from pathlib import Path
import time

# --- 1. Configuration ---
# Set the desired modality (S2RGB) and the download limit.
LIMIT_GB = 19
MODALITY_S2L1C = False
MODALITY_S2L2A = False
MODALITY_S1GRD = False
MODALITY_S2RGB = True  # Download this only: 3-channel RGB images.

# Convert limit to bytes and define base directory structure.
LIMIT_BYTES = LIMIT_GB * 1024 * 1024 * 1024
SPLIT = "train"
BASE_DIR = "data/ssl4eo-s12" # This structure will be created inside /kaggle/working/

print("Configuration:")
print(f"   LIMIT_GB: {LIMIT_GB}")
print(f"   MODALITY_S2RGB: {MODALITY_S2RGB}")
print(f"   SPLIT: {SPLIT}")


# --- 2. Functions ---

def setup_directories():
    """
    Creates the directory structure for the Kaggle environment.
    Data is saved to the persistent /kaggle/working/ directory.
    """
    # In Kaggle, the persistent storage location is /kaggle/working/.
    base_dir = Path(f"/kaggle/working/{BASE_DIR}/{SPLIT}")
    storage_type = "Kaggle working directory (persistent across sessions)"

    print(f"\nStorage Type: {storage_type}")
    print(f"Data Location: {base_dir}")

    return base_dir

def download_modality(modality, base_dir):
    """Downloads the specified modality from the web."""

    url_base = f"https://datapub.fz-juelich.de/ssl4eo-s12/{SPLIT}/{modality}"
    target_dir = base_dir / modality
    target_dir.mkdir(parents=True, exist_ok=True)

    print(f"\nDownloading modality: {modality}")
    print(f"   Source URL: {url_base}")
    print(f"   Target Dir: {target_dir}")

    downloaded_count = 0
    total_size = 0

    # Check for existing files and their sizes before starting the download.
    existing_files = list(target_dir.glob("*.zarr.zip"))
    if existing_files:
        existing_size = sum(f.stat().st_size for f in existing_files)
        total_size += existing_size
        downloaded_count = len(existing_files)
        print(f"   Found existing files: {downloaded_count} files, size: {existing_size/1e6:.1f} MB")

    # Download files by iterating through a sequence of file indices.
    # Use a wide range; the script will continue if a 404 error occurs.
    for i in range(1, 500):
        file_name = f"ssl4eos12_{SPLIT}_seasonal_data_{i:06d}.zarr.zip"
        url = f"{url_base}/{file_name}"
        file_path = target_dir / file_name

        # Skip the file if it already exists.
        if file_path.exists():
            continue

        try:
            print(f"   Fetching: {file_name} ...", end='')
            response = requests.get(url, timeout=30, stream=True)

            if response.status_code == 200:
                # Write the file in chunks to use memory efficiently.
                with open(file_path, 'wb') as f:
                    for chunk in response.iter_content(chunk_size=8192):
                        f.write(chunk)

                # Update counters.
                file_size = file_path.stat().st_size
                total_size += file_size
                downloaded_count += 1
                total_mb = total_size / 1e6
                print(f" Done! Total downloaded so far: {total_mb:.1f} MB")

                # Check the download limit.
                if total_size >= LIMIT_BYTES:
                    print(f"\nReached download limit of {LIMIT_GB} GB. Stopping.")
                    break
            
            elif response.status_code == 404:
                # A 404 error is expected if a file doesn't exist on the server; silently skip.
                print(" Not found, trying next.")
                continue # Try the next file.
            
            else:
                print(f" HTTP Error {response.status_code} for {file_name}")
                continue

        except Exception as e:
            print(f"   Error downloading {file_name}: {e}")
            continue

    final_mb = total_size / 1e6
    print(f"\nFinished downloading for modality: {modality}")
    print(f"   Total files for this modality: {downloaded_count}")
    print(f"   Total size for this modality: {final_mb:.1f} MB ({final_mb/1024:.2f} GB)")

    return downloaded_count, total_size

def main():
    """Main execution function."""
    print("\nStarting data acquisition process...")
    start_time = time.time()

    # 1. Prepare directories.
    base_dir = setup_directories()

    total_downloaded_files = 0
    total_size_bytes = 0

    # 2. Download selected modalities.
    if MODALITY_S2RGB:
        count, size = download_modality("S2RGB", base_dir)
        total_downloaded_files += count
        total_size_bytes += size

    # Inform the user why other modalities are being skipped.
    if MODALITY_S2L1C: print("\n(S2L1C disabled in configuration, skipping)")
    if MODALITY_S2L2A: print("(S2L2A disabled, using 3-channel RGB composite instead)")
    if MODALITY_S1GRD: print("(S1GRD disabled, skipping SAR data)")

    # 3. Summarize the results.
    duration_min = (time.time() - start_time) / 60
    final_gb = total_size_bytes / 1e9

    print("\n" + "="*60)
    print(f"DOWNLOAD COMPLETE!")
    print(f"   Final Data Location: {base_dir}")
    print(f"   Total Files Downloaded: {total_downloaded_files}")
    print(f"   Total Size on Disk: {final_gb:.2f} GB")
    print(f"   Total Duration: {duration_min:.1f} minutes")
    print(f"\nData is ready for the next step: Creating a PyTorch Dataset.")
    print("="*60)

    return base_dir, final_gb


# --- 3. Run the Script ---
# This block triggers the main() function when the script is run directly.
# This conditional is often not strictly necessary in a notebook but is good practice.
if __name__ == "__main__":
    save_location, dataset_size = main()

    # Store the data path in a variable for use in subsequent cells.
    S2RGB_DATA_PATH = save_location / "S2RGB"
    print(f"\nPath to S2RGB data for next steps: {S2RGB_DATA_PATH}")

Configuration:
   LIMIT_GB: 19
   MODALITY_S2RGB: True
   SPLIT: train

Starting data acquisition process...

Storage Type: Kaggle working directory (persistent across sessions)
Data Location: /kaggle/working/data/ssl4eo-s12/train

Downloading modality: S2RGB
   Source URL: https://datapub.fz-juelich.de/ssl4eo-s12/train/S2RGB
   Target Dir: /kaggle/working/data/ssl4eo-s12/train/S2RGB
   Fetching: ssl4eos12_train_seasonal_data_000001.zarr.zip ... Done! Total downloaded so far: 43.0 MB
   Fetching: ssl4eos12_train_seasonal_data_000002.zarr.zip ... Done! Total downloaded so far: 85.6 MB
   Fetching: ssl4eos12_train_seasonal_data_000003.zarr.zip ... Done! Total downloaded so far: 126.7 MB
   Fetching: ssl4eos12_train_seasonal_data_000004.zarr.zip ... Done! Total downloaded so far: 168.3 MB
   Fetching: ssl4eos12_train_seasonal_data_000005.zarr.zip ... Done! Total downloaded so far: 209.9 MB
   Fetching: ssl4eos12_train_seasonal_data_000006.zarr.zip ... Done! Total downloaded so far: 252.3 