# Sentinel-2 Data Analytics

This notebook loads `.zarr` Sentinel-2 Level-2A datasets and computes various statistics, correlations, and visualizations.

In [1]:
import os
import glob
import numpy as np
import pandas as pd
import xarray as xr
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler
from shapely.geometry import shape
import json

plt.style.use('seaborn-v0_8')

## 1. Parameters

In [2]:
# Directory containing .zarr files
DATA_DIR = "/home/ubuntu/mucilage_pipeline/mucilage-detection/data/adr_test/target"

# Bands to use for analytics
BANDS = ["b02", "b03", "b04", "b08", "b8a", "b11", "b12"]  # Blue, Green, Red, NIR, NIR narrow, SWIR1, SWIR2

## 2. Helper Functions

In [None]:
# Sentinel-2 SCL codes for clouds/snow
CLOUD_CODES = {8, 9, 10, 11}

def open_s2_zarr(zarr_path):
    """Open Sentinel-2 .zarr as a DataTree."""
    return xr.open_datatree(zarr_path, engine="zarr", mask_and_scale=False)

def compute_cloud_fraction(dt, ref_band_name="b04"):
    """Compute cloud fraction from SCL mask."""
    refl = dt.measurements.reflectance.r10m
    # Find reference band
    ref_band = None
    for cand in (ref_band_name, ref_band_name.lower(), "B04", "b04"):
        if cand in refl:
            ref_band = refl[cand]
            break
    if ref_band is None:
        raise ValueError("No reference band found in reflectance.")

    scl = dt.conditions.mask.l2a_classification.r20m["scl"]
    scl_10m = scl.interp_like(ref_band, method="nearest")

    cloud_mask = np.isin(scl_10m.values, list(CLOUD_CODES))
    return cloud_mask.sum() / cloud_mask.size

def compute_band_stats(dt, bands):
    """Compute mean, std, min, max for given bands."""
    stats = {}
    for b in bands:
        if b.lower() in dt.measurements.reflectance:
            band_data = dt.measurements.reflectance[b.lower()].values
        elif b in dt.measurements.reflectance:
            band_data = dt.measurements.reflectance[b].values
        else:
            continue
        stats[f"{b}_mean"] = np.nanmean(band_data)
        stats[f"{b}_std"] = np.nanstd(band_data)
        stats[f"{b}_min"] = np.nanmin(band_data)
        stats[f"{b}_max"] = np.nanmax(band_data)
    return stats

## 3. Process All Scenes

In [12]:
all_stats = []
zarr_files = glob.glob(os.path.join(DATA_DIR, "*.zarr"))

for zf in zarr_files:
    try:
        dt = open_s2_zarr(zf)
        scene_stats = {"file": os.path.basename(zf)}
        scene_stats["cloud_fraction"] = compute_cloud_fraction(dt)
        scene_stats.update(compute_band_stats(dt, BANDS))
        all_stats.append(scene_stats)
    except Exception as e:
        print(f"Error processing {zf}: {e}")

df_stats = pd.DataFrame(all_stats)
df_stats

1. Consolidating metadata in this existing store with zarr.consolidate_metadata().
2. Explicitly setting consolidated=False, to avoid trying to read consolidate metadata, or
3. Explicitly setting consolidated=True, to raise an error in this case instead of falling back to try reading non-consolidated metadata.
  return xr.open_datatree(zarr_path, engine="zarr", mask_and_scale=False)
1. Consolidating metadata in this existing store with zarr.consolidate_metadata().
2. Explicitly setting consolidated=False, to avoid trying to read consolidate metadata, or
3. Explicitly setting consolidated=True, to raise an error in this case instead of falling back to try reading non-consolidated metadata.
  return xr.open_datatree(zarr_path, engine="zarr", mask_and_scale=False)
1. Consolidating metadata in this existing store with zarr.consolidate_metadata().
2. Explicitly setting consolidated=False, to avoid trying to read consolidate metadata, or
3. Explicitly setting consolidated=True, to raise an e

Unnamed: 0,file,cloud_fraction
0,S2B_MSIL2A_20240721T100559_N0510_R022_T33TUL_2...,0.082829
1,S2B_MSIL2A_20240728T095549_N0511_R122_T32TQQ_2...,0.001443
2,S2B_MSIL2A_20240728T095549_N0511_R122_T32TQR_2...,0.005111
3,S2A_MSIL2A_20240716T100601_N0510_R022_T33TUL_2...,0.00156
4,S2B_MSIL2A_20240721T100559_N0510_R022_T32TQR_2...,0.078004
5,S2B_MSIL2A_20240721T100559_N0510_R022_T33TUK_2...,6.2e-05
6,S2A_MSIL2A_20240716T100601_N0510_R022_T32TQQ_2...,0.00897
7,S2A_MSIL2A_20240726T100601_N0511_R022_T32TQQ_2...,0.003582
8,S2B_MSIL2A_20240718T095549_N0510_R122_T33TUL_2...,0.001123
9,S2B_MSIL2A_20240718T095549_N0510_R122_T32TQQ_2...,0.005368


## 4. Correlation Matrix Between Bands

In [13]:
corr = df_stats[[f"{b}_mean" for b in BANDS]].corr()
plt.figure(figsize=(8,6))
sns.heatmap(corr, annot=True, cmap="coolwarm")
plt.title("Mean Reflectance Correlation Between Bands")
plt.show()

KeyError: "None of [Index(['b02_mean', 'b03_mean', 'b04_mean', 'b08_mean', 'b8a_mean', 'b11_mean',\n       'b12_mean'],\n      dtype='object')] are in the [columns]"

## 5. t-SNE on Band Statistics

In [None]:
X = df_stats[[f"{b}_mean" for b in BANDS]].fillna(0)
X_scaled = StandardScaler().fit_transform(X)

tsne = TSNE(n_components=2, random_state=42, perplexity=5)
tsne_coords = tsne.fit_transform(X_scaled)

df_stats['tsne_x'] = tsne_coords[:,0]
df_stats['tsne_y'] = tsne_coords[:,1]

plt.figure(figsize=(8,6))
sns.scatterplot(data=df_stats, x='tsne_x', y='tsne_y', hue='cloud_fraction', palette='viridis')
plt.title("t-SNE Projection of Scenes")
plt.show()

## 6. Example RGB Composite

In [None]:
example_file = zarr_files[0]
ds = open_s2_zarr(example_file)

rgb = np.dstack([
    ds['b04'].values.astype(np.float32) / 10000.0,
    ds['b03'].values.astype(np.float32) / 10000.0,
    ds['b02'].values.astype(np.float32) / 10000.0
])

# Simple percentile stretching
p2, p98 = np.nanpercentile(rgb, (2, 98))
rgb = np.clip((rgb - p2) / (p98 - p2), 0, 1)

plt.figure(figsize=(10,8))
plt.imshow(rgb)
plt.title("RGB Composite")
plt.axis('off')
plt.show()