In [1]:
import geopandas as gpd
import pandas as pd
from shapely.geometry import box

import os
import numpy as np
import rasterio
from rasterio import features
from tqdm import tqdm
import matplotlib.pyplot as plt
import random
import shutil 



#PATHs
glacliers_path = "glamos/glacier_list.csv"
shape_path_2010 = "glamos/2010/SGI_2010.shp"
shape_path_2016 = "glamos/2016/SGI_2016_glaciers.shp"

sgi_2016 = gpd.read_file(shape_path_2016)

# Attributes of the shape file
print(sgi_2016.columns)

Index(['gid', 'pk_glacier', 'sgi-id', 'name', 'rl_0', 'rl_1', 'rl_2', 'rl_3',
       'i_code', 'year_acq', 'year_rel', 'area_km2', 'length_km', 'masl_min',
       'masl_med', 'masl_mean', 'masl_max', 'slope_deg', 'aspect_deg',
       'geometry'],
      dtype='object')


In [2]:
gdf = gpd.read_file(shape_path_2016) 
if gdf.crs is None: 
    gdf.set_crs(epsg=2056, inplace=True) # Projection into WGS84 required by GEE 
gdf_wgs84 = gdf.to_crs(epsg=4326) # Save 
gdf_wgs84.to_file("glamos/SGI_2016_wgs84.shp")

In [3]:
# Filter out 2016 glaciers 
count_before = len(sgi_2016[sgi_2016["year_acq"] <= 2016]) 
sgi = sgi_2016[sgi_2016["year_acq"] >= 2016] 
print(f"Removed {count_before - len(sgi)} glaciers acquired in 2016. ({(count_before - len(sgi)) / count_before * 100:.2f}%)")

Removed 64 glaciers acquired in 2016. (6.39%)


In [None]:
# debugging info

gdf = gpd.read_file("glamos/SGI_2016_wgs84.shp")
print("crs after read:", gdf.crs)

gdf_2056 = gdf.to_crs(2056)
print("invalid geoms:", (~gdf_2056.is_valid).sum())
print("empty geoms:", (gdf_2056.is_empty).sum())
print("geom types:", gdf_2056.geom_type.value_counts().head())
print("example area m2:", gdf_2056.geometry.area.describe())


crs after read: EPSG:4326
invalid geoms: 0
empty geoms: 0
geom types: Polygon         1102
MultiPolygon     298
Name: count, dtype: int64
example area m2: count    1.400000e+03
mean     6.867138e+05
std      3.064828e+06
min      1.009541e+04
25%      3.275194e+04
50%      9.044433e+04
75%      3.126283e+05
max      7.849365e+07
dtype: float64


In [None]:
# debugging info

import ee
ee.Initialize(project="b3testdrive")

img = (ee.ImageCollection("COPERNICUS/S2_SR_HARMONIZED")
       .filterDate("2019-07-01", "2019-07-10")
       .first()
       .select("B4"))

print("Image id:", img.id().getInfo())


Image id: 20180322T105021_20190705T150938_T20TQP


In [None]:
#debugging info

import ee
ee.Initialize(project="b3testdrive")

region = ee.Geometry.Rectangle([7.0, 46.0, 7.01, 46.01])  # tiny region

img = (ee.ImageCollection("COPERNICUS/S2_SR_HARMONIZED")
       .filterDate("2019-07-01", "2019-07-10")
       .median()
       .select(["B2","B3","B4","B8"]))

url = img.getDownloadURL({
    "scale": 10,
    "region": region,
    "crs": "EPSG:4326",
    "format": "GEO_TIFF"
})
print("Download URL created OK (length):", len(url))


Download URL created OK (length): 145


In [7]:
import ee

import geemap

# Initialisation
try:
    ee.Initialize(project="b3testdrive")
except Exception:
    ee.Authenticate()
    ee.Initialize(project="b3testdrive")

# --- Configuration Scientifique ---
TILE_SIZE_PX = 224
RESOLUTION = 10 
TILE_SIZE_M = TILE_SIZE_PX * RESOLUTION  # 2240m
OUTPUT_DIR = "dataset/images_raw_2056"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Seuils
MIN_INTERSECTION_M2 = 5000
MIN_YEAR = 2015 

# Charge into EPSG:2056

gdf = gpd.read_file("glamos/SGI_2016_wgs84.shp") 
if gdf.crs.to_epsg() != 2056: 
    gdf = gdf.to_crs(epsg=2056)


def get_sentinel_raw(year):
    """
    Récupère les données brutes (Reflectance).
    Pas de .visualize() ! On veut les données scientifiques.
    """
    start = f'{year}-07-01'
    end = f'{year}-09-30'
    
    collection = ee.ImageCollection("COPERNICUS/S2_SR_HARMONIZED") \
        .filterDate(start, end) \
        .filter(ee.Filter.lt('CLOUDY_PIXEL_PERCENTAGE', 20)) \
        .median() # Median to reduce clouds
    
    # Select bands : Blue, Green, Red, NIR
    
    return collection.select(['B2', 'B3', 'B4', 'B8'])

# Stats
stats = { "tiles_kept": 0, "errors": 0 }

print("len(gdf) =", len(gdf))
print("crs =", gdf.crs)
print("total_bounds =", gdf.total_bounds)  # [minx, miny, maxx, maxy]
print("example year_acq unique (head) =", gdf["year_acq"].head().tolist())



len(gdf) = 1400
crs = EPSG:2056
total_bounds = [2552348.6644433  1078863.87018848 2827545.1764972  1235126.54123004]
example year_acq unique (head) = [2014, 2016, 2016, 2014, 2016]


In [9]:

print(f"Début du traitement en EPSG:2056...")

for index, row in gdf.iterrows():
    g_id = row['pk_glacier']
    year = int(row["year_acq"])
    
    if year < MIN_YEAR: continue 

    # Geometry (LV95)
    geom_shapely = row.geometry
    minx, miny, maxx, maxy = geom_shapely.bounds
    
    # Grid coordinates
    x_coords = np.arange(minx, maxx, TILE_SIZE_M)
    y_coords = np.arange(miny, maxy, TILE_SIZE_M)
    
    local_cnt = 0
    

    
    for x in x_coords:
        for y in y_coords:
            # Tiles
            tile_box = box(x, y, x + TILE_SIZE_M, y + TILE_SIZE_M)
            
            # Filter
            if not tile_box.intersects(geom_shapely): continue
            if tile_box.intersection(geom_shapely).area < MIN_INTERSECTION_M2: continue
            
            filename = os.path.join(OUTPUT_DIR, f"{g_id}_{year}_tile_{local_cnt}.tif")
            if os.path.exists(filename): 
                local_cnt += 1
                continue
            
            # EPSG:2056 -> EPSG:4326 (required in GEE)
            box_gdf = gpd.GeoSeries([tile_box], crs=2056)
            box_wgs84 = box_gdf.to_crs(epsg=4326).iloc[0]
            
            region_ee = ee.Geometry.Rectangle(
                [box_wgs84.bounds[0], box_wgs84.bounds[1], box_wgs84.bounds[2], box_wgs84.bounds[3]],
                proj='EPSG:4326',
                geodesic=False
            )
            
            try:
                # Save
                geemap.download_ee_image(
                    get_sentinel_raw(int(year)),
                    filename=filename,
                    region=region_ee,   # Area of interest
                    scale=10,           # 10m pixels
                    crs='EPSG:2056',    # EPSG:2056
                    dtype='uint16'      
                )
                stats["tiles_kept"] += 1
                local_cnt += 1
                
            except Exception as e:
                print(f"Erreur {g_id}: {e}")
                stats["errors"] += 1

print(f"Tiles generated : {stats['tiles_kept']}")

Début du traitement en EPSG:2056...
Tiles generated : 0


In [None]:
# Configuration

IMAGE_DIR = "dataset/images_raw_2056"
MASK_DIR = "dataset/masks"
SHP_PATH = "glamos/SGI_2016_wgs84.shp"

os.makedirs(MASK_DIR, exist_ok=True)

# Year tolerance
YEAR_TOLERANCE = 1 

# Classes definitions
CLASS_BACKGROUND = 0
CLASS_GLACIER = 1
CLASS_IGNORE = 255

# Project shapefile to EPSG:2056
gdf = gpd.read_file(SHP_PATH)
gdf = gdf.set_crs(epsg=4326, allow_override=True).to_crs(epsg=2056)
if gdf.crs.to_epsg() != 2056:
    gdf = gdf.to_crs(epsg=2056)

gdf["pk_glacier"] = gdf["pk_glacier"].astype(str)

# Spatial index for fast queries
sindex = gdf.sindex

# List images
image_files = [f for f in os.listdir(IMAGE_DIR) if f.endswith('.tif')]
total = len(image_files)
print(f"Generate masks for {total} images")

for filename in tqdm(image_files):
    
    # Parsing du nom : "UUID_ANNEE_tile_X.tif"
    try:
        parts = filename.replace('.tif', '').split('_')
        # format : ID_COMPLEX_YEAR_tile_NUMBER

        target_year = int(parts[-3]) 
    
        target_id = "_".join(parts[:-3])
    except Exception as e:
        print(f"   Filename error: {filename} -> {e}")
        continue
        
    image_path = os.path.join(IMAGE_DIR, filename)
    mask_path = os.path.join(MASK_DIR, filename) 
    
    with rasterio.open(image_path) as src:
        # Spatial info
        out_shape = (src.height, src.width) # Expected (224, 224)
        transform = src.transform 
        bounds = src.bounds 
        
        # Spatial query 
        bbox_geom = box(bounds.left, bounds.bottom, bounds.right, bounds.top)
        
        # Filter by spatial index
        possible_matches_index = list(sindex.query(bbox_geom))
        possible_matches = gdf.iloc[possible_matches_index]
        
        # Filter by intersection
        visible_glaciers = possible_matches[possible_matches.intersects(bbox_geom)].copy()
        
        # Empty mask
        mask = np.zeros(out_shape, dtype=np.uint8)

        if not visible_glaciers.empty:
            # Check acquisition year
            visible_glaciers['year_diff'] = (visible_glaciers['year_acq'] - target_year).abs()
            
            is_target = visible_glaciers['pk_glacier'].astype(str) == str(target_id)
            is_good_year = visible_glaciers['year_diff'] <= YEAR_TOLERANCE
            
            valid_mask = is_target | is_good_year
            
            bad_glaciers = visible_glaciers[~valid_mask]  # check neighbors too old/recent
            good_glaciers = visible_glaciers[valid_mask]  # target year + recent neighbors
            
            # Rasterisation
            # Fisrt : ignored glaciers (255)
        
            if not bad_glaciers.empty:
                shapes_bad = ((geom, CLASS_IGNORE) for geom in bad_glaciers.geometry)
                features.rasterize(
                    shapes=shapes_bad,
                    out=mask,
                    transform=transform,
                    default_value=CLASS_IGNORE,
                    dtype=np.uint8
                )
                
            # Then: valid glaciers (1)
            # If overlap -> keep valid glacier
            if not good_glaciers.empty:
                shapes_good = ((geom, CLASS_GLACIER) for geom in good_glaciers.geometry)
                features.rasterize(
                    shapes=shapes_good,
                    out=mask,
                    transform=transform,
                    default_value=CLASS_GLACIER,
                    dtype=np.uint8
                )
        
        # Save 
        # 1 band: uint8, LZW
        meta = src.meta.copy()
        meta.update({
            "count": 1,
            "dtype": 'uint8',
            "driver": "GTiff",
            "compress": "lzw",
            "nodata": 0 # back = 0
        })
        
        with rasterio.open(mask_path, 'w', **meta) as dst:
            dst.write(mask, 1)

In [None]:
IMAGE_DIR = "dataset/images_raw_2056"
MASK_DIR = "dataset/masks"

# Normalize image (Contrast Stretch)
def stretch_image(img_array):
    """
    Take an image (H, W, C) and apply histogram stretch 2%-98%.
    """
    # keep nodata 0
    lower = np.percentile(img_array, 2)
    upper = np.percentile(img_array, 98)
    
    # Stretch values between 0 and 1
    img_norm = (img_array - lower) / (upper - lower)
    img_norm = np.clip(img_norm, 0, 1)
    return img_norm

# Random choice
files = [f for f in os.listdir(MASK_DIR) if f.endswith('.tif')]
if not files:
    print("No files found in mask directory")
else:
    sample_file = random.choice(files)
    print(f"Visualisation: {sample_file}")

    img_p = os.path.join(IMAGE_DIR, sample_file)
    msk_p = os.path.join(MASK_DIR, sample_file)
    print(f"Image path: {img_p}")

    with rasterio.open(img_p) as src_img, rasterio.open(msk_p) as src_msk:
        # GEE : ['B2', 'B3', 'B4', 'B8']
        # Indices Python (0-based) : 0=Blue, 1=Green, 2=Red, 3=NIR
        
        r = src_img.read(3)
        g = src_img.read(2)
        b = src_img.read(1)
        
        # Stack RGB
        img = np.dstack((r, g, b))
        
        # Convert in float
        img = img.astype(np.float32)
        
        # Apply function
        img_vis = stretch_image(img)
        
        # Read mask
        msk = src_msk.read(1)

    # Visualisation
    plt.figure(figsize=(12, 6))
    
    plt.subplot(1, 2, 1)
    plt.imshow(img_vis)
    plt.title(f"Sentinel-2 (RGB Stretch)\n{sample_file}")
    plt.axis('off')

    plt.subplot(1, 2, 2)
    plt.imshow(msk, cmap='gray', interpolation='nearest')
    plt.title("Mask (Ground Truth)")
    plt.axis('off')

    plt.tight_layout()
    plt.show()

In [None]:

SRC_IMG_DIR = "dataset/images_raw_2056"
SRC_MASK_DIR = "dataset/masks"

CLEAN_IMG_DIR = "dataset/clean/images"
CLEAN_MASK_DIR = "dataset/clean/masks"

os.makedirs(CLEAN_IMG_DIR, exist_ok=True)
os.makedirs(CLEAN_MASK_DIR, exist_ok=True)

files = [f for f in os.listdir(SRC_IMG_DIR) if f.endswith('.tif')]
corrupted_count = 0
valid_count = 0

# Clean process

for f in tqdm(files):
    src_img_path = os.path.join(SRC_IMG_DIR, f)
    src_mask_path = os.path.join(SRC_MASK_DIR, f)
    
    # Check mask existence
    if not os.path.exists(src_mask_path):
        corrupted_count += 1
        continue

    is_valid = False
    
    # Check contents (not empty)
    try:
        with rasterio.open(src_img_path) as src:
            
            if src.read().max() > 0:
                is_valid = True
    except Exception as e:
        print(f"Error {f}: {e}")
    
    if is_valid:
        shutil.copy2(src_img_path, os.path.join(CLEAN_IMG_DIR, f))
        shutil.copy2(src_mask_path, os.path.join(CLEAN_MASK_DIR, f))
        valid_count += 1
    else:
        corrupted_count += 1


print(f"Rejected images : {corrupted_count}")
print(f"Valid images : {valid_count}")


# A cause des nuages, on perds 834 images

Pour mtn, on peut continuer sans elles


In [None]:
CLEAN_IMG_DIR = "dataset/clean/images"

ORIGINAL_SHP = "glamos/SGI_2016_wgs84.shp"

OUTPUT_SHP = "glamos/SGI_2016_VALID_ONLY.shp"


files = [f for f in os.listdir(CLEAN_IMG_DIR) if f.endswith('.tif')]

if not files:
    raise ValueError("Erreur : Le dossier d'images est vide !")

# IDs extraction - expected format : ID_YEAR_tile_X.tif

valid_ids = set()
for f in tqdm(files):
    
    parts = f.replace('.tif', '').split('_')
    # Rebuild ID
    if len(parts) >= 3:
        g_id = "_".join(parts[:-3])
        valid_ids.add(g_id)

print(f" Unique IDs: {len(valid_ids)} ")

gdf = gpd.read_file(ORIGINAL_SHP)

# Convert ID into string
gdf['pk_glacier'] = gdf['pk_glacier'].astype(str)

# Filter
gdf_valid = gdf[gdf['pk_glacier'].isin(valid_ids)].copy()

print(f" {len(gdf_valid)} polygones kept over {len(gdf)} ")


# Save
gdf_valid.to_file(OUTPUT_SHP)


![alt text](image.png)
test zone

In [None]:
DRY_RUN = False  # <--- METTRE A FALSE POUR ACTIVER LE DÉPLACEMENT RÉEL

TSV_PATH = "test_set_idx.tsv"
SRC_IMG_DIR = "dataset/clean/images"
SRC_MASK_DIR = "dataset/clean/masks"
DEST_IMG_DIR = "dataset/test/images"
DEST_MASK_DIR = "dataset/test/masks"


if not DRY_RUN:
    os.makedirs(DEST_IMG_DIR, exist_ok=True)
    os.makedirs(DEST_MASK_DIR, exist_ok=True)

# Charge target
test_set = pd.read_csv(TSV_PATH, sep="\t")

target_ids = set(test_set["pk_glacier"].astype(str).str.strip().str.replace('"', ''))

files = [f for f in os.listdir(SRC_IMG_DIR) if f.endswith('.tif')]

files_to_move = []
ids_found_in_files = set()

# Match
for filename in tqdm(files, desc="Analyse des fichiers"):
    parts = filename.replace('.tif', '').split('_')
    if len(parts) >= 3:
        file_glacier_id = "_".join(parts[:-3])
        
        # ID found in target?
        if file_glacier_id in target_ids:
            files_to_move.append(filename)
            ids_found_in_files.add(file_glacier_id)

#Compute stats
missing_ids = target_ids - ids_found_in_files
total_tiles = len(files_to_move)


print(f"Results Dry Run simulation: {DRY_RUN})")

print(f"Target : {len(target_ids)}")
print(f"Glaciers found: {len(ids_found_in_files)}")
print(f"Missing glaciers: {len(missing_ids)}")
print(f"Total tiles: {total_tiles}")


if len(missing_ids) > 0:
    print(f"Examples of missing IDs: {list(missing_ids)[:5]}")
    print("Glaciers found in target but no matching images in clean")

if total_tiles > 0:
    if DRY_RUN:
        print(f"\ntest: {total_tiles} files")
    else:
        moved_count = 0
        
        for filename in tqdm(files_to_move, desc="Déplacement"):
            src_img = os.path.join(SRC_IMG_DIR, filename)
            src_msk = os.path.join(SRC_MASK_DIR, filename)
            
            dst_img = os.path.join(DEST_IMG_DIR, filename)
            dst_msk = os.path.join(DEST_MASK_DIR, filename)
            
            try:
                shutil.move(src_img, dst_img)
                
                if os.path.exists(src_msk):
                    shutil.move(src_msk, dst_msk)
                moved_count += 1
            except Exception as e:
                print(f"Error in {filename}: {e}")
                
        print(f"\n{moved_count} tiles in dataset/test")
else:
    print("\nNothing to move")