### Given the manual outlines for the 2023 data release, this notebook processes them (clipping to the inventory, computing areas, etc.)

In [None]:
import os
import sys
from pathlib import Path

import geopandas as gpd
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import xarray as xr
from tqdm import tqdm

# add the patch to the source files
path_src = Path(os.getcwd()).parent / 'dl4gam'
if str(path_src) not in sys.path:
    sys.path.append(str(path_src))

from utils.data_prep import build_binary_mask

#### First read the inventory outlines

In [None]:
outlines_fp = '../data/outlines/paul_et_al_2020/c3s_gi_rgi11_s2_2015_v2.shp'
gl_df = gpd.read_file(outlines_fp)
gl_df['date_inv'] = gl_df.date_inv.apply(pd.to_datetime)
gl_df['area_inv'] = gl_df.area_km2
gl_df['year_inv'] = gl_df.date_inv.apply(lambda d: d.year)
gl_df = gl_df[gl_df.area_inv >= 0.1]
gl_df

#### Read the manual outlines for 2023, clip to the inventory, and build the binary mask (for consistency with how the inventory was processed)

In [None]:
# hack the 2023 preds and overlay my labels
subdir = '2023'
# subdir = 'inv'

if subdir == '2023':
    dir_rasters = Path('../data/external/wd/s2_alps_plus/2023/glacier_wide_sel')
    use_my_2023 = True
else:
    dir_rasters = Path('../data/external/wd/s2_alps_plus/inv/glacier_wide_sel')
    use_my_2023 = False

gl_sdf_2023 = gpd.read_file('../data/outlines/manual_delineation/preds_2023_corrected.shp')
gl_sdf_2023['entry_id_i'] = gl_sdf_2023.entry_id.apply(lambda s: int(s[2:]))

res_dir_root = Path('../data/external/_experiments/s2_alps_plus/unet')
fp_preds_all = []
for i_split in range(1, 5 + 1):
    crt_dir = res_dir_root / f'split_{i_split}/seed_all/version_0/output/preds_calib/s2_alps_plus/{subdir}/s_test'
    assert crt_dir.exists(), f"Directory {crt_dir} does not exist"
    crt_fp_list = list(crt_dir.rglob('*.nc'))
    fp_preds_all.extend(crt_fp_list)
fp_preds_all = sorted(filter(lambda fp: fp.parent.name in list(gl_sdf_2023.entry_id), fp_preds_all))
print(len(fp_preds_all))

In [None]:
stats_all = []
mask_gt_name = 'mask_crt_g'
for fp in tqdm(fp_preds_all):
    nc_pred = xr.open_dataset(fp, decode_coords='all', mask_and_scale=False)

    fp_data = dir_rasters / fp.parent.name / fp.name
    nc_data = xr.open_dataset(fp_data, decode_coords='all', mask_and_scale=False)

    # get all the preds from nc_pred
    for c in nc_pred.data_vars:
        if 'pred' in c:
            nc_data[c] = nc_pred[c]
    del nc_pred

    entry_id = fp.parent.name

    if use_my_2023:
        sdf = gl_sdf_2023[gl_sdf_2023.entry_id == entry_id].to_crs(nc_data.rio.crs)
        row = sdf.iloc[0]
        mask_crt_g = build_binary_mask(nc_data, geoms=[row.geometry])

        # set to zero pixels outside the 2015 inventory
        mask_inv = (nc_data['mask_crt_g'].values == 1)
        mask_crt_g &= mask_inv

        nc_data['mask_crt_g_inv'] = nc_data['mask_crt_g'].copy()
        nc_data['mask_crt_g'] = (('y', 'x'), mask_crt_g.astype(np.int8))
        nc_data['mask_crt_g'].attrs['_FillValue'] = -1
        nc_data['mask_crt_g'].rio.write_crs(nc_data.rio.crs, inplace=True)

    stats = {'entry_id': fp.parent.name}

    # get the ground truth for the current glacier
    nc = nc_data
    mask_gt = (nc.mask_crt_g.values == 1)

    # get the mask for the non-glacierized area
    mask_non_g = (nc.mask_all_g_id.values == -1)
    s1 = mask_non_g.sum() * 100

    # TODO: do this cleaner
    # add the area which became ice free
    if use_my_2023:
        mask_diff = (nc_data.mask_crt_g_inv.values == 1) & (~mask_gt)
        mask_non_g[mask_diff] = True

    # get the mask of all the other glaciers except the current one
    mask_other_g = (~mask_non_g) & (~mask_gt)

    # prepare the scaling constant for area computation in km2
    dx = nc.rio.resolution()[0]
    f_area = (dx ** 2) / 1e6

    # extract the mask for no-data pixels (which depends on the training yaml settings)
    mask_exclude = np.zeros_like(mask_gt)
    area_ok = np.sum(mask_gt & (~mask_exclude)) * f_area
    area_excluded = np.sum(mask_gt & mask_exclude) * f_area
    stats['area_ok'] = area_ok
    stats['area_nok'] = area_excluded
    stats['area_inv'] = area_ok + area_excluded

    nc['mask_crt_g_b0'] = nc['mask_crt_g']
    k = ''
    pred_band = f'pred{k}_b'
    preds = nc[pred_band].values.copy()

    # compute the predicted area using multiple buffers which will be later used for various metrics
    for b in ['b0', 'b20', 'b50']:
        mask_crt_b = (nc[f'mask_crt_g_{b}'].values == 1)

        # exclude the other glacier pixels (i.e., keeping the same ice divides)
        mask_crt_b &= (~mask_other_g)

        # compute the total area in the current buffer and the corresponding predicted area
        stats[f"area_{b}"] = np.sum(mask_crt_b) * f_area
        stats[f"area_pred{k}_{b}"] = np.sum(preds & mask_crt_b) * f_area

        # compute the total non-glacier area in the current buffer, and the corresponding FP area
        # skip if the buffer is completely within the glacier
        if b in ['b-20', 'b-10', 'b0']:
            continue

        mask_non_g_crt_b = mask_non_g & mask_crt_b
        stats[f"area_non_g_{b}"] = np.sum(mask_non_g_crt_b) * f_area
        stats[f"area_non_g_pred{k}_{b}"] = np.sum(preds & mask_non_g_crt_b) * f_area

    stats_all.append(stats)
df_glacier = pd.DataFrame(stats_all)
df_glacier = df_glacier.sort_values('entry_id')
df_glacier

In [None]:
# dummy columns in case buffer_size_pred is zero
buffer_pred_m = 20
buffer_fp_m = 50

df_glacier[f'area_non_g_b0'] = 0.0
df_glacier[f'area_non_g_pred_b0'] = 0.0

# non-glacierized area between the limit of OK predictions & the false positive limit
df_glacier[f'area_non_g_b{buffer_pred_m}_{buffer_fp_m}'] = (
        df_glacier[f'area_non_g_b{buffer_fp_m}'] -
        df_glacier[f'area_non_g_b{buffer_pred_m}']
)

# compute the recall and the false positive rates
s = ''

# recall
df_glacier[f'area_recalled{s}'] = df_glacier[f'area_pred{s}_b0']
df_glacier[f'recall{s}'] = df_glacier[f'area_recalled{s}'] / df_glacier.area_inv

# false-positive area & rate
df_glacier[f'area_non_g_pred{s}_b{buffer_pred_m}_{buffer_fp_m}'] = (
        df_glacier[f'area_non_g_pred{s}_b{buffer_fp_m}'] -
        df_glacier[f'area_non_g_pred{s}_b{buffer_pred_m}']
)
df_glacier[f'fp_rate{s}'] = (
        df_glacier[f'area_non_g_pred{s}_b{buffer_pred_m}_{buffer_fp_m}'] /
        df_glacier[f'area_non_g_b{buffer_pred_m}_{buffer_fp_m}']
)

# include the buffer area in the predicted area
df_glacier[f'area_pred{s}'] = (
        df_glacier[f'area_recalled{s}'] +
        df_glacier[f'area_non_g_pred{s}_b{buffer_pred_m}']
)
display(df_glacier)
fp_out = res_dir_root / f'stats_all_splits/df_stats_calib_agg_s2_alps_plus_manual_{subdir}_version_0_ensemble.csv'
df_glacier.to_csv(fp_out, index=None)
print(f"Wrote {fp_out}")

In [None]:
df_glacier_inv = pd.read_csv(
    res_dir_root / 'stats_all_splits/df_stats_calib_agg_s2_alps_plus_manual_inv_version_0_ensemble.csv'
)
df_glacier_2023 = pd.read_csv(
    res_dir_root / 'stats_all_splits/df_stats_calib_agg_s2_alps_plus_manual_2023_version_0_ensemble.csv'
)

plt.figure(dpi=120, figsize=[8, 8])
c1 = (df_glacier_2023.area_inv.values - df_glacier_inv.area_inv.values) / df_glacier_inv.area_inv.values
c2 = (df_glacier_2023.area_pred.values - df_glacier_inv.area_pred.values) / df_glacier_inv.area_pred.values
plt.scatter(c2 * 100, c1 * 100)
plt.xlim([-110, 10])
plt.ylim([-110, 10])
plt.axline([-110, -110], [10, 10], linestyle='--', color='r')
plt.gca().set_aspect('equal')
plt.grid()
plt.show()