In [1]:
import geopandas as gpd

## Load SUFOSAT clear-cuts enriched with BDFORET overlapping areas


In [2]:
gdf = gpd.read_file(
    "../../data_pipeline/bootstrap/data/sufosat/sufosat_clusters_enriched.fgb",
    columns=[
        "clear_cut_group",
        "area_ha",
        "bdf_deciduous_area_ha",
        "bdf_mixed_area_ha",
        "bdf_poplar_area_ha",
        "bdf_resinous_area_ha",
    ],
)
gdf

Unnamed: 0,area_ha,bdf_deciduous_area_ha,bdf_mixed_area_ha,bdf_poplar_area_ha,bdf_resinous_area_ha,clear_cut_group,geometry
0,0.520004,,,,,0,"MULTIPOLYGON (((1219789.689 6054526.697, 12197..."
1,0.520004,,,,,1,"MULTIPOLYGON (((1221859.689 6054536.697, 12218..."
2,0.060001,,,,,239709,"MULTIPOLYGON (((1221739.689 6054546.697, 12217..."
3,0.920005,,,,,2,"MULTIPOLYGON (((1221159.688 6055586.697, 12211..."
4,11.550035,,,,,3,"MULTIPOLYGON (((1219859.689 6055366.697, 12198..."
...,...,...,...,...,...,...,...
255611,1.220009,,0.272163,,,239704,"MULTIPOLYGON (((315009.689 6262906.697, 315009..."
255612,4.690016,3.731160,,,,239706,"MULTIPOLYGON (((315369.688 6261636.697, 315369..."
255613,1.100006,,,,1.06371,239705,"MULTIPOLYGON (((315689.689 6262016.697, 315689..."
255614,0.820007,0.675716,,,,239707,"MULTIPOLYGON (((314999.689 6264926.697, 314999..."


## Dealing with BDFORET data quality issues


In [3]:
f"{(gdf.isna().sum(axis=1) == 4).mean():.1%} of the clear-cuts don't overlap with any BDFORET polygon"

"12.5% of the clear-cuts don't overlap with any BDFORET polygon"

In [4]:
gdf["bdf_area_ha"] = (
    gdf["bdf_deciduous_area_ha"].fillna(0)
    + gdf["bdf_mixed_area_ha"].fillna(0)
    + gdf["bdf_poplar_area_ha"].fillna(0)
    + gdf["bdf_resinous_area_ha"].fillna(0)
)
gdf["bdf_coverage"] = gdf["bdf_area_ha"] / gdf["area_ha"]
gdf

Unnamed: 0,area_ha,bdf_deciduous_area_ha,bdf_mixed_area_ha,bdf_poplar_area_ha,bdf_resinous_area_ha,clear_cut_group,geometry,bdf_area_ha,bdf_coverage
0,0.520004,,,,,0,"MULTIPOLYGON (((1219789.689 6054526.697, 12197...",0.000000,0.000000
1,0.520004,,,,,1,"MULTIPOLYGON (((1221859.689 6054536.697, 12218...",0.000000,0.000000
2,0.060001,,,,,239709,"MULTIPOLYGON (((1221739.689 6054546.697, 12217...",0.000000,0.000000
3,0.920005,,,,,2,"MULTIPOLYGON (((1221159.688 6055586.697, 12211...",0.000000,0.000000
4,11.550035,,,,,3,"MULTIPOLYGON (((1219859.689 6055366.697, 12198...",0.000000,0.000000
...,...,...,...,...,...,...,...,...,...
255611,1.220009,,0.272163,,,239704,"MULTIPOLYGON (((315009.689 6262906.697, 315009...",0.272163,0.223082
255612,4.690016,3.731160,,,,239706,"MULTIPOLYGON (((315369.688 6261636.697, 315369...",3.731160,0.795554
255613,1.100006,,,,1.06371,239705,"MULTIPOLYGON (((315689.689 6262016.697, 315689...",1.063710,0.967004
255614,0.820007,0.675716,,,,239707,"MULTIPOLYGON (((314999.689 6264926.697, 314999...",0.675716,0.824037


In [5]:
gdf[gdf["bdf_coverage"] > 0]["bdf_coverage"].describe().drop("count").apply(
    lambda x: f"{x:.0%}"
)

mean     86%
std      26%
min       0%
25%      84%
50%      96%
75%     100%
max     200%
Name: bdf_coverage, dtype: object

Among the clear-cuts overlapping with BDFORET, the average overlap is 86% of the clear-cut area, and the median overlap is 96%, that's great!
However, the maximum coverage exceeds 100%, indicating that some BDFORET polygons representing different wood types overlap.
That might be an issue...


In [6]:
gdf[gdf["bdf_coverage"] > 1]

Unnamed: 0,area_ha,bdf_deciduous_area_ha,bdf_mixed_area_ha,bdf_poplar_area_ha,bdf_resinous_area_ha,clear_cut_group,geometry,bdf_area_ha,bdf_coverage
290,0.920007,0.754807,0.165199,,,282,"MULTIPOLYGON (((1168629.689 6157586.697, 11686...",0.920007,1.0
291,0.670005,0.557268,0.112738,,,283,"MULTIPOLYGON (((1168399.689 6157366.697, 11683...",0.670005,1.0
298,0.600007,0.600007,,,,288,"MULTIPOLYGON (((1167939.688 6170256.697, 11679...",0.600007,1.0
304,0.600005,0.189570,0.410435,,,295,"MULTIPOLYGON (((1175619.689 6179096.697, 11756...",0.600005,1.0
305,0.980006,0.602087,,,0.377919,297,"MULTIPOLYGON (((1174019.688 6178206.697, 11740...",0.980006,1.0
...,...,...,...,...,...,...,...,...,...
255551,0.430004,0.280038,,,0.149966,239649,"MULTIPOLYGON (((328419.689 6265316.697, 328419...",0.430004,1.0
255552,0.670005,0.009178,,,0.660827,239650,"MULTIPOLYGON (((328369.689 6265246.697, 328369...",0.670005,1.0
255592,0.820005,0.607471,0.212534,,,239687,"MULTIPOLYGON (((320739.688 6259476.697, 320709...",0.820005,1.0
255593,2.910010,1.030851,,,1.879159,239688,"MULTIPOLYGON (((318499.688 6258166.697, 318489...",2.910010,1.0


In [7]:
# Let's ignore the records with a coverage only slightly greater than 1
gdf[gdf["bdf_coverage"] > 1.001]

Unnamed: 0,area_ha,bdf_deciduous_area_ha,bdf_mixed_area_ha,bdf_poplar_area_ha,bdf_resinous_area_ha,clear_cut_group,geometry,bdf_area_ha,bdf_coverage
1221,0.810005,0.348756,0.564119,,,1164,"MULTIPOLYGON (((1015799.689 6281756.697, 10157...",0.912874,1.126999
1496,0.610005,0.666979,,,,239789,"MULTIPOLYGON (((1004129.688 6295976.697, 10041...",0.666979,1.093399
1662,3.120024,,,,4.962028,1582,"MULTIPOLYGON (((985539.689 6304406.697, 985539...",4.962028,1.590381
1663,2.030013,,0.258696,,2.683765,1583,"MULTIPOLYGON (((985579.688 6303766.697, 985569...",2.942461,1.449479
1669,1.820007,,,,1.833747,1586,"MULTIPOLYGON (((986279.689 6304506.697, 986279...",1.833747,1.007549
...,...,...,...,...,...,...,...,...,...
251582,1.130009,0.400579,,1.707437,,236241,"MULTIPOLYGON (((381949.689 6278746.697, 381949...",2.108016,1.865487
251587,0.830007,0.809670,,,0.024209,236246,"MULTIPOLYGON (((382899.689 6279466.697, 382899...",0.833878,1.004664
251621,1.130007,1.401339,,,,236279,"MULTIPOLYGON (((384839.689 6279886.697, 384839...",1.401339,1.240115
251622,1.890012,2.123945,,,,236278,"MULTIPOLYGON (((384699.688 6279996.697, 384699...",2.123945,1.123773


3,500 clear-cuts have an abnormal coverage


In [8]:
gdf[
    (gdf["bdf_deciduous_area_ha"] > 1.001 * gdf["area_ha"])
    | (gdf["bdf_mixed_area_ha"] > 1.001 * gdf["area_ha"])
    | (gdf["bdf_poplar_area_ha"] > 1.001 * gdf["area_ha"])
    | (gdf["bdf_resinous_area_ha"] > 1.001 * gdf["area_ha"])
]

Unnamed: 0,area_ha,bdf_deciduous_area_ha,bdf_mixed_area_ha,bdf_poplar_area_ha,bdf_resinous_area_ha,clear_cut_group,geometry,bdf_area_ha,bdf_coverage
1496,0.610005,0.666979,,,,239789,"MULTIPOLYGON (((1004129.688 6295976.697, 10041...",0.666979,1.093399
1662,3.120024,,,,4.962028,1582,"MULTIPOLYGON (((985539.689 6304406.697, 985539...",4.962028,1.590381
1663,2.030013,,0.258696,,2.683765,1583,"MULTIPOLYGON (((985579.688 6303766.697, 985569...",2.942461,1.449479
1669,1.820007,,,,1.833747,1586,"MULTIPOLYGON (((986279.689 6304506.697, 986279...",1.833747,1.007549
1670,1.710012,,,,3.126222,1587,"MULTIPOLYGON (((985389.688 6304476.697, 985359...",3.126222,1.828188
...,...,...,...,...,...,...,...,...,...
251441,0.990008,1.868215,,,,236109,"MULTIPOLYGON (((370649.689 6276446.697, 370649...",1.868215,1.887070
251518,2.830015,3.007410,,,,236179,"MULTIPOLYGON (((376169.689 6275106.697, 376169...",3.007410,1.062683
251582,1.130009,0.400579,,1.707437,,236241,"MULTIPOLYGON (((381949.689 6278746.697, 381949...",2.108016,1.865487
251621,1.130007,1.401339,,,,236279,"MULTIPOLYGON (((384839.689 6279886.697, 384839...",1.401339,1.240115


It can also happen that only one wood essence area is larger than the clear-cut area, which means we can even have BDFORET polygons overlap for the same wood essence!
This could be fixed by doing a dissolve operation on BDFORET before the JOIN.


For now, let's do a quick normalization fix to make sure that the coverage isn't greater than 100%


In [9]:
needs_to_be_normalized = gdf["bdf_coverage"] > 1
for bdf_col in [
    "bdf_deciduous_area_ha",
    "bdf_mixed_area_ha",
    "bdf_poplar_area_ha",
    "bdf_resinous_area_ha",
]:
    gdf.loc[needs_to_be_normalized, bdf_col] = (
        gdf.loc[needs_to_be_normalized, bdf_col]
        / gdf.loc[needs_to_be_normalized, "bdf_coverage"]
    )

In [10]:
gdf["bdf_area_ha"] = (
    gdf["bdf_deciduous_area_ha"].fillna(0)
    + gdf["bdf_mixed_area_ha"].fillna(0)
    + gdf["bdf_poplar_area_ha"].fillna(0)
    + gdf["bdf_resinous_area_ha"].fillna(0)
)
gdf["bdf_coverage"] = gdf["bdf_area_ha"] / gdf["area_ha"]
gdf

Unnamed: 0,area_ha,bdf_deciduous_area_ha,bdf_mixed_area_ha,bdf_poplar_area_ha,bdf_resinous_area_ha,clear_cut_group,geometry,bdf_area_ha,bdf_coverage
0,0.520004,,,,,0,"MULTIPOLYGON (((1219789.689 6054526.697, 12197...",0.000000,0.000000
1,0.520004,,,,,1,"MULTIPOLYGON (((1221859.689 6054536.697, 12218...",0.000000,0.000000
2,0.060001,,,,,239709,"MULTIPOLYGON (((1221739.689 6054546.697, 12217...",0.000000,0.000000
3,0.920005,,,,,2,"MULTIPOLYGON (((1221159.688 6055586.697, 12211...",0.000000,0.000000
4,11.550035,,,,,3,"MULTIPOLYGON (((1219859.689 6055366.697, 12198...",0.000000,0.000000
...,...,...,...,...,...,...,...,...,...
255611,1.220009,,0.272163,,,239704,"MULTIPOLYGON (((315009.689 6262906.697, 315009...",0.272163,0.223082
255612,4.690016,3.731160,,,,239706,"MULTIPOLYGON (((315369.688 6261636.697, 315369...",3.731160,0.795554
255613,1.100006,,,,1.06371,239705,"MULTIPOLYGON (((315689.689 6262016.697, 315689...",1.063710,0.967004
255614,0.820007,0.675716,,,,239707,"MULTIPOLYGON (((314999.689 6264926.697, 314999...",0.675716,0.824037


In [11]:
gdf[gdf["bdf_coverage"] > 1.00000001]  # slightly above 1 for float summing errors

Unnamed: 0,area_ha,bdf_deciduous_area_ha,bdf_mixed_area_ha,bdf_poplar_area_ha,bdf_resinous_area_ha,clear_cut_group,geometry,bdf_area_ha,bdf_coverage


Now that we fixed this issue, let's do the stats again


In [12]:
gdf[gdf["bdf_coverage"] > 0]["bdf_coverage"].describe().drop("count").apply(
    lambda x: f"{x:.0%}"
)

mean     85%
std      25%
min       0%
25%      84%
50%      96%
75%     100%
max     100%
Name: bdf_coverage, dtype: object

So now the mean coverage is 85% and the median coverage is 96%, that's still pretty good!


## Combination of BDFORET overlaps


In [13]:
threshold = 0.1
gdf["combination"] = ""
for bdf_col in ["deciduous", "mixed", "poplar", "resinous"]:
    gdf.loc[
        ((gdf[f"bdf_{bdf_col}_area_ha"] / gdf["area_ha"]) > threshold), "combination"
    ] = gdf["combination"] + f" {bdf_col}"
gdf["combination"] = gdf["combination"].str.split().str.join(" - ")
combination_vc = gdf["combination"].value_counts()
combination_vc

combination
deciduous                                88562
resinous                                 53057
                                         41120
mixed                                    19362
poplar                                   17102
deciduous - resinous                     11636
deciduous - mixed                         7841
deciduous - poplar                        7539
mixed - resinous                          7252
deciduous - mixed - resinous              1428
mixed - poplar                             244
poplar - resinous                          243
deciduous - poplar - resinous              120
deciduous - mixed - poplar                  79
mixed - poplar - resinous                   21
deciduous - mixed - poplar - resinous       10
Name: count, dtype: int64

In [14]:
single_essence = combination_vc.loc[
    ["deciduous", "resinous", "mixed", "poplar"]
].sum() / len(gdf)
print(
    f"{single_essence:.0%} of the clear-cuts overlap with a single wood essence (> {threshold:.0%})"
)

70% of the clear-cuts overlap with a single wood essence (> 10%)


In [15]:
missing_bdforet = combination_vc.loc[""] / len(gdf)
print(
    f"{missing_bdforet:.0%} of the clear-cuts don't overlap with any wood essence (> {threshold:.0%})"
)

16% of the clear-cuts don't overlap with any wood essence (> 10%)
