The goal of this notebook is to seed the following tables:

- `ecological_zonings`
- `clear_cuts_reports`
- `clear_cuts`
- `clear_cut_ecological_zoning`

Here is the backend's database schema:

![](database_schema.png)


In [1]:
import ast
import sys

import geopandas as gpd
import pandas as pd
from sqlalchemy import create_engine, inspect, text

sys.path.append("..")

from scripts import DATA_DIR

In [2]:
# Create a connection to the database
# TODO: use env var
DATABASE_URL = "postgresql://devuser:devuser@localhost:5432/local"
engine = create_engine(DATABASE_URL, plugins=["geoalchemy2"])

## ⚠️ Danger zone - wipe the database ⚠️


In [3]:
# ⚠️⚠️⚠️⚠️⚠️⚠️⚠️⚠️⚠️⚠️⚠️⚠️⚠️⚠️⚠️⚠️⚠️⚠️
# ⚠️ Danger zone - wipe the database ⚠️
# ⚠️⚠️⚠️⚠️⚠️⚠️⚠️⚠️⚠️⚠️⚠️⚠️⚠️⚠️⚠️⚠️⚠️⚠️

table_names = [
    table_name
    for table_name in inspect(engine).get_table_names()
    if table_name
    not in [
        "cities",
        "departments",
        "spatial_ref_sys",
    ]  # Leave cities, departments, and spatial_ref_sys untouched
]
# Check row counts in each table
with engine.connect() as conn:
    for table_name in table_names:
        row_count = conn.execute(text(f"SELECT COUNT(*) FROM {table_name}")).one()[0]
        print(f"Number of rows in the {table_name} table: {row_count}")

    print("\nWiping the database...\n")
    truncate_stmt = f"TRUNCATE TABLE {', '.join(table_names)} RESTART IDENTITY CASCADE"
    conn.execute(text(truncate_stmt))
    conn.commit()

    # Check row counts in each table
    for table_name in table_names:
        row_count = conn.execute(text(f"SELECT COUNT(*) FROM {table_name}")).one()[0]
        print(f"Number of rows in the {table_name} table: {row_count}")

Number of rows in the alembic_version table: 0
Number of rows in the user_department table: 0
Number of rows in the clear_cut_ecological_zoning table: 2202
Number of rows in the users table: 0
Number of rows in the clear_cuts_reports table: 9999
Number of rows in the ecological_zonings table: 1762
Number of rows in the clear_cuts table: 9999

Wiping the database...

Number of rows in the alembic_version table: 0
Number of rows in the user_department table: 0
Number of rows in the clear_cut_ecological_zoning table: 0
Number of rows in the users table: 0
Number of rows in the clear_cuts_reports table: 0
Number of rows in the ecological_zonings table: 0
Number of rows in the clear_cuts table: 0


## Seed the `ecological_zonings` table


In [4]:
# Load the Natura 2000 codes
natura2000_concat = gpd.read_file(DATA_DIR / "natura2000/natura2000_concat.fgb").drop(
    columns="geometry"
)
natura2000_concat = natura2000_concat.rename(columns={"type": "sub_type"})
natura2000_concat["type"] = "Natura 2000"
natura2000_concat

Unnamed: 0,sub_type,code,name,type
0,SIC,FR9400591,Plateau de Pertusato/ Bonifacio et îles Lavezzi,Natura 2000
1,SIC,FR9402009,Mare temporaire de Musella/Bonifacio,Natura 2000
2,SIC,FR9400608,Mares temporaires du terrain militaire de Fras...,Natura 2000
3,SIC,FR9400592,Ventilegne-la Trinite de Bonifacio-Fazzio,Natura 2000
4,SIC,FR9402015,"Bouches de Bonifacio, Iles des Moines",Natura 2000
...,...,...,...,...
1757,ZPS,FR7212015,Haute Cize : Pic d'Herrozate et forêt d'Orion,Natura 2000
1758,SIC,FR7200753,Forêt d'Iraty,Natura 2000
1759,ZPS,FR7212005,"Haute Soule : forêt d'Iraty, Orgambidexka et P...",Natura 2000
1760,SIC,FR7200756,Montagnes des Aldudes,Natura 2000


In [5]:
# INSERT them in the database
natura2000_concat.to_sql("ecological_zonings", con=engine, if_exists="append", index=False)

762

In [6]:
# Check the result
pd.read_sql("SELECT * FROM ecological_zonings LIMIT 10", con=engine)

Unnamed: 0,id,type,sub_type,name,code
0,1,Natura 2000,SIC,Plateau de Pertusato/ Bonifacio et îles Lavezzi,FR9400591
1,2,Natura 2000,SIC,Mare temporaire de Musella/Bonifacio,FR9402009
2,3,Natura 2000,SIC,Mares temporaires du terrain militaire de Fras...,FR9400608
3,4,Natura 2000,SIC,Ventilegne-la Trinite de Bonifacio-Fazzio,FR9400592
4,5,Natura 2000,SIC,"Bouches de Bonifacio, Iles des Moines",FR9402015
5,6,Natura 2000,ZPS,"Iles Lavezzi, Bouches de Bonifacio",FR9410021
6,7,Natura 2000,SIC,"Iles et pointe Bruzzi, étangs de Chevanu et d'...",FR9400609
7,8,Natura 2000,SIC,"Tre Padule de Suartone, Rondinara",FR9400590
8,9,Natura 2000,SIC,Iles Cerbicale et frange littoral,FR9400587
9,10,Natura 2000,ZPS,Iles Cerbicale,FR9410022


## Seed the `clear_cuts_reports` table


In [7]:
# Load our Sufosat enriched dataframe
# Sample 10k clear-cuts for PROD tests
sufosat = gpd.read_file(DATA_DIR / "sufosat/sufosat_clusters_enriched.fgb").sample(10000)
sufosat

Unnamed: 0,clear_cut_group,date_min,date_max,days_delta,clear_cut_group_size,concave_hull_score,area_ha,cities,natura2000_area_ha,natura2000_codes,bdf_deciduous_area_ha,bdf_mixed_area_ha,bdf_poplar_area_ha,bdf_resinous_area_ha,slope_area_ha,geometry
248379,233244,2023-05-23,2023-08-15,84,14,1.000000,0.810005,['11227'],,,0.810005,,,,,"MULTIPOLYGON (((663379.688 6215656.697, 663379..."
188984,249859,2021-03-26,2021-03-26,0,1,1.000000,0.020001,['23223'],,,0.003591,,,0.021839,,"MULTIPOLYGON (((596639.688 6527756.697, 596639..."
221998,209988,2019-06-24,2019-09-09,77,13,1.000000,0.510004,['24426'],,,,0.483031,,0.018945,,"MULTIPOLYGON (((502139.688 6440696.697, 502129..."
249347,234166,2021-09-23,2022-07-26,306,5,1.000000,0.780006,['31375'],,,0.250128,,,,,"MULTIPOLYGON (((553399.689 6230466.697, 553399..."
73736,70456,2023-02-15,2024-05-11,451,69,0.733231,7.050023,"['08449', '08468']",,,7.050023,,,,,"MULTIPOLYGON (((806589.689 6961386.697, 806589..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70237,67084,2022-06-16,2022-09-27,103,14,1.000000,1.210008,['59227'],,,0.908611,,,,,"MULTIPOLYGON (((720879.688 7030346.697, 720869..."
151703,144754,2022-07-10,2022-09-29,81,4,0.354846,0.110003,['62008'],,,0.088281,,,,,"MULTIPOLYGON (((635169.689 7070776.697, 635169..."
26254,24713,2020-04-03,2021-07-27,480,26,1.000000,2.500011,['42172'],0.016151,['FR8201762'],,0.137560,,1.933018,1.83122,"MULTIPOLYGON (((811919.689 6478616.697, 811919..."
116817,111622,2022-03-21,2022-04-29,39,17,0.787099,1.220008,['41059'],,,1.040900,,,,,"MULTIPOLYGON (((586259.688 6703626.697, 586239..."


In [8]:
# Retrieve the generated `id` for the `cities` table
city_ids = pd.read_sql(
    "SELECT id AS city_id, zip_code AS city_insee_code FROM cities", con=engine
)
city_ids

Unnamed: 0,city_id,city_insee_code
0,1,01002
1,2,01004
2,3,01005
3,4,01006
4,5,01007
...,...,...
37538,37539,97613
37539,37540,97614
37540,37541,97615
37541,37542,97616


In [9]:
# TODO: In the dataeng data model, we have several cities that can intersect with a clear-cut
# However, in the backend model we have just one. For now, we arbitrarily take the first one in the list
sufosat["city_insee_code"] = sufosat["cities"].apply(ast.literal_eval).str[0]

In [10]:
# Add the city_id column to our sufosat clear-cuts
length_before_merge = len(sufosat)
sufosat = sufosat.merge(city_ids, on="city_insee_code", how="left")

# TODO: There is a discrepency between the cities codes used in the dataeng pipeline and in the backend
# For example, some in the backend table, some zip_code are associated with multiple city names, e.g.,

#   id	    zip_code    name	        department_id
# 0	13884	34246	    Entre-Vignes	1
# 1	13885	34246	    Saint-Christol	1

# There is also the "01001" insee_code that is missing from the backend table

# For now, to mitigate this,
# we'll drop records with missing city_id and
# we'll drop records that have different city_id for the same insee_code
# Therefore we have to select a city_id randomly because the join causes duplicated records otherwise
sufosat = sufosat.dropna(subset="city_id").drop_duplicates("clear_cut_group")

# Make sure we don't remove too much records
assert sufosat["city_id"].isna().sum() == 0 and len(sufosat) >= length_before_merge - 100

sufosat

Unnamed: 0,clear_cut_group,date_min,date_max,days_delta,clear_cut_group_size,concave_hull_score,area_ha,cities,natura2000_area_ha,natura2000_codes,bdf_deciduous_area_ha,bdf_mixed_area_ha,bdf_poplar_area_ha,bdf_resinous_area_ha,slope_area_ha,geometry,city_insee_code,city_id
0,233244,2023-05-23,2023-08-15,84,14,1.000000,0.810005,['11227'],,,0.810005,,,,,"MULTIPOLYGON (((663379.688 6215656.697, 663379...",11227,3961
1,249859,2021-03-26,2021-03-26,0,1,1.000000,0.020001,['23223'],,,0.003591,,,0.021839,,"MULTIPOLYGON (((596639.688 6527756.697, 596639...",23223,8389
2,209988,2019-06-24,2019-09-09,77,13,1.000000,0.510004,['24426'],,,,0.483031,,0.018945,,"MULTIPOLYGON (((502139.688 6440696.697, 502129...",24426,8852
3,234166,2021-09-23,2022-07-26,306,5,1.000000,0.780006,['31375'],,,0.250128,,,,,"MULTIPOLYGON (((553399.689 6230466.697, 553399...",31375,12412
4,70456,2023-02-15,2024-05-11,451,69,0.733231,7.050023,"['08449', '08468']",,,7.050023,,,,,"MULTIPOLYGON (((806589.689 6961386.697, 806589...",08449,2905
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10483,67084,2022-06-16,2022-09-27,103,14,1.000000,1.210008,['59227'],,,0.908611,,,,,"MULTIPOLYGON (((720879.688 7030346.697, 720869...",59227,23788
10484,144754,2022-07-10,2022-09-29,81,4,0.354846,0.110003,['62008'],,,0.088281,,,,,"MULTIPOLYGON (((635169.689 7070776.697, 635169...",62008,25449
10485,24713,2020-04-03,2021-07-27,480,26,1.000000,2.500011,['42172'],0.016151,['FR8201762'],,0.137560,,1.933018,1.83122,"MULTIPOLYGON (((811919.689 6478616.697, 811919...",42172,16743
10486,111622,2022-03-21,2022-04-29,39,17,0.787099,1.220008,['41059'],,,1.040900,,,,,"MULTIPOLYGON (((586259.688 6703626.697, 586239...",41059,16331


In [11]:
# Add the "slope_area_ratio_percentage" field
# TODO: This field is a bit confusing, maybe we could use slope_area_ha instead
# TODO: "1 validation error for ClearCutReportPreviewSchema\nslope_area_ratio_percentage\n  Input should be a valid number [type=float_type, input_value=None, input_type=NoneType]\n    For further information visit https://errors.pydantic.dev/2.10/v/float_type"
sufosat["slope_area_ratio_percentage"] = sufosat["slope_area_ha"].fillna(0) / sufosat["area_ha"]
sufosat

Unnamed: 0,clear_cut_group,date_min,date_max,days_delta,clear_cut_group_size,concave_hull_score,area_ha,cities,natura2000_area_ha,natura2000_codes,bdf_deciduous_area_ha,bdf_mixed_area_ha,bdf_poplar_area_ha,bdf_resinous_area_ha,slope_area_ha,geometry,city_insee_code,city_id,slope_area_ratio_percentage
0,233244,2023-05-23,2023-08-15,84,14,1.000000,0.810005,['11227'],,,0.810005,,,,,"MULTIPOLYGON (((663379.688 6215656.697, 663379...",11227,3961,0.000000
1,249859,2021-03-26,2021-03-26,0,1,1.000000,0.020001,['23223'],,,0.003591,,,0.021839,,"MULTIPOLYGON (((596639.688 6527756.697, 596639...",23223,8389,0.000000
2,209988,2019-06-24,2019-09-09,77,13,1.000000,0.510004,['24426'],,,,0.483031,,0.018945,,"MULTIPOLYGON (((502139.688 6440696.697, 502129...",24426,8852,0.000000
3,234166,2021-09-23,2022-07-26,306,5,1.000000,0.780006,['31375'],,,0.250128,,,,,"MULTIPOLYGON (((553399.689 6230466.697, 553399...",31375,12412,0.000000
4,70456,2023-02-15,2024-05-11,451,69,0.733231,7.050023,"['08449', '08468']",,,7.050023,,,,,"MULTIPOLYGON (((806589.689 6961386.697, 806589...",08449,2905,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10483,67084,2022-06-16,2022-09-27,103,14,1.000000,1.210008,['59227'],,,0.908611,,,,,"MULTIPOLYGON (((720879.688 7030346.697, 720869...",59227,23788,0.000000
10484,144754,2022-07-10,2022-09-29,81,4,0.354846,0.110003,['62008'],,,0.088281,,,,,"MULTIPOLYGON (((635169.689 7070776.697, 635169...",62008,25449,0.000000
10485,24713,2020-04-03,2021-07-27,480,26,1.000000,2.500011,['42172'],0.016151,['FR8201762'],,0.137560,,1.933018,1.83122,"MULTIPOLYGON (((811919.689 6478616.697, 811919...",42172,16743,0.732485
10486,111622,2022-03-21,2022-04-29,39,17,0.787099,1.220008,['41059'],,,1.040900,,,,,"MULTIPOLYGON (((586259.688 6703626.697, 586239...",41059,16331,0.000000


In [12]:
# TODO: We also need the created_at, updated_at, and status fields, maybe these could be auto-generated by the database?
sufosat["created_at"] = pd.Timestamp.utcnow()
sufosat["updated_at"] = pd.Timestamp.utcnow()
sufosat["status"] = "to_validate"

In [13]:
# Format our Sufosat dataframe for the clear_cuts_reports table
clear_cuts_reports = sufosat.rename(columns={"clear_cut_group": "id"})[
    ["id", "slope_area_ratio_percentage", "city_id", "created_at", "updated_at", "status"]
]
clear_cuts_reports

Unnamed: 0,id,slope_area_ratio_percentage,city_id,created_at,updated_at,status
0,233244,0.000000,3961,2025-04-13 19:56:55.366173+00:00,2025-04-13 19:56:55.366672+00:00,to_validate
1,249859,0.000000,8389,2025-04-13 19:56:55.366173+00:00,2025-04-13 19:56:55.366672+00:00,to_validate
2,209988,0.000000,8852,2025-04-13 19:56:55.366173+00:00,2025-04-13 19:56:55.366672+00:00,to_validate
3,234166,0.000000,12412,2025-04-13 19:56:55.366173+00:00,2025-04-13 19:56:55.366672+00:00,to_validate
4,70456,0.000000,2905,2025-04-13 19:56:55.366173+00:00,2025-04-13 19:56:55.366672+00:00,to_validate
...,...,...,...,...,...,...
10483,67084,0.000000,23788,2025-04-13 19:56:55.366173+00:00,2025-04-13 19:56:55.366672+00:00,to_validate
10484,144754,0.000000,25449,2025-04-13 19:56:55.366173+00:00,2025-04-13 19:56:55.366672+00:00,to_validate
10485,24713,0.732485,16743,2025-04-13 19:56:55.366173+00:00,2025-04-13 19:56:55.366672+00:00,to_validate
10486,111622,0.000000,16331,2025-04-13 19:56:55.366173+00:00,2025-04-13 19:56:55.366672+00:00,to_validate


In [14]:
# INSERT them in the database
clear_cuts_reports.to_sql("clear_cuts_reports", con=engine, if_exists="append", index=False)

1000

In [15]:
clear_cuts_reports["id"].max()

np.int64(255604)

In [16]:
# Update the sequence for the SERIAL column so that it doesn't generate an ID that already exists
with engine.connect() as conn:
    conn.execute(
        text(
            "SELECT setval('clear_cuts_reports_id_seq', (SELECT MAX(id) FROM clear_cuts_reports))"
        )
    )
    print(conn.execute(text("SELECT currval('clear_cuts_reports_id_seq')")).scalar())

255604


In [17]:
# Check the result
pd.read_sql("SELECT * FROM clear_cuts_reports LIMIT 10", con=engine)

Unnamed: 0,id,slope_area_ratio_percentage,created_at,updated_at,status,city_id,user_id
0,233244,0.0,2025-04-13 19:56:55.366173,2025-04-13 19:56:55.366672,to_validate,3961,
1,249859,0.0,2025-04-13 19:56:55.366173,2025-04-13 19:56:55.366672,to_validate,8389,
2,209988,0.0,2025-04-13 19:56:55.366173,2025-04-13 19:56:55.366672,to_validate,8852,
3,234166,0.0,2025-04-13 19:56:55.366173,2025-04-13 19:56:55.366672,to_validate,12412,
4,70456,0.0,2025-04-13 19:56:55.366173,2025-04-13 19:56:55.366672,to_validate,2905,
5,144466,0.0,2025-04-13 19:56:55.366173,2025-04-13 19:56:55.366672,to_validate,25664,
6,141845,0.0,2025-04-13 19:56:55.366173,2025-04-13 19:56:55.366672,to_validate,24827,
7,98923,0.0,2025-04-13 19:56:55.366173,2025-04-13 19:56:55.366672,to_validate,23297,
8,99843,0.0,2025-04-13 19:56:55.366173,2025-04-13 19:56:55.366672,to_validate,29910,
9,91871,0.0,2025-04-13 19:56:55.366173,2025-04-13 19:56:55.366672,to_validate,9217,


## Seed the `clear_cuts` table


In [18]:
# Transform Sufosat into the `clear_cuts` table format

# TODO: The CRS of the target table (EPSG:4326) differs from the CRS of current GeoDataFrame (EPSG:2154).
# We should use Lambert93 (EPSG:2154) as it's more accurate for France
sufosat = sufosat.to_crs(4326)

# TODO: representative_point vs centroid?
sufosat["location"] = sufosat.representative_point()

# Since this is the first seed of the database, the clear_cuts_reports.id is equal to clear_cuts.id
sufosat["report_id"] = sufosat["clear_cut_group"]

# Transform Sufosat into the `clear_cuts` table format
clear_cuts = sufosat.rename(
    columns={
        "clear_cut_group": "id",
        "area_ha": "area_hectare",
        # TODO: add ecological_zonings_area_ha here?
        "geometry": "boundary",
        "date_min": "observation_start_date",
        "date_max": "observation_end_date",
        # TODO: add concave_hull_score here?
    }
).set_geometry("boundary")

clear_cuts = clear_cuts[
    [
        "id",
        "area_hectare",
        "location",
        "boundary",
        "created_at",
        "updated_at",
        "observation_start_date",
        "observation_end_date",
        "report_id",
    ]
]
clear_cuts

Unnamed: 0,id,area_hectare,location,boundary,created_at,updated_at,observation_start_date,observation_end_date,report_id
0,233244,0.810005,POINT (2.5516 43.03925),"MULTIPOLYGON (((2.55099 43.03929, 2.55099 43.0...",2025-04-13 19:56:55.366173+00:00,2025-04-13 19:56:55.366672+00:00,2023-05-23,2023-08-15,233244
1,249859,0.020001,POINT (1.66825 45.84152),"MULTIPOLYGON (((1.66824 45.84161, 1.66824 45.8...",2025-04-13 19:56:55.366173+00:00,2025-04-13 19:56:55.366672+00:00,2021-03-26,2021-03-26,249859
2,209988,0.510004,POINT (0.48741 45.03732),"MULTIPOLYGON (((0.48697 45.03718, 0.48684 45.0...",2025-04-13 19:56:55.366173+00:00,2025-04-13 19:56:55.366672+00:00,2019-06-24,2019-09-09,209988
3,234166,0.780006,POINT (1.19802 43.15888),"MULTIPOLYGON (((1.19865 43.15839, 1.19865 43.1...",2025-04-13 19:56:55.366173+00:00,2025-04-13 19:56:55.366672+00:00,2021-09-23,2022-07-26,234166
4,70456,7.050023,POINT (4.47815 49.74318),"MULTIPOLYGON (((4.47792 49.74248, 4.47792 49.7...",2025-04-13 19:56:55.366173+00:00,2025-04-13 19:56:55.366672+00:00,2023-02-15,2024-05-11,70456
...,...,...,...,...,...,...,...,...,...
10483,67084,1.210008,POINT (3.29358 50.37107),"MULTIPOLYGON (((3.29309 50.37045, 3.29295 50.3...",2025-04-13 19:56:55.366173+00:00,2025-04-13 19:56:55.366672+00:00,2022-06-16,2022-09-27,67084
10484,144754,0.110003,POINT (2.08308 50.72971),"MULTIPOLYGON (((2.08342 50.73029, 2.08342 50.7...",2025-04-13 19:56:55.366173+00:00,2025-04-13 19:56:55.366672+00:00,2022-07-10,2022-09-29,144754
10485,24713,2.500011,POINT (4.43045 45.39892),"MULTIPOLYGON (((4.43055 45.39784, 4.43055 45.3...",2025-04-13 19:56:55.366173+00:00,2025-04-13 19:56:55.366672+00:00,2020-04-03,2021-07-27,24713
10486,111622,1.220008,POINT (1.49198 47.42296),"MULTIPOLYGON (((1.49124 47.42321, 1.49097 47.4...",2025-04-13 19:56:55.366173+00:00,2025-04-13 19:56:55.366672+00:00,2022-03-21,2022-04-29,111622


In [19]:
# INSERT them in the database
clear_cuts.to_postgis(
    "clear_cuts", con=engine, if_exists="append", index=False, chunksize=10000
)

In [20]:
# Update the sequence for the SERIAL column so that it doesn't generate an ID that already exists
with engine.connect() as conn:
    conn.execute(text("SELECT setval('clear_cuts_id_seq', (SELECT MAX(id) FROM clear_cuts))"))
    print(conn.execute(text("SELECT currval('clear_cuts_id_seq')")).scalar())

255604


In [21]:
# Check the result
gpd.read_postgis(
    "SELECT * FROM clear_cuts LIMIT 10", con=engine, geom_col="boundary", crs="EPSG:4326"
)

Unnamed: 0,id,area_hectare,location,boundary,created_at,updated_at,observation_start_date,observation_end_date,report_id
0,233244,0.810005,0101000020E6100000C82D5C7AAF690440C8F4A4200685...,"MULTIPOLYGON (((2.55099 43.03929, 2.55099 43.0...",2025-04-13 19:56:55.366173,2025-04-13 19:56:55.366672,2023-05-23,2023-08-15,233244
1,249859,0.020001,0101000020E61000004E0E0E5922B1FA3FCB0C2710B7EB...,"MULTIPOLYGON (((1.66824 45.84161, 1.66824 45.8...",2025-04-13 19:56:55.366173,2025-04-13 19:56:55.366672,2021-03-26,2021-03-26,249859
2,209988,0.510004,0101000020E6100000A0FE9ACFB531DF3FA8E9CFEFC684...,"MULTIPOLYGON (((0.48697 45.03718, 0.48684 45.0...",2025-04-13 19:56:55.366173,2025-04-13 19:56:55.366672,2019-06-24,2019-09-09,209988
3,7577,5.380021,0101000020E61000000C7BEBB8509B1340D0FD9D084BE5...,"MULTIPOLYGON (((4.89932 43.79153, 4.89932 43.7...",2025-04-13 19:56:55.366173,2025-04-13 19:56:55.366672,2021-11-18,2022-03-23,7577
4,234166,0.780006,0101000020E6100000DA1B2EFE1A2BF33FDEB3C4065694...,"MULTIPOLYGON (((1.19865 43.15839, 1.19865 43.1...",2025-04-13 19:56:55.366173,2025-04-13 19:56:55.366672,2021-09-23,2022-07-26,234166
5,70456,7.050023,0101000020E610000096BBC84D9FE911403C4ABE5D20DF...,"MULTIPOLYGON (((4.47792 49.74248, 4.47792 49.7...",2025-04-13 19:56:55.366173,2025-04-13 19:56:55.366672,2023-02-15,2024-05-11,70456
6,144466,0.730005,0101000020E61000002A7DAFE08D82FF3F42A861D38465...,"MULTIPOLYGON (((1.96873 50.79355, 1.96873 50.7...",2025-04-13 19:56:55.366173,2025-04-13 19:56:55.366672,2018-03-30,2019-08-07,144466
7,249230,0.01,0101000020E61000001A7113B46180EEBF4CD033A6D2A2...,"MULTIPOLYGON (((-0.95324 45.2721, -0.95311 45....",2025-04-13 19:56:55.366173,2025-04-13 19:56:55.366672,2021-03-31,2021-03-31,249230
8,197006,5.760026,0101000020E6100000A6EC93123623EB3F54E3D5CCF28C...,"MULTIPOLYGON (((0.8464 45.09929, 0.84615 45.09...",2025-04-13 19:56:55.366173,2025-04-13 19:56:55.366672,2020-02-20,2022-03-28,197006
9,103600,7.400016,0101000020E6100000D2E55698E4D30F402C0857315180...,"MULTIPOLYGON (((3.9772 47.00186, 3.9772 47.001...",2025-04-13 19:56:55.366173,2025-04-13 19:56:55.366672,2021-02-16,2022-08-02,103600


## Seed the `clear_cut_ecological_zoning` table


In [22]:
# Retrieve the generated `ecological_zoning_id`
ecological_zonings_ids = pd.read_sql(
    "SELECT code AS natura2000_code, id AS ecological_zoning_id FROM ecological_zonings",
    con=engine,
)
ecological_zonings_ids

Unnamed: 0,natura2000_code,ecological_zoning_id
0,FR9400591,1
1,FR9402009,2
2,FR9400608,3
3,FR9400592,4
4,FR9402015,5
...,...,...
1757,FR7212015,1758
1758,FR7200753,1759
1759,FR7212005,1760
1760,FR7200756,1761


In [23]:
clear_cut_ecological_zoning = (
    sufosat.rename(columns={"clear_cut_group": "clear_cut_id"})
    .set_index("clear_cut_id")["natura2000_codes"]
    .dropna()
    .apply(ast.literal_eval)  # List of strings don't seem to be supported by FlatGeoBuf
    .explode()  # Explode the list of zones into individual rows
    .rename("natura2000_code")
).reset_index()
clear_cut_ecological_zoning

Unnamed: 0,clear_cut_id,natura2000_code
0,144466,FR3100485
1,141845,FR2200373
2,53525,FR4312028
3,53525,FR4301346
4,98599,FR2601017
...,...,...
2105,165829,FR7200680
2106,166883,FR5400450
2107,21116,FR8312002
2108,6620,FR9301595


In [24]:
# Join the "natura2000_code" from Sufosat with the "ecological_zoning_id" from the database
clear_cut_ecological_zoning = clear_cut_ecological_zoning.merge(
    ecological_zonings_ids, on="natura2000_code"
).drop(columns="natura2000_code")
clear_cut_ecological_zoning

Unnamed: 0,clear_cut_id,ecological_zoning_id
0,144466,1087
1,141845,1117
2,53525,859
3,53525,860
4,98599,1303
...,...,...
2105,165829,1625
2106,166883,1586
2107,21116,359
2108,6620,300


In [25]:
# TODO: we don't have "area_hectare" for each zone in the dataeng model
# Anyways this field wouldn't be useful as we cannot sum it since some ecological zones overlap
clear_cut_ecological_zoning["area_hectare"] = 0
clear_cut_ecological_zoning

Unnamed: 0,clear_cut_id,ecological_zoning_id,area_hectare
0,144466,1087,0
1,141845,1117,0
2,53525,859,0
3,53525,860,0
4,98599,1303,0
...,...,...,...
2105,165829,1625,0
2106,166883,1586,0
2107,21116,359,0
2108,6620,300,0


In [26]:
clear_cut_ecological_zoning.to_sql(
    "clear_cut_ecological_zoning", con=engine, if_exists="append", index=False
)

110

In [27]:
# Check the result
pd.read_sql("SELECT * FROM clear_cut_ecological_zoning LIMIT 10", con=engine)

Unnamed: 0,clear_cut_id,ecological_zoning_id,area_hectare
0,144466,1087,0.0
1,141845,1117,0.0
2,53525,859,0.0
3,53525,860,0.0
4,98599,1303,0.0
5,98599,1302,0.0
6,252084,1644,0.0
7,105709,1309,0.0
8,105709,1308,0.0
9,65472,1023,0.0
