The goal of this notebook is to seed the following tables:

- `ecological_zonings`
- `clear_cuts_reports`
- `clear_cuts`
- `clear_cut_ecological_zoning`

Here is the backend's database schema:

![](database_schema.png)


In [1]:
import ast
import sys

import geopandas as gpd
import pandas as pd
from sqlalchemy import create_engine, inspect, text

sys.path.append("..")

from scripts import DATA_DIR

In [2]:
# !pip install sqlalchemy psycopg2 geoalchemy2

In [None]:
# Create a connection to the database
# TODO: use env var
DATABASE_URL = "postgresql://devuser:devuser@localhost:5432/local"
engine = create_engine(DATABASE_URL, plugins=["geoalchemy2"])

## ⚠️ Danger zone - wipe the database ⚠️


In [None]:
# ⚠️⚠️⚠️⚠️⚠️⚠️⚠️⚠️⚠️⚠️⚠️⚠️⚠️⚠️⚠️⚠️⚠️⚠️
# ⚠️ Danger zone - wipe the database ⚠️
# ⚠️⚠️⚠️⚠️⚠️⚠️⚠️⚠️⚠️⚠️⚠️⚠️⚠️⚠️⚠️⚠️⚠️⚠️

table_names = [
    table_name
    for table_name in inspect(engine).get_table_names()
    if table_name
    not in [
        "cities",
        "departments",
        "spatial_ref_sys",
    ]  # Leave cities, departments, and spatial_ref_sys untouched
]
# Check row counts in each table
with engine.connect() as conn:
    for table_name in table_names:
        row_count = conn.execute(text(f"SELECT COUNT(*) FROM {table_name}")).one()[0]
        print(f"Number of rows in the {table_name} table: {row_count}")

    print("\nWiping the database...\n")
    truncate_stmt = f"TRUNCATE TABLE {', '.join(table_names)} RESTART IDENTITY CASCADE"
    conn.execute(text(truncate_stmt))
    conn.commit()

    # Check row counts in each table
    for table_name in table_names:
        row_count = conn.execute(text(f"SELECT COUNT(*) FROM {table_name}")).one()[0]
        print(f"Number of rows in the {table_name} table: {row_count}")

Number of rows in the clear_cut_ecological_zoning table: 0
Number of rows in the ecological_zonings table: 0
Number of rows in the alembic_version table: 0
Number of rows in the users table: 0
Number of rows in the clear_cuts table: 0
Number of rows in the user_department table: 0
Number of rows in the clear_cuts_reports table: 0

Wiping the database...

Number of rows in the clear_cut_ecological_zoning table: 0
Number of rows in the ecological_zonings table: 0
Number of rows in the alembic_version table: 0
Number of rows in the users table: 0
Number of rows in the clear_cuts table: 0
Number of rows in the user_department table: 0
Number of rows in the clear_cuts_reports table: 0


## Seed the `ecological_zonings` table


In [6]:
# Load the Natura 2000 codes
natura2000_concat = gpd.read_file(DATA_DIR / "natura2000/natura2000_concat.fgb").drop(
    columns="geometry"
)
natura2000_concat = natura2000_concat.rename(columns={"type": "sub_type"})
natura2000_concat["type"] = "Natura 2000"
natura2000_concat

Unnamed: 0,sub_type,code,name,type
0,SIC,FR9400591,Plateau de Pertusato/ Bonifacio et îles Lavezzi,Natura 2000
1,SIC,FR9402009,Mare temporaire de Musella/Bonifacio,Natura 2000
2,SIC,FR9400608,Mares temporaires du terrain militaire de Fras...,Natura 2000
3,SIC,FR9400592,Ventilegne-la Trinite de Bonifacio-Fazzio,Natura 2000
4,SIC,FR9402015,"Bouches de Bonifacio, Iles des Moines",Natura 2000
...,...,...,...,...
1757,ZPS,FR7212015,Haute Cize : Pic d'Herrozate et forêt d'Orion,Natura 2000
1758,SIC,FR7200753,Forêt d'Iraty,Natura 2000
1759,ZPS,FR7212005,"Haute Soule : forêt d'Iraty, Orgambidexka et P...",Natura 2000
1760,SIC,FR7200756,Montagnes des Aldudes,Natura 2000


In [7]:
# INSERT them in the database
natura2000_concat.to_sql("ecological_zonings", con=engine, if_exists="append", index=False)

762

In [8]:
# Check the result
pd.read_sql("SELECT * FROM ecological_zonings LIMIT 10", con=engine)

Unnamed: 0,id,type,sub_type,name,code
0,1,Natura 2000,SIC,Plateau de Pertusato/ Bonifacio et îles Lavezzi,FR9400591
1,2,Natura 2000,SIC,Mare temporaire de Musella/Bonifacio,FR9402009
2,3,Natura 2000,SIC,Mares temporaires du terrain militaire de Fras...,FR9400608
3,4,Natura 2000,SIC,Ventilegne-la Trinite de Bonifacio-Fazzio,FR9400592
4,5,Natura 2000,SIC,"Bouches de Bonifacio, Iles des Moines",FR9402015
5,6,Natura 2000,ZPS,"Iles Lavezzi, Bouches de Bonifacio",FR9410021
6,7,Natura 2000,SIC,"Iles et pointe Bruzzi, étangs de Chevanu et d'...",FR9400609
7,8,Natura 2000,SIC,"Tre Padule de Suartone, Rondinara",FR9400590
8,9,Natura 2000,SIC,Iles Cerbicale et frange littoral,FR9400587
9,10,Natura 2000,ZPS,Iles Cerbicale,FR9410022


## Seed the `clear_cuts_reports` table


In [None]:
# Load our Sufosat enriched dataframe
# Sample 10k clear-cuts for PROD tests
sufosat = gpd.read_file(DATA_DIR / "sufosat/sufosat_clusters_enriched.fgb").sample(10000)
sufosat

Unnamed: 0,clear_cut_group,date_min,date_max,days_delta,clear_cut_group_size,concave_hull_score,area_ha,cities,natura2000_area_ha,natura2000_codes,slope_area_ha,geometry
98601,94359,2021-02-16,2023-08-04,899,87,0.683684,7.500026,['39120'],7.500026,"['FR4312027', 'FR4301330']",0.251076,"MULTIPOLYGON (((923399.689 6620646.697, 923399..."
27238,25664,2021-05-04,2023-03-02,667,23,0.792394,2.500010,['42107'],0.000000,,0.000000,"MULTIPOLYGON (((775939.689 6490576.697, 775939..."
248804,233646,2024-02-23,2024-07-03,131,20,0.873304,1.930008,['09047'],1.930008,['FR7312008'],0.000000,"MULTIPOLYGON (((614109.688 6201126.697, 614099..."
135533,129259,2024-08-21,2024-11-01,72,19,0.824905,1.060006,['49347'],0.000000,,0.000000,"MULTIPOLYGON (((440859.688 6728926.697, 440849..."
169654,161320,2020-08-16,2020-11-03,79,5,1.000000,0.530005,['40075'],0.000000,,0.000000,"MULTIPOLYGON (((366799.689 6317556.697, 366799..."
...,...,...,...,...,...,...,...,...,...,...,...,...
17955,16944,2021-02-08,2021-12-30,325,24,1.000000,2.210009,['30067'],0.000000,,0.000000,"MULTIPOLYGON (((822229.689 6328976.697, 822229..."
56660,54006,2019-07-08,2020-09-08,428,21,0.719596,2.130011,['88498'],0.014899,['FR4112003'],1.649671,"MULTIPOLYGON (((975919.689 6768756.697, 975919..."
98525,94289,2022-04-23,2022-08-10,109,9,1.000000,0.660004,['39228'],0.660004,"['FR4301328', 'FR4312023']",0.000000,"MULTIPOLYGON (((935069.689 6624156.697, 935069..."
144097,137439,2019-05-12,2021-09-01,843,90,0.574541,6.070032,['45346'],0.000000,,0.000000,"MULTIPOLYGON (((640889.688 6757086.697, 640889..."


In [10]:
# Retrieve the generated `id` for the `cities` table
city_ids = pd.read_sql(
    "SELECT id AS city_id, zip_code AS city_insee_code FROM cities", con=engine
)
city_ids

Unnamed: 0,city_id,city_insee_code
0,1,01002
1,2,01004
2,3,01005
3,4,01006
4,5,01007
...,...,...
37538,37539,97613
37539,37540,97614
37540,37541,97615
37541,37542,97616


In [11]:
# TODO: In the dataeng data model, we have several cities that can intersect with a clear-cut
# However, in the backend model we have just one. For now, we arbitrarily take the first one in the list
sufosat["city_insee_code"] = sufosat["cities"].apply(ast.literal_eval).str[0]

In [12]:
# Add the city_id column to our sufosat clear-cuts
length_before_merge = len(sufosat)
sufosat = sufosat.merge(city_ids, on="city_insee_code", how="left")

# TODO: There is a discrepency between the cities codes used in the dataeng pipeline and in the backend
# For example, some in the backend table, some zip_code are associated with multiple city names, e.g.,

#   id	    zip_code    name	        department_id
# 0	13884	34246	    Entre-Vignes	1
# 1	13885	34246	    Saint-Christol	1

# There is also the "01001" insee_code that is missing from the backend table

# For now, to mitigate this,
# we'll drop records with missing city_id and
# we'll drop records that have different city_id for the same insee_code
# Therefore we have to select a city_id randomly because the join causes duplicated records otherwise
sufosat = sufosat.dropna(subset="city_id").drop_duplicates("clear_cut_group")

# Make sure we don't remove too much records
assert sufosat["city_id"].isna().sum() == 0 and len(sufosat) >= length_before_merge - 100

sufosat

Unnamed: 0,clear_cut_group,date_min,date_max,days_delta,clear_cut_group_size,concave_hull_score,area_ha,cities,natura2000_area_ha,natura2000_codes,slope_area_ha,geometry,city_insee_code,city_id
0,94359,2021-02-16,2023-08-04,899,87,0.683684,7.500026,['39120'],7.500026,"['FR4312027', 'FR4301330']",0.251076,"MULTIPOLYGON (((923399.689 6620646.697, 923399...",39120,15518
1,25664,2021-05-04,2023-03-02,667,23,0.792394,2.500010,['42107'],0.000000,,0.000000,"MULTIPOLYGON (((775939.689 6490576.697, 775939...",42107,16680
2,233646,2024-02-23,2024-07-03,131,20,0.873304,1.930008,['09047'],1.930008,['FR7312008'],0.000000,"MULTIPOLYGON (((614109.688 6201126.697, 614099...",09047,3006
3,129259,2024-08-21,2024-11-01,72,19,0.824905,1.060006,['49347'],0.000000,,0.000000,"MULTIPOLYGON (((440859.688 6728926.697, 440849...",49347,18975
4,161320,2020-08-16,2020-11-03,79,5,1.000000,0.530005,['40075'],0.000000,,0.000000,"MULTIPOLYGON (((366799.689 6317556.697, 366799...",40075,16012
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10483,16944,2021-02-08,2021-12-30,325,24,1.000000,2.210009,['30067'],0.000000,,0.000000,"MULTIPOLYGON (((822229.689 6328976.697, 822229...",30067,11754
10484,54006,2019-07-08,2020-09-08,428,21,0.719596,2.130011,['88498'],0.014899,['FR4112003'],1.649671,"MULTIPOLYGON (((975919.689 6768756.697, 975919...",88498,36298
10485,94289,2022-04-23,2022-08-10,109,9,1.000000,0.660004,['39228'],0.660004,"['FR4301328', 'FR4312023']",0.000000,"MULTIPOLYGON (((935069.689 6624156.697, 935069...",39228,15614
10486,137439,2019-05-12,2021-09-01,843,90,0.574541,6.070032,['45346'],0.000000,,0.000000,"MULTIPOLYGON (((640889.688 6757086.697, 640889...",45346,17734


In [13]:
# Add the "slope_area_ratio_percentage" field
# TODO: This field is a bit confusing, maybe we could use slope_area_ha instead
sufosat["slope_area_ratio_percentage"] = sufosat["slope_area_ha"] / sufosat["area_ha"]
sufosat

Unnamed: 0,clear_cut_group,date_min,date_max,days_delta,clear_cut_group_size,concave_hull_score,area_ha,cities,natura2000_area_ha,natura2000_codes,slope_area_ha,geometry,city_insee_code,city_id,slope_area_ratio_percentage
0,94359,2021-02-16,2023-08-04,899,87,0.683684,7.500026,['39120'],7.500026,"['FR4312027', 'FR4301330']",0.251076,"MULTIPOLYGON (((923399.689 6620646.697, 923399...",39120,15518,0.033477
1,25664,2021-05-04,2023-03-02,667,23,0.792394,2.500010,['42107'],0.000000,,0.000000,"MULTIPOLYGON (((775939.689 6490576.697, 775939...",42107,16680,0.000000
2,233646,2024-02-23,2024-07-03,131,20,0.873304,1.930008,['09047'],1.930008,['FR7312008'],0.000000,"MULTIPOLYGON (((614109.688 6201126.697, 614099...",09047,3006,0.000000
3,129259,2024-08-21,2024-11-01,72,19,0.824905,1.060006,['49347'],0.000000,,0.000000,"MULTIPOLYGON (((440859.688 6728926.697, 440849...",49347,18975,0.000000
4,161320,2020-08-16,2020-11-03,79,5,1.000000,0.530005,['40075'],0.000000,,0.000000,"MULTIPOLYGON (((366799.689 6317556.697, 366799...",40075,16012,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10483,16944,2021-02-08,2021-12-30,325,24,1.000000,2.210009,['30067'],0.000000,,0.000000,"MULTIPOLYGON (((822229.689 6328976.697, 822229...",30067,11754,0.000000
10484,54006,2019-07-08,2020-09-08,428,21,0.719596,2.130011,['88498'],0.014899,['FR4112003'],1.649671,"MULTIPOLYGON (((975919.689 6768756.697, 975919...",88498,36298,0.774489
10485,94289,2022-04-23,2022-08-10,109,9,1.000000,0.660004,['39228'],0.660004,"['FR4301328', 'FR4312023']",0.000000,"MULTIPOLYGON (((935069.689 6624156.697, 935069...",39228,15614,0.000000
10486,137439,2019-05-12,2021-09-01,843,90,0.574541,6.070032,['45346'],0.000000,,0.000000,"MULTIPOLYGON (((640889.688 6757086.697, 640889...",45346,17734,0.000000


In [14]:
# TODO: We also need the created_at, updated_at, and status fields, maybe these could be auto-generated by the database?
sufosat["created_at"] = pd.Timestamp.utcnow()
sufosat["updated_at"] = pd.Timestamp.utcnow()
sufosat["status"] = "to_validate"

In [15]:
# Format our Sufosat dataframe for the clear_cuts_reports table
clear_cuts_reports = sufosat.rename(columns={"clear_cut_group": "id"})[
    ["id", "slope_area_ratio_percentage", "city_id", "created_at", "updated_at", "status"]
]
clear_cuts_reports

Unnamed: 0,id,slope_area_ratio_percentage,city_id,created_at,updated_at,status
0,94359,0.033477,15518,2025-04-08 17:14:34.307819+00:00,2025-04-08 17:14:34.308333+00:00,to_validate
1,25664,0.000000,16680,2025-04-08 17:14:34.307819+00:00,2025-04-08 17:14:34.308333+00:00,to_validate
2,233646,0.000000,3006,2025-04-08 17:14:34.307819+00:00,2025-04-08 17:14:34.308333+00:00,to_validate
3,129259,0.000000,18975,2025-04-08 17:14:34.307819+00:00,2025-04-08 17:14:34.308333+00:00,to_validate
4,161320,0.000000,16012,2025-04-08 17:14:34.307819+00:00,2025-04-08 17:14:34.308333+00:00,to_validate
...,...,...,...,...,...,...
10483,16944,0.000000,11754,2025-04-08 17:14:34.307819+00:00,2025-04-08 17:14:34.308333+00:00,to_validate
10484,54006,0.774489,36298,2025-04-08 17:14:34.307819+00:00,2025-04-08 17:14:34.308333+00:00,to_validate
10485,94289,0.000000,15614,2025-04-08 17:14:34.307819+00:00,2025-04-08 17:14:34.308333+00:00,to_validate
10486,137439,0.000000,17734,2025-04-08 17:14:34.307819+00:00,2025-04-08 17:14:34.308333+00:00,to_validate


In [16]:
# INSERT them in the database
clear_cuts_reports.to_sql("clear_cuts_reports", con=engine, if_exists="append", index=False)

1000

In [17]:
clear_cuts_reports["id"].max()

np.int64(255625)

In [18]:
# Update the sequence for the SERIAL column so that it doesn't generate an ID that already exists
with engine.connect() as conn:
    conn.execute(
        text(
            "SELECT setval('clear_cuts_reports_id_seq', (SELECT MAX(id) FROM clear_cuts_reports))"
        )
    )
    print(conn.execute(text("SELECT currval('clear_cuts_reports_id_seq')")).scalar())

255625


In [19]:
# Check the result
pd.read_sql("SELECT * FROM clear_cuts_reports LIMIT 10", con=engine)

Unnamed: 0,id,slope_area_ratio_percentage,created_at,updated_at,status,city_id,user_id
0,94359,0.033477,2025-04-08 17:14:34.307819,2025-04-08 17:14:34.308333,to_validate,15518,
1,25664,0.0,2025-04-08 17:14:34.307819,2025-04-08 17:14:34.308333,to_validate,16680,
2,233646,0.0,2025-04-08 17:14:34.307819,2025-04-08 17:14:34.308333,to_validate,3006,
3,129259,0.0,2025-04-08 17:14:34.307819,2025-04-08 17:14:34.308333,to_validate,18975,
4,161320,0.0,2025-04-08 17:14:34.307819,2025-04-08 17:14:34.308333,to_validate,16012,
5,93394,0.0,2025-04-08 17:14:34.307819,2025-04-08 17:14:34.308333,to_validate,15841,
6,57615,0.0,2025-04-08 17:14:34.307819,2025-04-08 17:14:34.308333,to_validate,28539,
7,211659,0.0,2025-04-08 17:14:34.307819,2025-04-08 17:14:34.308333,to_validate,8652,
8,27813,0.0,2025-04-08 17:14:34.307819,2025-04-08 17:14:34.308333,to_validate,17138,
9,223510,0.0,2025-04-08 17:14:34.307819,2025-04-08 17:14:34.308333,to_validate,16267,


## Seed the `clear_cuts` table


In [20]:
# Transform Sufosat into the `clear_cuts` table format

# TODO: The CRS of the target table (EPSG:4326) differs from the CRS of current GeoDataFrame (EPSG:2154).
# We should use Lambert93 (EPSG:2154) as it's more accurate for France
sufosat = sufosat.to_crs(4326)

# TODO: representative_point vs centroid?
sufosat["location"] = sufosat.representative_point()

# Since this is the first seed of the database, the clear_cuts_reports.id is equal to clear_cuts.id
sufosat["report_id"] = sufosat["clear_cut_group"]

# Transform Sufosat into the `clear_cuts` table format
clear_cuts = sufosat.rename(
    columns={
        "clear_cut_group": "id",
        "area_ha": "area_hectare",
        # TODO: add ecological_zonings_area_ha here?
        "geometry": "boundary",
        "date_min": "observation_start_date",
        "date_max": "observation_end_date",
        # TODO: add concave_hull_score here?
    }
).set_geometry("boundary")

clear_cuts = clear_cuts[
    [
        "id",
        "area_hectare",
        "location",
        "boundary",
        "created_at",
        "updated_at",
        "observation_start_date",
        "observation_end_date",
        "report_id",
    ]
]
clear_cuts

Unnamed: 0,id,area_hectare,location,boundary,created_at,updated_at,observation_start_date,observation_end_date,report_id
0,94359,7.500026,POINT (5.92415 46.64998),"MULTIPOLYGON (((5.92166 46.64869, 5.92166 46.6...",2025-04-08 17:14:34.307819+00:00,2025-04-08 17:14:34.308333+00:00,2021-02-16,2023-08-04,94359
1,25664,2.500010,POINT (3.973 45.51175),"MULTIPOLYGON (((3.97259 45.51045, 3.97259 45.5...",2025-04-08 17:14:34.307819+00:00,2025-04-08 17:14:34.308333+00:00,2021-05-04,2023-03-02,25664
2,233646,1.930008,POINT (1.94991 42.90446),"MULTIPOLYGON (((1.9493 42.90442, 1.94918 42.90...",2025-04-08 17:14:34.307819+00:00,2025-04-08 17:14:34.308333+00:00,2024-02-23,2024-07-03,233646
3,129259,1.060006,POINT (-0.45016 47.61039),"MULTIPOLYGON (((-0.45038 47.60976, -0.45052 47...",2025-04-08 17:14:34.307819+00:00,2025-04-08 17:14:34.308333+00:00,2024-08-21,2024-11-01,129259
4,161320,0.530005,POINT (-1.14714 43.87881),"MULTIPOLYGON (((-1.14735 43.87825, -1.14735 43...",2025-04-08 17:14:34.307819+00:00,2025-04-08 17:14:34.308333+00:00,2020-08-16,2020-11-03,161320
...,...,...,...,...,...,...,...,...,...
10483,16944,2.210009,POINT (4.52518 44.04975),"MULTIPOLYGON (((4.52535 44.04917, 4.52535 44.0...",2025-04-08 17:14:34.307819+00:00,2025-04-08 17:14:34.308333+00:00,2021-02-08,2021-12-30,16944
10484,54006,2.130011,POINT (6.69865 47.96123),"MULTIPOLYGON (((6.6983 47.96104, 6.6983 47.961...",2025-04-08 17:14:34.307819+00:00,2025-04-08 17:14:34.308333+00:00,2019-07-08,2020-09-08,54006
10485,94289,0.660004,POINT (6.07661 46.67681),"MULTIPOLYGON (((6.07592 46.67628, 6.07592 46.6...",2025-04-08 17:14:34.307819+00:00,2025-04-08 17:14:34.308333+00:00,2022-04-23,2022-08-10,94289
10486,137439,6.070032,POINT (2.21038 47.91291),"MULTIPOLYGON (((2.20873 47.91151, 2.20873 47.9...",2025-04-08 17:14:34.307819+00:00,2025-04-08 17:14:34.308333+00:00,2019-05-12,2021-09-01,137439


In [21]:
# INSERT them in the database
clear_cuts.to_postgis(
    "clear_cuts", con=engine, if_exists="append", index=False, chunksize=10000
)

In [22]:
# Update the sequence for the SERIAL column so that it doesn't generate an ID that already exists
with engine.connect() as conn:
    conn.execute(text("SELECT setval('clear_cuts_id_seq', (SELECT MAX(id) FROM clear_cuts))"))
    print(conn.execute(text("SELECT currval('clear_cuts_id_seq')")).scalar())

255625


In [23]:
# Check the result
gpd.read_postgis(
    "SELECT * FROM clear_cuts LIMIT 10", con=engine, geom_col="boundary", crs="EPSG:4326"
)

Unnamed: 0,id,area_hectare,location,boundary,created_at,updated_at,observation_start_date,observation_end_date,report_id
0,94359,7.500026,0101000020E6100000A2BBCC5655B21740C19623883253...,"MULTIPOLYGON (((5.92166 46.64869, 5.92166 46.6...",2025-04-08 17:14:34.307819,2025-04-08 17:14:34.308333,2021-02-16,2023-08-04,94359
1,25664,2.50001,0101000020E6100000C8A5FD3FB5C80F407437732881C1...,"MULTIPOLYGON (((3.97259 45.51045, 3.97259 45.5...",2025-04-08 17:14:34.307819,2025-04-08 17:14:34.308333,2021-05-04,2023-03-02,25664
2,57615,0.500004,0101000020E61000003010DA26454220400A64A6E26E73...,"MULTIPOLYGON (((8.12892 48.90147, 8.12891 48.9...",2025-04-08 17:14:34.307819,2025-04-08 17:14:34.308333,2024-05-06,2024-07-29,57615
3,211659,3.950021,0101000020E6100000B904609B2A74A23F1CCD0630966C...,"MULTIPOLYGON (((0.03449 44.84773, 0.03449 44.8...",2025-04-08 17:14:34.307819,2025-04-08 17:14:34.308333,2023-02-08,2024-07-20,211659
4,143015,3.160015,0101000020E610000062850B350979FF3F1A1AB2BDC9FD...,"MULTIPOLYGON (((1.96583 49.98119, 1.96583 49.9...",2025-04-08 17:14:34.307819,2025-04-08 17:14:34.308333,2019-07-02,2020-07-12,143015
5,15773,8.540026,0101000020E6100000E8829DAA200B13409EF0E9302841...,"MULTIPOLYGON (((4.75767 44.50747, 4.75767 44.5...",2025-04-08 17:14:34.307819,2025-04-08 17:14:34.308333,2021-09-24,2024-05-31,15773
6,233646,1.930008,0101000020E6100000181CFF91D832FF3F1FF75158C573...,"MULTIPOLYGON (((1.9493 42.90442, 1.94918 42.90...",2025-04-08 17:14:34.307819,2025-04-08 17:14:34.308333,2024-02-23,2024-07-03,233646
7,97260,0.470004,0101000020E61000002080F7D9A3C71340A8FDF032D04C...,"MULTIPOLYGON (((4.94561 46.59988, 4.94561 46.5...",2025-04-08 17:14:34.307819,2025-04-08 17:14:34.308333,2022-04-04,2022-07-29,97260
8,160946,4.150015,0101000020E6100000A6F4CC0E5AECF4BF24163C90D50A...,"MULTIPOLYGON (((-1.30939 44.08377, -1.30976 44...",2025-04-08 17:14:34.307819,2025-04-08 17:14:34.308333,2020-12-14,2022-01-01,160946
9,58299,1.880013,0101000020E61000003012449921921D404669220B7A72...,"MULTIPOLYGON (((7.3923 48.89439, 7.3923 48.894...",2025-04-08 17:14:34.307819,2025-04-08 17:14:34.308333,2019-06-10,2020-08-03,58299


## Seed the `clear_cut_ecological_zoning` table


In [24]:
# Retrieve the generated `ecological_zoning_id`
ecological_zonings_ids = pd.read_sql(
    "SELECT code AS natura2000_code, id AS ecological_zoning_id FROM ecological_zonings",
    con=engine,
)
ecological_zonings_ids

Unnamed: 0,natura2000_code,ecological_zoning_id
0,FR9400591,1
1,FR9402009,2
2,FR9400608,3
3,FR9400592,4
4,FR9402015,5
...,...,...
1757,FR7212015,1758
1758,FR7200753,1759
1759,FR7212005,1760
1760,FR7200756,1761


In [25]:
clear_cut_ecological_zoning = (
    sufosat.rename(columns={"clear_cut_group": "clear_cut_id"})
    .set_index("clear_cut_id")["natura2000_codes"]
    .dropna()
    .apply(ast.literal_eval)  # List of strings don't seem to be supported by FlatGeoBuf
    .explode()  # Explode the list of zones into individual rows
    .rename("natura2000_code")
).reset_index()
clear_cut_ecological_zoning

Unnamed: 0,clear_cut_id,natura2000_code
0,94359,FR4312027
1,94359,FR4301330
2,233646,FR7312008
3,57615,FR4201797
4,57615,FR4211811
...,...,...
2101,84732,FR4312015
2102,84732,FR4301344
2103,54006,FR4112003
2104,94289,FR4301328


In [26]:
# Join the "natura2000_code" from Sufosat with the "ecological_zoning_id" from the database
clear_cut_ecological_zoning = clear_cut_ecological_zoning.merge(
    ecological_zonings_ids, on="natura2000_code"
).drop(columns="natura2000_code")
clear_cut_ecological_zoning

Unnamed: 0,clear_cut_id,ecological_zoning_id
0,94359,775
1,94359,774
2,233646,181
3,57615,926
4,57615,929
...,...,...
2101,84732,872
2102,84732,871
2103,54006,844
2104,94289,798


In [27]:
# TODO: we don't have "area_hectare" for each zone in the dataeng model
# Anyways this field wouldn't be useful as we cannot sum it since some ecological zones overlap
clear_cut_ecological_zoning["area_hectare"] = 0
clear_cut_ecological_zoning

Unnamed: 0,clear_cut_id,ecological_zoning_id,area_hectare
0,94359,775,0
1,94359,774,0
2,233646,181,0
3,57615,926,0
4,57615,929,0
...,...,...,...
2101,84732,872,0
2102,84732,871,0
2103,54006,844,0
2104,94289,798,0


In [28]:
clear_cut_ecological_zoning.to_sql(
    "clear_cut_ecological_zoning", con=engine, if_exists="append", index=False
)

106

In [29]:
# Check the result
pd.read_sql("SELECT * FROM clear_cut_ecological_zoning LIMIT 10", con=engine)

Unnamed: 0,clear_cut_id,ecological_zoning_id,area_hectare
0,94359,775,0.0
1,94359,774,0.0
2,233646,181,0.0
3,57615,926,0.0
4,57615,929,0.0
5,223510,1675,0.0
6,234814,185,0.0
7,234814,186,0.0
8,53508,859,0.0
9,53508,860,0.0
