In [8]:
import duckdb
from pipelines.tasks.config.common import DUCKDB_FILE
from pipelines.tasks.client.https_client import HTTPSClient
from pipelines.tasks.config.common import CACHE_FOLDER
import json
import os
from tqdm import tqdm
import pandas as pd
import geopandas as gpd

In [9]:
# la donnée geo est telechargé depuis https://public.opendatasoft.com/explore/dataset/georef-france-commune/information
https_client = HTTPSClient(
    "https://public.opendatasoft.com/api/explore/v2.1/catalog/datasets/"
)

In [6]:
path = "georef-france-commune/exports/geojson?lang=fr&timezone=Africa%2FLagos"
filepath = os.path.join(CACHE_FOLDER, "georef-france-commune.geojson")
https_client.download_file_from_https(path, filepath)

Processing file georef-france-commune.geojson: |                                         | 285M/0.00


'georef-france-commune.geojson'

In [7]:
print(CACHE_FOLDER)
print(filepath)

/Users/jaouadsalahy/Documents/IT/13_pollution_eau/database/cache
/Users/jaouadsalahy/Documents/IT/13_pollution_eau/database/cache/georef-france-commune.geojson


In [10]:
con = duckdb.connect(database=DUCKDB_FILE, read_only=True)

In [11]:
# Show all tables in the database
query = """
SHOW TABLES;
"""

tables = con.sql(query)
tables_df = tables.df()
tables_df

Unnamed: 0,name
0,cog_communes
1,edc_communes
2,edc_prelevements
3,edc_resultats
4,laposte_communes


In [None]:
query_2024 = """
select * from ana__resultats_communes
"""

prelevements_2024 = con.sql(query_2024)
prelevements_2024_df = prelevements_2024.df()
prelevements_2024_df.head(2)

In [13]:
with open(os.path.join(CACHE_FOLDER, "georef-france-commune.geojson"), "r") as file:
    data_geo = json.load(file)
    print("Top-level GeoJSON structure:")
    print("-" * 40)
    for key, value in data_geo.items():
        if key == "features":
            print(f"- {key}: Array with {len(value)} features")
        elif key == "crs":
            print(f"- {key}:")
            print(json.dumps(value, indent=2))
        else:
            print(f"- {key}: {value}")


Top-level GeoJSON structure:
----------------------------------------
- type: FeatureCollection
- features: Array with 34948 features


In [14]:
def parse_geojson_structure(data, level=0, max_items=1):
    """
    Recursively parse GeoJSON structure and print key-value pairs
    Args:
        data: dictionary or value to parse
        level: current indentation level
        max_items: maximum number of items to show for lists
    """
    indent = "  " * level

    if isinstance(data, dict):
        for key, value in data.items():
            if isinstance(value, (dict, list)):
                print(f"{indent}- {key}:")
                parse_geojson_structure(value, level + 1, max_items)
            else:
                print(f"{indent}- {key}: {value}")

    elif isinstance(data, list):
        print(f"{indent}List with {len(data)} items")
        if data and max_items > 0:
            print(f"{indent}First item:")
            parse_geojson_structure(data[0], level + 1, max_items)


# Read and parse the GeoJSON
with open(os.path.join(CACHE_FOLDER, "georef-france-commune.geojson"), "r") as file:
    data_geo = json.load(file)
    print("GeoJSON Structure:")
    print("-" * 40)
    parse_geojson_structure(data_geo)


GeoJSON Structure:
----------------------------------------
- type: FeatureCollection
- features:
  List with 34948 items
  First item:
    - type: Feature
    - geometry:
      - coordinates:
        List with 1 items
        First item:
          List with 254 items
          First item:
            List with 2 items
            First item:
      - type: Polygon
    - properties:
      - geo_point_2d:
        - lon: 2.063226732075124
        - lat: 45.445282333838946
      - year: 2024
      - reg_code:
        List with 1 items
        First item:
      - reg_name:
        List with 1 items
        First item:
      - dep_code:
        List with 1 items
        First item:
      - dep_name:
        List with 1 items
        First item:
      - arrdep_code:
        List with 1 items
        First item:
      - arrdep_name:
        List with 1 items
        First item:
      - ze2020_code:
        List with 1 items
        First item:
      - ze2020_name:
        List with 1 items
   

In [None]:
with open(os.path.join(CACHE_FOLDER, "georef-france-commune.geojson"), "r") as file:
    data_geo = json.load(file)
    # Read the GeoJSON file
    # gdf = gpd.read_file(os.path.join(CACHE_FOLDER, "georef-france-commune.geojson"))

    gdf = gpd.read_file(
        os.path.join(CACHE_FOLDER, "georef-france-commune.geojson"),
        dtype={
            "reg_code": str,
            "reg_name": str,
            "dep_code": str,
            "dep_name": str,
            "arrdep_code": str,
            "arrdep_name": str,
            "ze2020_code": str,
            "ze2020_name": str,
            "bv2022_code": str,
            "bv2022_name": str,
            "epci_code": str,
            "epci_name": str,
            "ept_code": str,
            "ept_name": str,
            "com_code": str,
            "com_current_code": str,
            "com_name": str,
        },
    )

    # Print basic information about the GeoJSON
    # print("GeoJSON Info:")
    # print("-" * 40)
    # print(f"Number of features: {len(gdf)}")

In [15]:
# Print data types of all columns
print("Column Data Types:")
print("-" * 50)
print(gdf.dtypes)


Column Data Types:
--------------------------------------------------


NameError: name 'gdf' is not defined

In [None]:
# For a more detailed view with null counts
print("\nDetailed DataFrame Info:")
print("-" * 50)
gdf.info()


In [None]:
# Specific information about the geometry column
print("\nGeometry Column Info:")
print("-" * 50)
print(f"Geometry type: {gdf.geometry.geom_type.unique()}")
print(f"CRS: {gdf.crs}")


In [None]:
# Si la solution est validée, il faudra optimiser ce code qui est bien trop lent
data_geo_features = data_geo["features"]
default_properties = {"annee": "2024", "resultat_cvm": "None"}
for i in tqdm(range(len(data_geo_features))):
    elem = data_geo_features[i]
    code_insee = elem["properties"]["com_code"]
    name_insee = elem["properties"]["com_name"]
    if code_insee is not None:
        code_insee = code_insee[0]
        name_insee = name_insee[0]
        prelevement = prelevements_2024_df[
            (prelevements_2024_df.commune_code_insee == code_insee)
        ]
        if len(prelevement) >= 0:
            properties = {}
            properties["commune_code_insee"] = code_insee
            properties["commune_nom"] = name_insee
            properties["resultat_cvm"] = {}
            for index, row in prelevement.iterrows():
                properties["resultat_cvm"][row["annee"]] = row["resultat_cvm"]
            elem["properties"] = properties
        else:
            # print("elem not found")
            elem["properties"] = default_properties
            elem["properties"]["commune_code_insee"] = code_insee
            elem["properties"]["commune_nom"] = name_insee

In [None]:
new_geo_json = {"type": "FeatureCollection"}
new_geo_json["features"] = data_geo_features

In [None]:
# Write the dictionary to a GeoJSON file
filename = "georef-france-commune-prelevement.geojson"
write_filepath = os.path.join(CACHE_FOLDER, filename)
with open(
    write_filepath,
    "w",
    encoding="utf-8",
) as file:
    json.dump(new_geo_json, file)

In [None]:
from pipelines.utils.storage_client import ObjectStorageClient
from pipelines.config.config import load_env_variables

load_env_variables()
s3 = ObjectStorageClient()

db_path = DUCKDB_FILE  # Fichier local
s3_path = "dev/geojson/georef-france-commune-prelevement.geojson.removeme"  # Destination sur S3

s3.upload_object(local_path=write_filepath, file_key=s3_path, public_read=True)
print(f"✅ geojson uploadée sur s3://{s3.bucket_name}/{s3_path}")
# La donnée peut maintenant être récupéré depuis https://pollution-eau-s3.s3.fr-par.scw.cloud/dev/geojson/georef-france-commune-prelevement.geojson.removeme

# Tests


In [None]:
df_geo = pd.DataFrame(new_geo_json["features"])
df_geo_flatten = df_geo.join(pd.json_normalize(df_geo["properties"]))
df_geo_flatten

In [None]:
df_geo_flatten.commune_code_insee.unique()

In [None]:
len(df_geo_flatten[df_geo_flatten["resultat_cvm.2024"].isna()])

In [None]:
# s3.delete_object("dev/geojson/georef-france-commune-prelevement.csv")

In [None]:
# from pipelines.tasks.config.common import download_file_from_https

# download_file_from_https(
#     url="https://pollution-eau-s3.s3.fr-par.scw.cloud/dev/geojson/georef-france-commune-prelevement-small.geojson.removeme",
#     filepath="test.geojson",
# )

In [None]:
df_geo = pd.DataFrame(data_geo["features"])
df_geo_flatten = df_geo.join(pd.json_normalize(df_geo["properties"]))
df_geo_flatten.com_code.map(lambda x: len(x) if x is not None else 0).value_counts()

In [None]:
df_geo_flatten[
    df_geo_flatten.dep_code.map(lambda x: x[0] == "2A" if len(x) == 1 else False)
].com_code