Paramètrage notebook

In [343]:
#importation package
import pandas as pd
import duckdb

#config affichage
pd.set_option("display.max_columns", None)  # Affiche toutes les colonnes sans limitation
pd.set_option("display.max_colwidth", None)  # Affiche la largeur complète des colonnes sans couper le texte
pd.set_option("display.expand_frame_repr", False)  # Empêche le repliement des colonnes lors de l'affichage du DataFrame

# Connexion
from pipelines.tasks.config.common import DUCKDB_FILE
con = duckdb.connect(database=DUCKDB_FILE, read_only=True)

In [344]:
#affichage tables & vues
con.sql('show tables')

┌──────────────────────────────────┐
│               name               │
│             varchar              │
├──────────────────────────────────┤
│ ana__resultats_communes          │
│ cog_communes                     │
│ edc_communes                     │
│ edc_prelevements                 │
│ edc_resultats                    │
│ int__lien_cdreseau_refreneceprel │
│ int__lien_commune_cdreseau       │
│ int__mapping_category_simple     │
│ int__prelevements_uniques        │
│ int__resultats_udi_communes      │
│ laposte_communes                 │
│ mapping_categories               │
│ stg_communes__cog                │
│ stg_communes__laposte            │
│ stg_edc__communes                │
│ stg_edc__prevelevements          │
│ stg_edc__resultats               │
├──────────────────────────────────┤
│             17 rows              │
└──────────────────────────────────┘

Objectif : Création d'un modèle dbt pour le résultat des nitrites  
Tâches : création d'un fichier qui remprend le resultat du dernier prélèvement contenant les colonnes 
- cdreseau
- période
- catégorie
- résultat
- date

Catégorie Nitrates
3 paramètres à regarder:  
nitrates (en no3)  
nitrites (en no2)  
nitrates/50 + nitrites/3  


Filtre de la table sur les catégories nitrites  et ajout ordre de prélèvement

In [345]:
#paramètres à regarder NO2, NO3, prélèvement inf à 1 AN
query_nitrates = """ 
SELECT *, ROW_NUMBER() OVER(PARTITION BY cdreseau, cdparametresiseeaux  ORDER BY datetimeprel DESC, valtraduite DESC) AS row_number
    -- valtraduite DESC pour prendre les max de valtraduite en cas de résultat contradictoire d'un même paramètre
FROM int__resultats_udi_communes
WHERE cdparametresiseeaux IN ('NO2','NO3')
AND CURRENT_DATE - datetimeprel < INTERVAL 1 YEAR
ORDER BY cdreseau, cdparametresiseeaux
  """

nitrates = con.sql(query_nitrates).df()
nitrates

Unnamed: 0,referenceprel,cdparametresiseeaux,valtraduite,limitequal,de_partition,limitequal_float,unite,categorie,cdreseau,inseecommune,datetimeprel,row_number
0,00100143925,NO2,0.00,"<=0,5 mg/L",2025,0.5,mg/L,nitrite,001000003,01007,2025-01-21 12:35:00,1
1,00100143925,NO3,14.00,<=50 mg/L,2025,50.0,mg/L,nitrite,001000003,01007,2025-01-21 12:35:00,1
2,00100143918,NO2,0.00,"<=0,5 mg/L",2025,0.5,mg/L,nitrite,001000241,01139,2025-01-17 11:10:00,1
3,00100143918,NO2,0.00,"<=0,5 mg/L",2025,0.5,mg/L,nitrite,001000241,01050,2025-01-17 11:10:00,2
4,00100143918,NO2,0.00,"<=0,5 mg/L",2025,0.5,mg/L,nitrite,001000241,01175,2025-01-17 11:10:00,3
...,...,...,...,...,...,...,...,...,...,...,...,...
73513,97400140947,NO3,0.71,<=50 mg/L,2025,50.0,mg/L,nitrite,974004294,97411,2025-01-09 08:35:00,1
73514,97400140966,NO2,0.00,"<=0,1 mg/L",2025,0.1,mg/L,nitrite,974004295,97418,2025-01-20 07:47:00,1
73515,97400140966,NO3,3.40,<=50 mg/L,2025,50.0,mg/L,nitrite,974004295,97418,2025-01-20 07:47:00,1
73516,97400140948,NO2,0.00,"<=0,1 mg/L",2025,0.1,mg/L,nitrite,974004298,97420,2025-01-13 09:00:00,1


Récupération des derniers résultats des derniers prélèvements

In [346]:
#dernier prélèvement paramètre
query_dernier_prel = """ 
SELECT *
FROM nitrates
WHERE row_number = 1
 """
dernier_prel = con.sql(query_dernier_prel).df()
dernier_prel


Unnamed: 0,referenceprel,cdparametresiseeaux,valtraduite,limitequal,de_partition,limitequal_float,unite,categorie,cdreseau,inseecommune,datetimeprel,row_number
0,00100143925,NO2,0.00,"<=0,5 mg/L",2025,0.5,mg/L,nitrite,001000003,01007,2025-01-21 12:35:00,1
1,00100143925,NO3,14.00,<=50 mg/L,2025,50.0,mg/L,nitrite,001000003,01007,2025-01-21 12:35:00,1
2,00100143918,NO2,0.00,"<=0,5 mg/L",2025,0.5,mg/L,nitrite,001000241,01139,2025-01-17 11:10:00,1
3,00100143918,NO3,3.30,<=50 mg/L,2025,50.0,mg/L,nitrite,001000241,01040,2025-01-17 11:10:00,1
4,00100143923,NO2,0.00,"<=0,5 mg/L",2025,0.5,mg/L,nitrite,001000248,01004,2025-01-21 12:10:00,1
...,...,...,...,...,...,...,...,...,...,...,...,...
11823,97400140947,NO3,0.71,<=50 mg/L,2025,50.0,mg/L,nitrite,974004294,97411,2025-01-09 08:35:00,1
11824,97400140966,NO2,0.00,"<=0,1 mg/L",2025,0.1,mg/L,nitrite,974004295,97418,2025-01-20 07:47:00,1
11825,97400140966,NO3,3.40,<=50 mg/L,2025,50.0,mg/L,nitrite,974004295,97418,2025-01-20 07:47:00,1
11826,97400140948,NO2,0.00,"<=0,1 mg/L",2025,0.1,mg/L,nitrite,974004298,97420,2025-01-13 09:00:00,1


Traitement intermédiaire des référence prélèvements pour NO2 et NO3 issus de prélèvement différent

In [347]:
#Tri des referenceprel selon UDI, datetimeprel
query_order_prel = """ 
SELECT referenceprel, cdreseau, ROW_NUMBER() OVER(PARTITION BY cdreseau ORDER BY datetimeprel DESC, valtraduite DESC) AS row_number
FROM dernier_prel

 """
order_prel = con.sql(query_order_prel).df()
order_prel

Unnamed: 0,referenceprel,cdreseau,row_number
0,00100143931,001000268,1
1,00100143931,001000268,2
2,00100143908,001000641,1
3,00100143908,001000641,2
4,00100143934,001000655,1
...,...,...,...
11823,97400141064,974000092,2
11824,97400141261,974003516,1
11825,97400141261,974003516,2
11826,97400140890,974003649,1


In [348]:
#récupération du referenceprel du dernier prélèvement pour éliminer les referencepel différents
query_last_prel_ref = """
SELECT referenceprel, cdreseau
FROM order_prel
WHERE row_number = 1
 """
last_prel_ref = con.sql(query_last_prel_ref).df()
last_prel_ref


Unnamed: 0,referenceprel,cdreseau
0,00100143931,001000268
1,00100143908,001000641
2,00100143934,001000655
3,00100143880,001000691
4,00100143872,001000758
...,...,...
7115,97400141261,974000031
7116,97400141275,974000068
7117,97400141064,974000092
7118,97400141261,974003516


In [349]:
#récupération des derniers prélèvement par paramètres sans referenceprel
query_prel_param_ss_ref = """
SELECT * EXCLUDE referenceprel
FROM dernier_prel
"""
prel_param_ss_ref = con.sql(query_prel_param_ss_ref).df()
prel_param_ss_ref


Unnamed: 0,cdparametresiseeaux,valtraduite,limitequal,de_partition,limitequal_float,unite,categorie,cdreseau,inseecommune,datetimeprel,row_number
0,NO2,0.00,"<=0,5 mg/L",2025,0.5,mg/L,nitrite,001000003,01007,2025-01-21 12:35:00,1
1,NO3,14.00,<=50 mg/L,2025,50.0,mg/L,nitrite,001000003,01007,2025-01-21 12:35:00,1
2,NO2,0.00,"<=0,5 mg/L",2025,0.5,mg/L,nitrite,001000241,01139,2025-01-17 11:10:00,1
3,NO3,3.30,<=50 mg/L,2025,50.0,mg/L,nitrite,001000241,01040,2025-01-17 11:10:00,1
4,NO2,0.00,"<=0,5 mg/L",2025,0.5,mg/L,nitrite,001000248,01004,2025-01-21 12:10:00,1
...,...,...,...,...,...,...,...,...,...,...,...
11823,NO3,0.71,<=50 mg/L,2025,50.0,mg/L,nitrite,974004294,97411,2025-01-09 08:35:00,1
11824,NO2,0.00,"<=0,1 mg/L",2025,0.1,mg/L,nitrite,974004295,97418,2025-01-20 07:47:00,1
11825,NO3,3.40,<=50 mg/L,2025,50.0,mg/L,nitrite,974004295,97418,2025-01-20 07:47:00,1
11826,NO2,0.00,"<=0,1 mg/L",2025,0.1,mg/L,nitrite,974004298,97420,2025-01-13 09:00:00,1


In [None]:
#jointure UDI et référenceprel
query_prel_param_ac_ref = """

SELECT lpr.referenceprel, psr.*
FROM prel_param_ss_ref AS psr
LEFT JOIN last_prel_ref lpr
USING (cdreseau)
"""
prel_param_ac_ref = con.sql(query_prel_param_ac_ref).df()
prel_param_ac_ref

Unnamed: 0,referenceprel,cdparametresiseeaux,valtraduite,limitequal,de_partition,limitequal_float,unite,categorie,cdreseau,inseecommune,datetimeprel,row_number
0,00100143925,NO2,0.00,"<=0,5 mg/L",2025,0.5,mg/L,nitrite,001000003,01007,2025-01-21 12:35:00,1
1,00100143925,NO3,14.00,<=50 mg/L,2025,50.0,mg/L,nitrite,001000003,01007,2025-01-21 12:35:00,1
2,00100143918,NO2,0.00,"<=0,5 mg/L",2025,0.5,mg/L,nitrite,001000241,01139,2025-01-17 11:10:00,1
3,00100143918,NO3,3.30,<=50 mg/L,2025,50.0,mg/L,nitrite,001000241,01040,2025-01-17 11:10:00,1
4,00100143923,NO2,0.00,"<=0,5 mg/L",2025,0.5,mg/L,nitrite,001000248,01004,2025-01-21 12:10:00,1
...,...,...,...,...,...,...,...,...,...,...,...,...
11823,97400140947,NO3,0.71,<=50 mg/L,2025,50.0,mg/L,nitrite,974004294,97411,2025-01-09 08:35:00,1
11824,97400140966,NO2,0.00,"<=0,1 mg/L",2025,0.1,mg/L,nitrite,974004295,97418,2025-01-20 07:47:00,1
11825,97400140966,NO3,3.40,<=50 mg/L,2025,50.0,mg/L,nitrite,974004295,97418,2025-01-20 07:47:00,1
11826,97400140948,NO2,0.00,"<=0,1 mg/L",2025,0.1,mg/L,nitrite,974004298,97420,2025-01-13 09:00:00,1


Aggrégation des résultats  : Table intermédiaire  - traitement des valtraduites  
2 situations:  
- Nitrates < 50 mg/L et nitrites < 0,5 mg/L et nitrate/50 + nitrites/3 < 1 mg/L (eau conforme)  
- Nitrates > 50 mg/L et/ou nitrites > 0,5 mg/L et/ou nitrate/50 + nitrites/3 > 1 mg/L (eau non conforme) (fait passer l’affichage total polluant en rouge)

In [372]:
#définition des résultats
query_resultats_nitrites = """ 

SELECT referenceprel, cdreseau, MAX(datetimeprel) AS datetimeprel,
SUM(CASE
    WHEN cdparametresiseeaux = 'NO2' THEN valtraduite
    -- mettre 0 si aucune analyse NO2
    ELSE 0
END) AS valtraduite_NO2,

SUM(CASE
    WHEN cdparametresiseeaux = 'NO3' THEN valtraduite
    -- mettre 0 si aucune analyse NO3
    ELSE 0
END) AS valtraduite_NO3,

ROUND((valtraduite_NO2/50 + valtraduite_NO3/3),2) AS valtraduite_NO2_NO3

FROM prel_param_ac_ref
GROUP BY referenceprel, cdreseau
 """
resultats_nitrites = con.sql(query_resultats_nitrites).df()
resultats_nitrites


Unnamed: 0,referenceprel,cdreseau,datetimeprel,valtraduite_NO2,valtraduite_NO3,valtraduite_NO2_NO3
0,00100143877,001000258,2025-01-21 09:45:00,0.0,0.00,0.00
1,00100143931,001000268,2025-01-13 09:57:00,0.0,6.00,2.00
2,00100143998,001000278,2025-01-22 11:24:00,0.0,13.00,4.33
3,00100144051,001000289,2025-01-21 11:14:00,0.0,0.00,0.00
4,00100143912,001000293,2025-01-28 11:29:00,0.0,3.70,1.23
...,...,...,...,...,...,...
7115,97400141070,974000781,2025-01-06 09:45:00,0.0,0.86,0.29
7116,97400141255,974001245,2025-01-13 10:35:00,0.0,0.00,0.00
7117,97400141301,974003623,2025-01-30 08:10:00,0.0,4.10,1.37
7118,97400141307,974004288,2025-01-30 08:45:00,0.0,0.35,0.12


Aggrégation des résultats  : Table finale  
 

2 situations:  
- Nitrates < 50 mg/L et nitrites < 0,5 mg/L et nitrate/50 + nitrites/3 < 1 mg/L (eau conforme)  
- Nitrates > 50 mg/L et/ou nitrites > 0,5 mg/L et/ou nitrate/50 + nitrites/3 > 1 mg/L (eau non conforme) (fait passer l’affichage total polluant en rouge)  

La table d'origine présente des 'valtraduite' Null => traduit par 'aucun résultat' dans la table finale : ==> est-ce que c'est OK ou faut t'il enlever les prélèvements à valtraduite NULL?

In [352]:

query_resultat_nitrite_dernier = """

SELECT cdreseau, referenceprel, 'dernier relevé' AS periode, 'nitrites' AS categorie, 
CASE 
    WHEN valtraduite_NO2 < 50 AND valtraduite_NO3 < 0.5 AND valtraduite_NO2_NO3 < 1
    THEN 'eau conforme'
    WHEN valtraduite_NO2 >= 50 OR valtraduite_NO3 >= 0.5 OR valtraduite_NO2_NO3 >= 1
    THEN 'eau non conforme'
    ELSE 'aucun résultat'
END AS resultat ,
datetimeprel
FROM resultats_nitrites
ORDER BY datetimeprel
 """
resultat_nitrite_dernier = con.sql(query_resultat_nitrite_dernier).df()
resultat_nitrite_dernier


Unnamed: 0,cdreseau,referenceprel,periode,categorie,resultat,datetimeprel
0,086000622,08600133560,dernier relevé,nitrites,eau non conforme,2025-01-02 08:47:00
1,026000675,02600171794,dernier relevé,nitrites,eau non conforme,2025-01-02 08:51:00
2,028001276,02800125276,dernier relevé,nitrites,eau non conforme,2025-01-02 09:00:00
3,086000333,08600133561,dernier relevé,nitrites,eau non conforme,2025-01-02 09:03:00
4,083001260,08300290363,dernier relevé,nitrites,eau non conforme,2025-01-02 09:04:00
...,...,...,...,...,...,...
7115,034001311,03400327052,dernier relevé,nitrites,eau conforme,2025-01-31 14:36:00
7116,062000735,06200288259,dernier relevé,nitrites,eau non conforme,2025-01-31 15:34:00
7117,062004264,06200288259,dernier relevé,nitrites,eau non conforme,2025-01-31 15:34:00
7118,072000572,07200139826,dernier relevé,nitrites,eau non conforme,2025-01-31 15:37:00


FIN CREATION MODELE  
Remarques (voir explo ci-dessous):  
- j'ai modifié > par >=  dans la deuxième condition  
- j'ai mis valtraduite à 0 pour les analyses N02 inexistant
- écart de prélèvement entre NO2 et NO3, 27 jours, problème ou pas ???  
- les prélèvements à valtraduites NULL, à garder, à compléter par zéro, à enlever ???



----------------------------------------------  
---------------------------------------------

EXPLO VERIF RESULTAT

In [353]:
#vérification des valtraduite NULL dans la table d'origine
con.sql('SELECT * FROM dernier_prel WHERE valtraduite IS NULL LIMIT 2' ).df()

Unnamed: 0,referenceprel,cdparametresiseeaux,valtraduite,limitequal,de_partition,limitequal_float,unite,categorie,cdreseau,inseecommune,datetimeprel,row_number
0,3400327115,NO3,,<=50 mg/L,2025,50.0,mg/L,nitrite,34000005,34041,2025-01-10 11:32:00,1
1,3400327103,NO3,,<=50 mg/L,2025,50.0,mg/L,nitrite,34000006,34056,2025-01-07 10:44:00,1


In [None]:
#vérification du nobmre de cdreseau >1 dans le résultat final
verif_double_cdreseau = con.sql('select  cdreseau, COUNT(*) AS nbcdreseau FROM resultat_nitrite_dernier GROUP BY cdreseau HAVING nbcdreseau>1 ORDER BY nbcdreseau DESC').df()
verif_double_cdreseau

#==> OK on a des UDI uniques

Unnamed: 0,cdreseau,nbcdreseau


In [355]:
#verif : affichage table qui présente deux prélèvements différent pour dernier NO2 et dernier NO3
query_cdreseau = """ 
SELECT *
FROM int__resultats_udi_communes
WHERE cdreseau = '076001796' AND cdparametresiseEaux IN ('NO2','NO3')
ORDER BY datetimeprel DESC
  """

cdreseau = con.sql(query_cdreseau).df()
cdreseau

Unnamed: 0,referenceprel,cdparametresiseeaux,valtraduite,limitequal,de_partition,limitequal_float,unite,categorie,cdreseau,inseecommune,datetimeprel
0,7600298380,NO3,35.3,<=50 mg/L,2025,50.0,mg/L,nitrite,76001796,76351,2025-01-30 10:30:00
1,7600298379,NO3,30.6,<=50 mg/L,2025,50.0,mg/L,nitrite,76001796,76351,2025-01-30 10:00:00
2,7600298292,NO3,28.3,<=50 mg/L,2025,50.0,mg/L,nitrite,76001796,76351,2025-01-27 11:35:00
3,7600298294,NO3,32.1,<=50 mg/L,2025,50.0,mg/L,nitrite,76001796,76351,2025-01-27 11:35:00
4,7600298270,NO3,31.3,<=50 mg/L,2025,50.0,mg/L,nitrite,76001796,76351,2025-01-23 10:10:00
5,7600298080,NO3,29.2,<=50 mg/L,2025,50.0,mg/L,nitrite,76001796,76351,2025-01-15 10:35:00
6,7600297999,NO3,33.9,<=50 mg/L,2025,50.0,mg/L,nitrite,76001796,76351,2025-01-13 13:30:00
7,7600297998,NO3,35.2,<=50 mg/L,2025,50.0,mg/L,nitrite,76001796,76351,2025-01-13 11:00:00
8,7600297886,NO3,30.0,<=50 mg/L,2025,50.0,mg/L,nitrite,76001796,76351,2025-01-09 10:05:00
9,7600297823,NO3,33.7,<=50 mg/L,2025,50.0,mg/L,nitrite,76001796,76351,2025-01-06 10:35:00


In [356]:
#VERIFICATION écart de date entre les prélèvements
con.sql(' SELECT cdreseau, MAX(datetimeprel) - MIN(datetimeprel)  AS ecartprel  FROM dernier_prel GROUP BY cdreseau HAVING ecartprel <> INTERVAL \'0 days\' ORDER BY ecartprel DESC').df()

#==> il y a 27 jours d'écart entre le prélèvement du dernier NO2 et dernier NO3

Unnamed: 0,cdreseau,ecartprel
0,076000358,27 days 23:25:00
1,076001796,27 days 22:40:00
2,076000359,25 days 22:05:00
3,044000170,24 days 01:00:00
4,017000286,24 days 00:27:00
...,...,...
461,086000367,0 days 00:05:00
462,086000325,0 days 00:04:00
463,086000335,0 days 00:04:00
464,086000359,0 days 00:04:00


In [None]:
#définition des résultats
query_count_param_analyse = """ 

SELECT cdparametresiseeaux, COUNT(*) nbparam_dern_prel
FROM prel_param_ac_ref
GROUP BY cdparametresiseeaux
 """
count_param_analyse = con.sql(query_count_param_analyse).df()
count_param_analyse 



Unnamed: 0,cdparametresiseeaux,nbparam_dern_prel
0,NO2,5084
1,NO3,6744


In [None]:
#recherche UDI qui n'a pas de NO2 analysé depuis 1 an
query_count_param_analyse = """ 

SELECT cdreseau, COUNT(*) nbparam_dern_prel
FROM prel_param_ac_ref
GROUP BY cdreseau
HAVING nbparam_dern_prel<2
 """
count_param_analyse = con.sql(query_count_param_analyse).df()
count_param_analyse 

Unnamed: 0,cdreseau,nbparam_dern_prel
0,001000258,1
1,001000598,1
2,001000960,1
3,003000367,1
4,003000917,1
...,...,...
2407,091000228,1
2408,095000183,1
2409,095000351,1
2410,971000096,1


In [None]:
#Explo cdreseau 974003500
query_cdreseau_974003500 = """
SELECT *
FROM int__resultats_udi_communes
WHERE cdparametresiseeaux IN ('NO2','NO3') AND cdreseau = '974003500'
ORDER BY cdparametresiseeaux, datetimeprel DESC
"""
cdreseau_974003500 = con.sql(query_cdreseau_974003500).df()
cdreseau_974003500


Unnamed: 0,referenceprel,cdparametresiseeaux,valtraduite,limitequal,de_partition,limitequal_float,unite,categorie,cdreseau,inseecommune,datetimeprel
0,97400140904,NO3,25.0,<=50 mg/L,2025,50.0,mg/L,nitrite,974003500,97415,2025-01-10 10:23:00



-------------------------------------  
-------------------------------------  
-------------------------------------  