# Calcul du nombre de prélèvements CVM non conforme par commune et par année

L'objectif de ce notebook est de partir de la liste de communes cog_communes, et pour chaque commune et chaque année, calculer le nombre de prélèvements non conformes pour le CVM.

Il y aura plusieurs aggrégations à faire :

- commune (inseecommune) peut avoir plusieurs UDIs (cdreseau) **ET** un UID peut avoir plusieurs communes (inseecommune) 
- un prélèvement (referenceprel) peut être rattaché à plusieurs UDIs (cdreseau)
- un prélèvement (referenceprel) peut être composé de plusieurs paramètres (cdparametresiseeaux) ; mais dans le cas du CVM, il y a un seul paramètre selon la catégorisation de Pauline, donc c'est plus simple




In [1]:
%load_ext sql
%sql duckdb:///../../database/data.duckdb
%config SqlMagic.displaylimit = 10

### Les communes

In [2]:
%%sql 
WITH
udi AS (
    SELECT
      de_partition AS year,
      inseecommune AS commune_code_insee,
      cdreseau,
    FROM
      edc_communes
    GROUP BY 
    1,2,3
),

cog AS (
    SELECT
      DEP AS code_departement,
      REG AS code_region,
      COM AS commune_code_insee,
    FROM 
      cog_communes
)
    SELECT
      udi.commune_code_insee ,
      udi.cdreseau,
      udi.year,
      cog.code_departement,
      cog.code_region
    FROM
      udi
    LEFT JOIN 
      cog
    ON 
      udi.commune_code_insee = cog.commune_code_insee

commune_code_insee,cdreseau,year,code_departement,code_region
1001,1000556,2020,1,84
1002,1000369,2023,1,84
1004,1000249,2020,1,84
1005,1000850,2022,1,84
1006,1000235,2021,1,84
1007,1000003,2022,1,84
1008,1000254,2024,1,84
1009,1000338,2020,1,84
1010,1000260,2022,1,84
1011,1000870,2020,1,84


#### Pour chaque catégorie et années, on veut la liste complète des commmunes

In [3]:
%%sql  
WITH
annees AS (
    SELECT unnest(generate_series(2020, 2024)) as annee
    ),

cat AS (
    SELECT categorie FROM int__mapping_category_simple GROUP BY 1 
    )

SELECT
 annee, categorie
FROM 
annees
CROSS JOIN
cat

annee,categorie
2020,métabolite de pesticide
2020,hydrocarbure
2020,pcb
2020,dioxine et furane
2020,médicament
2020,minéral
2021,métabolite de pesticide
2021,hydrocarbure
2021,pcb
2021,dioxine et furane


In [4]:
%%sql 
WITH
annees AS (
    SELECT unnest(generate_series(2020, 2024)) as annee
    ),

cat AS (
    SELECT categorie FROM int__mapping_category_simple GROUP BY 1 
    ),

year_cat AS (   
    SELECT
     annee, categorie
    FROM 
    annees
    CROSS JOIN
    cat 
),

udi AS (
    SELECT
      de_partition AS year,
      inseecommune AS commune_code_insee,
      cdreseau,
    FROM
      edc_communes
    GROUP BY 
    1,2,3
),

cog AS (
    SELECT
      DEP AS code_departement,
      REG AS code_region,
      COM AS commune_code_insee,
    FROM 
      cog_communes
),

udi_cog AS (
    SELECT
      udi.commune_code_insee ,
      udi.cdreseau,
      udi.year,
      cog.code_departement,
      cog.code_region
    FROM
      udi
    LEFT JOIN 
      cog
    ON 
      udi.commune_code_insee = cog.commune_code_insee
)
    
SELECT DISTINCT
    annee, 
    categorie,
    commune_code_insee ,
    cdreseau,
    code_departement,
    code_region
 FROM
      udi_cog
 FULL OUTER JOIN
       year_cat
 ON
    udi_cog.year = year_cat.annee

annee,categorie,commune_code_insee,cdreseau,code_departement,code_region
2022,nitrite,1005,1000850,1,84
2020,nitrite,1035,1001219,1,84
2021,nitrite,1049,1000761,1,84
2022,nitrite,1240,1000527,1,84
2022,nitrite,1279,1000395,1,84
2021,nitrite,1386,1000382,1,84
2022,nitrite,2065,2000091,2,32
2021,nitrite,2097,2000106,2,32
2022,nitrite,2422,2000658,2,32
2021,nitrite,2450,2000878,2,32


### Les résultats

#### mesures_cat

In [5]:
%%sql
WITH
resultats AS (
    SELECT
      referenceprel,
      cdparametresiseeaux,
      valtraduite,
      limitequal,
      CAST(regexp_extract(REPLACE(limitequal, ',', '.'), '-?\d+(\.\d+)?') AS FLOAT) AS limitequal_float,
      regexp_extract(limitequal, '[a-zA-Zµg]+/?[a-zA-Z/L]+$') AS unite,
    FROM  
        edc_resultats 
    )
    
    SELECT
        resultats.*,
        int__mapping_category_simple.categorie
    FROM 
        resultats
    LEFT JOIN 
        int__mapping_category_simple 
    ON
        resultats.cdparametresiseeaux = int__mapping_category_simple .cdparametresiseeaux

referenceprel,cdparametresiseeaux,valtraduite,limitequal,limitequal_float,unite,categorie
400121459,CDT25,586.0,,,,paramètre organoleptique
400121459,CHINE,0.0,"<=0,1 µg/L",0.1000000014901161,µg/L,pesticides
400121459,CHINOME,0.0,"<=0,1 µg/L",0.1000000014901161,µg/L,pesticides
400121459,CHLORB,0.0,"<=0,1 µg/L",0.1000000014901161,µg/L,pesticides
400121459,CHLPM,0.0,"<=0,1 µg/L",0.1000000014901161,µg/L,pesticides
400121459,CHLX,0.0,"<=0,1 µg/L",0.1000000014901161,µg/L,pesticides
400121459,CINOSUL,0.0,"<=0,1 µg/L",0.1000000014901161,µg/L,pesticides
400121459,CL,2.8,,,,sous produit désinfection
400121459,CL2LIB,0.0,,,,sous produit désinfection
400121459,CLAHA,0.0,"<=0,1 µg/L",0.1000000014901161,µg/L,pesticides


#### mesures_cat_communes : on associe aux resultats la ref de prelevement et les uid associés

In [6]:
%%sql --save mesures_cat_communes
WITH 
udi AS (
    SELECT
      de_partition AS year,
      inseecommune AS commune_code_insee,
      cdreseau,
    FROM
      edc_communes
    GROUP BY 
    1,2,3
),
   
prelevement AS (
	SELECT
	  referenceprel,
	  cdreseau,
      dateprel,
	FROM   
      edc_prelevements
),

resultats AS (
    SELECT
      referenceprel,
      cdparametresiseeaux,
      valtraduite,
      limitequal,
      CAST(regexp_extract(REPLACE(limitequal, ',', '.'), '-?\d+(\.\d+)?') AS FLOAT) AS limitequal_float,
      regexp_extract(limitequal, '[a-zA-Zµg]+/?[a-zA-Z/L]+$') AS unite,
    FROM  
        edc_resultats 
    ),

mesures_cat AS (
    SELECT
        resultats.*,
        int__mapping_category_simple.categorie
    FROM 
        resultats
    LEFT JOIN 
        int__mapping_category_simple 
    ON
        resultats.cdparametresiseeaux = int__mapping_category_simple .cdparametresiseeaux
)
    
    SELECT
        mesures_cat.*,
        prelevement.dateprel,
        udi.commune_code_insee AS commune_code_insee,
    FROM 
        mesures_cat 
    LEFT JOIN
    	prelevement
    ON
       mesures_cat.referenceprel = prelevement.referenceprel
    LEFT JOIN
        udi
    ON
    	udi.cdreseau = prelevement.cdreseau
        AND udi.year = extract( YEAR FROM prelevement.dateprel)

referenceprel,cdparametresiseeaux,valtraduite,limitequal,limitequal_float,unite,categorie,dateprel,commune_code_insee
100119085,12DCLE,0.0,<=3 µg/L,3.0,µg/L,hydrocarbure,2020-02-14,1333
100119085,ACTIK40,0.034,,,,radioactivité,2020-02-14,1333
100119085,ACTITR,0.0,,,,radioactivité,2020-02-14,1333
100119085,ADET,0.013,"<=0,1 µg/L",0.1000000014901161,µg/L,métabolite de pesticide,2020-02-14,1333
100119085,ADET2,0.0,"<=0,1 µg/L",0.1000000014901161,µg/L,métabolite de pesticide,2020-02-14,1333
100119085,ADETD,0.0,"<=0,1 µg/L",0.1000000014901161,µg/L,métabolite de pesticide,2020-02-14,1333
100119085,ADSP,0.0,"<=0,1 µg/L",0.1000000014901161,µg/L,métabolite de pesticide,2020-02-14,1333
100119085,ALTMICR,0.0,,,,minéral,2020-02-14,1333
100119085,AMTH,0.0,"<=0,1 µg/L",0.1000000014901161,µg/L,pesticides,2020-02-14,1333
100119085,ATRZ,0.0,"<=0,1 µg/L",0.1000000014901161,µg/L,pesticides,2020-02-14,1333


#### mesures_cat_communes_year

In [7]:
%%sql --with mesures_cat_communes --save mesures_cat_communes_year

SELECT 
 extract( YEAR FROM mesures_cat_communes.dateprel) as annee,
 categorie,
 commune_code_insee,
 SUM(1) AS nb_analyses,
 SUM(case
        when limitequal_float is not NULL and valtraduite >= limitequal_float then 1
        else 0
    end ) AS nb_analyses_not_ok,
 SUM(case
        when limitequal_float is not NULL and valtraduite < limitequal_float then 1
        else 0
    end ) AS nb_analyses_ok,    
FROM
 mesures_cat_communes
GROUP BY 
    1,2,3

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

annee,categorie,commune_code_insee,nb_analyses,nb_analyses_not_ok,nb_analyses_ok
2020,non classé,2728,182,0,35
2020,minéral,2255,25,0,1
2020,non classé,2168,190,0,39
2020,paramètre organoleptique,2774,112,0,0
2020,sous produit désinfection,2353,33,0,11
2020,microbio,2685,61,24,0
2020,sous produit désinfection,2329,38,0,11
2020,nitrite,2312,12,0,12
2020,non classé,2524,190,0,39
2020,paramètre organoleptique,2166,379,0,0


#### mesures_cat_communes_year_cvm

In [8]:
%%sql --with mesures_cat_communes_year --save mesures_cat_communes_year_cvm
SELECT
    annee,
    commune_code_insee,
    CASE WHEN nb_analyses IS NULL OR nb_analyses = 0 THEN 'Pas recherché'
         WHEN nb_analyses_ok =0 THEN 'jamais quantifié'
         WHEN nb_analyses_ok >0 THEN '<= 0,5 µg/L'
         WHEN nb_analyses_not_ok >1 THEN '> 0,5 µg/L'
    END AS resultat
FROM  
 mesures_cat_communes_year
WHERE
 categorie = 'cvm'

annee,commune_code_insee,resultat
2020,1034,"<= 0,5 µg/L"
2020,1193,"<= 0,5 µg/L"
2020,1311,"<= 0,5 µg/L"
2020,1036,"<= 0,5 µg/L"
2020,1361,"<= 0,5 µg/L"
2020,1027,"<= 0,5 µg/L"
2020,1431,"<= 0,5 µg/L"
2020,1225,"<= 0,5 µg/L"
2020,1188,"<= 0,5 µg/L"
2020,1175,"<= 0,5 µg/L"


**Pour un cas plus générique il faudra ajouter une condition sur categorie dans le CASE WHEN resultat**

#### mesures_cat_communes_year_cvm + Joint list annee_cat_communes list

In [9]:
%%sql --save mesures_cat_communes_year_cvm
WITH 
annees AS (
    SELECT unnest(generate_series(2020, 2024)) as annee
    ),

cat AS (
    SELECT categorie FROM int__mapping_category_simple GROUP BY 1 
    ),

year_cat AS (   
    SELECT
     annee, categorie
    FROM 
    annees
    CROSS JOIN
    cat 
),

udi AS (
    SELECT
      de_partition AS year,
      inseecommune AS commune_code_insee,
      cdreseau,
    FROM
      edc_communes
    GROUP BY 
    1,2,3
),

cog AS (
    SELECT
      DEP AS code_departement,
      REG AS code_region,
      COM AS commune_code_insee,
    FROM 
      cog_communes
),

udi_cog AS (
    SELECT
      udi.commune_code_insee ,
      udi.cdreseau,
      udi.year,
      cog.code_departement,
      cog.code_region
    FROM
      udi
    LEFT JOIN 
      cog
    ON 
      udi.commune_code_insee = cog.commune_code_insee
),

LIST_REF_UDI_YEAR AS (    
    SELECT DISTINCT
        annee, 
        categorie,
        commune_code_insee ,
        cdreseau,
        code_departement,
        code_region
     FROM
          udi_cog
     FULL OUTER JOIN
           year_cat
     ON
        udi_cog.year = year_cat.annee
),

   
prelevement AS (
	SELECT
	  referenceprel,
	  cdreseau,
      dateprel,
	FROM   
      edc_prelevements
),

resultats AS (
    SELECT
      referenceprel,
      cdparametresiseeaux,
      valtraduite,
      limitequal,
      CAST(regexp_extract(REPLACE(limitequal, ',', '.'), '-?\d+(\.\d+)?') AS FLOAT) AS limitequal_float,
      regexp_extract(limitequal, '[a-zA-Zµg]+/?[a-zA-Z/L]+$') AS unite,
    FROM  
        edc_resultats 
    ),

mesures_cat AS (
    SELECT
        resultats.*,
        int__mapping_category_simple.categorie
    FROM 
        resultats
    LEFT JOIN 
        int__mapping_category_simple 
    ON
        resultats.cdparametresiseeaux = int__mapping_category_simple .cdparametresiseeaux
),

 mesures_cat_communes AS (
    SELECT
        mesures_cat.*,
        prelevement.dateprel,
        udi_cog.commune_code_insee AS commune_code_insee,
    FROM 
        mesures_cat 
    LEFT JOIN
    	prelevement
    ON
       mesures_cat.referenceprel = prelevement.referenceprel
    LEFT JOIN
        udi_cog
    ON
    	udi_cog.cdreseau = prelevement.cdreseau
    ),

 mesures_cat_communes_year AS(
    SELECT 
     extract( YEAR FROM mesures_cat_communes.dateprel) as annee,
     categorie,
     commune_code_insee,
     SUM(1) AS nb_analyses,
     SUM(case
            when limitequal_float is not NULL and valtraduite >= limitequal_float then 1
            else 0
        end ) AS nb_analyses_not_ok,
     SUM(case
            when limitequal_float is not NULL and valtraduite < limitequal_float then 1
            else 0
        end ) AS nb_analyses_ok,    
    FROM
     mesures_cat_communes
    GROUP BY 
        1,2,3
    )

SELECT
    LIST_REF_UDI_YEAR.annee,
    LIST_REF_UDI_YEAR.commune_code_insee,
    LIST_REF_UDI_YEAR.categorie,
    CASE WHEN coalesce(nb_analyses,0) = 0 THEN 'Pas recherché'
         WHEN coalesce(nb_analyses_ok,0) =0 THEN 'jamais quantifié'
         WHEN coalesce(nb_analyses_ok,0) >0 THEN '<= 0,5 µg/L'
         WHEN coalesce(nb_analyses_not_ok,0) >1 THEN '> 0,5 µg/L'
    END AS resultat
FROM  
 LIST_REF_UDI_YEAR
LEFT JOIN 
 mesures_cat_communes_year
ON
    LIST_REF_UDI_YEAR.annee =  mesures_cat_communes_year.annee
    AND LIST_REF_UDI_YEAR.categorie =  mesures_cat_communes_year.categorie
    AND  LIST_REF_UDI_YEAR.commune_code_insee  =  mesures_cat_communes_year.commune_code_insee
WHERE
 LIST_REF_UDI_YEAR.categorie = 'cvm'

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

annee,commune_code_insee,categorie,resultat
2020,29025,cvm,"<= 0,5 µg/L"
2020,29083,cvm,"<= 0,5 µg/L"
2020,29274,cvm,"<= 0,5 µg/L"
2020,29298,cvm,"<= 0,5 µg/L"
2020,29029,cvm,"<= 0,5 µg/L"
2020,29065,cvm,"<= 0,5 µg/L"
2020,29252,cvm,"<= 0,5 µg/L"
2020,29035,cvm,"<= 0,5 µg/L"
2020,29076,cvm,"<= 0,5 µg/L"
2020,29202,cvm,"<= 0,5 µg/L"


# Check

In [10]:
%%sql
SELECT
    de_partition as year,
    COUNT(DISTINCT inseecommune)
FROM 
 edc_communes
GROUP BY 
    1 
ORDER BY 
   1

year,count(DISTINCT inseecommune)
2020,34788
2021,34833
2022,34874
2023,34852
2024,34809


In [11]:
%%sql --with mesures_cat_communes_year_cvm  
 SELECT 
    annee,
    COUNT(DISTINCT commune_code_insee)
 FROM
   mesures_cat_communes_year_cvm
GROUP BY 
    1

annee,count(DISTINCT commune_code_insee)
2020,34788
2022,34874
2024,34809
2021,34833
2023,34852


In [14]:
%%sql --with mesures_cat_communes_year_cvm  
 SELECT 
   *
 FROM
   mesures_cat_communes_year_cvm
WHERE
    commune_code_insee = '07194'
ORDER BY 
    annee

annee,commune_code_insee,categorie,resultat
2020,7194,cvm,"<= 0,5 µg/L"
2020,7194,cvm,"<= 0,5 µg/L"
2021,7194,cvm,"<= 0,5 µg/L"
2021,7194,cvm,"<= 0,5 µg/L"
2022,7194,cvm,"<= 0,5 µg/L"
2022,7194,cvm,"<= 0,5 µg/L"
2023,7194,cvm,"<= 0,5 µg/L"
2023,7194,cvm,"<= 0,5 µg/L"
2024,7194,cvm,"<= 0,5 µg/L"
2024,7194,cvm,"<= 0,5 µg/L"


**Ici on a des années en double**

In [15]:
%%sql --with mesures_cat_communes_year 
 SELECT 
   *
 FROM
   mesures_cat_communes_year
WHERE
    commune_code_insee = '07194'
    AND categorie = 'cvm'
ORDER BY 
    annee

annee,categorie,commune_code_insee,nb_analyses,nb_analyses_not_ok,nb_analyses_ok
2020,cvm,7194,3,0,3
2021,cvm,7194,3,0,3
2022,cvm,7194,11,8,3
2023,cvm,7194,16,11,5
2024,cvm,7194,10,5,5
