# Calcul du nombre de prélèvements CVM non conforme par commune et par année

L'objectif de ce notebook est de partir de la liste de communes cog_communes, et pour chaque commune et chaque année, calculer le nombre de prélèvements non conformes pour le CVM.

Il y aura plusieurs aggrégations à faire :

- commune (inseecommune) peut avoir plusieurs UDIs (cdreseau) **ET** un UID peut avoir plusieurs communes (inseecommune) 
- un prélèvement (referenceprel) peut être rattaché à plusieurs UDIs (cdreseau)
- un prélèvement (referenceprel) peut être composé de plusieurs paramètres (cdparametresiseeaux) ; mais dans le cas du CVM, il y a un seul paramètre selon la catégorisation de Pauline, donc c'est plus simple




In [1]:
%load_ext sql
%sql duckdb:///../../database/data.duckdb
%config SqlMagic.displaylimit = 10

### Les communes

In [2]:
%%sql 
WITH
udi AS (
    SELECT
      inseecommune AS commune_code_insee,
      cdreseau,
    FROM
      edc_communes
),

cog AS (
    SELECT
      DEP AS code_departement,
      REG AS code_region,
      COM AS commune_code_insee,
    FROM 
      cog_communes
)
    SELECT
      udi.commune_code_insee ,
      udi.cdreseau,
      cog.code_departement,
      cog.code_region
    FROM
      udi
    LEFT JOIN 
      cog
    ON 
      udi.commune_code_insee = cog.commune_code_insee

commune_code_insee,cdreseau,code_departement,code_region
1001,1000556,1,84
1002,1000369,1,84
1004,1000248,1,84
1004,1000249,1,84
1004,1000251,1,84
1005,1000850,1,84
1005,1000850,1,84
1006,1000235,1,84
1007,1000003,1,84
1008,1000254,1,84


#### Pour chaque catégorie et années, on veut la liste complète des commmunes

In [3]:
%%sql  
WITH
annees AS (
    SELECT unnest(generate_series(2020, 2024)) as annee
    ),

cat AS (
    SELECT categorie FROM int__mapping_category_simple GROUP BY 1 
    )

SELECT
 annee, categorie
FROM 
annees
CROSS JOIN
cat

annee,categorie
2020,métabolite de pesticide
2020,hydrocarbure
2020,pcb
2020,dioxine et furane
2020,médicament
2020,minéral
2021,métabolite de pesticide
2021,hydrocarbure
2021,pcb
2021,dioxine et furane


In [4]:
%%sql 
WITH
annees AS (
    SELECT unnest(generate_series(2020, 2024)) as annee
    ),

cat AS (
    SELECT categorie FROM int__mapping_category_simple GROUP BY 1 
    ),

year_cat AS (   
    SELECT
     annee, categorie
    FROM 
    annees
    CROSS JOIN
    cat 
),

udi AS (
    SELECT
      inseecommune AS commune_code_insee,
      cdreseau,
    FROM
      edc_communes
),

cog AS (
    SELECT
      DEP AS code_departement,
      REG AS code_region,
      COM AS commune_code_insee,
    FROM 
      cog_communes
),

udi_cog AS (
    SELECT
      udi.commune_code_insee ,
      udi.cdreseau,
      cog.code_departement,
      cog.code_region
    FROM
      udi
    LEFT JOIN 
      cog
    ON 
      udi.commune_code_insee = cog.commune_code_insee
)
    
SELECT DISTINCT
    annee, 
    categorie,
    commune_code_insee ,
    cdreseau,
    code_departement,
    code_region
 FROM
      udi_cog
CROSS  JOIN
       year_cat

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

annee,categorie,commune_code_insee,cdreseau,code_departement,code_region
2020,phénol,89008,89000765,89,27
2020,phénol,89029,89000700,89,27
2020,phénol,89035,89000447,89,27
2020,phénol,89059,89000457,89,27
2020,phénol,89081,89000738,89,27
2020,phénol,89115,89000490,89,27
2020,phénol,89133,89000500,89,27
2020,phénol,89276,89000745,89,27
2020,phénol,89348,89000751,89,27
2020,phénol,89385,89000736,89,27


### Les résultats

#### mesures_cat

In [5]:
%%sql
WITH
resultats AS (
    SELECT
      referenceprel,
      cdparametresiseeaux,
      valtraduite,
      limitequal,
      CAST(regexp_extract(REPLACE(limitequal, ',', '.'), '-?\d+(\.\d+)?') AS FLOAT) AS limitequal_float,
      regexp_extract(limitequal, '[a-zA-Zµg]+/?[a-zA-Z/L]+$') AS unite,
    FROM  
        edc_resultats 
    )
    
    SELECT
        resultats.*,
        int__mapping_category_simple.categorie
    FROM 
        resultats
    LEFT JOIN 
        int__mapping_category_simple 
    ON
        resultats.cdparametresiseeaux = int__mapping_category_simple .cdparametresiseeaux

referenceprel,cdparametresiseeaux,valtraduite,limitequal,limitequal_float,unite,categorie
100119085,12DCLE,0.0,<=3 µg/L,3.0,µg/L,hydrocarbure
100119085,ACTIK40,0.034,,,,radioactivité
100119085,ACTITR,0.0,,,,radioactivité
100119085,ADET,0.013,"<=0,1 µg/L",0.1000000014901161,µg/L,métabolite de pesticide
100119085,ADET2,0.0,"<=0,1 µg/L",0.1000000014901161,µg/L,métabolite de pesticide
100119085,ADETD,0.0,"<=0,1 µg/L",0.1000000014901161,µg/L,métabolite de pesticide
100119085,ADSP,0.0,"<=0,1 µg/L",0.1000000014901161,µg/L,métabolite de pesticide
100119085,ALTMICR,0.0,,,,minéral
100119085,AMTH,0.0,"<=0,1 µg/L",0.1000000014901161,µg/L,pesticides
100119085,ATRZ,0.0,"<=0,1 µg/L",0.1000000014901161,µg/L,pesticides


#### mesures_cat_communes : on associe aux resultats la ref de prelevement et les uid associés

In [6]:
%%sql
WITH 
udi AS (
    SELECT
      inseecommune AS commune_code_insee,
      cdreseau,
    FROM
      edc_communes
),
   
prelevement AS (
	SELECT
	  referenceprel,
	  cdreseau,
      dateprel,
	FROM   
      edc_prelevements
),

resultats AS (
    SELECT
      referenceprel,
      cdparametresiseeaux,
      valtraduite,
      limitequal,
      CAST(regexp_extract(REPLACE(limitequal, ',', '.'), '-?\d+(\.\d+)?') AS FLOAT) AS limitequal_float,
      regexp_extract(limitequal, '[a-zA-Zµg]+/?[a-zA-Z/L]+$') AS unite,
    FROM  
        edc_resultats 
    ),

mesures_cat AS (
    SELECT
        resultats.*,
        int__mapping_category_simple.categorie
    FROM 
        resultats
    LEFT JOIN 
        int__mapping_category_simple 
    ON
        resultats.cdparametresiseeaux = int__mapping_category_simple .cdparametresiseeaux
)
    
    SELECT
        mesures_cat.*,
        prelevement.dateprel,
        udi.commune_code_insee AS commune_code_insee,
    FROM 
        mesures_cat 
    LEFT JOIN
    	prelevement
    ON
       mesures_cat.referenceprel = prelevement.referenceprel
    LEFT JOIN
        udi
    ON
    	udi.cdreseau = prelevement.cdreseau

referenceprel,cdparametresiseeaux,valtraduite,limitequal,limitequal_float,unite,categorie,dateprel,commune_code_insee
100119085,12DCLE,0.0,<=3 µg/L,3.0,µg/L,hydrocarbure,2020-02-14,1333
100119085,ACTIK40,0.034,,,,radioactivité,2020-02-14,1333
100119085,ACTITR,0.0,,,,radioactivité,2020-02-14,1333
100119085,ADET,0.013,"<=0,1 µg/L",0.1000000014901161,µg/L,métabolite de pesticide,2020-02-14,1333
100119085,ADET2,0.0,"<=0,1 µg/L",0.1000000014901161,µg/L,métabolite de pesticide,2020-02-14,1333
100119085,ADETD,0.0,"<=0,1 µg/L",0.1000000014901161,µg/L,métabolite de pesticide,2020-02-14,1333
100119085,ADSP,0.0,"<=0,1 µg/L",0.1000000014901161,µg/L,métabolite de pesticide,2020-02-14,1333
100119085,ALTMICR,0.0,,,,minéral,2020-02-14,1333
100119085,AMTH,0.0,"<=0,1 µg/L",0.1000000014901161,µg/L,pesticides,2020-02-14,1333
100119085,ATRZ,0.0,"<=0,1 µg/L",0.1000000014901161,µg/L,pesticides,2020-02-14,1333


#### mesures_cat_communes_year

In [7]:
%%sql
WITH 
udi AS (
    SELECT
      inseecommune AS commune_code_insee,
      cdreseau,
    FROM
      edc_communes
),
   
prelevement AS (
	SELECT
	  referenceprel,
	  cdreseau,
      dateprel,
	FROM   
      edc_prelevements
),

resultats AS (
    SELECT
      referenceprel,
      cdparametresiseeaux,
      valtraduite,
      limitequal,
      CAST(regexp_extract(REPLACE(limitequal, ',', '.'), '-?\d+(\.\d+)?') AS FLOAT) AS limitequal_float,
      regexp_extract(limitequal, '[a-zA-Zµg]+/?[a-zA-Z/L]+$') AS unite,
    FROM  
        edc_resultats 
    ),

mesures_cat AS (
    SELECT
        resultats.*,
        int__mapping_category_simple.categorie
    FROM 
        resultats
    LEFT JOIN 
        int__mapping_category_simple 
    ON
        resultats.cdparametresiseeaux = int__mapping_category_simple .cdparametresiseeaux
),

 mesures_cat_communes AS (
    SELECT
        mesures_cat.*,
        prelevement.dateprel,
        udi.commune_code_insee AS commune_code_insee,
    FROM 
        mesures_cat 
    LEFT JOIN
    	prelevement
    ON
       mesures_cat.referenceprel = prelevement.referenceprel
    LEFT JOIN
        udi
    ON
    	udi.cdreseau = prelevement.cdreseau)

SELECT 
 extract( YEAR FROM mesures_cat_communes.dateprel) as annee,
 categorie,
 commune_code_insee,
 SUM(1) AS nb_analyses,
 SUM(case
        when limitequal_float is not NULL and valtraduite >= limitequal_float then 1
        else 0
    end ) AS nb_analyses_not_ok,
 SUM(case
        when limitequal_float is not NULL and valtraduite < limitequal_float then 1
        else 0
    end ) AS nb_analyses_ok,    
FROM
 mesures_cat_communes
GROUP BY 
    1,2,3

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

annee,categorie,commune_code_insee,nb_analyses,nb_analyses_not_ok,nb_analyses_ok
2020,sous produit désinfection,4058,150,0,30
2020,paramètre organoleptique,4226,2370,10,70
2020,métaux lourds,4217,105,0,105
2020,paramètre organoleptique,4184,220,0,0
2020,sous produit désinfection,4026,285,0,60
2020,sous produit désinfection,4016,105,0,55
2020,métabolite de pesticide,4121,235,0,235
2020,microbio,4124,360,120,0
2020,microbio,4167,330,110,0
2020,microbio,4126,960,320,0


#### mesures_cat_communes_year_cvm

**Pour un cas plus générique il faudra ajouter une condition sur categorie dans le CASE WHEN resultat**

In [8]:
%%sql
WITH 
udi AS (
    SELECT
      inseecommune AS commune_code_insee,
      cdreseau,
    FROM
      edc_communes
),
   
prelevement AS (
	SELECT
	  referenceprel,
	  cdreseau,
      dateprel,
	FROM   
      edc_prelevements
),

resultats AS (
    SELECT
      referenceprel,
      cdparametresiseeaux,
      valtraduite,
      limitequal,
      CAST(regexp_extract(REPLACE(limitequal, ',', '.'), '-?\d+(\.\d+)?') AS FLOAT) AS limitequal_float,
      regexp_extract(limitequal, '[a-zA-Zµg]+/?[a-zA-Z/L]+$') AS unite,
    FROM  
        edc_resultats 
    ),

mesures_cat AS (
    SELECT
        resultats.*,
        int__mapping_category_simple.categorie
    FROM 
        resultats
    LEFT JOIN 
        int__mapping_category_simple 
    ON
        resultats.cdparametresiseeaux = int__mapping_category_simple .cdparametresiseeaux
),

 mesures_cat_communes AS (
    SELECT
        mesures_cat.*,
        prelevement.dateprel,
        udi.commune_code_insee AS commune_code_insee,
    FROM 
        mesures_cat 
    LEFT JOIN
    	prelevement
    ON
       mesures_cat.referenceprel = prelevement.referenceprel
    LEFT JOIN
        udi
    ON
    	udi.cdreseau = prelevement.cdreseau
    ),

 mesures_cat_communes_year AS(
SELECT 
 extract( YEAR FROM mesures_cat_communes.dateprel) as annee,
 categorie,
 commune_code_insee,
 SUM(1) AS nb_analyses,
 SUM(case
        when limitequal_float is not NULL and valtraduite >= limitequal_float then 1
        else 0
    end ) AS nb_analyses_not_ok,
 SUM(case
        when limitequal_float is not NULL and valtraduite < limitequal_float then 1
        else 0
    end ) AS nb_analyses_ok,    
FROM
 mesures_cat_communes
GROUP BY 
    1,2,3
    )

SELECT
    annee,
    commune_code_insee,
    CASE WHEN nb_analyses IS NULL OR nb_analyses = 0 THEN 'Pas recherché'
         WHEN nb_analyses_ok =0 THEN 'jamais quantifié'
         WHEN nb_analyses_ok >0 THEN '<= 0,5 µg/L'
         WHEN nb_analyses_not_ok >1 THEN '> 0,5 µg/L'
    END AS resultat
FROM  
 mesures_cat_communes_year
WHERE
 categorie = 'cvm'

annee,commune_code_insee,resultat
2020,29083,"<= 0,5 µg/L"
2020,29274,"<= 0,5 µg/L"
2020,29298,"<= 0,5 µg/L"
2020,29025,"<= 0,5 µg/L"
2020,29029,"<= 0,5 µg/L"
2020,29065,"<= 0,5 µg/L"
2020,29252,"<= 0,5 µg/L"
2020,29035,"<= 0,5 µg/L"
2020,29076,"<= 0,5 µg/L"
2020,29202,"<= 0,5 µg/L"


#### mesures_cat_communes_year_cvm + Joint list annee_cat_communes list

In [9]:
%%sql --save mesures_cat_communes_year_cvm
WITH 
annees AS (
    SELECT unnest(generate_series(2020, 2024)) as annee
    ),

cat AS (
    SELECT categorie FROM int__mapping_category_simple GROUP BY 1 
    ),

year_cat AS (   
    SELECT
     annee, categorie
    FROM 
    annees
    CROSS JOIN
    cat 
),

udi AS (
    SELECT
      inseecommune AS commune_code_insee,
      cdreseau,
    FROM
      edc_communes
),

cog AS (
    SELECT
      DEP AS code_departement,
      REG AS code_region,
      COM AS commune_code_insee,
    FROM 
      cog_communes
),

udi_cog AS (
    SELECT
      udi.commune_code_insee ,
      udi.cdreseau,
      cog.code_departement,
      cog.code_region
    FROM
      udi
    LEFT JOIN 
      cog
    ON 
      udi.commune_code_insee = cog.commune_code_insee
),

LIST_REF_UDI_YEAR AS (    
SELECT DISTINCT
    annee, 
    categorie,
    commune_code_insee ,
    cdreseau,
    code_departement,
    code_region
 FROM
      udi_cog
CROSS  JOIN
       year_cat

),

   
prelevement AS (
	SELECT
	  referenceprel,
	  cdreseau,
      dateprel,
	FROM   
      edc_prelevements
),

resultats AS (
    SELECT
      referenceprel,
      cdparametresiseeaux,
      valtraduite,
      limitequal,
      CAST(regexp_extract(REPLACE(limitequal, ',', '.'), '-?\d+(\.\d+)?') AS FLOAT) AS limitequal_float,
      regexp_extract(limitequal, '[a-zA-Zµg]+/?[a-zA-Z/L]+$') AS unite,
    FROM  
        edc_resultats 
    ),

mesures_cat AS (
    SELECT
        resultats.*,
        int__mapping_category_simple.categorie
    FROM 
        resultats
    LEFT JOIN 
        int__mapping_category_simple 
    ON
        resultats.cdparametresiseeaux = int__mapping_category_simple .cdparametresiseeaux
),

 mesures_cat_communes AS (
    SELECT
        mesures_cat.*,
        prelevement.dateprel,
        udi_cog.commune_code_insee AS commune_code_insee,
    FROM 
        mesures_cat 
    LEFT JOIN
    	prelevement
    ON
       mesures_cat.referenceprel = prelevement.referenceprel
    LEFT JOIN
        udi_cog
    ON
    	udi_cog.cdreseau = prelevement.cdreseau
    ),

 mesures_cat_communes_year AS(
    SELECT 
     extract( YEAR FROM mesures_cat_communes.dateprel) as annee,
     categorie,
     commune_code_insee,
     SUM(1) AS nb_analyses,
     SUM(case
            when limitequal_float is not NULL and valtraduite >= limitequal_float then 1
            else 0
        end ) AS nb_analyses_not_ok,
     SUM(case
            when limitequal_float is not NULL and valtraduite < limitequal_float then 1
            else 0
        end ) AS nb_analyses_ok,    
    FROM
     mesures_cat_communes
    GROUP BY 
        1,2,3
    )

SELECT
    LIST_REF_UDI_YEAR.annee,
    LIST_REF_UDI_YEAR.commune_code_insee,
    LIST_REF_UDI_YEAR.categorie,
    CASE WHEN coalesce(nb_analyses,0) = 0 THEN 'Pas recherché'
         WHEN coalesce(nb_analyses_ok,0) =0 THEN 'jamais quantifié'
         WHEN coalesce(nb_analyses_ok,0) >0 THEN '<= 0,5 µg/L'
         WHEN coalesce(nb_analyses_not_ok,0) >1 THEN '> 0,5 µg/L'
    END AS resultat
FROM  
 LIST_REF_UDI_YEAR
LEFT JOIN 
 mesures_cat_communes_year
ON
    LIST_REF_UDI_YEAR.annee =  mesures_cat_communes_year.annee
    AND LIST_REF_UDI_YEAR.categorie =  mesures_cat_communes_year.categorie
    AND  LIST_REF_UDI_YEAR.commune_code_insee  =  mesures_cat_communes_year.commune_code_insee
WHERE
 LIST_REF_UDI_YEAR.categorie = 'cvm'

annee,commune_code_insee,categorie,resultat
2020,88518,cvm,"<= 0,5 µg/L"
2020,89004,cvm,"<= 0,5 µg/L"
2020,89025,cvm,"<= 0,5 µg/L"
2020,89100,cvm,"<= 0,5 µg/L"
2020,89108,cvm,"<= 0,5 µg/L"
2020,89112,cvm,"<= 0,5 µg/L"
2020,89159,cvm,"<= 0,5 µg/L"
2020,89160,cvm,"<= 0,5 µg/L"
2020,89169,cvm,"<= 0,5 µg/L"
2020,89187,cvm,"<= 0,5 µg/L"


# Check

In [10]:
%%sql
SELECT
    de_partition as year,
    COUNT(DISTINCT inseecommune)
FROM 
 edc_communes
GROUP BY 
    1 
ORDER BY 
   1

year,count(DISTINCT inseecommune)
2020,34788
2021,34833
2022,34874
2023,34852
2024,34809


In [11]:
%%sql --with mesures_cat_communes_year_cvm  
 SELECT 
    annee,
    COUNT(DISTINCT commune_code_insee)
 FROM
   mesures_cat_communes_year_cvm
GROUP BY 
    1

annee,count(DISTINCT commune_code_insee)
2022,34914
2024,34914
2020,34914
2021,34914
2023,34914


**Il y a creation de communes.... :/  A creuser: je pense que j'ai du oublier quelque part la notion d'année pour la liste de commune**

In [12]:
%%sql
SELECT
    COUNT(DISTINCT inseecommune)
FROM 
 edc_communes

count(DISTINCT inseecommune)
34914
