In [26]:
import pandas as pd
import numpy as np
import geopandas as gpd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')
from matplotlib.colors import Normalize
import shapely

# Import des bibliothèques supplémentaires
from scipy import stats
from scipy.stats import shapiro, levene
import sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn import metrics
import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.stats.multicomp import pairwise_tukeyhsd
from sklearn.neighbors import NearestNeighbors
from scipy.spatial.distance import cdist

In [27]:
# Importation des données
data_aheme = pd.read_excel("Data_Aheme.xlsx", sheet_name="Data_Aheme")

# Création d'une copie du dataframe
df_copy = data_aheme.copy()

In [28]:
df_copy

Unnamed: 0,Date,Mois_jr,Saison,Heure,Code,Stations,X,Y,Temperature,Conductivity,...,Turbidity,Chl_a,N_NO2,N_NH4,N_NO3,P_PO4,PT,NT,Long,Lat
0,2023-10-05,10-05,PSP,09:05:00,1,A1,1.93618,6.3918,29.1,2.70,...,67.86,,8.23,150.40,0.17,167.98,250.98,22.36,382769.630037,706304.353377
1,2023-10-05,10-05,PSP,09:31:00,1,A2,1.95130,6.4095,29.2,2.27,...,49.66,,12.47,131.76,0.16,169.26,233.10,24.59,384024.157078,708590.427515
2,2023-10-05,10-05,PSP,10:00:00,1,A3,1.96690,6.4546,29.4,2.76,...,33.14,,10.21,104.36,0.20,190.97,230.55,24.96,385759.620745,713573.256780
3,2023-10-05,10-05,PSP,10:31:00,1,A4,1.99330,6.5007,29.7,2.06,...,32.76,,7.38,86.83,0.14,166.71,222.89,26.40,388689.340098,718664.272053
4,2023-10-05,10-05,PSP,10:58:00,1,A5,1.98400,6.5295,29.9,0.74,...,31.56,,10.49,71.48,0.20,153.94,192.24,20.31,387667.342624,721850.495347
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
125,2024-10-10,10-10,PSP,11:03:00,13,A6,1.98220,6.5602,30.1,2.10,...,15.88,9.61,30.65,35.95,0.14,199.77,410.58,6.58,387475.180794,725245.137356
126,2024-10-10,10-10,PSP,11:16:00,13,A7,1.98090,6.5894,30.1,0.70,...,40.14,3.20,63.85,39.90,0.13,158.07,409.30,6.24,387338.017558,728473.832198
127,2024-10-10,10-10,PSP,12:18:00,13,A8,1.96550,6.4878,29.4,2.83,...,15.02,5.34,32.96,37.36,0.11,198.23,392.70,10.19,385612.262295,717244.223525
128,2024-10-10,10-10,PSP,12:39:00,13,A9,1.93730,6.4568,29.7,3.20,...,18.96,7.48,28.33,39.34,0.21,181.24,395.25,11.10,382486.591631,713823.225992


In [39]:
df_copy.columns

Index(['Date', 'Mois_jr', 'Saison', 'Heure', 'Code', 'Stations', 'X', 'Y',
       'Temperature', 'Conductivity', 'Salinity', 'O2', 'Saturation', 'pH',
       'Redox', 'Transparence', 'Turbidity', 'Chl_a', 'N_NO2', 'N_NH4',
       'N_NO3', 'P_PO4', 'PT', 'NT', 'Long', 'Lat', 'Periode', 'Month_Year'],
      dtype='object')

In [40]:
# Extraction de la colonne des mois (en s'assurant qu'elle est au format string)
HE = ['05', '06', '07', '10']
mois_col = df_copy['Mois_jr'].str.split('-').str[0]

# Définition des conditions et des choix correspondants
conditions = [
    mois_col.isin(HE)
]
choix = ['HE']

# Application des conditions
df_copy['Periode'] = np.select(conditions, choix, default='BE')

In [42]:
# Supprimer les colonnes spécifiques
cols_to_drop = ['Saison', 'Code', 'Redox','N_NO3','Heure','Code', 'X', 'Y','Long', 'Lat']
df_clean = df_copy.drop(columns=[col for col in cols_to_drop if col in df_copy.columns])

In [43]:
df_clean

Unnamed: 0,Date,Mois_jr,Stations,Temperature,Conductivity,Salinity,O2,Saturation,pH,Transparence,Turbidity,Chl_a,N_NO2,N_NH4,P_PO4,PT,NT,Periode,Month_Year
0,2023-10-05,10-05,A1,29.1,2.70,1.4,6.05,78.4,8.08,,67.86,,8.23,150.40,167.98,250.98,22.36,HE,OCT 23
1,2023-10-05,10-05,A2,29.2,2.27,1.1,6.62,86.0,8.37,,49.66,,12.47,131.76,169.26,233.10,24.59,HE,OCT 23
2,2023-10-05,10-05,A3,29.4,2.76,1.4,7.01,91.4,8.43,,33.14,,10.21,104.36,190.97,230.55,24.96,HE,OCT 23
3,2023-10-05,10-05,A4,29.7,2.06,1.0,7.60,92.5,8.41,,32.76,,7.38,86.83,166.71,222.89,26.40,HE,OCT 23
4,2023-10-05,10-05,A5,29.9,0.74,0.3,6.91,90.8,8.44,,31.56,,10.49,71.48,153.94,192.24,20.31,HE,OCT 23
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
125,2024-10-10,10-10,A6,30.1,2.10,1.1,7.87,104.1,8.25,70.0,15.88,9.61,30.65,35.95,199.77,410.58,6.58,HE,OCT 24
126,2024-10-10,10-10,A7,30.1,0.70,0.3,6.82,90.0,7.80,45.0,40.14,3.20,63.85,39.90,158.07,409.30,6.24,HE,OCT 24
127,2024-10-10,10-10,A8,29.4,2.83,1.4,7.84,102.4,8.26,80.0,15.02,5.34,32.96,37.36,198.23,392.70,10.19,HE,OCT 24
128,2024-10-10,10-10,A9,29.7,3.20,1.7,8.00,105.1,8.46,64.0,18.96,7.48,28.33,39.34,181.24,395.25,11.10,HE,OCT 24


In [37]:
df_copy['Date'] = pd.to_datetime(df_copy['Date'], format='mixed', dayfirst=True)
df_copy['Heure'] = df_copy['Heure'].astype(str)
# Créer une liste des mois sous forme de chaînes de 3 lettres
df_copy['Month_Year'] = df_copy['Date'].dt.strftime('%b %y').str.upper()
#df_copy['Year_month'] = df_copy['Date'].dt.strftime('%Y-%m')

In [38]:
# Calcul des moyennes mensuelles par station et par paramètre
period_means = df_copy.groupby(['Stations', 'Periode'])[parameters].mean().reset_index()

NameError: name 'parameters' is not defined

In [34]:
def impute_missing(group):
    numeric_cols = group.select_dtypes(include=[np.number]).columns
    for col in numeric_cols:
        group[col] = group[col].fillna(group[col].mean())
    return group

df_clean = df_copy.groupby(['Periode', 'Stations']).apply(impute_missing).reset_index(drop=True)

In [35]:
df_clean

Unnamed: 0,Date,Mois_jr,Saison,Heure,Code,Stations,X,Y,Temperature,Conductivity,...,N_NO2,N_NH4,N_NO3,P_PO4,PT,NT,Long,Lat,Periode,Month_Year
0,2023-11-02,11-02,GSS,09:06:00,2,A1,1.93618,6.3918,29.1,0.11,...,6.82,86.83,0.175,183.31,249.70,24.01,382769.630037,706304.353377,BE,NOV 23
1,2023-12-06,12-06,GSS,08:54:00,3,A1,1.93618,6.3918,29.1,0.90,...,10.49,132.86,0.200,111.81,317.37,17.04,382769.630037,706304.353377,BE,DEC 23
2,2024-01-18,01-18,GSS,09:18:00,4,A1,1.93618,6.3918,29.1,14.18,...,1.73,40.80,0.175,115.64,156.49,29.21,382769.630037,706304.353377,BE,JAN 24
3,2024-02-08,02-08,GSS,08:57:00,5,A1,1.93618,6.3918,26.8,26.10,...,2.58,0.24,0.175,128.40,179.48,18.97,382769.630037,706304.353377,BE,FEB 24
4,2024-03-07,03-07,GSS,10:33:00,6,A1,1.93618,6.3918,30.1,34.30,...,1.73,245.75,0.175,77.33,174.37,19.36,382769.630037,706304.353377,BE,MAR 24
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
125,2023-10-05,10-05,PSP,13:21:00,1,A9,1.93730,6.4568,29.8,1.44,...,10.78,79.16,0.140,155.22,221.61,26.25,382486.591631,713823.225992,HE,OCT 23
126,2024-05-08,05-08,GSP,13:20:00,8,A9,1.93730,6.4568,31.5,17.12,...,6.25,51.76,0.190,119.47,151.39,17.31,382486.591631,713823.225992,HE,MAY 24
127,2024-06-06,06-06,GSP,13:49:00,9,A9,1.93730,6.4568,30.4,13.26,...,2.29,63.30,0.190,61.54,137.22,32.72,382486.591631,713823.225992,HE,JUN 24
128,2024-07-09,07-09,GSS,13:30:00,10,A9,1.93730,6.4568,27.9,4.05,...,2.58,36.41,0.220,146.28,174.37,1.50,382486.591631,713823.225992,HE,JUL 24


In [4]:
## 2) Imputation des valeurs manquantes par la moyenne par groupe

In [5]:
def impute_missing(group):
    numeric_cols = group.select_dtypes(include=[np.number]).columns
    for col in numeric_cols:
        group[col] = group[col].fillna(group[col].mean())
    return group

df_clean = df_copy.groupby(['Saison', 'Stations']).apply(impute_missing).reset_index(drop=True)

In [6]:
# 3) Sélection et renommage des colonnes
cols_to_keep = [col for col in df_clean.columns if not col.startswith('cleaned_')]
df_clean = df_clean[cols_to_keep]

# Supprimer les colonnes spécifiques
cols_to_drop = ['Saison', 'Code', 'Redox','N_NO3']
df_clean = df_clean.drop(columns=[col for col in cols_to_drop if col in df_clean.columns])

In [7]:
# 4) Conversion des types
df_clean['Stations'] = df_clean['Stations'].astype('category')
df_clean['Date'] = pd.to_datetime(df_clean['Date'])

In [8]:
mv = df_clean.isna().sum()
mv

Date            0
Mois_jr         0
Heure           0
Stations        0
X               0
Y               0
Temperature     0
Conductivity    0
Salinity        0
O2              0
Saturation      0
pH              0
Transparence    0
Turbidity       0
Chl_a           0
N_NO2           0
N_NH4           0
P_PO4           0
PT              0
NT              0
Long            0
Lat             0
Year_month      0
dtype: int64

In [9]:
# 6) Création d'un dataframe numérique seulement
df_num = df_clean.select_dtypes(include=[np.number])
cols_to_exclude = ['X', 'Y', 'Long', 'Lat']
df_num = df_num.drop(columns=[col for col in cols_to_exclude if col in df_num.columns])

In [10]:
df_num['Stations'] = df_clean['Stations']

In [11]:
df_num['N_NH4'] = df_num['N_NH4'] /1000
df_num['N_NO2'] = df_num['N_NO2'] /1000
df_num['PT'] = df_num['PT'] /1000
df_num['P_PO4'] = df_num['P_PO4'] /1000
df_num['Conductivity'] = df_num['Conductivity'] *1000

In [12]:
df_num.head()

Unnamed: 0,Temperature,Conductivity,Salinity,O2,Saturation,pH,Transparence,Turbidity,Chl_a,N_NO2,N_NH4,P_PO4,PT,NT,Stations
0,30.7,48100.0,31.7,6.43,85.9,7.95,70.57,26.94,7.12,0.00173,0.05176,0.03451,0.15394,9.51,A1
1,30.6,19080.0,11.4,4.9,65.8,7.24,46.0,24.72,2.14,0.00625,0.08792,0.04541,0.1616,19.67,A1
2,30.0,14500.0,8.5,6.18,80.8,8.01,20.0,27.34,3.2,0.00173,0.05118,0.03528,0.06385,37.63,A1
3,33.0,24800.0,15.2,8.04,112.1,8.4,45.0,30.28,8.54,0.00144,0.09012,0.17274,0.39398,25.18,A10
4,32.5,18360.0,11.0,7.16,99.8,8.01,29.0,56.4,6.41,0.0054,0.09998,0.08882,0.1233,18.32,A10


In [13]:
df_num.columns

Index(['Temperature', 'Conductivity', 'Salinity', 'O2', 'Saturation', 'pH',
       'Transparence', 'Turbidity', 'Chl_a', 'N_NO2', 'N_NH4', 'P_PO4', 'PT',
       'NT', 'Stations'],
      dtype='object')

In [48]:
dfgp = round(df_num.groupby("Stations").mean(),3)
dfgp.head()

Unnamed: 0_level_0,Temperature,Conductivity,Salinity,O2,Saturation,pH,Transparence,Turbidity,Chl_a,N_NO2,N_NH4,P_PO4,PT,NT
Stations,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
A1,28.854,13829.231,8.892,6.495,83.8,7.942,42.916,48.891,4.268,0.006,0.086,0.119,0.219,17.283
A10,30.638,8596.923,5.154,7.455,99.454,8.251,40.372,52.576,6.795,0.008,0.07,0.149,0.26,21.864
A2,28.754,8705.385,5.085,7.328,94.368,8.301,65.192,44.245,5.806,0.007,0.069,0.13,0.261,17.904
A3,29.115,6029.231,3.415,7.378,95.2,8.262,76.077,30.055,3.862,0.006,0.061,0.168,0.254,18.591
A4,29.331,4544.615,2.554,7.486,97.592,8.279,65.372,28.698,4.05,0.007,0.051,0.163,0.281,17.386


In [49]:
#dfgp.describe()

In [50]:
# Seuils Norme marocaine eau de surface et lac
normes = {
    'Temperature': 30,
    'Conductivity': 2700,
    'O2': 3,
    'pH': 9,
    'Turbidity': 5,
    'Chl_a': 30,
    'N_NO2': 0.1,
    'N_NH4': 2,
    'P_PO4': 1,
    'PT': 0.5,
    'NT': 1.0
}

In [53]:
df = pd.DataFrame(normes)
print(df)

ValueError: If using all scalar values, you must pass an index

In [15]:
# Fonction qui vérifie pour une ligne si un paramètre dépasse la norme
def compare_avec_normes(row, normes):
    for parametre, valeur_norme in normes.items():
        if parametre in row and row[parametre] > valeur_norme:
            return 0  # au moins une valeur dépasse la norme
    return 1  # toutes les valeurs sont conformes

# Application de la fonction à chaque ligne du dataframe
df_num['Conformite'] = df_num.apply(compare_avec_normes, axis=1, normes=normes)

df_num.head()

Unnamed: 0,Temperature,Conductivity,Salinity,O2,Saturation,pH,Transparence,Turbidity,Chl_a,N_NO2,N_NH4,P_PO4,PT,NT,Stations,Conformite
0,30.7,48100.0,31.7,6.43,85.9,7.95,70.57,26.94,7.12,0.00173,0.05176,0.03451,0.15394,9.51,A1,0
1,30.6,19080.0,11.4,4.9,65.8,7.24,46.0,24.72,2.14,0.00625,0.08792,0.04541,0.1616,19.67,A1,0
2,30.0,14500.0,8.5,6.18,80.8,8.01,20.0,27.34,3.2,0.00173,0.05118,0.03528,0.06385,37.63,A1,0
3,33.0,24800.0,15.2,8.04,112.1,8.4,45.0,30.28,8.54,0.00144,0.09012,0.17274,0.39398,25.18,A10,0
4,32.5,18360.0,11.0,7.16,99.8,8.01,29.0,56.4,6.41,0.0054,0.09998,0.08882,0.1233,18.32,A10,0


In [16]:
df_num['Conformite'].unique()

array([0], dtype=int64)

In [44]:
# Seuils Ifremer pour un lac
normes_lac = {
    'Temperature': 30,
    'Conductivity': 2700,
    'O2': 5,
    'pH_min': 6.5,
    'pH_max': 9.0,
    'Turbidity': 5,
    'Chl_a': 30,
    'N_NO2': 0.1,
    'N_NH4': 1,
    'P_PO4': 1,
    'PT': 0.5,
    'NT': 1.0
}

def compare_lac(row, normes):
    # Test des paramètres avec seuils supérieurs
    for param in ['Temperature', 'Conductivity', 'Turbidity', 'Chl_a', 'N_NO2', 'N_NH4', 'P_PO4', 'PT', 'NT']:
        if param in row:
            val = row[param]
            seuil = normes[param]
            if param == 'Transparence':  # Transparence seuil minimum
                if val < seuil:
                    return 1
            else:  # Seuil maximum
                if val > seuil:
                    return 1
    # Cas particulier du pH entre min et max
    if 'pH' in row:
        if row['pH'] < normes['pH_min'] or row['pH'] > normes['pH_max']:
            return 1
    # Cas oxygène seuil minimum
    if 'O2' in row and row['O2'] < normes['O2']:
        return 1

    return 0

# Application au dataframe
dfgp['Conformite'] = dfgp.apply(compare_lac, axis=1, normes=normes_lac)

dfgp.head()

Unnamed: 0_level_0,Temperature,Conductivity,Salinity,O2,Saturation,pH,Transparence,Turbidity,Chl_a,N_NO2,N_NH4,P_PO4,PT,NT,Conformite
Stations,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
A1,28.854,13829.231,8.892,6.495,83.8,7.942,42.916,48.891,4.268,0.006,0.086,0.119,0.219,17.283,0
A10,30.638,8596.923,5.154,7.455,99.454,8.251,40.372,52.576,6.795,0.008,0.07,0.149,0.26,21.864,0
A2,28.754,8705.385,5.085,7.328,94.368,8.301,65.192,44.245,5.806,0.007,0.069,0.13,0.261,17.904,0
A3,29.115,6029.231,3.415,7.378,95.2,8.262,76.077,30.055,3.862,0.006,0.061,0.168,0.254,18.591,0
A4,29.331,4544.615,2.554,7.486,97.592,8.279,65.372,28.698,4.05,0.007,0.051,0.163,0.281,17.386,0


In [45]:
dfgp['Conformite'].unique()

array([0], dtype=int64)

In [None]:
import numpy as np
import pandas as pd
from typing import Optional, Union, List, Tuple
import warnings

def get_excursions(value: np.ndarray, lower: Optional[np.ndarray] = None, 
                   upper: Optional[np.ndarray] = None) -> np.ndarray:
    """
    Calcule les excursions par rapport aux limites supérieures et inférieures.
    
    Args:
        value: Valeurs numériques
        lower: Limites inférieures (optionnel)
        upper: Limites supérieures (optionnel)
    
    Returns:
        Array numpy des excursions
    """
    # Convertir en arrays numpy et gérer les valeurs manquantes
    value = np.array(value, dtype=float)
    
    if lower is None:
        lower = np.full_like(value, np.nan)
    else:
        lower = np.array(lower, dtype=float)
    
    if upper is None:
        upper = np.full_like(value, np.nan)
    else:
        upper = np.array(upper, dtype=float)
    
    # Vérifier que lower et upper sont numériques
    if not (np.issubdtype(lower.dtype, np.number) and np.issubdtype(upper.dtype, np.number)):
        raise ValueError("lower et upper doivent être numériques")
    
    # Identifier les valeurs en dehors des limites
    less = ~np.isnan(lower) & (value <= lower)
    more = ~np.isnan(upper) & (value >= upper)
    
    # Vérifier la cohérence des limites
    if np.any(less & more):
        raise ValueError("la limite inférieure doit être inférieure à la limite supérieure")
    
    # Calculer les excursions
    excursion = np.zeros_like(value)
    excursion[more] = value[more] / upper[more] - 1
    excursion[less] = lower[less] / value[less] - 1
    
    return excursion


def categorize_wqi(x: np.ndarray) -> pd.Categorical:
    """
    Catégorise les indices de qualité de l'eau entre 1 et 100 selon le manuel CCME.
    
    Args:
        x: Vecteur numérique des valeurs WQI à catégoriser
    
    Returns:
        Catégories pandas
    """
    x = np.array(x, dtype=float)
    
    labels = ["Poor", "Marginal", "Fair", "Good", "Excellent"]
    bins = [-1, 44, 64, 79, 94, 100]
    
    # Créer les catégories
    categories = pd.cut(x, bins=bins, labels=labels, ordered=True)
    # Inverser l'ordre des niveaux comme dans le code R
    categories = pd.Categorical(categories, categories=reversed(labels), ordered=True)
    
    return categories


def wqif(nv: int, nt: int, nfv: int, nft: int, nse: float) -> dict:
    """
    Calcule les composantes F1, F2, F3 et le WQI.
    
    Args:
        nv: Nombre de variables
        nt: Nombre total de tests
        nfv: Nombre de variables défaillantes
        nft: Nombre de tests défaillants
        nse: Somme normalisée des excursions
    
    Returns:
        Dictionnaire avec WQI, F1, F2, F3
    """
    F1 = nfv / nv * 100
    F2 = nft / nt * 100
    F3 = nse / (nse + 1) * 100
    
    WQI = 100 - np.sqrt(F1**2 + F2**2 + F3**2) / 1.732
    
    return {"WQI": WQI, "F1": F1, "F2": F2, "F3": F3}


def wqif_excursion_variable(x: np.ndarray, v: np.ndarray, nv: int, nt: int) -> dict:
    """
    Calcule le WQI à partir des excursions et variables.
    
    Args:
        x: Array des excursions
        v: Array des variables
        nv: Nombre de variables
        nt: Nombre total de tests
    
    Returns:
        Dictionnaire avec WQI, F1, F2, F3
    """
    nft = np.sum(x != 0)
    nfv = len(np.unique(v[x != 0]))
    nse = np.sum(x) / nt
    
    return wqif(nv=nv, nt=nt, nfv=nfv, nft=nft, nse=nse)


def wqif_matrix(x: np.ndarray) -> dict:
    """
    Calcule le WQI à partir d'une matrice d'excursions.
    
    Args:
        x: Matrice 2D des excursions
    
    Returns:
        Dictionnaire avec WQI, F1, F2, F3
    """
    not_missing = ~np.isnan(x)
    nt = np.sum(not_missing)
    not_all_missing = ~np.all(np.isnan(x), axis=0)
    nv = np.sum(not_all_missing)
    nft = np.sum(not_missing & (x != 0))
    
    # Compter les variables avec au moins une excursion non nulle
    nfv = 0
    for col_idx in range(x.shape[1]):
        if not_all_missing[col_idx]:
            col_data = x[:, col_idx]
            if np.any(~np.isnan(col_data) & (col_data != 0)):
                nfv += 1
    
    nse = np.sum(x[not_missing]) / nt
    
    return wqif(nv=nv, nt=nt, nfv=nfv, nft=nft, nse=nse)


def wqi_matrix(x: np.ndarray, i: Optional[np.ndarray] = None) -> float:
    """
    Calcule uniquement le WQI à partir d'une matrice.
    
    Args:
        x: Matrice 2D des excursions
        i: Indices des lignes à utiliser
    
    Returns:
        Valeur WQI
    """
    if i is None:
        i = np.arange(x.shape[0])
    
    return wqif_matrix(x[i, :])["WQI"]


def four(x: pd.DataFrame) -> pd.DataFrame:
    """
    Vérifie si un groupe a au moins 4 observations.
    
    Args:
        x: DataFrame avec les données
    
    Returns:
        DataFrame avec une colonne booléenne 'Four'
    """
    return pd.DataFrame({"Four": [len(x) >= 4]})


def fourtimesfour(x: pd.DataFrame) -> bool:
    """
    Vérifie s'il y a au moins 4 variables avec au moins 4 observations chacune.
    
    Args:
        x: DataFrame avec les données
    
    Returns:
        Booléen indiquant si la condition est satisfaite
    """
    # Grouper par variable et appliquer la fonction four
    grouped = x.groupby("Variable").apply(four).reset_index()
    # Compter les variables avec au moins 4 observations
    variables_with_four = np.sum(grouped["Four"])
    
    return variables_with_four >= 4


def set_detection_limits(x: pd.DataFrame, messages: bool = True) -> pd.DataFrame:
    """
    Remplace les valeurs zéro par leur limite de détection si disponible.
    
    Args:
        x: DataFrame avec les données
        messages: Afficher les messages
    
    Returns:
        DataFrame modifié
    """
    x_copy = x.copy()
    
    if "DetectionLimit" not in x_copy.columns:
        x_copy["DetectionLimit"] = 0
    
    # Identifier les lignes où Value = 0 et DetectionLimit > 0
    mask = (x_copy["Value"] == 0) & ~np.isnan(x_copy["DetectionLimit"]) & (x_copy["DetectionLimit"] > 0)
    
    if np.any(mask):
        x_copy.loc[mask, "Value"] = x_copy.loc[mask, "DetectionLimit"]
        if messages:
            count = np.sum(mask)
            plural = "s" if count > 1 else ""
            message = f"Replaced {count} of the value{plural} in column Value with the detection limit in column DetectionLimit."
            warnings.warn(message)
    
    return x_copy


def calc_wqi_by(x: pd.DataFrame, messages: bool = True) -> pd.DataFrame:
    """
    Calcule le WQI pour un sous-ensemble de données.
    
    Args:
        x: DataFrame avec les données
        messages: Afficher les messages
    
    Returns:
        DataFrame avec les résultats WQI
    """
    # Cette fonction nécessiterait l'implémentation complète de boot_wqis
    # Pour l'instant, retournons un DataFrame vide comme placeholder
    return pd.DataFrame()


def calc_wqi(x: pd.DataFrame, by: Optional[List[str]] = None, 
             messages: bool = True) -> pd.DataFrame:
    """
    Calcule l'indice de qualité de l'eau (WQI) selon la méthode CCME.
    
    Args:
        x: DataFrame avec les données
        by: Colonnes pour grouper les calculs
        messages: Afficher les messages
    
    Returns:
        DataFrame avec les résultats WQI
    """
    # Vérifications de base
    if not isinstance(x, pd.DataFrame):
        raise ValueError("x doit être un DataFrame")
    
    if by is not None and (not isinstance(by, list) or not all(isinstance(col, str) for col in by)):
        raise ValueError("by doit être une liste de chaînes de caractères")
    
    # Vérifier les colonnes requises
    required_cols = ["Variable", "Value", "UpperLimit"]
    missing_cols = [col for col in required_cols if col not in x.columns]
    if missing_cols:
        raise ValueError(f"x doit contenir les colonnes: {missing_cols}")
    
    if messages:
        print("Calculating water quality indices...")
    
    # Ajouter les colonnes manquantes avec des valeurs par défaut
    if "Date" not in x.columns:
        x = x.copy()
        x["Date"] = pd.to_datetime("2000-01-01")
    
    if "LowerLimit" not in x.columns:
        x = x.copy()
        x["LowerLimit"] = np.nan
    
    if "DetectionLimit" not in x.columns:
        x = x.copy()
        x["DetectionLimit"] = 0
    
    # Vérifier les types de colonnes
    if not pd.api.types.is_numeric_dtype(x["Value"]):
        raise ValueError("La colonne Value doit être numérique")
    
    # Nettoyer les données
    x_clean = x.dropna(subset=["Date", "Variable", "Value"])
    
    # Appliquer les limites de détection
    x_clean = set_detection_limits(x_clean, messages=messages)
    
    # Calculer le WQI
    if by is None:
        result = calc_wqi_by(x_clean, messages=messages)
    else:
        # Grouper par les colonnes spécifiées
        result = x_clean.groupby(by).apply(calc_wqi_by, messages=messages).reset_index()
    
    if messages:
        print("Calculated water quality indices.")
    
    return result