In [1]:
%pip install -q -r requirements.txt

Note: you may need to restart the kernel to use updated packages.


In [2]:
%load_ext autoreload
%autoreload 1
from imports import *
from functions import *

Connection to PostgreSQL

In [4]:
path_postgresql_creds = r"C:\Users\f.gionnane\Documents\Data Engineering\Credentials\postgresql_creds.json"

with open(path_postgresql_creds, 'r') as file:
    content = json.load(file)
    user = content["user"]
    password = content["password"]
    host = content["host"]
    port = content["port"]

db = "Oceanography_ML_Project"
schema_bronze = "Bronze"
schema_silver = "Silver"
conn_string = f"postgresql+psycopg2://{user}:{password}@{host}:{port}/{db}"
# Créer l'engine PostgreSQL
engine = create_engine(conn_string)
conn = engine.connect()

Charger les Données des Tables

In [37]:
# Charger les métadonnées du schéma existant
metadata = MetaData(schema=schema_bronze)

print("\n🔍 Chargement des métadonnées du schéma...")
metadata.reflect(bind=conn)
print("✅ Métadonnées chargées avec succès.\n")

# Récupérer les noms des tables
table_names = [t.name for t in metadata.sorted_tables]
print(f"🔢 Nombre total de tables dans le schéma : {len(table_names)}\n")

# Filtrer les tables en fonction du contenu de leur nom
marine_tables = {t for t in table_names if "marine" in t.lower()}
meteo_tables = {t for t in table_names if "meteo" in t.lower()}
buoys_data_table = {t for t in table_names if "buoy" in t.lower()}

print(f"🌊 Tables marines trouvées : {len(marine_tables)}")
print(f"🌧️ Tables météo trouvées : {len(meteo_tables)}")
print(f"🐋 Tables de bouées trouvées : {len(buoys_data_table)}\n")

# Initialiser le dictionnaire des résultats
buoys_datas = {}

# Compteurs pour suivre le nombre de tables chargées avec succès
marine_data_count = 0
meteo_data_count = 0
buoys_data_count = 0

# Compteur pour le nombre total de lignes
total_marine_rows = 0
total_meteo_rows = 0
total_buoys_rows = 0  # Changer ici pour compter le nombre de lignes (bouées)

# Vérifier et récupérer les données de la table "buoys_datas"
if buoys_data_table:
    print("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━")
    print("🔄 Chargement des données de la table 'buoys_datas'...")

    try:
        buoys_datas_raw = fetch_table_data(schema=schema_bronze, conn=conn, table_name=next(iter(buoys_data_table)), as_df=True)

        if buoys_datas_raw is not None:
            print("📦 Données récupérées pour 'buoys_datas'.")

            # Conversion JSON → dict si nécessaire
            if isinstance(buoys_datas_raw, str):
                buoys_datas_raw = json.loads(buoys_datas_raw)

            elif isinstance(buoys_datas_raw, pd.DataFrame) and "Station ID" in buoys_datas_raw.columns:
                # Convertir en dictionnaire avec "Station ID" comme clé
                buoys_datas_raw = buoys_datas_raw.set_index("Station ID").to_dict(orient="index")

            # Ajouter au dictionnaire principal directement avec les Station ID comme clés
            buoys_datas.update(buoys_datas_raw)
            buoys_data_count += 1
            total_buoys_rows += len(buoys_datas_raw)  # Compter le nombre de bouées
            print(f"✅ Table 'buoys_datas' chargée avec succès! Nombre de bouées (lignes) : {total_buoys_rows}\n")
        else:
            print("⚠️ Aucun résultat trouvé dans 'buoys_datas'.\n")

    except Exception as e:
        print(f"❌ Erreur lors du chargement de 'buoys_datas': {e}\n")

# Associer les tables marine et meteo en fonction du station_id et récupérer leurs données
for table_set, label, icon, counter, total_rows in [
    (marine_tables, "Marine", "🌊", marine_data_count, total_marine_rows),
    (meteo_tables, "Meteo", "🌧️", meteo_data_count, total_meteo_rows)
]:
    print("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━")
    for table_name in table_set:
        print(f"🔄 Chargement des données pour la table {label} : {table_name}...")

        try:
            station_id = table_name.split("_")[1]

            # Vérifier si la station existe déjà dans buoys_datas, sinon initialiser un dictionnaire
            if station_id not in buoys_datas:
                buoys_datas[station_id] = {}

            # Récupérer les données
            data = fetch_table_data(schema=schema_bronze, conn=conn, table_name=table_name, as_df=True)

            if data is not None:
                print(f"📦 Données récupérées pour la station {station_id} ({label}).")

                if isinstance(data, str):
                    data = pd.DataFrame(json.loads(data))
                elif isinstance(data, dict):
                    data = pd.DataFrame(data)
                # Ajouter les données au dictionnaire de bouées sous la station_id
                buoys_datas[station_id][f"{label} DataFrame"] = data
                counter += 1
                total_rows += len(data)  # Ajouter le nombre de lignes collectées
                print(f"{icon} Données {label} chargées pour la station {station_id}! Nombre de lignes collectées : {len(data)}\n")
            else:
                print(f"⚠️ Aucun résultat trouvé pour la station {station_id} ({label}).\n")

        except Exception as e:
            print(f"❌ Erreur lors du chargement des données {label} pour {table_name} : {e}\n")

    # Mise à jour des compteurs après le chargement des données pour chaque catégorie
    if label == "Marine":
        marine_data_count = counter
        total_marine_rows = total_rows
    elif label == "Meteo":
        meteo_data_count = counter
        total_meteo_rows = total_rows

# Finalement, afficher un récapitulatif global
print("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━")
print(f"🏆 Chargement des données terminé avec succès !")
print(f"🐋 Total des données bouées chargées : {buoys_data_count} - Nombre de bouées (lignes) : {total_buoys_rows}")
print(f"🌊 Total des données marines chargées : {marine_data_count} - Nombre total de lignes : {total_marine_rows}")
print(f"🌧️ Total des données météorologiques chargées : {meteo_data_count} - Nombre total de lignes : {total_meteo_rows}")
print("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━")


🔍 Chargement des métadonnées du schéma...
✅ Métadonnées chargées avec succès.

🔢 Nombre total de tables dans le schéma : 79

🌊 Tables marines trouvées : 39
🌧️ Tables météo trouvées : 39
🐋 Tables de bouées trouvées : 1

━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
🔄 Chargement des données de la table 'buoys_datas'...
📦 Données récupérées pour 'buoys_datas'.
✅ Table 'buoys_datas' chargée avec succès! Nombre de bouées (lignes) : 39

━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
🔄 Chargement des données pour la table Marine : station_POTA2_marine_potato point, ak...
📦 Données récupérées pour la station POTA2 (Marine).
🌊 Données Marine chargées pour la station POTA2! Nombre de lignes collectées : 2569

🔄 Chargement des données pour la table Marine : station_46071_marine_western aleutians...
📦 Données récupérées pour la station 46071 (Marine).
🌊 Données Marine chargées pour la station 46071! Nombre de lignes collectées : 7644

🔄 Chargement des données pour la table Marine : station_LONF1_marine_long key, 

In [38]:
buoys_datas["42058"]["Marine DataFrame"].dtypes

id                           int64
wind_direction             float64
wind_speed                 float64
wind_gust                  float64
wave_height                float64
dominant_wave_period       float64
average_wave_period        float64
dominant_wave_direction    float64
pressure                   float64
air_temperature            float64
water_temperature          float64
dewpoint                   float64
visibility                  object
3hr_pressure_tendency      float64
water_level_above_mean      object
time                        object
Station ID                  object
dtype: object

In [39]:
buoys_datas["42058"]["Meteo DataFrame"].columns

Index(['id', 'date', 'temperature_2m', 'relative_humidity_2m', 'dew_point_2m',
       'precipitation', 'rain', 'showers', 'pressure_msl', 'surface_pressure',
       'cloud_cover', 'cloud_cover_low', 'cloud_cover_mid', 'cloud_cover_high',
       'visibility', 'wind_speed_10m', 'soil_temperature_0cm',
       'soil_moisture_0_to_1cm', 'is_day'],
      dtype='object')

In [40]:
buoys_datas["42058"]["Meteo DataFrame"].dtypes

id                         int64
date                      object
temperature_2m            object
relative_humidity_2m      object
dew_point_2m              object
precipitation             object
rain                      object
showers                   object
pressure_msl              object
surface_pressure          object
cloud_cover               object
cloud_cover_low           object
cloud_cover_mid           object
cloud_cover_high          object
visibility                object
wind_speed_10m            object
soil_temperature_0cm      object
soil_moisture_0_to_1cm    object
is_day                    object
dtype: object

In [41]:
list_silver_merged_df = []  
list_failed_dfs = []        

number_marine_data = 0
number_meteo_data = 0
number_merged_data = 0

marine_data_conversion = 0
meteo_data_conversion = 0

In [42]:
marine_cols = [
    "wind_direction", "wind_speed", "wind_gust", "wave_height",
    "dominant_wave_period", "average_wave_period", "dominant_wave_direction",
    "pressure", "air_temperature", "water_temperature", "dewpoint",
    "visibility", "3hr_pressure_tendency", "water_level_above_mean"
]

meteo_cols = [
    "temperature_2m", "relative_humidity_2m", "dew_point_2m", "precipitation", "rain",
    "showers", "pressure_msl", "surface_pressure", "cloud_cover", "cloud_cover_low",
    "cloud_cover_mid", "cloud_cover_high", "visibility", "wind_speed_10m",
    "soil_temperature_0cm", "soil_moisture_0_to_1cm"
]

col_to_rename={'temperature_2m': 'T°(C°)',  'relative_humidity_2m': 'Relative Humidity (%)',
 'dew_point_2m': 'Dew Point (°C)', 'precipitation': 'Precipitation (mm)',  'pressure_msl':' Sea Level Pressure (hPa)', 
 'cloud_cover_low':'Low Clouds (%)', 'cloud_cover_mid' : 'Middle Clouds (%)',	 'cloud_cover_high' : 'High Clouds (%)', 
 'visibility' : ' Visibility (km)',  'wind_direction': 'Wind Direction (°)',
 'wind_speed': 'Wind Speed (km/h)','wind_gust': 'Wind Gusts (km/h)', 'wave_height': 'Wave Height (m)',  'average_wave_period': 'Average Wave Period (s)',
 'dominant_wave_direction': 'Dominant Wave Direction (°)','pressure': 'Pressure (hPA)',
 'air_temperature': 'Air T°','water_temperature': 'Water T°'}

meteo_cols_to_delete = ['soil_temperature_0cm','rain', 'showers', 'is_day',
                  'soil_moisture_0_to_1cm']

for station_id, tables in buoys_datas.items():
    marine_df = tables["Marine DataFrame"]
    marine_df = rename_columns(marine_df, col_to_rename)

    marine_df = drop_columns_if_exist

    meteo_df = tables["Meteo DataFrame"]
    meteo_df = rename_columns(meteo_df,col_to_rename)
    meteo_df = drop_columns_if_exist(meteo_df, meteo_cols_to_delete)

Colonne 'soil_temperature_0cm' Supprimée
Colonne 'rain' Supprimée
Colonne 'showers' Supprimée
Colonne 'is_day' Supprimée
Colonne 'soil_moisture_0_to_1cm' Supprimée
Colonne 'soil_temperature_0cm' Supprimée
Colonne 'rain' Supprimée
Colonne 'showers' Supprimée
Colonne 'is_day' Supprimée
Colonne 'soil_moisture_0_to_1cm' Supprimée
Colonne 'soil_temperature_0cm' Supprimée
Colonne 'rain' Supprimée
Colonne 'showers' Supprimée
Colonne 'is_day' Supprimée
Colonne 'soil_moisture_0_to_1cm' Supprimée
Colonne 'soil_temperature_0cm' Supprimée
Colonne 'rain' Supprimée
Colonne 'showers' Supprimée
Colonne 'is_day' Supprimée
Colonne 'soil_moisture_0_to_1cm' Supprimée
Colonne 'soil_temperature_0cm' Supprimée
Colonne 'rain' Supprimée
Colonne 'showers' Supprimée
Colonne 'is_day' Supprimée
Colonne 'soil_moisture_0_to_1cm' Supprimée
Colonne 'soil_temperature_0cm' Supprimée
Colonne 'rain' Supprimée
Colonne 'showers' Supprimée
Colonne 'is_day' Supprimée
Colonne 'soil_moisture_0_to_1cm' Supprimée
Colonne 'soil_te

HOUR RESAMPLING

In [43]:
# Resampling des données et stockage dans un nouveau compartiment du dictionnaire 
for station_id, tables in buoys_datas.items():
    try:
        print(f"🔁 Processing and resampling marine data for station {station_id}...")
        # Convert columns to numeric types (float or int) excluding datetime columns using pandas to_numeric
        tables["Marine DataFrame"] = process_datetime_column(tables["Marine DataFrame"], column='time')
    except Exception as e:
        print(f"Error processing Marine Data for {station_id}: {e}")

    try:
        print(f"🔁 Processing and resampling weather data for station {station_id}...")
        # Convert columns to numeric types (float or int) excluding datetime columns using pandas to_numeric
        tables["Meteo DataFrame"] = process_datetime_column(tables["Meteo DataFrame"], column='date')
    except Exception as e:
        print(f"Error processing Meteo Data for {station_id}: {e}")

🔁 Processing and resampling marine data for station 41008...
📌 La colonne 'time' est maintenant convertie en chaîne de caractères.
📌 Conversion réussie de 'time' en datetime.
✅ Successfully renamed column to "Datetime"!
🔁 Processing and resampling weather data for station 41008...
📌 La colonne 'date' est maintenant convertie en chaîne de caractères.
📌 Conversion réussie de 'date' en datetime.
✅ Successfully renamed column to "Datetime"!
🔁 Processing and resampling marine data for station 41044...
📌 La colonne 'time' est maintenant convertie en chaîne de caractères.
📌 Conversion réussie de 'time' en datetime.
✅ Successfully renamed column to "Datetime"!
🔁 Processing and resampling weather data for station 41044...
📌 La colonne 'date' est maintenant convertie en chaîne de caractères.
📌 Conversion réussie de 'date' en datetime.
✅ Successfully renamed column to "Datetime"!
🔁 Processing and resampling marine data for station 42001...
📌 La colonne 'time' est maintenant convertie en chaîne de

Adding Coordinates

In [44]:
# Ajout des coordonnées
for station_id, tables in buoys_datas.items():
    try:
        tables["Marine DataFrame"]["Lat"] = tables["Lat"]
        tables["Marine DataFrame"]["Lon"] = tables["Lon"]
        print(f"🌐 Coordinates (Lat/Lon) added for station {station_id}.")
    except Exception as e:
        print(f"Error adding coordinates for {station_id}: {e}")

🌐 Coordinates (Lat/Lon) added for station 41008.
🌐 Coordinates (Lat/Lon) added for station 41044.
🌐 Coordinates (Lat/Lon) added for station 42001.
🌐 Coordinates (Lat/Lon) added for station 42002.
🌐 Coordinates (Lat/Lon) added for station 42012.
🌐 Coordinates (Lat/Lon) added for station 42036.
🌐 Coordinates (Lat/Lon) added for station 42056.
🌐 Coordinates (Lat/Lon) added for station 42058.
🌐 Coordinates (Lat/Lon) added for station 44020.
🌐 Coordinates (Lat/Lon) added for station 44025.
🌐 Coordinates (Lat/Lon) added for station 44027.
🌐 Coordinates (Lat/Lon) added for station 44065.
🌐 Coordinates (Lat/Lon) added for station 46001.
🌐 Coordinates (Lat/Lon) added for station 46006.
🌐 Coordinates (Lat/Lon) added for station 46014.
🌐 Coordinates (Lat/Lon) added for station 46022.
🌐 Coordinates (Lat/Lon) added for station 46025.
🌐 Coordinates (Lat/Lon) added for station 46027.
🌐 Coordinates (Lat/Lon) added for station 46029.
🌐 Coordinates (Lat/Lon) added for station 46053.
🌐 Coordinates (Lat/L

Merging Based on Station ID

In [45]:
number_merged_data = 0
list_silver_merged_df=[]
# Fusion des DataFrames
for station_id, tables in buoys_datas.items():
    try:

        metadatas = get_station_metadata(station_id=station_id)
        print(f"🔗 Merging marine and weather data for station {station_id}...")
        df_merged = pd.merge(
            tables["Marine DataFrame"], tables["Meteo DataFrame"], on='Datetime', how='inner'
        )
        tables["Merged DataFrame"] = df_merged
        number_merged_data += df_merged.shape[0]
        list_silver_merged_df.append(df_merged)
    except Exception as e:
        print(f"Error merging data for station {station_id}: {e}")

# print(f'{len(list_silver_merged_df)} DataFrames Merged :\n{number_merged_data} rows in total !')


🔗 Merging marine and weather data for station 41008...
🔗 Merging marine and weather data for station 41044...
🔗 Merging marine and weather data for station 42001...
🔗 Merging marine and weather data for station 42002...
🔗 Merging marine and weather data for station 42012...
🔗 Merging marine and weather data for station 42036...
🔗 Merging marine and weather data for station 42056...
🔗 Merging marine and weather data for station 42058...
🔗 Merging marine and weather data for station 44020...
🔗 Merging marine and weather data for station 44025...
🔗 Merging marine and weather data for station 44027...
🔗 Merging marine and weather data for station 44065...
🔗 Merging marine and weather data for station 46001...
🔗 Merging marine and weather data for station 46006...
🔗 Merging marine and weather data for station 46014...
🔗 Merging marine and weather data for station 46022...
🔗 Merging marine and weather data for station 46025...
🔗 Merging marine and weather data for station 46027...
🔗 Merging 

In [46]:
for key, value in buoys_datas.items():
    if value['id'] == 1:
        for key, val in value.items():
            print(key)


id
Zone
Lat
Lon
Marine DataFrame
Meteo DataFrame
Merged DataFrame


In [47]:
df_merged.columns

Index(['id_x', 'Wind Direction (°)', 'Wind Speed (km/h)', 'Wind Gusts (km/h)',
       'Wave Height (m)', 'dominant_wave_period', 'Average Wave Period (s)',
       'Dominant Wave Direction (°)', 'Pressure (hPA)', 'Air T°', 'Water T°',
       'dewpoint', ' Visibility (km)_x', '3hr_pressure_tendency',
       'water_level_above_mean', 'Datetime', 'Station ID', 'Lat', 'Lon',
       'id_y', 'T°(C°)', 'Relative Humidity (%)', 'Dew Point (°C)',
       'Precipitation (mm)', 'rain', 'showers', ' Sea Level Pressure (hPa)',
       'surface_pressure', 'cloud_cover', 'Low Clouds (%)',
       'Middle Clouds (%)', 'High Clouds (%)', ' Visibility (km)_y',
       'wind_speed_10m', 'soil_temperature_0cm', 'soil_moisture_0_to_1cm',
       'is_day'],
      dtype='object')

In [48]:
show_first_row(df_merged)

id_x                           1  (int64)
Wind Direction (°)             240.0  (float64)
Wind Speed (km/h)              9.3  (float64)
Wind Gusts (km/h)              9.8  (float64)
Wave Height (m)                None  (object)
dominant_wave_period           None  (object)
Average Wave Period (s)        None  (object)
Dominant Wave Direction (°)    None  (object)
Pressure (hPA)                 1002.7  (float64)
Air T°                         7.6  (float64)
Water T°                       None  (object)
dewpoint                       None  (object)
 Visibility (km)_x             None  (object)
3hr_pressure_tendency          0.4  (float64)
water_level_above_mean         None  (object)
Datetime                       2025-03-22 11:00:00  (datetime64[ns])
Station ID                     SBIO1  (object)
Lat                            41.63N  (object)
Lon                            82.84W  (object)
id_y                           2220  (int64)
T°(C°)                         5.110000133514404  (o

Test Adding MetaData

In [49]:
# Ajout des coordonnées
for station_id, tables in buoys_datas.items():
    try:

        df_merged = tables["Merged DataFrame"]
        station_id = str(df_merged["Station ID"])
        print(station_id)
        metadatas = get_station_metadata(station_id)
        
    except Exception as e:
        print(f"Error adding Metadata for {station_id}: {e}")

0       41008
1       41008
2       41008
3       41008
4       41008
        ...  
7671    41008
7672    41008
7673    41008
7674    41008
7675    41008
Name: Station ID, Length: 7676, dtype: object
0       41044
1       41044
2       41044
3       41044
4       41044
        ...  
7631    41044
7632    41044
7633    41044
7634    41044
7635    41044
Name: Station ID, Length: 7636, dtype: object
0       42001
1       42001
2       42001
3       42001
4       42001
        ...  
2489    42001
2490    42001
2491    42001
2492    42001
2493    42001
Name: Station ID, Length: 2494, dtype: object
0       42002
1       42002
2       42002
3       42002
4       42002
        ...  
2643    42002
2644    42002
2645    42002
2646    42002
2647    42002
Name: Station ID, Length: 2648, dtype: object
0       42012
1       42012
2       42012
3       42012
4       42012
        ...  
7623    42012
7624    42012
7625    42012
7626    42012
7627    42012
Name: Station ID, Length: 7628, dtype: object


In [50]:
metadatas

{}

Changing Data Types

In [51]:
def convert_df_columns(df):
    """
    Convertit chaque colonne en son type approprié sans modifier les données
    ou introduire des NaN.
    
    Args:
    - df: pd.DataFrame. Le DataFrame à traiter.
    
    Returns:
    - pd.DataFrame: Le DataFrame avec les types de données convertis.
    """
    
    # Traitement des colonnes avec les types appropriés
    for col in df.columns:
        # Convertir les colonnes numériques
        if df[col].dtype == 'object':
            # Tenter de convertir en float si c'est un nombre représenté par des strings
            try:
                # Convertir en float pour les colonnes qui peuvent l'être (ex: "Wind Speed (km/h)", "Pressure (hPa)", etc.)
                df[col] = pd.to_numeric(df[col], errors='raise')
            except ValueError:
                # Si la conversion échoue, laisser la colonne intacte
                pass
                
        # Convertir des dates si la colonne contient des chaînes de caractères représentant des dates
        if df[col].dtype == 'object' and 'date' in col.lower():
            try:
                df[col] = pd.to_datetime(df[col], errors='raise')
            except ValueError:
                pass
        
        # Convertir les booléens (is_day) en int
        if col == "is_day":
            df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0).astype(int)

    # Assurer les types numériques pour les colonnes déjà numériques mais mal typées
    for col in df.select_dtypes(include=['float64', 'int64']).columns:
        df[col] = df[col].astype(pd.Float64Dtype())  # Garantir une gestion correcte des NaN dans les colonnes numériques

    return df

for station_id, tables in buoys_datas.items():
    try:
        df_merged = tables["Merged DataFrame"]
        print(f"🔗 Changing Data Types  for station {station_id}...")
        df_converted = convert_df_columns(df_merged)
        tables["Converted DataFrame"] = df_converted
        
        print(f"Successfully Changed Data Types for Station {station_id}")

    except Exception as e:
        
        print(f"Error changing data types for station {station_id}: {e}")

🔗 Changing Data Types  for station 41008...
Successfully Changed Data Types for Station 41008
🔗 Changing Data Types  for station 41044...
Successfully Changed Data Types for Station 41044
🔗 Changing Data Types  for station 42001...
Successfully Changed Data Types for Station 42001
🔗 Changing Data Types  for station 42002...
Successfully Changed Data Types for Station 42002
🔗 Changing Data Types  for station 42012...
Successfully Changed Data Types for Station 42012
🔗 Changing Data Types  for station 42036...
Successfully Changed Data Types for Station 42036
🔗 Changing Data Types  for station 42056...
Successfully Changed Data Types for Station 42056
🔗 Changing Data Types  for station 42058...
Successfully Changed Data Types for Station 42058
🔗 Changing Data Types  for station 44020...
Successfully Changed Data Types for Station 44020
🔗 Changing Data Types  for station 44025...
Successfully Changed Data Types for Station 44025
🔗 Changing Data Types  for station 44027...
Successfully Cha

Cleaning Null Values

In [52]:
def clean_dataframe(df):
    for column in df.columns:
        # Calculer le pourcentage de valeurs manquantes
        missing_percentage = df[column].isnull().mean() * 100
        
        # Supprimer la colonne si elle est totalement nulle
        if df[column].isnull().sum() == len(df[column]):
            df = df.drop(columns=[column])
            continue
        
        # Si plus de 50% des valeurs sont manquantes, on retire la colonne sauf si c'est numérique
        if missing_percentage > 50:
            if df[column].dtype not in ['float64', 'int64']:  # Ne pas supprimer les colonnes numériques
                df = df.drop(columns=[column])
        else:
            # Si la colonne est numérique, on remplace les NaN par la médiane
            if df[column].dtype in ['float64', 'int64']:  # vérifier si c'est une colonne numérique
                median_value = df[column].median()
                df[column].fillna(median_value, inplace=True)
            else:
                pass
    return df

In [53]:
for station_id, tables in buoys_datas.items():
    try:

        print(f"🔗 Cleaning DataFrame for station {station_id}...")
        df_converted = tables["Converted DataFrame"]
        
        df_cleaned = clean_dataframe(df_converted)

        tables["Cleaned DataFrame"] = df_cleaned

        print(f"Successfully Cleaned DataFrame for Station {station_id}")

    except Exception as e:
        print(f"Error Cleaning DataFrame for station {station_id}: {e}")

🔗 Cleaning DataFrame for station 41008...
Successfully Cleaned DataFrame for Station 41008
🔗 Cleaning DataFrame for station 41044...
Successfully Cleaned DataFrame for Station 41044
🔗 Cleaning DataFrame for station 42001...
Successfully Cleaned DataFrame for Station 42001
🔗 Cleaning DataFrame for station 42002...
Successfully Cleaned DataFrame for Station 42002
🔗 Cleaning DataFrame for station 42012...
Successfully Cleaned DataFrame for Station 42012
🔗 Cleaning DataFrame for station 42036...
Successfully Cleaned DataFrame for Station 42036
🔗 Cleaning DataFrame for station 42056...
Successfully Cleaned DataFrame for Station 42056
🔗 Cleaning DataFrame for station 42058...
Successfully Cleaned DataFrame for Station 42058
🔗 Cleaning DataFrame for station 44020...
Successfully Cleaned DataFrame for Station 44020
🔗 Cleaning DataFrame for station 44025...
Successfully Cleaned DataFrame for Station 44025
🔗 Cleaning DataFrame for station 44027...
Successfully Cleaned DataFrame for Station 44027

Concatenating All in One Final DataFrame

In [54]:
# Fusion finale de tous les DataFrames
try:
    print("🔀 Merging all DataFrames into a final DataFrame...")
    dataframes_to_concat = [tables["Cleaned DataFrame"] for tables in buoys_datas.values()]

    df_final = pd.concat(dataframes_to_concat, ignore_index=True)

except Exception as e:
    print(f"Error during final merge: {e}")
    df_final = None

# Résumé final
print("\n⭐🏆 Processing complete!")
print(f"🔢 Total stations processed: {len(buoys_datas)}")

if df_final is not None and not df_final.empty:
    print(f"📝 Final merged DataFrame size: {df_final.shape}")
else:
    print("The DataFrame is either None or empty.")

🔀 Merging all DataFrames into a final DataFrame...

⭐🏆 Processing complete!
🔢 Total stations processed: 39
📝 Final merged DataFrame size: (252711, 34)


In [55]:
df_final.dtypes

id_x                                  Float64
Wind Direction (°)                    Float64
Wind Speed (km/h)                     Float64
Wind Gusts (km/h)                     Float64
Wave Height (m)                       Float64
Average Wave Period (s)               Float64
Dominant Wave Direction (°)           Float64
Pressure (hPA)                        Float64
Air T°                                Float64
Water T°                              Float64
dewpoint                              Float64
Datetime                       datetime64[ns]
Station ID                             object
Lat                                    object
Lon                                    object
id_y                                  Float64
T°(C°)                                Float64
Relative Humidity (%)                 Float64
Dew Point (°C)                        Float64
Precipitation (mm)                    Float64
rain                                  Float64
showers                           

In [56]:
# Parcourir toutes les colonnes contenant "Station ID" dans leur nom
for column in df_final.columns:
    if "Station ID" in column:
        try:
            # Tenter de convertir la colonne en numérique (en utilisant pd.to_numeric avec errors='coerce')
            df_final[column] = pd.to_numeric(df_final[column], errors='raise')
             # Si la conversion est réussie, convertir en int
            df_final[column] = df_final[column].astype(int) 

        except Exception as e:
                print(f"Error in Conversion Step 1 for column: {column}:\n{e}")
        
        try:
            
            df_final[column] = df_final[column].astype(str)

        except Exception as e:
                print(f"Error in Conversion Step 2 for column: {column}:\n{e}")
            

show_first_row(df_final)

Error in Conversion Step 1 for column: Station ID:
Unable to parse string "BURL1" at position 227260
id_x                           1.0  (Float64)
Wind Direction (°)             240.0  (Float64)
Wind Speed (km/h)              5.0  (Float64)
Wind Gusts (km/h)              6.0  (Float64)
Wave Height (m)                0.7  (Float64)
Average Wave Period (s)        3.7  (Float64)
Dominant Wave Direction (°)    218.0  (Float64)
Pressure (hPA)                 1020.3  (Float64)
Air T°                         14.1  (Float64)
Water T°                       15.3  (Float64)
dewpoint                       10.4  (Float64)
Datetime                       2025-03-22 11:00:00  (datetime64[ns])
Station ID                     41008.0  (object)
Lat                            31.40N  (object)
Lon                            80.87W  (object)
id_y                           2220.0  (Float64)
T°(C°)                         5.110000133514404  (Float64)
Relative Humidity (%)          56.0  (Float64)
Dew Point (°C

In [57]:
df_final = clean_dataframe(df_final)
df_final.isnull().sum()
df_final2 = df_final.dropna()
print(f'{df_final2.shape}\n\n{df_final2.isnull().sum()}')

(160309, 30)

id_x                         0
Wind Direction (°)           0
Wind Speed (km/h)            0
Wind Gusts (km/h)            0
Pressure (hPA)               0
Air T°                       0
Water T°                     0
dewpoint                     0
Datetime                     0
Station ID                   0
Lat                          0
Lon                          0
id_y                         0
T°(C°)                       0
Relative Humidity (%)        0
Dew Point (°C)               0
Precipitation (mm)           0
rain                         0
showers                      0
 Sea Level Pressure (hPa)    0
surface_pressure             0
cloud_cover                  0
Low Clouds (%)               0
Middle Clouds (%)            0
High Clouds (%)              0
 Visibility (km)_y           0
wind_speed_10m               0
soil_temperature_0cm         0
soil_moisture_0_to_1cm       0
is_day                       0
dtype: int64


In [58]:

col_to_rename={'temperature_2m': 'T°(C°)',  'relative_humidity_2m': 'Relative Humidity (%)',
 'dew_point_2m': 'Dew Point (°C)', 'precipitation': 'Precipitation (mm)',  'pressure_msl':' Sea Level Pressure (hPa)', 
 'cloud_cover_low':'Low Clouds (%)', 'cloud_cover_mid' : 'Middle Clouds (%)',	 'cloud_cover_high' : 'High Clouds (%)', 
 'visibility' : ' Visibility (km)',  'wind_direction': 'Wind Direction (°)',
 'wind_speed': 'Wind Speed (km/h)','wind_gust': 'Wind Gusts (km/h)', 'wave_height': 'Wave Height (m)',  'average_wave_period': 'Average Wave Period (s)',
 'dominant_wave_direction': 'Dominant Wave Direction (°)','pressure': 'Pressure (hPA)',
 'air_temperature': 'Air T°','water_temperature': 'Water T°'}

meteo_cols_to_delete = ['soil_temperature_0cm','rain', 'showers', 'id_x','id_y' 'is_day',
                  'soil_moisture_0_to_1cm']


df_final2 = rename_columns(df_final2,{' Visibility (km)_y':'Visibility (km)'})

df_final2['Visibility (km)'] = df_final2['Visibility (km)']/1000

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.rename(columns=existing_columns, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_final2['Visibility (km)'] = df_final2['Visibility (km)']/1000


In [59]:
df_final2 = df_final2.round(2)
show_first_row(df_final2)

id_x                           1.0  (Float64)
Wind Direction (°)             240.0  (Float64)
Wind Speed (km/h)              5.0  (Float64)
Wind Gusts (km/h)              6.0  (Float64)
Pressure (hPA)                 1020.3  (Float64)
Air T°                         14.1  (Float64)
Water T°                       15.3  (Float64)
dewpoint                       10.4  (Float64)
Datetime                       2025-03-22 11:00:00  (datetime64[ns])
Station ID                     41008.0  (object)
Lat                            31.40N  (object)
Lon                            80.87W  (object)
id_y                           2220.0  (Float64)
T°(C°)                         5.11  (Float64)
Relative Humidity (%)          56.0  (Float64)
Dew Point (°C)                 -2.96  (Float64)
Precipitation (mm)             0.0  (Float64)
rain                           0.0  (Float64)
showers                        0.0  (Float64)
 Sea Level Pressure (hPa)      1003.7  (Float64)
surface_pressure               9

In [60]:
df_final2[['Daytime', 'Month']] = df_final2['Datetime'].apply(lambda x: get_day_time(x)).apply(pd.Series)

In [61]:
df_final2=df_final2.round(2)
show_first_row(df_final2)

id_x                           1.0  (Float64)
Wind Direction (°)             240.0  (Float64)
Wind Speed (km/h)              5.0  (Float64)
Wind Gusts (km/h)              6.0  (Float64)
Pressure (hPA)                 1020.3  (Float64)
Air T°                         14.1  (Float64)
Water T°                       15.3  (Float64)
dewpoint                       10.4  (Float64)
Datetime                       2025-03-22 11:00:00  (datetime64[ns])
Station ID                     41008.0  (object)
Lat                            31.40N  (object)
Lon                            80.87W  (object)
id_y                           2220.0  (Float64)
T°(C°)                         5.11  (Float64)
Relative Humidity (%)          56.0  (Float64)
Dew Point (°C)                 -2.96  (Float64)
Precipitation (mm)             0.0  (Float64)
rain                           0.0  (Float64)
showers                        0.0  (Float64)
 Sea Level Pressure (hPa)      1003.7  (Float64)
surface_pressure               9

Renaming, Dropping Useless Columns

Third API Test

In [62]:
show_first_row(df_final2)

id_x                           1.0  (Float64)
Wind Direction (°)             240.0  (Float64)
Wind Speed (km/h)              5.0  (Float64)
Wind Gusts (km/h)              6.0  (Float64)
Pressure (hPA)                 1020.3  (Float64)
Air T°                         14.1  (Float64)
Water T°                       15.3  (Float64)
dewpoint                       10.4  (Float64)
Datetime                       2025-03-22 11:00:00  (datetime64[ns])
Station ID                     41008.0  (object)
Lat                            31.40N  (object)
Lon                            80.87W  (object)
id_y                           2220.0  (Float64)
T°(C°)                         5.11  (Float64)
Relative Humidity (%)          56.0  (Float64)
Dew Point (°C)                 -2.96  (Float64)
Precipitation (mm)             0.0  (Float64)
rain                           0.0  (Float64)
showers                        0.0  (Float64)
 Sea Level Pressure (hPa)      1003.7  (Float64)
surface_pressure               9

Test Envoi Vers PostgreSQL

In [63]:
load_data_in_table(engine=engine, schema = schema_silver, table_name='Silver_Table', df=df_final2, key_column='Datetime')

Table 'Silver_Table' does not exist. Creating...
Table 'Silver_Table' created in schema 'Silver'.
Error retrieving existing values: This Connection is closed
An error occurred: This Connection is closed


In [64]:
df_final2.dtypes

id_x                                Float64
Wind Direction (°)                  Float64
Wind Speed (km/h)                   Float64
Wind Gusts (km/h)                   Float64
Pressure (hPA)                      Float64
Air T°                              Float64
Water T°                            Float64
dewpoint                            Float64
Datetime                     datetime64[ns]
Station ID                           object
Lat                                  object
Lon                                  object
id_y                                Float64
T°(C°)                              Float64
Relative Humidity (%)               Float64
Dew Point (°C)                      Float64
Precipitation (mm)                  Float64
rain                                Float64
showers                             Float64
 Sea Level Pressure (hPa)           Float64
surface_pressure                    Float64
cloud_cover                         Float64
Low Clouds (%)                  

In [65]:
# Filtrer le dataframe pour la Station ID 42058
df_42058 = df_final2[df_final2["Station ID"] == 42058]
df_42058.head()

Unnamed: 0,id_x,Wind Direction (°),Wind Speed (km/h),Wind Gusts (km/h),Pressure (hPA),Air T°,Water T°,dewpoint,Datetime,Station ID,...,Low Clouds (%),Middle Clouds (%),High Clouds (%),Visibility (km),wind_speed_10m,soil_temperature_0cm,soil_moisture_0_to_1cm,is_day,Daytime,Month


In [66]:
# import requests
# import json
# from datetime import datetime, timedelta

# # Variables de contrôle des appels API
# vc_api_key_path = r"c:\Credentials\visual_crossing_weather_api.json"
# with open(vc_api_key_path, 'r') as file:
#     content = json.load(file)
#     vc_api_key = content["api_key"]

# # Ajouter un index numérique automatique (0, 1, 2, ...)
# df_42058.index = range(len(df_42058))

# # Assurez-vous que lat et lon sont définis
# lat, lon = None, None

# # Si le DataFrame n'est pas vide, récupérer les coordonnées de la première ligne
# if not df_42058.empty:
#     first_row = df_42058.iloc[0]
#     lat = first_row["Lat"]
#     lon = first_row["Lon"]

#     # Convertir les coordonnées (si nécessaire)
#     lat, lon = convert_coordinates(lat, lon)

# print(f'{lat}\n{lon}\n{vc_api_key}')

# # Définition des dates dynamiques
# today = datetime.now().strftime("%Y-%m-%d")  # Aujourd'hui
# last_month = (datetime.now() - timedelta(days=31)).strftime("%Y-%m-%d")  # 31 jours avant aujourd'hui

# # Vérifier si lat et lon ont été correctement définis avant de continuer
# if lat is not None and lon is not None:
#     # Construction de l'URL
#     url_last_month = f"https://weather.visualcrossing.com/VisualCrossingWebServices/rest/services/timeline/{lat},{lon}/{last_month}/{today}?unitGroup=metric&key={vc_api_key}&contentType=json"
    
#     # Tentative de récupération des données météo
#     try:
#         response = requests.get(url_last_month)
        
#         # Vérifier si la requête a réussi (code 200)
#         if response.status_code == 200:
#             vc_meteo_data = response.json()  # Essayer de décoder le JSON
#             print(f"Données météo récupérées : {vc_meteo_data}")
#         else:
#             print(f"Erreur lors de l'appel à l'API, code de statut : {response.status_code}")
    
#     except Exception as e:
#         print(f"Erreur lors de la récupération des données météo : {e}")
# else:
#     print("Les coordonnées (lat, lon) ne sont pas définies.")


In [67]:
# # Assuming the df_cleaned DataFrame already exists and contains the required data

# # First, load your Visual Crossing Weather Data (example, you may already have it)
# # Assuming vc_meteo_data is the JSON response from Visual Crossing
# # Example of flattening the JSON
# df_vc_meteo = pd.json_normalize(vc_meteo_data, record_path=["days", "hours"], meta=["days"])

# # Convert the datetimeEpoch from Visual Crossing Weather data into Date and Hour columns
# df_vc_meteo["Date"] = pd.to_datetime(df_vc_meteo["datetimeEpoch"], unit="s").dt.strftime("%Y-%m-%d")
# df_vc_meteo["Hour"] = pd.to_datetime(df_vc_meteo["datetimeEpoch"], unit="s").dt.strftime("%H")

# # Filter data from df_vc_meteo for the last 30 days
# today = datetime.now()
# thirty_days_ago = today - timedelta(days=30)

# today_str = today.strftime("%Y-%m-%d")
# thirty_days_ago_str = thirty_days_ago.strftime("%Y-%m-%d")

# # Filter df_vc_meteo for the last 30 days
# df_test_last_month = df_vc_meteo[['Date', 'Hour', 'windspeed']]
# df_test_last_month = df_test_last_month[(df_test_last_month['Date'] >= thirty_days_ago_str) & 
#                                         (df_test_last_month['Date'] <= today_str)]

# # Prepare df_cleaned for merging (add Date and Hour columns)
# df_cleaned['Date'] = df_cleaned['Datetime'].dt.strftime("%Y-%m-%d")
# df_cleaned['Hour'] = df_cleaned['Datetime'].dt.strftime("%H")

# # Filter df_cleaned for the last 30 days
# df_cleaned_last_month = df_cleaned[(df_cleaned['Date'] >= thirty_days_ago_str) & 
#                                    (df_cleaned['Date'] <= today_str)]

# # Merge df_vc_meteo and df_cleaned based on Date and Hour
# df_merged = df_test_last_month.merge(df_cleaned_last_month[['Date', 'Hour', 'Wind Speed (km/h)', 'wind_speed_10m']], 
#                                     on=['Date', 'Hour'], 
#                                     how='inner')

# # Display the merged dataframe
# print(df_merged.head(100))


In [68]:
col_to_rename={'temperature_2m': 'T°(C°)',  'relative_humidity_2m': 'Relative Humidity (%)',
 'dew_point_2m': 'Dew Point (°C)', 'precipitation': 'Precipitation (mm)',  'pressure_msl':' Sea Level Pressure (hPa)', 
 'cloud_cover_low':'Low Clouds (%)', 'cloud_cover_mid' : 'Middle Clouds (%)',	 'cloud_cover_high' : 'High Clouds (%)', 
 'visibility' : ' Visibility (%)',  'wind_direction': 'Wind Direction (°)',
 'wind_speed': 'Wind Speed (km/h)','wind_gust': 'Wind Gusts (km/h)', 'wave_height': 'Wave Height (m)',  'average_wave_period': 'Average Wave Period (s)',
 'dominant_wave_direction': 'Dominant Wave Direction (°)','pressure': 'Pressure (hPA)',
 'air_temperature': 'Air T°','water_temperature': 'Water T°'}

df_cleaned = rename_columns(df_cleaned, col_to_rename)
df_cleaned = drop_columns_if_exist(df_cleaned,['soil_temperature_0cm','rain', 'showers', 'is_day', 'id_x', 'id_y','soil_moisture_0_to_1cm'])
df_cleaned.columns

⚠️ Aucune colonne à renommer pour ce spécification : {'temperature_2m': 'T°(C°)', 'relative_humidity_2m': 'Relative Humidity (%)', 'dew_point_2m': 'Dew Point (°C)', 'precipitation': 'Precipitation (mm)', 'pressure_msl': ' Sea Level Pressure (hPa)', 'cloud_cover_low': 'Low Clouds (%)', 'cloud_cover_mid': 'Middle Clouds (%)', 'cloud_cover_high': 'High Clouds (%)', 'visibility': ' Visibility (%)', 'wind_direction': 'Wind Direction (°)', 'wind_speed': 'Wind Speed (km/h)', 'wind_gust': 'Wind Gusts (km/h)', 'wave_height': 'Wave Height (m)', 'average_wave_period': 'Average Wave Period (s)', 'dominant_wave_direction': 'Dominant Wave Direction (°)', 'pressure': 'Pressure (hPA)', 'air_temperature': 'Air T°', 'water_temperature': 'Water T°'}
Colonne 'soil_temperature_0cm' Supprimée
Colonne 'rain' Supprimée
Colonne 'showers' Supprimée
Colonne 'is_day' Supprimée
Colonne 'id_x' Supprimée
Colonne 'id_y' Supprimée
Colonne 'soil_moisture_0_to_1cm' Supprimée


Index(['Wind Direction (°)', 'Wind Speed (km/h)', 'Wind Gusts (km/h)',
       'Pressure (hPA)', 'Air T°', '3hr_pressure_tendency', 'Datetime',
       'Station ID', 'Lat', 'Lon', 'T°(C°)', 'Relative Humidity (%)',
       'Dew Point (°C)', 'Precipitation (mm)', ' Sea Level Pressure (hPa)',
       'surface_pressure', 'cloud_cover', 'Low Clouds (%)',
       'Middle Clouds (%)', 'High Clouds (%)', ' Visibility (km)_y',
       'wind_speed_10m'],
      dtype='object')

In [69]:
#  Récupérer les données de l'API
# vc_meteo_data = response.json()
# print(vc_meteo_data)  # Vérifiez les données récupérées

In [70]:
# # Normaliser les données JSON en DataFrame
# df_vc_meteo = pd.json_normalize(vc_meteo_data, record_path=["days", "hours"], meta=["days"])

# # Afficher la première ligne des données
# df_vc_meteo.head(1)

In [71]:
# Conversion du timestamp en datetime
df_vc_meteo["Date"] = pd.to_datetime(df_vc_meteo["datetimeEpoch"], unit="s").dt.strftime("%Y-%m-%d")
df_vc_meteo["Hour"] = pd.to_datetime(df_vc_meteo["datetimeEpoch"], unit="s").dt.strftime("%H")

NameError: name 'df_vc_meteo' is not defined

In [None]:
# Définir les dates de filtrage pour les 30 derniers jours
today = datetime.now()
thirty_days_ago = today - timedelta(days=30)

# Convertir les dates en format YYYY-MM-DD
today_str = today.strftime("%Y-%m-%d")
thirty_days_ago_str = thirty_days_ago.strftime("%Y-%m-%d")

In [None]:
# Filtrer les données des 30 derniers jours de df_vc_meteo
df_test_last_month = df_vc_meteo[['Date', 'Hour', 'windspeed']]
df_test_last_month = df_test_last_month[(df_test_last_month['Date'] >= thirty_days_ago_str) & 
                                        (df_test_last_month['Date'] <= today_str)]

# Ajouter les colonnes Date et Hour à df_42058
df_42058.loc[:, 'Date'] = df_42058['Datetime'].dt.strftime("%Y-%m-%d")
df_42058.loc[:, 'Hour'] = df_42058['Datetime'].dt.strftime("%H")

# Filtrer les données des 30 derniers jours dans df_42058
df_42058_last_month = df_42058[(df_42058['Date'] >= thirty_days_ago_str) & 
                                (df_42058['Date'] <= today_str)]

# Fusionner les deux DataFrames sur Date et Hour
df_test_merged = df_test_last_month.merge(df_42058_last_month[['Date', 'Hour', 'Wind Speed (km/h)', 'wind_speed_10m']], 
                                     on=['Date', 'Hour'], 
                                     how='inner')

df_test_merged.head()

In [None]:
# import pandas as pd

# def handle_null_values(df: pd.DataFrame) -> pd.DataFrame:
#     row_count = df.shape[0]
    
#     # Initialisation des listes pour suivre les colonnes supprimées
#     removed_columns = []
#     non_numeric_columns_to_drop = []
    
#     # Utiliser lambda et apply() pour calculer le nombre de valeurs nulles dans chaque colonne
#     null_counts = df.apply(lambda col: int(col.isnull().sum()))  # Calculer le nombre de NaN par colonne
    
#     # Condition : 1. Colonnes avec toutes les valeurs nulles ou 2. Plus de 50% de valeurs nulles et colonne non numérique
#     columns_to_drop = null_counts[
#         (null_counts == row_count) | 
#         ((null_counts > row_count * 0.5) & ~df.apply(lambda col: pd.api.types.is_numeric_dtype(col)))
#     ].index
    
#     # Ajouter les noms des colonnes supprimées dans les listes appropriées
#     for col in columns_to_drop:
#         if null_counts[col] == row_count:
#             removed_columns.append(col)  # Colonnes entièrement vides
#         elif null_counts[col] > row_count * 0.5 and not pd.api.types.is_numeric_dtype(df[col]):
#             non_numeric_columns_to_drop.append(col)  # Colonnes > 50% nulles et non numériques
    
#     # Supprimer les colonnes identifiées
#     df = df.drop(columns=columns_to_drop)
    
#     # Afficher les résultats
#     print("Colonnes supprimées pour avoir toutes les valeurs nulles:")
#     print(removed_columns)
    
#     print("\nColonnes supprimées pour avoir plus de 50% de valeurs nulles et être non numériques:")
#     print(non_numeric_columns_to_drop)
    
#     return df

# # Exemple d'utilisation
# # df_final = pd.read_csv('ton_fichier.csv') # Assure-toi que df_final est bien un DataFrame valide avant d'appeler la fonction
# df_final = handle_null_values(df_final)


In [None]:
# df_final = df_final.round(2)
# print(df_final.columns)
# df_final.describe()

In [None]:
# def explore_dict_keys(d, parent_key='', sep='_'):
#     """
#     Explore un dictionnaire récursivement pour obtenir toutes les clés, y compris les sous-clés,
#     mais ne retourne pas les valeurs finales.

#     :param d: Le dictionnaire à explorer
#     :param parent_key: La clé parent qui est utilisée pour concaténer les sous-clés
#     :param sep: Le séparateur utilisé pour concaténer les clés (par défaut '_')
#     :return: Une liste des clés (et sous-clés)
#     """
#     keys = []
#     for k, v in d.items():
#         new_key = f"{parent_key}{sep}{k}" if parent_key else k
#         if isinstance(v, dict):  # Si la valeur est un dictionnaire, on explore récursivement
#             keys.append(new_key)  # Ajouter la clé, mais ne pas inclure la valeur
#             keys.extend(explore_dict_keys(v, new_key, sep=sep))  # Continuer l'exploration
#         else:
#             keys.append(new_key)  # Ajouter la clé finale
#     return keys

In [None]:
# def find_key_path(d, target_key, path=[]):
#     """
#     Recherche récursive d'une clé dans un dictionnaire et retourne son chemin.
#     :param d: dictionnaire
#     :param target_key: clé recherchée
#     :param path: liste pour stocker le chemin jusqu'à la clé
#     :return: chemin sous forme de liste
#     """
#     if isinstance(d, dict):  # Si le dictionnaire est encore imbriqué
#         for key, value in d.items():
#             new_path = path + [key]
#             if key == target_key:
#                 return new_path
#             elif isinstance(value, dict):
#                 result = find_key_path(value, target_key, new_path)
#                 if result:  # Si la clé est trouvée, retourner le chemin
#                     return result
#     return None  # Retourne None si la clé n'a pas été trouvée



# # Recherche du chemin pour la clé 'marine_data'
# path = find_key_path(table_dict, "Marine Dataframe")
# print(path)


Auto_convert Test

In [None]:
# for idx, (buoy_id, tables) in enumerate(table_dict.items()):  # Utilisation de .items() pour obtenir (clé, valeur)
#     if isinstance(tables, dict):
#         if idx == 1:  # Vérifier si l'index est égal à 1

Counting Rows of all Dataframes in total