In [None]:
%pip install -q -r requirements.txt

In [1]:
from imports import *
from functions import *

Connection to PostgreSQL

In [2]:
path_postgresql_creds = r"C:\Users\f.gionnane\Documents\Data Engineering\Credentials\postgresql_creds.json"

with open(path_postgresql_creds, 'r') as file:
    content = json.load(file)
    user = content["user"]
    password = content["password"]
    host = content["host"]
    port = content["port"]

db = "Oceanography_ML_Project"
schema_bronze = "Bronze"
schema_silver = "Silver"

# Créer l'engine PostgreSQL
engine = create_engine(f"postgresql+psycopg2://{user}:{password}@{host}:{port}/{db}")
conn = engine.connect()

Charger les Données des Tables

In [3]:
# Charger les métadonnées du schéma existant
metadata = MetaData(schema=schema_bronze)

print("\n🔍 Chargement des métadonnées du schéma...")
metadata.reflect(bind=conn)
print("✅ Métadonnées chargées avec succès.\n")

# Récupérer les noms des tables
table_names = [t.name for t in metadata.sorted_tables]
print(f"🔢 Nombre total de tables dans le schéma : {len(table_names)}\n")

# Filtrer les tables en fonction du contenu de leur nom
marine_tables = {t for t in table_names if "marine" in t.lower()}
meteo_tables = {t for t in table_names if "meteo" in t.lower()}
buoys_data_table = {t for t in table_names if "buoy" in t.lower()}

print(f"🌊 Tables marines trouvées : {len(marine_tables)}")
print(f"🌧️ Tables météo trouvées : {len(meteo_tables)}")
print(f"🐋 Tables de bouées trouvées : {len(buoys_data_table)}\n")

# Initialiser le dictionnaire des résultats
buoys_datas = {}

# Compteurs pour suivre le nombre de tables chargées avec succès
marine_data_count = 0
meteo_data_count = 0
buoys_data_count = 0

# Compteur pour le nombre total de lignes
total_marine_rows = 0
total_meteo_rows = 0
total_buoys_rows = 0  # Changer ici pour compter le nombre de lignes (bouées)

# Vérifier et récupérer les données de la table "buoys_datas"
if buoys_data_table:
    print("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━")
    print("🔄 Chargement des données de la table 'buoys_datas'...")

    try:
        buoys_datas_raw = fetch_table_data(schema=schema_bronze, conn=conn, table_name=next(iter(buoys_data_table)), as_df=True)

        if buoys_datas_raw is not None:
            print("📦 Données récupérées pour 'buoys_datas'.")

            # Conversion JSON → dict si nécessaire
            if isinstance(buoys_datas_raw, str):
                buoys_datas_raw = json.loads(buoys_datas_raw)

            elif isinstance(buoys_datas_raw, pd.DataFrame) and "Station ID" in buoys_datas_raw.columns:
                # Convertir en dictionnaire avec "Station ID" comme clé
                buoys_datas_raw = buoys_datas_raw.set_index("Station ID").to_dict(orient="index")

            # Ajouter au dictionnaire principal directement avec les Station ID comme clés
            buoys_datas.update(buoys_datas_raw)
            buoys_data_count += 1
            total_buoys_rows += len(buoys_datas_raw)  # Compter le nombre de bouées
            print(f"✅ Table 'buoys_datas' chargée avec succès! Nombre de bouées (lignes) : {total_buoys_rows}\n")
        else:
            print("⚠️ Aucun résultat trouvé dans 'buoys_datas'.\n")

    except Exception as e:
        print(f"❌ Erreur lors du chargement de 'buoys_datas': {e}\n")

# Associer les tables marine et meteo en fonction du station_id et récupérer leurs données
for table_set, label, icon, counter, total_rows in [
    (marine_tables, "Marine", "🌊", marine_data_count, total_marine_rows),
    (meteo_tables, "Meteo", "🌧️", meteo_data_count, total_meteo_rows)
]:
    print("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━")
    for table_name in table_set:
        print(f"🔄 Chargement des données pour la table {label} : {table_name}...")

        try:
            station_id = table_name.split("_")[1]

            # Vérifier si la station existe déjà dans buoys_datas, sinon initialiser un dictionnaire
            if station_id not in buoys_datas:
                buoys_datas[station_id] = {}

            # Récupérer les données
            data = fetch_table_data(schema=schema_bronze, conn=conn, table_name=table_name, as_df=True)

            if data is not None:
                print(f"📦 Données récupérées pour la station {station_id} ({label}).")

                if isinstance(data, str):
                    data = pd.DataFrame(json.loads(data))
                elif isinstance(data, dict):
                    data = pd.DataFrame(data)
                # Ajouter les données au dictionnaire de bouées sous la station_id
                buoys_datas[station_id][f"{label} DataFrame"] = data
                counter += 1
                total_rows += len(data)  # Ajouter le nombre de lignes collectées
                print(f"{icon} Données {label} chargées pour la station {station_id}! Nombre de lignes collectées : {len(data)}\n")
            else:
                print(f"⚠️ Aucun résultat trouvé pour la station {station_id} ({label}).\n")

        except Exception as e:
            print(f"❌ Erreur lors du chargement des données {label} pour {table_name} : {e}\n")

    # Mise à jour des compteurs après le chargement des données pour chaque catégorie
    if label == "Marine":
        marine_data_count = counter
        total_marine_rows = total_rows
    elif label == "Meteo":
        meteo_data_count = counter
        total_meteo_rows = total_rows

# Finalement, afficher un récapitulatif global
print("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━")
print(f"🏆 Chargement des données terminé avec succès !")
print(f"🐋 Total des données bouées chargées : {buoys_data_count} - Nombre de bouées (lignes) : {total_buoys_rows}")
print(f"🌊 Total des données marines chargées : {marine_data_count} - Nombre total de lignes : {total_marine_rows}")
print(f"🌧️ Total des données météorologiques chargées : {meteo_data_count} - Nombre total de lignes : {total_meteo_rows}")
print("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━")


🔍 Chargement des métadonnées du schéma...
✅ Métadonnées chargées avec succès.

🔢 Nombre total de tables dans le schéma : 79

🌊 Tables marines trouvées : 39
🌧️ Tables météo trouvées : 39
🐋 Tables de bouées trouvées : 1

━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
🔄 Chargement des données de la table 'buoys_datas'...
📦 Données récupérées pour 'buoys_datas'.
✅ Table 'buoys_datas' chargée avec succès! Nombre de bouées (lignes) : 39

━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
🔄 Chargement des données pour la table Marine : station_42001_marine_mid gulf...
📦 Données récupérées pour la station 42001 (Marine).
🌊 Données Marine chargées pour la station 42001! Nombre de lignes collectées : 2322

🔄 Chargement des données pour la table Marine : station_46088_marine_new dungeness...
📦 Données récupérées pour la station 46088 (Marine).
🌊 Données Marine chargées pour la station 46088! Nombre de lignes collectées : 7520

🔄 Chargement des données pour la table Marine : station_POTA2_marine_potato point, ak...
📦 

In [None]:
buoys_datas["42058"]["Marine DataFrame"].columns

In [None]:
buoys_datas["42058"]["Meteo DataFrame"].columns

In [None]:
list_silver_merged_df = []  
list_failed_dfs = []        

number_marine_data = 0
number_meteo_data = 0
number_merged_data = 0

marine_data_conversion = 0
meteo_data_conversion = 0

In [4]:
marine_cols = [
    "wind_direction", "wind_speed", "wind_gust", "wave_height",
    "dominant_wave_period", "average_wave_period", "dominant_wave_direction",
    "pressure", "air_temperature", "water_temperature", "dewpoint",
    "visibility", "3hr_pressure_tendency", "water_level_above_mean"
]

meteo_cols = [
    "temperature_2m", "relative_humidity_2m", "dew_point_2m", "precipitation", "rain",
    "showers", "pressure_msl", "surface_pressure", "cloud_cover", "cloud_cover_low",
    "cloud_cover_mid", "cloud_cover_high", "visibility", "wind_speed_10m",
    "soil_temperature_0cm", "soil_moisture_0_to_1cm"
]

col_to_rename={'temperature_2m': 'T°(C°)',  'relative_humidity_2m': 'Relative Humidity (%)',
 'dew_point_2m': 'Dew Point (°C)', 'precipitation': 'Precipitation (mm)',  'pressure_msl':' Sea Level Pressure (hPa)', 
 'cloud_cover_low':'Low Clouds (%)', 'cloud_cover_mid' : 'Middle Clouds (%)',	 'cloud_cover_high' : 'High Clouds (%)', 
 'visibility' : ' Visibility (%)',  'wind_direction': 'Wind Direction (°)',
 'wind_speed': 'Wind Speed (km/h)','wind_gust': 'Wind Gusts (km/h)', 'wave_height': 'Wave Height (m)',  'average_wave_period': 'Average Wave Period (s)',
 'dominant_wave_direction': 'Dominant Wave Direction (°)','pressure': 'Pressure (hPA)',
 'air_temperature': 'Air T°','water_temperature': 'Water T°'}

meteo_cols_to_delete = ['soil_temperature_0cm','rain', 'showers', 'is_day',
                  'soil_moisture_0_to_1cm']

for station_id, tables in buoys_datas.items():
    marine_df = tables["Marine DataFrame"]
    marine_df = rename_columns(marine_df, col_to_rename)

    marine_df = drop_columns_if_exist

    meteo_df = tables["Meteo DataFrame"]
    meteo_df = rename_columns(meteo_df,col_to_rename)
    meteo_df = drop_columns_if_exist(meteo_df, meteo_cols_to_delete)

Colonne 'soil_temperature_0cm' Supprimée
Colonne 'rain' Supprimée
Colonne 'showers' Supprimée
Colonne 'is_day' Supprimée
Colonne 'soil_moisture_0_to_1cm' Supprimée
Colonne 'soil_temperature_0cm' Supprimée
Colonne 'rain' Supprimée
Colonne 'showers' Supprimée
Colonne 'is_day' Supprimée
Colonne 'soil_moisture_0_to_1cm' Supprimée
Colonne 'soil_temperature_0cm' Supprimée
Colonne 'rain' Supprimée
Colonne 'showers' Supprimée
Colonne 'is_day' Supprimée
Colonne 'soil_moisture_0_to_1cm' Supprimée
Colonne 'soil_temperature_0cm' Supprimée
Colonne 'rain' Supprimée
Colonne 'showers' Supprimée
Colonne 'is_day' Supprimée
Colonne 'soil_moisture_0_to_1cm' Supprimée
Colonne 'soil_temperature_0cm' Supprimée
Colonne 'rain' Supprimée
Colonne 'showers' Supprimée
Colonne 'is_day' Supprimée
Colonne 'soil_moisture_0_to_1cm' Supprimée
Colonne 'soil_temperature_0cm' Supprimée
Colonne 'rain' Supprimée
Colonne 'showers' Supprimée
Colonne 'is_day' Supprimée
Colonne 'soil_moisture_0_to_1cm' Supprimée
Colonne 'soil_te

HOUR RESAMPLING

In [5]:
# Resampling des données et stockage dans un nouveau compartiment du dictionnaire 
for station_id, tables in buoys_datas.items():
    try:
        print(f"🔁 Processing and resampling marine data for station {station_id}...")
        # Convert columns to numeric types (float or int) excluding datetime columns using pandas to_numeric
        tables["Marine DataFrame"] = process_datetime_column(tables["Marine DataFrame"], column='time')
    except Exception as e:
        print(f"Error processing Marine Data for {station_id}: {e}")

    try:
        print(f"🔁 Processing and resampling weather data for station {station_id}...")
        # Convert columns to numeric types (float or int) excluding datetime columns using pandas to_numeric
        tables["Meteo DataFrame"] = process_datetime_column(tables["Meteo DataFrame"], column='date')
    except Exception as e:
        print(f"Error processing Meteo Data for {station_id}: {e}")

🔁 Processing and resampling marine data for station 41008...
📌 La colonne 'time' est maintenant convertie en chaîne de caractères.
📌 Conversion réussie de 'time' en datetime.
✅ Successfully renamed column to "Datetime"!
🔁 Processing and resampling weather data for station 41008...
📌 La colonne 'date' est maintenant convertie en chaîne de caractères.
📌 Conversion réussie de 'date' en datetime.
✅ Successfully renamed column to "Datetime"!
🔁 Processing and resampling marine data for station 41044...
📌 La colonne 'time' est maintenant convertie en chaîne de caractères.
📌 Conversion réussie de 'time' en datetime.
✅ Successfully renamed column to "Datetime"!
🔁 Processing and resampling weather data for station 41044...
📌 La colonne 'date' est maintenant convertie en chaîne de caractères.
📌 Conversion réussie de 'date' en datetime.
✅ Successfully renamed column to "Datetime"!
🔁 Processing and resampling marine data for station 42001...
📌 La colonne 'time' est maintenant convertie en chaîne de

Adding Coordinates

In [6]:
# Ajout des coordonnées
for station_id, tables in buoys_datas.items():
    try:
        tables["Marine DataFrame"]["Lat"] = tables["Lat"]
        tables["Marine DataFrame"]["Lon"] = tables["Lon"]
        print(f"🌐 Coordinates (Lat/Lon) added for station {station_id}.")
    except Exception as e:
        print(f"Error adding coordinates for {station_id}: {e}")

🌐 Coordinates (Lat/Lon) added for station 41008.
🌐 Coordinates (Lat/Lon) added for station 41044.
🌐 Coordinates (Lat/Lon) added for station 42001.
🌐 Coordinates (Lat/Lon) added for station 42002.
🌐 Coordinates (Lat/Lon) added for station 42012.
🌐 Coordinates (Lat/Lon) added for station 42036.
🌐 Coordinates (Lat/Lon) added for station 42056.
🌐 Coordinates (Lat/Lon) added for station 42058.
🌐 Coordinates (Lat/Lon) added for station 44020.
🌐 Coordinates (Lat/Lon) added for station 44025.
🌐 Coordinates (Lat/Lon) added for station 44027.
🌐 Coordinates (Lat/Lon) added for station 44065.
🌐 Coordinates (Lat/Lon) added for station 46001.
🌐 Coordinates (Lat/Lon) added for station 46006.
🌐 Coordinates (Lat/Lon) added for station 46014.
🌐 Coordinates (Lat/Lon) added for station 46022.
🌐 Coordinates (Lat/Lon) added for station 46025.
🌐 Coordinates (Lat/Lon) added for station 46027.
🌐 Coordinates (Lat/Lon) added for station 46029.
🌐 Coordinates (Lat/Lon) added for station 46053.
🌐 Coordinates (Lat/L

Merging Based on Station ID

In [None]:
number_merged_data = 0
list_silver_merged_df=[]
# Fusion des DataFrames
for station_id, tables in buoys_datas.items():
    try:
        print(f"🔗 Merging marine and weather data for station {station_id}...")
        df_merged = pd.merge(
            tables["Marine DataFrame"], tables["Meteo DataFrame"], on='Datetime', how='inner'
        )
        tables["Merged DataFrame"] = df_merged
        number_merged_data += df_merged.shape[0]
        list_silver_merged_df.append(df_merged)
    except Exception as e:
        print(f"Error merging data for station {station_id}: {e}")

print(f'{len(list_silver_merged_df)} DataFrames Merged :\n{number_merged_data} rows in total !')


🔗 Merging marine and weather data for station 41008...
🔗 Merging marine and weather data for station 41044...
🔗 Merging marine and weather data for station 42001...
🔗 Merging marine and weather data for station 42002...
🔗 Merging marine and weather data for station 42012...
🔗 Merging marine and weather data for station 42036...
🔗 Merging marine and weather data for station 42056...
🔗 Merging marine and weather data for station 42058...
🔗 Merging marine and weather data for station 44020...
🔗 Merging marine and weather data for station 44025...
🔗 Merging marine and weather data for station 44027...
🔗 Merging marine and weather data for station 44065...
🔗 Merging marine and weather data for station 46001...
🔗 Merging marine and weather data for station 46006...
🔗 Merging marine and weather data for station 46014...
🔗 Merging marine and weather data for station 46022...
🔗 Merging marine and weather data for station 46025...
🔗 Merging marine and weather data for station 46027...
🔗 Merging 

In [26]:
def convert_df_columns(df):
    """
    Convertit chaque colonne en son type approprié sans modifier les données
    ou introduire des NaN.
    
    Args:
    - df: pd.DataFrame. Le DataFrame à traiter.
    
    Returns:
    - pd.DataFrame: Le DataFrame avec les types de données convertis.
    """
    
    # Traitement des colonnes avec les types appropriés
    for col in df.columns:
        # Convertir les colonnes numériques
        if df[col].dtype == 'object':
            # Tenter de convertir en float si c'est un nombre représenté par des strings
            try:
                # Convertir en float pour les colonnes qui peuvent l'être (ex: "Wind Speed (km/h)", "Pressure (hPa)", etc.)
                df[col] = pd.to_numeric(df[col], errors='raise')
            except ValueError:
                # Si la conversion échoue, laisser la colonne intacte
                pass
                
        # Convertir des dates si la colonne contient des chaînes de caractères représentant des dates
        if df[col].dtype == 'object' and 'date' in col.lower():
            try:
                df[col] = pd.to_datetime(df[col], errors='raise')
            except ValueError:
                pass
        
        # Convertir les booléens (is_day) en int
        if col == "is_day":
            df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0).astype(int)

    # Assurer les types numériques pour les colonnes déjà numériques mais mal typées
    for col in df.select_dtypes(include=['float64', 'int64']).columns:
        df[col] = df[col].astype(pd.Float64Dtype())  # Garantir une gestion correcte des NaN dans les colonnes numériques

    return df

for station_id, tables in buoys_datas.items():
    try:
        df_merged = tables["Merged DataFrame"]
        print(f"🔗 Changing Data Types  for station {station_id}...")
        df_converted = convert_df_columns(df_merged)
        tables["Converted DataFrame"] = df_converted
        print(f"Successfully Changed Data Types for Station {station_id}")

    except Exception as e:
        
        print(f"Error changing data types for station {station_id}: {e}")

🔗 Changing Data Types  for station 41008...
Successfully Changed Data Types for Station 41008
🔗 Changing Data Types  for station 41044...
Successfully Changed Data Types for Station 41044
🔗 Changing Data Types  for station 42001...
Successfully Changed Data Types for Station 42001
🔗 Changing Data Types  for station 42002...
Successfully Changed Data Types for Station 42002
🔗 Changing Data Types  for station 42012...
Successfully Changed Data Types for Station 42012
🔗 Changing Data Types  for station 42036...
Successfully Changed Data Types for Station 42036
🔗 Changing Data Types  for station 42056...
Successfully Changed Data Types for Station 42056
🔗 Changing Data Types  for station 42058...
Successfully Changed Data Types for Station 42058
🔗 Changing Data Types  for station 44020...
Successfully Changed Data Types for Station 44020
🔗 Changing Data Types  for station 44025...
Successfully Changed Data Types for Station 44025
🔗 Changing Data Types  for station 44027...
Successfully Cha

In [None]:
buoys_datas["42058"]["Converted DataFrame"].shape

(7452, 37)

In [None]:
def clean_dataframe(df):
    for column in df.columns:
        # Calculer le pourcentage de valeurs manquantes
        missing_percentage = df[column].isnull().mean() * 100
        
        # Supprimer la colonne si elle est totalement nulle
        if df[column].isnull().sum() == len(df[column]):
            df = df.drop(columns=[column])
            continue
        
        # Si plus de 50% des valeurs sont manquantes, on retire la colonne sauf si c'est numérique
        if missing_percentage > 50:
            if df[column].dtype not in ['float64', 'int64']:  # Ne pas supprimer les colonnes numériques
                df = df.drop(columns=[column])
        else:
            # Si la colonne est numérique, on remplace les NaN par la médiane
            if df[column].dtype in ['float64', 'int64']:  # vérifier si c'est une colonne numérique
                median_value = df[column].median()
                df[column].fillna(median_value, inplace=True)
            else:
                pass

    return df


In [31]:
for station_id, tables in buoys_datas.items():
    try:

        print(f"🔗 Cleaning DataFrame for station {station_id}...")
        df_converted = tables["Converted DataFrame"]
        
        df_cleaned = handle_null_values(df_converted)

        tables["Cleaned DataFrame"] = df_cleaned

        print(f"Successfully DataFrame for Station {station_id}")

    except Exception as e:
        print(f"Error Cleaning DataFrame for station {station_id}: {e}")

🔗 Cleaning DataFrame for station 41008...
Successfully DataFrame for Station 41008
🔗 Cleaning DataFrame for station 41044...
Successfully DataFrame for Station 41044
🔗 Cleaning DataFrame for station 42001...
Successfully DataFrame for Station 42001
🔗 Cleaning DataFrame for station 42002...
Successfully DataFrame for Station 42002
🔗 Cleaning DataFrame for station 42012...
Successfully DataFrame for Station 42012
🔗 Cleaning DataFrame for station 42036...
Successfully DataFrame for Station 42036
🔗 Cleaning DataFrame for station 42056...


  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)


Successfully DataFrame for Station 42056
🔗 Cleaning DataFrame for station 42058...
Successfully DataFrame for Station 42058
🔗 Cleaning DataFrame for station 44020...
Successfully DataFrame for Station 44020
🔗 Cleaning DataFrame for station 44025...
Successfully DataFrame for Station 44025
🔗 Cleaning DataFrame for station 44027...
Successfully DataFrame for Station 44027
🔗 Cleaning DataFrame for station 44065...
Successfully DataFrame for Station 44065
🔗 Cleaning DataFrame for station 46001...
Successfully DataFrame for Station 46001
🔗 Cleaning DataFrame for station 46006...
Successfully DataFrame for Station 46006
🔗 Cleaning DataFrame for station 46014...
Successfully DataFrame for Station 46014
🔗 Cleaning DataFrame for station 46022...


  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, ou

Successfully DataFrame for Station 46022
🔗 Cleaning DataFrame for station 46025...
Successfully DataFrame for Station 46025
🔗 Cleaning DataFrame for station 46027...
Successfully DataFrame for Station 46027
🔗 Cleaning DataFrame for station 46029...
Successfully DataFrame for Station 46029
🔗 Cleaning DataFrame for station 46053...
Successfully DataFrame for Station 46053
🔗 Cleaning DataFrame for station 46069...
Successfully DataFrame for Station 46069
🔗 Cleaning DataFrame for station 46071...
Successfully DataFrame for Station 46071
🔗 Cleaning DataFrame for station 46072...
Successfully DataFrame for Station 46072
🔗 Cleaning DataFrame for station 46078...
Successfully DataFrame for Station 46078
🔗 Cleaning DataFrame for station 46084...


  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, ou

Successfully DataFrame for Station 46084
🔗 Cleaning DataFrame for station 46086...
Successfully DataFrame for Station 46086
🔗 Cleaning DataFrame for station 46087...
Successfully DataFrame for Station 46087
🔗 Cleaning DataFrame for station 46088...
Successfully DataFrame for Station 46088
🔗 Cleaning DataFrame for station 51000...
Successfully DataFrame for Station 51000
🔗 Cleaning DataFrame for station 51001...
Successfully DataFrame for Station 51001
🔗 Cleaning DataFrame for station 51002...
Successfully DataFrame for Station 51002
🔗 Cleaning DataFrame for station BURL1...
Successfully DataFrame for Station BURL1
🔗 Cleaning DataFrame for station FFIA2...
Successfully DataFrame for Station FFIA2
🔗 Cleaning DataFrame for station LONF1...
Successfully DataFrame for Station LONF1
🔗 Cleaning DataFrame for station MDRM1...


  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, ou

Successfully DataFrame for Station MDRM1
🔗 Cleaning DataFrame for station MRKA2...
Successfully DataFrame for Station MRKA2
🔗 Cleaning DataFrame for station POTA2...
Successfully DataFrame for Station POTA2
🔗 Cleaning DataFrame for station SANF1...
Successfully DataFrame for Station SANF1
🔗 Cleaning DataFrame for station SBIO1...
Successfully DataFrame for Station SBIO1


  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, ou

In [19]:
# Fusion finale de tous les DataFrames
try:
    print("🔀 Merging all DataFrames into a final DataFrame...")
    dataframes_to_concat = [tables["Merged DataFrame"] for tables in buoys_datas.values()]

    df_final = pd.concat(dataframes_to_concat, ignore_index=True)

    
except Exception as e:
    print(f"Error during final merge: {e}")
    df_final = None

# Résumé final
print("\n⭐🏆 Processing complete!")
print(f"🔢 Total stations processed: {len(buoys_datas)}")

if df_final is not None and not df_final.empty:
    print(f"📝 Final merged DataFrame size: {df_final.shape}")
else:
    print("The DataFrame is either None or empty.")

🔀 Merging all DataFrames into a final DataFrame...

⭐🏆 Processing complete!
🔢 Total stations processed: 39
📝 Final merged DataFrame size: (246782, 37)


In [21]:
df_final.dtypes

id_x                                    int64
Wind Direction (°)                    float64
Wind Speed (km/h)                     float64
Wind Gusts (km/h)                     float64
Wave Height (m)                       float64
dominant_wave_period                  float64
Average Wave Period (s)               float64
Dominant Wave Direction (°)           float64
Pressure (hPA)                        float64
Air T°                                float64
Water T°                              float64
dewpoint                              float64
 Visibility (%)_x                      object
3hr_pressure_tendency                 float64
water_level_above_mean                 object
Datetime                       datetime64[ns]
Station ID                             object
Lat                                    object
Lon                                    object
id_y                                    int64
T°(C°)                                 object
Relative Humidity (%)             

In [24]:
df_final = convert_final_df_columns(df_final)

In [25]:
df_final.dtypes

id_x                                  Float64
Wind Direction (°)                    Float64
Wind Speed (km/h)                     Float64
Wind Gusts (km/h)                     Float64
Wave Height (m)                       Float64
dominant_wave_period                  Float64
Average Wave Period (s)               Float64
Dominant Wave Direction (°)           Float64
Pressure (hPA)                        Float64
Air T°                                Float64
Water T°                              Float64
dewpoint                              Float64
 Visibility (%)_x                     Float64
3hr_pressure_tendency                 Float64
water_level_above_mean                Float64
Datetime                       datetime64[ns]
Station ID                             object
Lat                                    object
Lon                                    object
id_y                                  Float64
T°(C°)                                Float64
Relative Humidity (%)             

In [None]:
show_first_row(buoys_datas["42058"]["Marine DataFrame"])

HANDLING NULL VALUES

In [None]:
 clean_dataframe(meteo_df, meteo_cols, verbose=True)

In [None]:
show_first_row(buoys_datas["42058"]["Cleaned Marine DataFrame"])

In [None]:
show_first_row(buoys_datas["42058"]["Cleaned Meteo DataFrame"])

Cleaning Dataframes Values

In [None]:
def convert_columns_to_numeric(df, cols_to_convert):

    df = df.copy()  # Ne pas modifier l'original

    for col in cols_to_convert:
        if col in df.columns:
            try:
                # Essayer de convertir la colonne en numérique
                df[col] = pd.to_numeric(df[col], errors="raise")  # 'raise' lève une erreur si la conversion échoue
            except ValueError:
                print(f"⚠️ Impossible de convertir la colonne {col}, elle reste inchangée.")

    # Si 'is_day' existe, convertissons-le en entier proprement
    if "is_day" in df.columns:
        try:
            df["is_day"] = pd.to_numeric(df["is_day"], errors="raise").astype(int)
        except ValueError:
            print("⚠️ Impossible de convertir 'is_day' en entier, elle reste inchangée.")

    print("Columns after conversion:", df.dtypes)
    return df


In [None]:
for station_id, tables in buoys_datas.items():
    
    try:
        cleaned_meteo_df = tables["Cleaned Meteo DataFrame"]
        converted_meteo_df = convert_columns_to_numeric(cleaned_meteo_df, meteo_cols)
        tables["Converted Meteo DataFrame"] = converted_meteo_df
        print(f'Successfully Converted {station_id} Meteo Dataframe Types!')

    except Exception as e:
        print(f'Error converting Meteo Dataframe Types for {station_id}!')
        break

    try:
        cleaned_marine_df = tables["Cleaned Marine DataFrame"]
        converted_marine_df = convert_columns_to_numeric(cleaned_marine_df, meteo_cols)
        tables["Converted Marine DataFrame"] = converted_marine_df
        print(f'Successfully Converted {station_id} Marine Dataframe Types!')
    except Exception as e:
        print(f'Error converting Marine Dataframe Types for {station_id}!')
        break

In [None]:
print(f'{converted_marine_df.shape}\n{converted_marine_df.isnull().sum()}')

In [None]:
show_first_row(converted_marine_df)

In [None]:
# Resampling des données et stockage dans un nouveau compartiment du dictionnaire 
for station_id, tables in buoys_datas.items():
    try:
        print(f"🔁 Processing and resampling marine data for station {station_id}...")
        # Convert columns to numeric types (float or int) excluding datetime columns using pandas to_numeric
        tables["Resampled Marine DataFrame"] = process_datetime_column(tables["Marine DataFrame"], column='time')
    except Exception as e:
        print(f"Error processing Marine Data for {station_id}: {e}")

    try:
        print(f"🔁 Processing and resampling weather data for station {station_id}...")
        # Convert columns to numeric types (float or int) excluding datetime columns using pandas to_numeric
        tables["Resampled Meteo DataFrame"] = process_datetime_column(tables["Meteo DataFrame"], column='date')
    except Exception as e:
        print(f"Error processing Meteo Data for {station_id}: {e}")

In [None]:
marine_none=0
meteo_none=0

# Resampling des données et stockage dans un nouveau compartiment du dictionnaire 
for station_id, tables in buoys_datas.items():
    meteo_df = tables["Meteo DataFrame"]
    if meteo_df is None:
        meteo_none +=1
    marine_df = tables["Marine DataFrame"]
    if marine_df is None:
        marine_none +=1

print(f"Meteo None number :{meteo_none}\nMarine None number :{marine_none}")

In [None]:
marine_df = buoys_datas["42058"]["Marine DataFrame"]
marine_df.columns

In [None]:
buoys_datas["42058"]["Meteo DataFrame"].columns

In [None]:
# Resampling des données et stockage dans un nouveau compartiment du dictionnaire 
for station_id, tables in buoys_datas.items():
    meteo_df = tables["Meteo DataFrame"]
    marine_df = tables["Marine DataFrame"]

In [None]:
buoys_datas["41008"]["Merged DataFrame"]

Concatenating all DataFrames into a Final one

In [None]:
df_final.shape

In [None]:
show_first_row(df_final)

In [None]:
df_final.isnull().sum()

Renaming Columns

Renaming Columns

In [None]:
df_cleaned.head()

In [None]:
show_first_row(df_cleaned)

In [None]:
df_cleaned[['Daytime', 'Month']] = df_cleaned['Datetime'].apply(lambda x: get_day_time(x)).apply(pd.Series)

In [None]:
df_cleaned.head()

Third API Test

In [None]:
df_42058 = df_cleaned[df_cleaned["Station ID"] == 41008]
df_42058.head()

In [None]:
# Variables de contrôle des appels API
vc_api_key_path = r"c:\Credentials\visual_crossing_weather_api.json"
with open(vc_api_key_path, 'r') as file:
    content = json.load(file)
    vc_api_key = content["api_key"]

df_42058 = df_cleaned[df_cleaned["Station ID"] == 42058]
lat = df_42058["Lat"].iloc[0]  # Récupérer la première valeur de la colonne "Lat"
lon = df_42058["Lon"].iloc[0]  # Récupérer la première valeur de la colonne "Lon"

lat, lon = convert_coordinates(lat, lon)

# Définition des dates dynamiques
today = datetime.now().strftime("%Y-%m-%d")  # Hier pour éviter les données incomplètes d'aujourd'hui
start_date = (datetime.now() - timedelta(days=31)).strftime("%Y-%m-%d")  # 31 jours avant aujourd'hui
last_call_time = None
vc_meteo_data = None

# Construction de l'URL
url_last_month = f"https://weather.visualcrossing.com/VisualCrossingWebServices/rest/services/timeline/{lat},{lon}/{start_date}/{today}?unitGroup=metric&key={vc_api_key}&contentType=json"
try :
    response = requests.get(url_last_month)
    vc_meteo_data = response.json()  # Essayer de décoder le JSON
except Exception as e:
    print(f"{e}")            

In [None]:
import pandas as pd
from datetime import datetime, timedelta

# Assuming the df_cleaned DataFrame already exists and contains the required data

# First, load your Visual Crossing Weather Data (example, you may already have it)
# Assuming vc_meteo_data is the JSON response from Visual Crossing
# Example of flattening the JSON
df_vc_meteo = pd.json_normalize(vc_meteo_data, record_path=["days", "hours"], meta=["days"])

# Convert the datetimeEpoch from Visual Crossing Weather data into Date and Hour columns
df_vc_meteo["Date"] = pd.to_datetime(df_vc_meteo["datetimeEpoch"], unit="s").dt.strftime("%Y-%m-%d")
df_vc_meteo["Hour"] = pd.to_datetime(df_vc_meteo["datetimeEpoch"], unit="s").dt.strftime("%H")

# Filter data from df_vc_meteo for the last 30 days
today = datetime.now()
thirty_days_ago = today - timedelta(days=30)

today_str = today.strftime("%Y-%m-%d")
thirty_days_ago_str = thirty_days_ago.strftime("%Y-%m-%d")

# Filter df_vc_meteo for the last 30 days
df_test_last_month = df_vc_meteo[['Date', 'Hour', 'windspeed']]
df_test_last_month = df_test_last_month[(df_test_last_month['Date'] >= thirty_days_ago_str) & 
                                        (df_test_last_month['Date'] <= today_str)]

# Prepare df_cleaned for merging (add Date and Hour columns)
df_cleaned['Date'] = df_cleaned['Datetime'].dt.strftime("%Y-%m-%d")
df_cleaned['Hour'] = df_cleaned['Datetime'].dt.strftime("%H")

# Filter df_cleaned for the last 30 days
df_cleaned_last_month = df_cleaned[(df_cleaned['Date'] >= thirty_days_ago_str) & 
                                   (df_cleaned['Date'] <= today_str)]

# Merge df_vc_meteo and df_cleaned based on Date and Hour
df_merged = df_test_last_month.merge(df_cleaned_last_month[['Date', 'Hour', 'Wind Speed (km/h)', 'wind_speed_10m']], 
                                    on=['Date', 'Hour'], 
                                    how='inner')

# Display the merged dataframe
print(df_merged.head(100))


In [None]:
col_to_rename={'temperature_2m': 'T°(C°)',  'relative_humidity_2m': 'Relative Humidity (%)',
 'dew_point_2m': 'Dew Point (°C)', 'precipitation': 'Precipitation (mm)',  'pressure_msl':' Sea Level Pressure (hPa)', 
 'cloud_cover_low':'Low Clouds (%)', 'cloud_cover_mid' : 'Middle Clouds (%)',	 'cloud_cover_high' : 'High Clouds (%)', 
 'visibility' : ' Visibility (%)',  'wind_direction': 'Wind Direction (°)',
 'wind_speed': 'Wind Speed (km/h)','wind_gust': 'Wind Gusts (km/h)', 'wave_height': 'Wave Height (m)',  'average_wave_period': 'Average Wave Period (s)',
 'dominant_wave_direction': 'Dominant Wave Direction (°)','pressure': 'Pressure (hPA)',
 'air_temperature': 'Air T°','water_temperature': 'Water T°'}

df_cleaned = rename_columns(df_cleaned, col_to_rename)
df_cleaned = drop_columns_if_exist(df_cleaned,['soil_temperature_0cm','rain', 'showers', 'is_day', 'id_x', 'id_y','soil_moisture_0_to_1cm'])
df_cleaned.columns

In [None]:
# Récupérer les données de l'API
vc_meteo_data = response.json()
print(vc_meteo_data)  # Vérifiez les données récupérées

In [None]:
# Normaliser les données JSON en DataFrame
df_vc_meteo = pd.json_normalize(vc_meteo_data, record_path=["days", "hours"], meta=["days"])

# Afficher la première ligne des données
df_vc_meteo.head(1)

In [None]:
# Conversion du timestamp en datetime
df_vc_meteo["Date"] = pd.to_datetime(df_vc_meteo["datetimeEpoch"], unit="s").dt.strftime("%Y-%m-%d")
df_vc_meteo["Hour"] = pd.to_datetime(df_vc_meteo["datetimeEpoch"], unit="s").dt.strftime("%H")

In [None]:
# Définir les dates de filtrage pour les 30 derniers jours
today = datetime.now()
thirty_days_ago = today - timedelta(days=30)

# Convertir les dates en format YYYY-MM-DD
today_str = today.strftime("%Y-%m-%d")
thirty_days_ago_str = thirty_days_ago.strftime("%Y-%m-%d")

In [None]:
# Filtrer les données des 30 derniers jours de df_vc_meteo
df_test_last_month = df_vc_meteo[['Date', 'Hour', 'windspeed']]
df_test_last_month = df_test_last_month[(df_test_last_month['Date'] >= thirty_days_ago_str) & 
                                        (df_test_last_month['Date'] <= today_str)]

# Ajouter les colonnes Date et Hour à df_42058
df_42058.loc[:, 'Date'] = df_42058['Datetime'].dt.strftime("%Y-%m-%d")
df_42058.loc[:, 'Hour'] = df_42058['Datetime'].dt.strftime("%H")

# Filtrer les données des 30 derniers jours dans df_42058
df_42058_last_month = df_42058[(df_42058['Date'] >= thirty_days_ago_str) & 
                                (df_42058['Date'] <= today_str)]

# Fusionner les deux DataFrames sur Date et Hour
df_test_merged = df_test_last_month.merge(df_42058_last_month[['Date', 'Hour', 'Wind Speed (km/h)', 'wind_speed_10m']], 
                                     on=['Date', 'Hour'], 
                                     how='inner')

df_test_merged.head()

In [None]:
# import pandas as pd

# def handle_null_values(df: pd.DataFrame) -> pd.DataFrame:
#     row_count = df.shape[0]
    
#     # Initialisation des listes pour suivre les colonnes supprimées
#     removed_columns = []
#     non_numeric_columns_to_drop = []
    
#     # Utiliser lambda et apply() pour calculer le nombre de valeurs nulles dans chaque colonne
#     null_counts = df.apply(lambda col: int(col.isnull().sum()))  # Calculer le nombre de NaN par colonne
    
#     # Condition : 1. Colonnes avec toutes les valeurs nulles ou 2. Plus de 50% de valeurs nulles et colonne non numérique
#     columns_to_drop = null_counts[
#         (null_counts == row_count) | 
#         ((null_counts > row_count * 0.5) & ~df.apply(lambda col: pd.api.types.is_numeric_dtype(col)))
#     ].index
    
#     # Ajouter les noms des colonnes supprimées dans les listes appropriées
#     for col in columns_to_drop:
#         if null_counts[col] == row_count:
#             removed_columns.append(col)  # Colonnes entièrement vides
#         elif null_counts[col] > row_count * 0.5 and not pd.api.types.is_numeric_dtype(df[col]):
#             non_numeric_columns_to_drop.append(col)  # Colonnes > 50% nulles et non numériques
    
#     # Supprimer les colonnes identifiées
#     df = df.drop(columns=columns_to_drop)
    
#     # Afficher les résultats
#     print("Colonnes supprimées pour avoir toutes les valeurs nulles:")
#     print(removed_columns)
    
#     print("\nColonnes supprimées pour avoir plus de 50% de valeurs nulles et être non numériques:")
#     print(non_numeric_columns_to_drop)
    
#     return df

# # Exemple d'utilisation
# # df_final = pd.read_csv('ton_fichier.csv') # Assure-toi que df_final est bien un DataFrame valide avant d'appeler la fonction
# df_final = handle_null_values(df_final)


In [None]:
# df_final = df_final.round(2)
# print(df_final.columns)
# df_final.describe()

In [None]:
# def explore_dict_keys(d, parent_key='', sep='_'):
#     """
#     Explore un dictionnaire récursivement pour obtenir toutes les clés, y compris les sous-clés,
#     mais ne retourne pas les valeurs finales.

#     :param d: Le dictionnaire à explorer
#     :param parent_key: La clé parent qui est utilisée pour concaténer les sous-clés
#     :param sep: Le séparateur utilisé pour concaténer les clés (par défaut '_')
#     :return: Une liste des clés (et sous-clés)
#     """
#     keys = []
#     for k, v in d.items():
#         new_key = f"{parent_key}{sep}{k}" if parent_key else k
#         if isinstance(v, dict):  # Si la valeur est un dictionnaire, on explore récursivement
#             keys.append(new_key)  # Ajouter la clé, mais ne pas inclure la valeur
#             keys.extend(explore_dict_keys(v, new_key, sep=sep))  # Continuer l'exploration
#         else:
#             keys.append(new_key)  # Ajouter la clé finale
#     return keys

In [None]:
# def find_key_path(d, target_key, path=[]):
#     """
#     Recherche récursive d'une clé dans un dictionnaire et retourne son chemin.
#     :param d: dictionnaire
#     :param target_key: clé recherchée
#     :param path: liste pour stocker le chemin jusqu'à la clé
#     :return: chemin sous forme de liste
#     """
#     if isinstance(d, dict):  # Si le dictionnaire est encore imbriqué
#         for key, value in d.items():
#             new_path = path + [key]
#             if key == target_key:
#                 return new_path
#             elif isinstance(value, dict):
#                 result = find_key_path(value, target_key, new_path)
#                 if result:  # Si la clé est trouvée, retourner le chemin
#                     return result
#     return None  # Retourne None si la clé n'a pas été trouvée



# # Recherche du chemin pour la clé 'marine_data'
# path = find_key_path(table_dict, "Marine Dataframe")
# print(path)


Auto_convert Test

In [None]:
# for idx, (buoy_id, tables) in enumerate(table_dict.items()):  # Utilisation de .items() pour obtenir (clé, valeur)
#     if isinstance(tables, dict):
#         if idx == 1:  # Vérifier si l'index est égal à 1

Counting Rows of all Dataframes in total