In [18]:
%pip install -q -r requirements.txt

Note: you may need to restart the kernel to use updated packages.


In [1]:
from imports import *
from functions import *

Connection to PostgreSQL

In [2]:
path_postgresql_creds = r"C:\Users\f.gionnane\Documents\Data Engineering\Credentials\postgresql_creds.json"

with open(path_postgresql_creds, 'r') as file:
    content = json.load(file)
    user = content["user"]
    password = content["password"]
    host = content["host"]
    port = content["port"]

db = "Oceanography_ML_Project"
schema_bronze = "Bronze"
schema_silver = "Silver"

# Créer l'engine PostgreSQL
engine = create_engine(f"postgresql+psycopg2://{user}:{password}@{host}:{port}/{db}")
conn = engine.connect()

Charger les Données des Tables

In [3]:
# Charger les métadonnées du schéma existant
metadata = MetaData(schema=schema_bronze)

print("\n🔍 Chargement des métadonnées du schéma...")
metadata.reflect(bind=conn)
print("✅ Métadonnées chargées avec succès.\n")

# Récupérer les noms des tables
table_names = [t.name for t in metadata.sorted_tables]
print(f"🔢 Nombre total de tables dans le schéma : {len(table_names)}\n")

# Filtrer les tables en fonction du contenu de leur nom
marine_tables = {t for t in table_names if "marine" in t.lower()}
meteo_tables = {t for t in table_names if "meteo" in t.lower()}
buoys_data_table = {t for t in table_names if "buoy" in t.lower()}

print(f"🌊 Tables marines trouvées : {len(marine_tables)}")
print(f"🌧️ Tables météo trouvées : {len(meteo_tables)}")
print(f"🐋 Tables de bouées trouvées : {len(buoys_data_table)}\n")

# Initialiser le dictionnaire des résultats
buoys_datas = {}

# Compteurs pour suivre le nombre de tables chargées avec succès
marine_data_count = 0
meteo_data_count = 0
buoys_data_count = 0

# Compteur pour le nombre total de lignes
total_marine_rows = 0
total_meteo_rows = 0
total_buoys_rows = 0  # Changer ici pour compter le nombre de lignes (bouées)

# Vérifier et récupérer les données de la table "buoys_datas"
if buoys_data_table:
    print("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━")
    print("🔄 Chargement des données de la table 'buoys_datas'...")

    try:
        buoys_datas_raw = fetch_table_data(schema=schema_bronze, conn=conn, table_name=next(iter(buoys_data_table)), as_df=True)

        if buoys_datas_raw is not None:
            print("📦 Données récupérées pour 'buoys_datas'.")

            # Conversion JSON → dict si nécessaire
            if isinstance(buoys_datas_raw, str):
                buoys_datas_raw = json.loads(buoys_datas_raw)

            elif isinstance(buoys_datas_raw, pd.DataFrame) and "Station ID" in buoys_datas_raw.columns:
                # Convertir en dictionnaire avec "Station ID" comme clé
                buoys_datas_raw = buoys_datas_raw.set_index("Station ID").to_dict(orient="index")

            # Ajouter au dictionnaire principal directement avec les Station ID comme clés
            buoys_datas.update(buoys_datas_raw)
            buoys_data_count += 1
            total_buoys_rows += len(buoys_datas_raw)  # Compter le nombre de bouées
            print(f"✅ Table 'buoys_datas' chargée avec succès! Nombre de bouées (lignes) : {total_buoys_rows}\n")
        else:
            print("⚠️ Aucun résultat trouvé dans 'buoys_datas'.\n")

    except Exception as e:
        print(f"❌ Erreur lors du chargement de 'buoys_datas': {e}\n")

# Associer les tables marine et meteo en fonction du station_id et récupérer leurs données
for table_set, label, icon, counter, total_rows in [
    (marine_tables, "Marine", "🌊", marine_data_count, total_marine_rows),
    (meteo_tables, "Meteo", "🌧️", meteo_data_count, total_meteo_rows)
]:
    print("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━")
    for table_name in table_set:
        print(f"🔄 Chargement des données pour la table {label} : {table_name}...")

        try:
            station_id = table_name.split("_")[1]

            # Vérifier si la station existe déjà dans buoys_datas, sinon initialiser un dictionnaire
            if station_id not in buoys_datas:
                buoys_datas[station_id] = {}

            # Récupérer les données
            data = fetch_table_data(schema=schema_bronze, conn=conn, table_name=table_name, as_df=True)

            if data is not None:
                print(f"📦 Données récupérées pour la station {station_id} ({label}).")

                if isinstance(data, str):
                    data = pd.DataFrame(json.loads(data))
                elif isinstance(data, dict):
                    data = pd.DataFrame(data)
                # Ajouter les données au dictionnaire de bouées sous la station_id
                buoys_datas[station_id][f"{label} DataFrame"] = data
                counter += 1
                total_rows += len(data)  # Ajouter le nombre de lignes collectées
                print(f"{icon} Données {label} chargées pour la station {station_id}! Nombre de lignes collectées : {len(data)}\n")
            else:
                print(f"⚠️ Aucun résultat trouvé pour la station {station_id} ({label}).\n")

        except Exception as e:
            print(f"❌ Erreur lors du chargement des données {label} pour {table_name} : {e}\n")

    # Mise à jour des compteurs après le chargement des données pour chaque catégorie
    if label == "Marine":
        marine_data_count = counter
        total_marine_rows = total_rows
    elif label == "Meteo":
        meteo_data_count = counter
        total_meteo_rows = total_rows

# Finalement, afficher un récapitulatif global
print("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━")
print(f"🏆 Chargement des données terminé avec succès !")
print(f"🐋 Total des données bouées chargées : {buoys_data_count} - Nombre de bouées (lignes) : {total_buoys_rows}")
print(f"🌊 Total des données marines chargées : {marine_data_count} - Nombre total de lignes : {total_marine_rows}")
print(f"🌧️ Total des données météorologiques chargées : {meteo_data_count} - Nombre total de lignes : {total_meteo_rows}")
print("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━")


🔍 Chargement des métadonnées du schéma...
✅ Métadonnées chargées avec succès.

🔢 Nombre total de tables dans le schéma : 79

🌊 Tables marines trouvées : 39
🌧️ Tables météo trouvées : 39
🐋 Tables de bouées trouvées : 1

━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
🔄 Chargement des données de la table 'buoys_datas'...
📦 Données récupérées pour 'buoys_datas'.
✅ Table 'buoys_datas' chargée avec succès! Nombre de bouées (lignes) : 39

━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
🔄 Chargement des données pour la table Marine : station_44027_marine_jonesport, me...
📦 Données récupérées pour la station 44027 (Marine).
🌊 Données Marine chargées pour la station 44027! Nombre de lignes collectées : 7470

🔄 Chargement des données pour la table Marine : station_46025_marine_santa monica basin...
📦 Données récupérées pour la station 46025 (Marine).
🌊 Données Marine chargées pour la station 46025! Nombre de lignes collectées : 7515

🔄 Chargement des données pour la table Marine : station_BURL1_marine_southwest pa

In [4]:
buoys_datas["42058"]["Marine DataFrame"].columns

Index(['id', 'wind_direction', 'wind_speed', 'wind_gust', 'wave_height',
       'dominant_wave_period', 'average_wave_period',
       'dominant_wave_direction', 'pressure', 'air_temperature',
       'water_temperature', 'dewpoint', 'visibility', '3hr_pressure_tendency',
       'water_level_above_mean', 'time', 'Station ID'],
      dtype='object')

In [5]:
buoys_datas["42058"]["Meteo DataFrame"].columns

Index(['id', 'date', 'temperature_2m', 'relative_humidity_2m', 'dew_point_2m',
       'precipitation', 'rain', 'showers', 'pressure_msl', 'surface_pressure',
       'cloud_cover', 'cloud_cover_low', 'cloud_cover_mid', 'cloud_cover_high',
       'visibility', 'wind_speed_10m', 'soil_temperature_0cm',
       'soil_moisture_0_to_1cm', 'is_day'],
      dtype='object')

In [6]:
list_silver_merged_df = []  
list_failed_dfs = []        

number_marine_data = 0
number_meteo_data = 0
number_merged_data = 0

marine_data_conversion = 0
meteo_data_conversion = 0

In [7]:
marine_cols = [
    "wind_direction", "wind_speed", "wind_gust", "wave_height",
    "dominant_wave_period", "average_wave_period", "dominant_wave_direction",
    "pressure", "air_temperature", "water_temperature", "dewpoint",
    "visibility", "3hr_pressure_tendency", "water_level_above_mean"
]

meteo_cols = [
    "temperature_2m", "relative_humidity_2m", "dew_point_2m", "precipitation", "rain",
    "showers", "pressure_msl", "surface_pressure", "cloud_cover", "cloud_cover_low",
    "cloud_cover_mid", "cloud_cover_high", "visibility", "wind_speed_10m",
    "soil_temperature_0cm", "soil_moisture_0_to_1cm"
]


In [8]:
col_to_rename={'temperature_2m': 'T°(C°)',  'relative_humidity_2m': 'Relative Humidity (%)',
 'dew_point_2m': 'Dew Point (°C)', 'precipitation': 'Precipitation (mm)',  'pressure_msl':' Sea Level Pressure (hPa)', 
 'cloud_cover_low':'Low Clouds (%)', 'cloud_cover_mid' : 'Middle Clouds (%)',	 'cloud_cover_high' : 'High Clouds (%)', 
 'visibility' : ' Visibility (%)',  'wind_direction': 'Wind Direction (°)',
 'wind_speed': 'Wind Speed (km/h)','wind_gust': 'Wind Gusts (km/h)', 'wave_height': 'Wave Height (m)',  'average_wave_period': 'Average Wave Period (s)',
 'dominant_wave_direction': 'Dominant Wave Direction (°)','pressure': 'Pressure (hPA)',
 'air_temperature': 'Air T°','water_temperature': 'Water T°'}

meteo_cols_to_delete = ['soil_temperature_0cm','rain', 'showers', 'is_day',
                  'soil_moisture_0_to_1cm']

for station_id, tables in buoys_datas.items():
    marine_df = tables["Marine DataFrame"]
    marine_df = rename_columns(marine_df, col_to_rename)

    marine_df = drop_columns_if_exist

    meteo_df = tables["Meteo DataFrame"]
    meteo_df = rename_columns(meteo_df,col_to_rename)
    meteo_df = drop_columns_if_exist(meteo_df, meteo_cols_to_delete)

Colonne 'soil_temperature_0cm' Supprimée
Colonne 'rain' Supprimée
Colonne 'showers' Supprimée
Colonne 'is_day' Supprimée
Colonne 'soil_moisture_0_to_1cm' Supprimée
Colonne 'soil_temperature_0cm' Supprimée
Colonne 'rain' Supprimée
Colonne 'showers' Supprimée
Colonne 'is_day' Supprimée
Colonne 'soil_moisture_0_to_1cm' Supprimée
Colonne 'soil_temperature_0cm' Supprimée
Colonne 'rain' Supprimée
Colonne 'showers' Supprimée
Colonne 'is_day' Supprimée
Colonne 'soil_moisture_0_to_1cm' Supprimée
Colonne 'soil_temperature_0cm' Supprimée
Colonne 'rain' Supprimée
Colonne 'showers' Supprimée
Colonne 'is_day' Supprimée
Colonne 'soil_moisture_0_to_1cm' Supprimée
Colonne 'soil_temperature_0cm' Supprimée
Colonne 'rain' Supprimée
Colonne 'showers' Supprimée
Colonne 'is_day' Supprimée
Colonne 'soil_moisture_0_to_1cm' Supprimée
Colonne 'soil_temperature_0cm' Supprimée
Colonne 'rain' Supprimée
Colonne 'showers' Supprimée
Colonne 'is_day' Supprimée
Colonne 'soil_moisture_0_to_1cm' Supprimée
Colonne 'soil_te

In [9]:
show_first_row(buoys_datas["42058"]["Marine DataFrame"])

id                             1  (int64)
Wind Direction (°)             80.0  (float64)
Wind Speed (km/h)              11.0  (float64)
Wind Gusts (km/h)              13.0  (float64)
Wave Height (m)                2.2  (float64)
dominant_wave_period           7.0  (float64)
Average Wave Period (s)        5.5  (float64)
Dominant Wave Direction (°)    92.0  (float64)
Pressure (hPA)                 1012.6  (float64)
Air T°                         27.7  (float64)
Water T°                       27.7  (float64)
dewpoint                       24.3  (float64)
 Visibility (%)                None  (object)
3hr_pressure_tendency          nan  (float64)
water_level_above_mean         None  (object)
time                           2025-03-22 12:20:00+01  (object)
Station ID                     42058  (object)


HANDLING NULL VALUES

In [None]:
for station_id, tables in buoys_datas.items():

    try:
        meteo_df = tables["Meteo DataFrame"].copy()
        cleaned_meteo_df = clean_dataframe(meteo_df, meteo_cols, verbose=True)
        tables["Cleaned Meteo DataFrame"] = cleaned_meteo_df
        print(f'Succesfully cleaned Meteo Dataframe {station_id} !')
    except Exception as e:
        print(f'Error cleaning Meteo Dataframe {station_id}:')
        print(f'{e}')
        break
    
    try:
        marine_df = tables["Marine DataFrame"].copy()
        cleaned_marine_df = clean_dataframe(marine_df, marine_cols, verbose=True)
        tables["Cleaned Marine DataFrame"] = cleaned_marine_df
        print(f'Succesfully cleaned Marine Dataframe {station_id} !')

    except Exception as e:
        print(f'Error cleaning Meteo Dataframe {station_id}:')
        print(f'{e}')
        break

⚠️ Il reste encore des NaN !
id                             int64
date                          object
T°(C°)                        object
Relative Humidity (%)         object
Dew Point (°C)                object
Precipitation (mm)            object
rain                         float64
showers                      float64
 Sea Level Pressure (hPa)     object
surface_pressure             float64
cloud_cover                  float64
Low Clouds (%)                object
Middle Clouds (%)             object
High Clouds (%)               object
 Visibility (%)               object
wind_speed_10m               float64
soil_temperature_0cm         float64
soil_moisture_0_to_1cm       float64
is_day                        object
dtype: object
✅ Toutes les valeurs ont été remplacées avec succès !
id                               int64
Wind Direction (°)             float64
Wind Speed (km/h)              float64
Wind Gusts (km/h)              float64
Wave Height (m)                float64
domin

In [50]:
show_first_row(buoys_datas["42058"]["Cleaned Marine DataFrame"])

id                             1  (int64)
Wind Direction (°)             80.0  (float64)
Wind Speed (km/h)              11.0  (float64)
Wind Gusts (km/h)              13.0  (float64)
Wave Height (m)                2.2  (float64)
dominant_wave_period           7.0  (float64)
Average Wave Period (s)        5.5  (float64)
Dominant Wave Direction (°)    92.0  (float64)
Pressure (hPA)                 1012.6  (float64)
Air T°                         27.7  (float64)
Water T°                       27.7  (float64)
dewpoint                       24.3  (float64)
3hr_pressure_tendency          0.0  (float64)
time                           2025-03-22 12:20:00+01  (object)
Station ID                     42058  (object)


In [57]:
print(f'{show_first_row(buoys_datas["42058"]["Cleaned Marine DataFrame"])}')

id                             1  (int64)
Wind Direction (°)             80.0  (float64)
Wind Speed (km/h)              11.0  (float64)
Wind Gusts (km/h)              13.0  (float64)
Wave Height (m)                2.2  (float64)
dominant_wave_period           7.0  (float64)
Average Wave Period (s)        5.5  (float64)
Dominant Wave Direction (°)    92.0  (float64)
Pressure (hPA)                 1012.6  (float64)
Air T°                         27.7  (float64)
Water T°                       27.7  (float64)
dewpoint                       24.3  (float64)
3hr_pressure_tendency          0.0  (float64)
time                           2025-03-22 12:20:00+01  (object)
Station ID                     42058  (object)
None


In [56]:
print(f'{show_first_row(buoys_datas["42058"]["Cleaned Meteo DataFrame"])}')

id                             1  (int64)
date                           2024-12-20 01:00:00+01  (object)
T°(C°)                         None  (object)
Relative Humidity (%)          None  (object)
Dew Point (°C)                 None  (object)
Precipitation (mm)             None  (object)
rain                           0.0  (float64)
showers                        0.0  (float64)
 Sea Level Pressure (hPa)      None  (object)
surface_pressure               991.4349975585938  (float64)
cloud_cover                    98.0  (float64)
Low Clouds (%)                 None  (object)
Middle Clouds (%)              None  (object)
High Clouds (%)                None  (object)
 Visibility (%)                None  (object)
wind_speed_10m                 19.3700008392334  (float64)
soil_temperature_0cm           0.0900000035762786  (float64)
soil_moisture_0_to_1cm         0.3199999928474426  (float64)
is_day                         0.0  (object)
None


In [None]:
for station_id, tables in buoys_datas.items():
    
    try:
        cleaned_meteo_df = tables["Cleaned Meteo DataFrame"]
        converted_meteo_df = convert_columns_to_numeric(cleaned_meteo_df, meteo_cols)
        tables["Converted Meteo DataFrame"] = converted_meteo_df
        print(f'Successfully Converted {station_id} Meteo Dataframe Types!')

    except Exception as e:
        print(f'Error converting Meteo Dataframe Types for {station_id}!')
        break

    try:
        cleaned_marine_df = tables["Cleaned Marine DataFrame"]
        converted_marine_df = convert_columns_to_numeric(cleaned_marine_df, meteo_cols)
        tables["Converted Marine DataFrame"] = converted_marine_df
        print(f'Successfully Converted {station_id} Marine Dataframe Types!')
    except Exception as e:
        print(f'Error converting Marine Dataframe Types for {station_id}!')
        break

In [22]:
print(f'{converted_marine_df.shape}\n{converted_marine_df.isnull().sum()}')

(1256, 9)
id                       0
wind_direction           0
wind_speed               0
wind_gust                0
pressure                 0
air_temperature          0
3hr_pressure_tendency    0
time                     0
Station ID               0
dtype: int64


In [23]:
show_first_row(converted_marine_df)

id                             1  (int64)
wind_direction                 240.0  (float64)
wind_speed                     9.3  (float64)
wind_gust                      9.8  (float64)
pressure                       1002.7  (float64)
air_temperature                7.6  (float64)
3hr_pressure_tendency          0.4  (float64)
time                           2025-03-22 12:00:00+01  (object)
Station ID                     SBIO1  (object)


HOUR RESAMPLING

In [29]:
# Resampling des données et stockage dans un nouveau compartiment du dictionnaire 
for station_id, tables in buoys_datas.items():
    try:
        print(f"🔁 Processing and resampling marine data for station {station_id}...")
        # Convert columns to numeric types (float or int) excluding datetime columns using pandas to_numeric
        tables["Resampled Marine DataFrame"] = process_datetime_column(tables["Marine DataFrame"], column='time')
    except Exception as e:
        print(f"Error processing Marine Data for {station_id}: {e}")

    try:
        print(f"🔁 Processing and resampling weather data for station {station_id}...")
        # Convert columns to numeric types (float or int) excluding datetime columns using pandas to_numeric
        tables["Resampled Meteo DataFrame"] = process_datetime_column(tables["Meteo DataFrame"], column='date')
    except Exception as e:
        print(f"Error processing Meteo Data for {station_id}: {e}")

🔁 Processing and resampling marine data for station 41008...
📌 La colonne 'time' est maintenant convertie en chaîne de caractères.
📌 Conversion réussie de 'time' en datetime.
✅ Successfully renamed column to "Datetime"!
🔁 Processing and resampling weather data for station 41008...
📌 La colonne 'date' est maintenant convertie en chaîne de caractères.
📌 Conversion réussie de 'date' en datetime.
✅ Successfully renamed column to "Datetime"!
🔁 Processing and resampling marine data for station 41044...
📌 La colonne 'time' est maintenant convertie en chaîne de caractères.
📌 Conversion réussie de 'time' en datetime.
✅ Successfully renamed column to "Datetime"!
🔁 Processing and resampling weather data for station 41044...
📌 La colonne 'date' est maintenant convertie en chaîne de caractères.
📌 Conversion réussie de 'date' en datetime.
✅ Successfully renamed column to "Datetime"!
🔁 Processing and resampling marine data for station 42001...
📌 La colonne 'time' est maintenant convertie en chaîne de

In [30]:
# Resampling des données et stockage dans un nouveau compartiment du dictionnaire 
for station_id, tables in buoys_datas.items():
    try:
        print(f"🔁 Processing and resampling marine data for station {station_id}...")
        # Convert columns to numeric types (float or int) excluding datetime columns using pandas to_numeric
        tables["Resampled Marine DataFrame"] = process_datetime_column(tables["Marine DataFrame"], column='time')
    except Exception as e:
        print(f"Error processing Marine Data for {station_id}: {e}")

    try:
        print(f"🔁 Processing and resampling weather data for station {station_id}...")
        # Convert columns to numeric types (float or int) excluding datetime columns using pandas to_numeric
        tables["Resampled Meteo DataFrame"] = process_datetime_column(tables["Meteo DataFrame"], column='date')
    except Exception as e:
        print(f"Error processing Meteo Data for {station_id}: {e}")

🔁 Processing and resampling marine data for station 41008...
Error processing Marine Data for 41008: 'time'
🔁 Processing and resampling weather data for station 41008...
Error processing Meteo Data for 41008: 'date'
🔁 Processing and resampling marine data for station 41044...
Error processing Marine Data for 41044: 'time'
🔁 Processing and resampling weather data for station 41044...
Error processing Meteo Data for 41044: 'date'
🔁 Processing and resampling marine data for station 42001...
Error processing Marine Data for 42001: 'time'
🔁 Processing and resampling weather data for station 42001...
Error processing Meteo Data for 42001: 'date'
🔁 Processing and resampling marine data for station 42002...
Error processing Marine Data for 42002: 'time'
🔁 Processing and resampling weather data for station 42002...
Error processing Meteo Data for 42002: 'date'
🔁 Processing and resampling marine data for station 42012...
Error processing Marine Data for 42012: 'time'
🔁 Processing and resampling 

In [31]:
marine_none=0
meteo_none=0

# Resampling des données et stockage dans un nouveau compartiment du dictionnaire 
for station_id, tables in buoys_datas.items():
    meteo_df = tables["Meteo DataFrame"]
    if meteo_df is None:
        meteo_none +=1
    marine_df = tables["Marine DataFrame"]
    if marine_df is None:
        marine_none +=1

print(f"Meteo None number :{meteo_none}\nMarine None number :{marine_none}")

Meteo None number :0
Marine None number :0


In [32]:
marine_df = buoys_datas["42058"]["Marine DataFrame"]
marine_df.columns

Index(['id', 'wind_direction', 'wind_speed', 'wind_gust', 'wave_height',
       'dominant_wave_period', 'average_wave_period',
       'dominant_wave_direction', 'pressure', 'air_temperature',
       'water_temperature', 'dewpoint', 'visibility', '3hr_pressure_tendency',
       'water_level_above_mean', 'Datetime', 'Station ID'],
      dtype='object')

In [34]:
buoys_datas["42058"]["Meteo DataFrame"].columns

Index(['id', 'Datetime', 'temperature_2m', 'relative_humidity_2m',
       'dew_point_2m', 'precipitation', 'rain', 'showers', 'pressure_msl',
       'surface_pressure', 'cloud_cover', 'cloud_cover_low', 'cloud_cover_mid',
       'cloud_cover_high', 'visibility', 'wind_speed_10m',
       'soil_temperature_0cm', 'soil_moisture_0_to_1cm', 'is_day'],
      dtype='object')

In [35]:
# Resampling des données et stockage dans un nouveau compartiment du dictionnaire 
for station_id, tables in buoys_datas.items():
    meteo_df = tables["Meteo DataFrame"]
    marine_df = tables["Marine DataFrame"]

ADDING COORDINATES

In [36]:
# Ajout des coordonnées
for station_id, tables in buoys_datas.items():
    try:
        tables["Marine DataFrame"]["Lat"] = tables["Lat"]
        tables["Marine DataFrame"]["Lon"] = tables["Lon"]
        print(f"🌐 Coordinates (Lat/Lon) added for station {station_id}.")
    except Exception as e:
        print(f"Error adding coordinates for {station_id}: {e}")

🌐 Coordinates (Lat/Lon) added for station 41008.
🌐 Coordinates (Lat/Lon) added for station 41044.
🌐 Coordinates (Lat/Lon) added for station 42001.
🌐 Coordinates (Lat/Lon) added for station 42002.
🌐 Coordinates (Lat/Lon) added for station 42012.
🌐 Coordinates (Lat/Lon) added for station 42036.
🌐 Coordinates (Lat/Lon) added for station 42056.
🌐 Coordinates (Lat/Lon) added for station 42058.
🌐 Coordinates (Lat/Lon) added for station 44020.
🌐 Coordinates (Lat/Lon) added for station 44025.
🌐 Coordinates (Lat/Lon) added for station 44027.
🌐 Coordinates (Lat/Lon) added for station 44065.
🌐 Coordinates (Lat/Lon) added for station 46001.
🌐 Coordinates (Lat/Lon) added for station 46006.
🌐 Coordinates (Lat/Lon) added for station 46014.
🌐 Coordinates (Lat/Lon) added for station 46022.
🌐 Coordinates (Lat/Lon) added for station 46025.
🌐 Coordinates (Lat/Lon) added for station 46027.
🌐 Coordinates (Lat/Lon) added for station 46029.
🌐 Coordinates (Lat/Lon) added for station 46053.
🌐 Coordinates (Lat/L

PAIRS DATAFRAMES FUSION

In [37]:
# Fusion des DataFrames
for station_id, tables in buoys_datas.items():
    try:
        print(f"🔗 Merging marine and weather data for station {station_id}...")
        df_merged = pd.merge(
            tables["Marine DataFrame"], tables["Meteo DataFrame"], on='Datetime', how='inner'
        )
        tables["Merged DataFrame"] = df_merged
        number_merged_data += df_merged.shape[0]
        list_silver_merged_df.append(df_merged)
    except Exception as e:
        print(f"Error merging data for station {station_id}: {e}")

🔗 Merging marine and weather data for station 41008...
🔗 Merging marine and weather data for station 41044...
🔗 Merging marine and weather data for station 42001...
🔗 Merging marine and weather data for station 42002...
🔗 Merging marine and weather data for station 42012...
🔗 Merging marine and weather data for station 42036...
🔗 Merging marine and weather data for station 42056...
🔗 Merging marine and weather data for station 42058...
🔗 Merging marine and weather data for station 44020...
🔗 Merging marine and weather data for station 44025...
🔗 Merging marine and weather data for station 44027...
🔗 Merging marine and weather data for station 44065...
🔗 Merging marine and weather data for station 46001...
🔗 Merging marine and weather data for station 46006...
🔗 Merging marine and weather data for station 46014...
🔗 Merging marine and weather data for station 46022...
🔗 Merging marine and weather data for station 46025...
🔗 Merging marine and weather data for station 46027...
🔗 Merging 

In [38]:
buoys_datas["41008"]["Merged DataFrame"]

Unnamed: 0,id_x,wind_direction,wind_speed,wind_gust,wave_height,dominant_wave_period,average_wave_period,dominant_wave_direction,pressure,air_temperature,...,surface_pressure,cloud_cover,cloud_cover_low,cloud_cover_mid,cloud_cover_high,visibility_y,wind_speed_10m,soil_temperature_0cm,soil_moisture_0_to_1cm,is_day
0,1,240.0,5.0,6.0,0.7,4.0,3.7,218.0,1020.3,14.1,...,979.2899780273438,0.0,0.0,0.0,0.0,34300.0,22.06999969482422,6.739999771118164,0.2800000011920929,0.0
1,2,240.0,6.0,7.0,0.7,,3.7,218.0,1020.3,14.1,...,979.2899780273438,0.0,0.0,0.0,0.0,34300.0,22.06999969482422,6.739999771118164,0.2800000011920929,0.0
2,3,240.0,6.0,7.0,,,,,1020.3,14.2,...,979.2899780273438,0.0,0.0,0.0,0.0,34300.0,22.06999969482422,6.739999771118164,0.2800000011920929,0.0
3,4,240.0,6.0,8.0,0.7,11.0,3.7,82.0,1020.3,14.3,...,978.9600219726562,56.0,0.0,56.0,0.0,38000.0,25.809999465942383,7.039999961853027,0.2800000011920929,0.0
4,5,240.0,6.0,8.0,0.7,,3.7,82.0,1020.2,14.3,...,978.9600219726562,56.0,0.0,56.0,0.0,38000.0,25.809999465942383,7.039999961853027,0.2800000011920929,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7497,7498,360.0,1.0,2.0,,,,,1019.5,17.1,...,991.5999755859375,100.0,71.0,5.0,100.0,24140.0,23.260000228881836,10.289999961853027,0.28999999165534973,1.0
7498,7499,360.0,2.0,2.0,0.3,5.0,4.1,119.0,1019.4,17.0,...,991.72998046875,70.0,65.0,0.0,9.0,24140.0,26.760000228881836,9.239999771118164,0.28999999165534973,1.0
7499,7500,350.0,2.0,3.0,,,,,1019.5,16.8,...,991.72998046875,70.0,65.0,0.0,9.0,24140.0,26.760000228881836,9.239999771118164,0.28999999165534973,1.0
7500,7501,350.0,3.0,3.0,,,,,1019.5,16.7,...,991.72998046875,70.0,65.0,0.0,9.0,24140.0,26.760000228881836,9.239999771118164,0.28999999165534973,1.0


Concatenating all DataFrames into a Final one

In [39]:
# Fusion finale de tous les DataFrames
try:
    print("🔀 Merging all DataFrames into a final DataFrame...")
    dataframes_to_concat = [tables["Merged DataFrame"] for tables in buoys_datas.values()]
    df_final = pd.concat(dataframes_to_concat, ignore_index=True)
    print(f"📝 Final merged DataFrame size: {df_final.shape}")
except Exception as e:
    print(f"Error during final merge: {e}")
    df_final = None

# Résumé final
print("\n⭐🏆 Processing complete!")
print(f"🔢 Total stations processed: {len(buoys_datas)}")
print(f"Marine data rows collected = {number_marine_data}\nMeteo data rows collected = {number_meteo_data}")
print(f"Marine Data Successfully Converted: {marine_data_conversion}")
print(f"Meteo Data Successfully Converted: {meteo_data_conversion}")

if df_final is not None and not df_final.empty:
    print(f"Total Number of merged rows: {number_merged_data}")
    print(f"Final DataFrame rows number: {df_final.shape[0]}")
else:
    print("The DataFrame is either None or empty.")

print(f"❌ Number of failed stations: {len(list_failed_dfs)}")
if list_failed_dfs:
    print(f"⚠️ Failed stations: {', '.join(map(str, list_failed_dfs))}")

🔀 Merging all DataFrames into a final DataFrame...
📝 Final merged DataFrame size: (246782, 37)

⭐🏆 Processing complete!
🔢 Total stations processed: 39
Marine data rows collected = 0
Meteo data rows collected = 0
Marine Data Successfully Converted: 0
Meteo Data Successfully Converted: 0
Total Number of merged rows: 246782
Final DataFrame rows number: 246782
❌ Number of failed stations: 0


In [40]:
df_final.shape

(246782, 37)

In [41]:
show_first_row(df_final)

id_x                           1  (int64)
wind_direction                 240.0  (float64)
wind_speed                     5.0  (float64)
wind_gust                      6.0  (float64)
wave_height                    0.7  (float64)
dominant_wave_period           4.0  (float64)
average_wave_period            3.7  (float64)
dominant_wave_direction        218.0  (float64)
pressure                       1020.3  (float64)
air_temperature                14.1  (float64)
water_temperature              15.3  (float64)
dewpoint                       10.4  (float64)
visibility_x                   None  (object)
3hr_pressure_tendency          nan  (float64)
water_level_above_mean         None  (object)
Datetime                       2025-03-22 11:00:00  (datetime64[ns])
Station ID                     41008  (object)
Lat                            31.40N  (object)
Lon                            80.87W  (object)
id_y                           2220  (int64)
temperature_2m                 5.11000013351440

In [42]:
df_final.isnull().sum()

id_x                            0
wind_direction               8158
wind_speed                   2881
wind_gust                   10097
wave_height                138089
dominant_wave_period       182728
average_wave_period        138150
dominant_wave_direction    140578
pressure                     1208
air_temperature             42021
water_temperature           43845
dewpoint                    51720
visibility_x               246782
3hr_pressure_tendency      199937
water_level_above_mean     246782
Datetime                        0
Station ID                      0
Lat                             0
Lon                             0
id_y                            0
temperature_2m                  0
relative_humidity_2m            0
dew_point_2m                    0
precipitation                   0
rain                            0
showers                         0
pressure_msl                    0
surface_pressure                0
cloud_cover                     0
cloud_cover_lo

Renaming Columns

Renaming Columns

In [None]:
for idx, row in df_cleaned.iterrows():
    if idx == 0:
        Datetime = row["Datetime"]
        print(Datetime)

In [None]:
df_cleaned.head()

In [None]:
show_first_row(df_cleaned)

In [None]:
df_cleaned[['Daytime', 'Month']] = df_cleaned['Datetime'].apply(lambda x: get_day_time(x)).apply(pd.Series)

In [None]:
df_cleaned.head()

Third API Test

In [None]:
df_42058 = df_cleaned[df_cleaned["Station ID"] == 41008]
df_42058.head()

In [None]:
# Variables de contrôle des appels API
vc_api_key_path = r"c:\Credentials\visual_crossing_weather_api.json"
with open(vc_api_key_path, 'r') as file:
    content = json.load(file)
    vc_api_key = content["api_key"]

df_42058 = df_cleaned[df_cleaned["Station ID"] == 42058]
lat = df_42058["Lat"].iloc[0]  # Récupérer la première valeur de la colonne "Lat"
lon = df_42058["Lon"].iloc[0]  # Récupérer la première valeur de la colonne "Lon"

lat, lon = convert_coordinates(lat, lon)

# Définition des dates dynamiques
today = datetime.now().strftime("%Y-%m-%d")  # Hier pour éviter les données incomplètes d'aujourd'hui
start_date = (datetime.now() - timedelta(days=31)).strftime("%Y-%m-%d")  # 31 jours avant aujourd'hui
last_call_time = None
vc_meteo_data = None

# Construction de l'URL
url_last_month = f"https://weather.visualcrossing.com/VisualCrossingWebServices/rest/services/timeline/{lat},{lon}/{start_date}/{today}?unitGroup=metric&key={vc_api_key}&contentType=json"
try :
    response = requests.get(url_last_month)
    vc_meteo_data = response.json()  # Essayer de décoder le JSON
except Exception as e:
    print(f"{e}")            

In [None]:
import pandas as pd
from datetime import datetime, timedelta

# Assuming the df_cleaned DataFrame already exists and contains the required data

# First, load your Visual Crossing Weather Data (example, you may already have it)
# Assuming vc_meteo_data is the JSON response from Visual Crossing
# Example of flattening the JSON
df_vc_meteo = pd.json_normalize(vc_meteo_data, record_path=["days", "hours"], meta=["days"])

# Convert the datetimeEpoch from Visual Crossing Weather data into Date and Hour columns
df_vc_meteo["Date"] = pd.to_datetime(df_vc_meteo["datetimeEpoch"], unit="s").dt.strftime("%Y-%m-%d")
df_vc_meteo["Hour"] = pd.to_datetime(df_vc_meteo["datetimeEpoch"], unit="s").dt.strftime("%H")

# Filter data from df_vc_meteo for the last 30 days
today = datetime.now()
thirty_days_ago = today - timedelta(days=30)

today_str = today.strftime("%Y-%m-%d")
thirty_days_ago_str = thirty_days_ago.strftime("%Y-%m-%d")

# Filter df_vc_meteo for the last 30 days
df_test_last_month = df_vc_meteo[['Date', 'Hour', 'windspeed']]
df_test_last_month = df_test_last_month[(df_test_last_month['Date'] >= thirty_days_ago_str) & 
                                        (df_test_last_month['Date'] <= today_str)]

# Prepare df_cleaned for merging (add Date and Hour columns)
df_cleaned['Date'] = df_cleaned['Datetime'].dt.strftime("%Y-%m-%d")
df_cleaned['Hour'] = df_cleaned['Datetime'].dt.strftime("%H")

# Filter df_cleaned for the last 30 days
df_cleaned_last_month = df_cleaned[(df_cleaned['Date'] >= thirty_days_ago_str) & 
                                   (df_cleaned['Date'] <= today_str)]

# Merge df_vc_meteo and df_cleaned based on Date and Hour
df_merged = df_test_last_month.merge(df_cleaned_last_month[['Date', 'Hour', 'Wind Speed (km/h)', 'wind_speed_10m']], 
                                    on=['Date', 'Hour'], 
                                    how='inner')

# Display the merged dataframe
print(df_merged.head(100))


In [None]:
col_to_rename={'temperature_2m': 'T°(C°)',  'relative_humidity_2m': 'Relative Humidity (%)',
 'dew_point_2m': 'Dew Point (°C)', 'precipitation': 'Precipitation (mm)',  'pressure_msl':' Sea Level Pressure (hPa)', 
 'cloud_cover_low':'Low Clouds (%)', 'cloud_cover_mid' : 'Middle Clouds (%)',	 'cloud_cover_high' : 'High Clouds (%)', 
 'visibility' : ' Visibility (%)',  'wind_direction': 'Wind Direction (°)',
 'wind_speed': 'Wind Speed (km/h)','wind_gust': 'Wind Gusts (km/h)', 'wave_height': 'Wave Height (m)',  'average_wave_period': 'Average Wave Period (s)',
 'dominant_wave_direction': 'Dominant Wave Direction (°)','pressure': 'Pressure (hPA)',
 'air_temperature': 'Air T°','water_temperature': 'Water T°'}

df_cleaned = rename_columns(df_cleaned, col_to_rename)
df_cleaned = drop_columns_if_exist(df_cleaned,['soil_temperature_0cm','rain', 'showers', 'is_day', 'id_x', 'id_y','soil_moisture_0_to_1cm'])
df_cleaned.columns

In [None]:
# Récupérer les données de l'API
vc_meteo_data = response.json()
print(vc_meteo_data)  # Vérifiez les données récupérées

In [None]:
# Normaliser les données JSON en DataFrame
df_vc_meteo = pd.json_normalize(vc_meteo_data, record_path=["days", "hours"], meta=["days"])

# Afficher la première ligne des données
df_vc_meteo.head(1)

In [None]:
# Conversion du timestamp en datetime
df_vc_meteo["Date"] = pd.to_datetime(df_vc_meteo["datetimeEpoch"], unit="s").dt.strftime("%Y-%m-%d")
df_vc_meteo["Hour"] = pd.to_datetime(df_vc_meteo["datetimeEpoch"], unit="s").dt.strftime("%H")

In [None]:
# Définir les dates de filtrage pour les 30 derniers jours
today = datetime.now()
thirty_days_ago = today - timedelta(days=30)

# Convertir les dates en format YYYY-MM-DD
today_str = today.strftime("%Y-%m-%d")
thirty_days_ago_str = thirty_days_ago.strftime("%Y-%m-%d")

In [None]:
# Filtrer les données des 30 derniers jours de df_vc_meteo
df_test_last_month = df_vc_meteo[['Date', 'Hour', 'windspeed']]
df_test_last_month = df_test_last_month[(df_test_last_month['Date'] >= thirty_days_ago_str) & 
                                        (df_test_last_month['Date'] <= today_str)]

# Ajouter les colonnes Date et Hour à df_42058
df_42058.loc[:, 'Date'] = df_42058['Datetime'].dt.strftime("%Y-%m-%d")
df_42058.loc[:, 'Hour'] = df_42058['Datetime'].dt.strftime("%H")

# Filtrer les données des 30 derniers jours dans df_42058
df_42058_last_month = df_42058[(df_42058['Date'] >= thirty_days_ago_str) & 
                                (df_42058['Date'] <= today_str)]

# Fusionner les deux DataFrames sur Date et Hour
df_test_merged = df_test_last_month.merge(df_42058_last_month[['Date', 'Hour', 'Wind Speed (km/h)', 'wind_speed_10m']], 
                                     on=['Date', 'Hour'], 
                                     how='inner')

df_test_merged.head()

In [None]:
# import pandas as pd

# def handle_null_values(df: pd.DataFrame) -> pd.DataFrame:
#     row_count = df.shape[0]
    
#     # Initialisation des listes pour suivre les colonnes supprimées
#     removed_columns = []
#     non_numeric_columns_to_drop = []
    
#     # Utiliser lambda et apply() pour calculer le nombre de valeurs nulles dans chaque colonne
#     null_counts = df.apply(lambda col: int(col.isnull().sum()))  # Calculer le nombre de NaN par colonne
    
#     # Condition : 1. Colonnes avec toutes les valeurs nulles ou 2. Plus de 50% de valeurs nulles et colonne non numérique
#     columns_to_drop = null_counts[
#         (null_counts == row_count) | 
#         ((null_counts > row_count * 0.5) & ~df.apply(lambda col: pd.api.types.is_numeric_dtype(col)))
#     ].index
    
#     # Ajouter les noms des colonnes supprimées dans les listes appropriées
#     for col in columns_to_drop:
#         if null_counts[col] == row_count:
#             removed_columns.append(col)  # Colonnes entièrement vides
#         elif null_counts[col] > row_count * 0.5 and not pd.api.types.is_numeric_dtype(df[col]):
#             non_numeric_columns_to_drop.append(col)  # Colonnes > 50% nulles et non numériques
    
#     # Supprimer les colonnes identifiées
#     df = df.drop(columns=columns_to_drop)
    
#     # Afficher les résultats
#     print("Colonnes supprimées pour avoir toutes les valeurs nulles:")
#     print(removed_columns)
    
#     print("\nColonnes supprimées pour avoir plus de 50% de valeurs nulles et être non numériques:")
#     print(non_numeric_columns_to_drop)
    
#     return df

# # Exemple d'utilisation
# # df_final = pd.read_csv('ton_fichier.csv') # Assure-toi que df_final est bien un DataFrame valide avant d'appeler la fonction
# df_final = handle_null_values(df_final)


In [None]:
# df_final = df_final.round(2)
# print(df_final.columns)
# df_final.describe()

In [None]:
# def explore_dict_keys(d, parent_key='', sep='_'):
#     """
#     Explore un dictionnaire récursivement pour obtenir toutes les clés, y compris les sous-clés,
#     mais ne retourne pas les valeurs finales.

#     :param d: Le dictionnaire à explorer
#     :param parent_key: La clé parent qui est utilisée pour concaténer les sous-clés
#     :param sep: Le séparateur utilisé pour concaténer les clés (par défaut '_')
#     :return: Une liste des clés (et sous-clés)
#     """
#     keys = []
#     for k, v in d.items():
#         new_key = f"{parent_key}{sep}{k}" if parent_key else k
#         if isinstance(v, dict):  # Si la valeur est un dictionnaire, on explore récursivement
#             keys.append(new_key)  # Ajouter la clé, mais ne pas inclure la valeur
#             keys.extend(explore_dict_keys(v, new_key, sep=sep))  # Continuer l'exploration
#         else:
#             keys.append(new_key)  # Ajouter la clé finale
#     return keys

In [None]:
# def find_key_path(d, target_key, path=[]):
#     """
#     Recherche récursive d'une clé dans un dictionnaire et retourne son chemin.
#     :param d: dictionnaire
#     :param target_key: clé recherchée
#     :param path: liste pour stocker le chemin jusqu'à la clé
#     :return: chemin sous forme de liste
#     """
#     if isinstance(d, dict):  # Si le dictionnaire est encore imbriqué
#         for key, value in d.items():
#             new_path = path + [key]
#             if key == target_key:
#                 return new_path
#             elif isinstance(value, dict):
#                 result = find_key_path(value, target_key, new_path)
#                 if result:  # Si la clé est trouvée, retourner le chemin
#                     return result
#     return None  # Retourne None si la clé n'a pas été trouvée



# # Recherche du chemin pour la clé 'marine_data'
# path = find_key_path(table_dict, "Marine Dataframe")
# print(path)


Auto_convert Test

In [None]:
# for idx, (buoy_id, tables) in enumerate(table_dict.items()):  # Utilisation de .items() pour obtenir (clé, valeur)
#     if isinstance(tables, dict):
#         if idx == 1:  # Vérifier si l'index est égal à 1

Counting Rows of all Dataframes in total