In [1]:
%pip install -q -r requirements.txt

Note: you may need to restart the kernel to use updated packages.


In [2]:
from imports import *
from functions import *

Connection to PostgreSQL

In [3]:
path_postgresql_creds = r"C:\Users\f.gionnane\Documents\Data Engineering\Credentials\postgresql_creds.json"

with open(path_postgresql_creds, 'r') as file:
    content = json.load(file)
    user = content["user"]
    password = content["password"]
    host = content["host"]
    port = content["port"]

db = "Oceanography_ML_Project"
schema_bronze = "Bronze"
schema_silver = "Silver"

# Créer l'engine PostgreSQL
engine = create_engine(f"postgresql+psycopg2://{user}:{password}@{host}:{port}/{db}")
conn = engine.connect()

Charger les Données des Tables de la couche de Bronze

In [4]:
# Charger les métadonnées du schéma existant
metadata = MetaData(schema=schema_bronze)

print("\n🔍 Chargement des métadonnées du schéma...")
metadata.reflect(bind=conn)
print("✅ Métadonnées chargées avec succès.\n")

# Récupérer les noms des tables
table_names = [t.name for t in metadata.sorted_tables]
print(f"🔢 Nombre total de tables dans le schéma : {len(table_names)}\n")

# Filtrer les tables en fonction du contenu de leur nom
marine_tables = {t for t in table_names if "marine" in t.lower()}
meteo_tables = {t for t in table_names if "meteo" in t.lower()}
buoys_data_table = {t for t in table_names if "buoy" in t.lower()}

print(f"🌊 Tables marines trouvées : {len(marine_tables)}")
print(f"🌧️ Tables météo trouvées : {len(meteo_tables)}")
print(f"🐋 Tables de bouées trouvées : {len(buoys_data_table)}\n")

# Initialiser le dictionnaire des résultats
buoys_datas = {}

# Compteurs pour suivre le nombre de tables chargées avec succès
marine_data_count = 0
meteo_data_count = 0
buoys_data_count = 0

# Compteur pour le nombre total de lignes
total_marine_rows = 0
total_meteo_rows = 0
total_buoys_rows = 0  # Changer ici pour compter le nombre de lignes (bouées)

# Vérifier et récupérer les données de la table "buoys_datas"
if buoys_data_table:
    print("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━")
    print("🔄 Chargement des données de la table 'buoys_datas'...")

    try:
        buoys_datas_raw = fetch_table_data(schema=schema_bronze, conn=conn, table_name=next(iter(buoys_data_table)), as_df=True)

        if buoys_datas_raw is not None:
            print("📦 Données récupérées pour 'buoys_datas'.")

            # Conversion JSON → dict si nécessaire
            if isinstance(buoys_datas_raw, str):
                buoys_datas_raw = json.loads(buoys_datas_raw)

            elif isinstance(buoys_datas_raw, pd.DataFrame) and "Station ID" in buoys_datas_raw.columns:
                # Convertir en dictionnaire avec "Station ID" comme clé
                buoys_datas_raw = buoys_datas_raw.set_index("Station ID").to_dict(orient="index")

            # Ajouter au dictionnaire principal directement avec les Station ID comme clés
            buoys_datas.update(buoys_datas_raw)
            buoys_data_count += 1
            total_buoys_rows += len(buoys_datas_raw)  # Compter le nombre de bouées
            print(f"✅ Table 'buoys_datas' chargée avec succès! Nombre de bouées (lignes) : {total_buoys_rows}\n")
        else:
            print("⚠️ Aucun résultat trouvé dans 'buoys_datas'.\n")

    except Exception as e:
        print(f"❌ Erreur lors du chargement de 'buoys_datas': {e}\n")

# Associer les tables marine et meteo en fonction du station_id et récupérer leurs données
for table_set, label, icon, counter, total_rows in [
    (marine_tables, "Marine", "🌊", marine_data_count, total_marine_rows),
    (meteo_tables, "Meteo", "🌧️", meteo_data_count, total_meteo_rows)
]:
    print("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━")
    for table_name in table_set:
        print(f"🔄 Chargement des données pour la table {label} : {table_name}...")

        try:
            station_id = table_name.split("_")[1]

            # Vérifier si la station existe déjà dans buoys_datas, sinon initialiser un dictionnaire
            if station_id not in buoys_datas:
                buoys_datas[station_id] = {}

            # Récupérer les données
            data = fetch_table_data(schema=schema_bronze, conn=conn, table_name=table_name, as_df=True)

            if data is not None:
                print(f"📦 Données récupérées pour la station {station_id} ({label}).")

                if isinstance(data, str):
                    data = pd.DataFrame(json.loads(data))
                elif isinstance(data, dict):
                    data = pd.DataFrame(data)

                # Ajouter les données au dictionnaire de bouées sous la station_id
                buoys_datas[station_id][f"{label} DataFrame"] = data
                counter += 1
                total_rows += len(data)  # Ajouter le nombre de lignes collectées
                print(f"{icon} Données {label} chargées pour la station {station_id}! Nombre de lignes collectées : {len(data)}\n")
            else:
                print(f"⚠️ Aucun résultat trouvé pour la station {station_id} ({label}).\n")

        except Exception as e:
            print(f"❌ Erreur lors du chargement des données {label} pour {table_name} : {e}\n")

    # Mise à jour des compteurs après le chargement des données pour chaque catégorie
    if label == "Marine":
        marine_data_count = counter
        total_marine_rows = total_rows
    elif label == "Meteo":
        meteo_data_count = counter
        total_meteo_rows = total_rows

# Finalement, afficher un récapitulatif global
print("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━")
print(f"🏆 Chargement des données terminé avec succès !")
print(f"🐋 Total des données bouées chargées : {buoys_data_count} - Nombre de bouées (lignes) : {total_buoys_rows}")
print(f"🌊 Total des données marines chargées : {marine_data_count} - Nombre total de lignes : {total_marine_rows}")
print(f"🌧️ Total des données météorologiques chargées : {meteo_data_count} - Nombre total de lignes : {total_meteo_rows}")
print("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━")


🔍 Chargement des métadonnées du schéma...
✅ Métadonnées chargées avec succès.

🔢 Nombre total de tables dans le schéma : 79

🌊 Tables marines trouvées : 39
🌧️ Tables météo trouvées : 39
🐋 Tables de bouées trouvées : 1

━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
🔄 Chargement des données de la table 'buoys_datas'...
📦 Données récupérées pour 'buoys_datas'.
✅ Table 'buoys_datas' chargée avec succès! Nombre de bouées (lignes) : 39

━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
🔄 Chargement des données pour la table Marine : station_44020_marine_nantucket sound...
📦 Données récupérées pour la station 44020 (Marine).
🌊 Données Marine chargées pour la station 44020! Nombre de lignes collectées : 6519

🔄 Chargement des données pour la table Marine : station_POTA2_marine_potato point, ak...
📦 Données récupérées pour la station POTA2 (Marine).
🌊 Données Marine chargées pour la station POTA2! Nombre de lignes collectées : 2190

🔄 Chargement des données pour la table Marine : station_46084_marine_cape edgecum

Fuse all the Dataframe in one Final Dataframe

In [5]:
list_silver_merged_df = []  # List to store merged DataFrames
list_failed_dfs = []        # List to store failed DataFrame pairs

number_marine_data = 0
number_meteo_data = 0
number_merged_data = 0

marine_data_conversion = 0
meteo_data_conversion = 0

# Loop through each station in the dictionary
for idx, (station_id, tables) in enumerate(buoys_datas.items()):
    print(f"\n🔄 Processing station {station_id} ({idx + 1}/{len(buoys_datas)})...")

    # Check if the station's buoy data is excluded from normal processing
    # Retrieve weather and marine DataFrames for the station
    df_meteo = buoys_datas[station_id].get("Meteo DataFrame", None)
    df_marine = buoys_datas[station_id].get("Marine DataFrame", None)
    
    # Ensure all necessary data is present
    if df_meteo is None or df_marine is None or buoys_datas[station_id] is None:
        print(f"⚠️ Missing data for station {station_id}. Skipping this station.")
        list_failed_dfs.append(station_id)
        continue

    print(f"📦 Data retrieved for station {station_id}:\n")
    number_marine_data += int(df_marine.shape[0])
    number_meteo_data += int(df_meteo.shape[0])

    # Add buoy coordinates (Lat/Lon) to marine data
    print("📍 Merging buoy coordinates (Lat/Lon) with marine data...")

                ### HANDLING DATA NULL VALUE ###

    ### HANDLING MARINE DATA NULL VALUE
    print(f"Handling Null Values in Marine data for Buoy {station_id}")
    try:
        df_marine = handle_null_values(df_marine)
    except Exception as e:
        print(f"Error: {e}")
    
    ### HANDLING METEO DATA NULL VALUE

    print(f"Handling Null Values in weather data for Buoy {station_id}")
    try:
        df_meteo = handle_null_values(df_meteo)
    except Exception as e:
        print(f"Error: {e}")


                    ### PROCESS AND RESAMPLE ###

    ### PROCESS AND RESAMPLE MARINE DATA
    print(f"🔁 Processing and resampling marine data for station {station_id}...")
    df_marine = process_and_resample(df_marine, column_name='time')
    print(f"✅ Marine data resampled for station {station_id}.")

    ### PROCESS AND RESAMPLE METEO DATA
    print(f"🔁 Processing and resampling weather data for station {station_id}...")
    df_meteo = process_and_resample(df_meteo, column_name='date')
    print(f"✅ Weather data resampled for station {station_id}.")


                    #### AUTOCONVERT PROCESS ###

    ### Trying Autoconvert on Marine Data
    print(f"🔁 Processing to Marine Data Conversion for station {station_id}...")
    try:
        df_marine = auto_convert(df_marine)
        print(f"✅ Marine Data Successfully Converted for Station {station_id}.")
        marine_data_conversion += 0

    except Exception as e:
        print(f"Conversion Failed :\nError: {e}")

    ### Trying Autoconvert on Meteo Data
    print(f"🔁 Processing to Meteo Data Conversion for station {station_id}...")
    try:
        df_meteo = auto_convert(df_meteo)
        print(f"✅ Meteo Data Successfully Converted for Station {station_id}.")
        meteo_data_conversion += 1

    except Exception as e:
        print(f"Conversion Failed :\nError: {e}")

###### ADDING COORDINATES TO MARINE DATA ##################################################

    df_marine["Lat"] = buoys_datas[station_id]["Lat"]
    df_marine["Lon"] = buoys_datas[station_id]["Lon"]
    print(f"🌐 Coordinates (Lat/Lon) added for station {station_id}: {df_marine['Lat'].iloc[0]} / {df_marine['Lon'].iloc[0]}.")

    ##### MERGING DATAFRAMES

    # Merge marine and weather DataFrames
    print(f"🔗 Merging marine and weather data for station {station_id}...")
    df_merged = pd.merge(df_marine, df_meteo, on='Datetime', how='inner')
    buoys_datas[str(station_id)]["Merged Dataframe"] = df_merged
    df_merged_row_count = int(df_merged.shape[0])
    number_merged_data += df_merged_row_count
    
    print(f"🔢 Buoy {station_id} Merged Dataframe row count: {df_merged_row_count}"
            
          f"💾 Merged data added to dictionary for station {station_id}.")


    list_silver_merged_df.append(df_merged)

# Final merge of all DataFrames
print("🔀 Merging all DataFrames into a final DataFrame...")
# Display the size of DataFrames before final merge

# Extract only the merged DataFrames from each station
dataframes_to_concat = [station_data["Merged Dataframe"] for station_data in buoys_datas.values()]

# Merge the DataFrames
df_final = pd.concat(dataframes_to_concat, ignore_index=True)

# Print some details about the final merge
print(f"📝 Final merged DataFrame size: {df_final.shape}")
print("✅ Final merge completed successfully!")

# Final summary
print("\n⭐🏆 Processing complete!")
print(f"🔢 Total stations processed: {len(buoys_datas)}")
print(f"Marine data rows collected = {number_marine_data}\nMeteo data rows collected = {number_meteo_data}")
print(f"Marine Data Successfully Converted:  {marine_data_conversion}")
print(f"Meteo Data Successfully Converted:  {meteo_data_conversion}")

if df_final is not None and not df_final.empty:
    print(f"Total Number of merged rows: {number_merged_data}")
    print(f"Final DataFrame rows number: {df_final.shape[0]}")

else:
    print("The DataFrame is either None or empty.")

print(f"❌ Number of failed stations: {len(list_failed_dfs)}")
if list_failed_dfs:
    print(f"⚠️ Failed stations: {', '.join(list_failed_dfs)}")


🔄 Processing station 41008 (1/39)...
📦 Data retrieved for station 41008:

📍 Merging buoy coordinates (Lat/Lon) with marine data...
Handling Null Values in Marine data for Buoy 41008
Dropped columns (100% missing): visibility, water_level_above_mean
Imputed columns (<50% missing, median): wind_direction, wind_speed, wind_gust, wave_height, dominant_wave_period, average_wave_period, dominant_wave_direction, pressure, air_temperature, water_temperature, dewpoint, 3hr_pressure_tendency
Handling Null Values in weather data for Buoy 41008
Skipped non-numeric columns: temperature_2m, relative_humidity_2m, dew_point_2m, precipitation, rain, showers, pressure_msl, surface_pressure, cloud_cover, cloud_cover_low, cloud_cover_mid, cloud_cover_high, visibility, wind_speed_10m, soil_temperature_0cm, soil_moisture_0_to_1cm
🔁 Processing and resampling marine data for station 41008...
✅ Marine data resampled for station 41008.
🔁 Processing and resampling weather data for station 41008...
✅ Weather dat

In [6]:
df_final.dtypes

id_x                         int64
wind_direction             float64
wind_speed                 float64
wind_gust                  float64
wave_height                float64
dominant_wave_period       float64
average_wave_period        float64
dominant_wave_direction    float64
pressure                   float64
air_temperature            float64
water_temperature          float64
dewpoint                   float64
3hr_pressure_tendency      float64
Datetime                     int64
Station ID                 float64
Lat                         object
Lon                         object
id_y                         int64
temperature_2m             float64
relative_humidity_2m       float64
dew_point_2m               float64
precipitation              float64
rain                       float64
showers                    float64
pressure_msl               float64
surface_pressure           float64
cloud_cover                float64
cloud_cover_low            float64
cloud_cover_mid     

Renaming Columns

In [7]:
df_final.columns

Index(['id_x', 'wind_direction', 'wind_speed', 'wind_gust', 'wave_height',
       'dominant_wave_period', 'average_wave_period',
       'dominant_wave_direction', 'pressure', 'air_temperature',
       'water_temperature', 'dewpoint', '3hr_pressure_tendency', 'Datetime',
       'Station ID', 'Lat', 'Lon', 'id_y', 'temperature_2m',
       'relative_humidity_2m', 'dew_point_2m', 'precipitation', 'rain',
       'showers', 'pressure_msl', 'surface_pressure', 'cloud_cover',
       'cloud_cover_low', 'cloud_cover_mid', 'cloud_cover_high', 'visibility',
       'wind_speed_10m', 'soil_temperature_0cm', 'soil_moisture_0_to_1cm',
       'is_day'],
      dtype='object')

In [8]:
col_to_rename={'temperature_2m': 'T°(C°)',  'relative_humidity_2m': 'Relative Humidity (%)',
 'dew_point_2m': 'Dew Point (°C)', 'precipitation': 'Precipitation (mm)',  'pressure_msl':' Sea Level Pressure (hPa)', 
 'cloud_cover_low':'Low Clouds (%)', 'cloud_cover_mid' : 'Middle Clouds (%)',	 'cloud_cover_high' : 'High Clouds (%)', 
 'visibility' : ' Visibility (%)',  'wind_speed_10m' : 'Wind Speed (km/h)',  'wind_direction': 'Wind Direction (°)',
 'wind_speed': 'Wind Speed (km/h)','wind_gust': 'Wind Gusts (km/h)', 'wave_height': 'Wave Height (m)',  'average_wave_period': 'Average Wave Period (s)',
 'dominant_wave_direction': 'Dominant Wave Direction (°)','pressure': 'Pressure (hPA)',
 'air_temperature': 'Air T°','water_temperature': 'Water T°'}
df_final = rename_columns(df_final, col_to_rename)
df_final.columns

Index(['id_x', 'Wind Direction (°)', 'Wind Speed (km/h)', 'Wind Gusts (km/h)',
       'Wave Height (m)', 'dominant_wave_period', 'Average Wave Period (s)',
       'Dominant Wave Direction (°)', 'Pressure (hPA)', 'Air T°', 'Water T°',
       'dewpoint', '3hr_pressure_tendency', 'Datetime', 'Station ID', 'Lat',
       'Lon', 'id_y', 'T°(C°)', 'Relative Humidity (%)', 'Dew Point (°C)',
       'Precipitation (mm)', 'rain', 'showers', ' Sea Level Pressure (hPa)',
       'surface_pressure', 'cloud_cover', 'Low Clouds (%)',
       'Middle Clouds (%)', 'High Clouds (%)', ' Visibility (%)',
       'Wind Speed (km/h)', 'soil_temperature_0cm', 'soil_moisture_0_to_1cm',
       'is_day'],
      dtype='object')

In [9]:
df_final.columns

Index(['id_x', 'Wind Direction (°)', 'Wind Speed (km/h)', 'Wind Gusts (km/h)',
       'Wave Height (m)', 'dominant_wave_period', 'Average Wave Period (s)',
       'Dominant Wave Direction (°)', 'Pressure (hPA)', 'Air T°', 'Water T°',
       'dewpoint', '3hr_pressure_tendency', 'Datetime', 'Station ID', 'Lat',
       'Lon', 'id_y', 'T°(C°)', 'Relative Humidity (%)', 'Dew Point (°C)',
       'Precipitation (mm)', 'rain', 'showers', ' Sea Level Pressure (hPa)',
       'surface_pressure', 'cloud_cover', 'Low Clouds (%)',
       'Middle Clouds (%)', 'High Clouds (%)', ' Visibility (%)',
       'Wind Speed (km/h)', 'soil_temperature_0cm', 'soil_moisture_0_to_1cm',
       'is_day'],
      dtype='object')

In [10]:
df_final = drop_columns_if_exist(df_final,['soil_temperature_0cm', 'soil_moisture_0_to_1cm'])

Colonne 'soil_temperature_0cm' Supprimée
Colonne 'soil_moisture_0_to_1cm' Supprimée


In [11]:
show_null_counts(df_final)

id_x                                    0   / 41047
Wind Direction (°)                      0   / 41047
Wind Speed (km/h)                       0   / 41047
Wind Gusts (km/h)                       1096/ 41047
Wave Height (m)                         9855/ 41047
dominant_wave_period                    9855/ 41047
Average Wave Period (s)                 9855/ 41047
Dominant Wave Direction (°)             9855/ 41047
Pressure (hPA)                          0   / 41047
Air T°                                  5485/ 41047
Water T°                                8764/ 41047
dewpoint                                9867/ 41047
3hr_pressure_tendency                   0   / 41047
Datetime                                0   / 41047
Station ID                              8764/ 41047
Lat                                     0   / 41047
Lon                                     0   / 41047
id_y                                    0   / 41047
T°(C°)                                  0   / 41047
Relative Hum

In [12]:
df_cleaned = df_final.dropna().round(2)
show_null_counts(df_cleaned)

id_x                                    0   / 25707
Wind Direction (°)                      0   / 25707
Wind Speed (km/h)                       0   / 25707
Wind Gusts (km/h)                       0   / 25707
Wave Height (m)                         0   / 25707
dominant_wave_period                    0   / 25707
Average Wave Period (s)                 0   / 25707
Dominant Wave Direction (°)             0   / 25707
Pressure (hPA)                          0   / 25707
Air T°                                  0   / 25707
Water T°                                0   / 25707
dewpoint                                0   / 25707
3hr_pressure_tendency                   0   / 25707
Datetime                                0   / 25707
Station ID                              0   / 25707
Lat                                     0   / 25707
Lon                                     0   / 25707
id_y                                    0   / 25707
T°(C°)                                  0   / 25707
Relative Hum

In [13]:
df_cleaned['Datetime'] = pd.to_datetime(df_cleaned['Datetime'], errors='coerce').dt.floor('H')
show_first_row(df_cleaned)

id_x                          3
Wind Direction (°)            240.0
Wind Speed (km/h)             6.0
Wind Gusts (km/h)             7.0
Wave Height (m)               0.8
dominant_wave_period          8.0
Average Wave Period (s)       4.8
Dominant Wave Direction (°)   104.0
Pressure (hPA)                1020.3
Air T°                        14.2
Water T°                      15.3
dewpoint                      10.9
3hr_pressure_tendency         0.6
Datetime                      2025-03-22 11:00:00
Station ID                    41008.0
Lat                           31.40N
Lon                           80.87W
id_y                          2220
T°(C°)                        5.11
Relative Humidity (%)         56.0
Dew Point (°C)                -2.96
Precipitation (mm)            0.0
rain                          0.0
showers                       0.0
 Sea Level Pressure (hPa)     1003.7
surface_pressure              979.29
cloud_cover                   0.0
Low Clouds (%)                0.0
Mid

In [14]:
df_cleaned.dtypes

id_x                                    int64
Wind Direction (°)                    float64
Wind Speed (km/h)                     float64
Wind Gusts (km/h)                     float64
Wave Height (m)                       float64
dominant_wave_period                  float64
Average Wave Period (s)               float64
Dominant Wave Direction (°)           float64
Pressure (hPA)                        float64
Air T°                                float64
Water T°                              float64
dewpoint                              float64
3hr_pressure_tendency                 float64
Datetime                       datetime64[ns]
Station ID                            float64
Lat                                    object
Lon                                    object
id_y                                    int64
T°(C°)                                float64
Relative Humidity (%)                 float64
Dew Point (°C)                        float64
Precipitation (mm)                

In [15]:
show_null_counts(df_cleaned)

id_x                                    0   / 25707
Wind Direction (°)                      0   / 25707
Wind Speed (km/h)                       0   / 25707
Wind Gusts (km/h)                       0   / 25707
Wave Height (m)                         0   / 25707
dominant_wave_period                    0   / 25707
Average Wave Period (s)                 0   / 25707
Dominant Wave Direction (°)             0   / 25707
Pressure (hPA)                          0   / 25707
Air T°                                  0   / 25707
Water T°                                0   / 25707
dewpoint                                0   / 25707
3hr_pressure_tendency                   0   / 25707
Datetime                                0   / 25707
Station ID                              0   / 25707
Lat                                     0   / 25707
Lon                                     0   / 25707
id_y                                    0   / 25707
T°(C°)                                  0   / 25707
Relative Hum

In [16]:
print(f"Colonnes df_marine:\n{df_marine.columns}\n\nColonnes df_meteo:\n{df_meteo.columns}\n")

for col in df_marine.columns:
    print(f"{col}:  {type(col)}")


Colonnes df_marine:
Index(['id', 'wind_direction', 'wind_speed', 'wind_gust', 'pressure',
       'air_temperature', '3hr_pressure_tendency', 'Datetime', 'Station ID',
       'Lat', 'Lon'],
      dtype='object')

Colonnes df_meteo:
Index(['id', 'Datetime', 'temperature_2m', 'relative_humidity_2m',
       'dew_point_2m', 'precipitation', 'rain', 'showers', 'pressure_msl',
       'surface_pressure', 'cloud_cover', 'cloud_cover_low', 'cloud_cover_mid',
       'cloud_cover_high', 'visibility', 'wind_speed_10m',
       'soil_temperature_0cm', 'soil_moisture_0_to_1cm', 'is_day'],
      dtype='object')

id:  <class 'str'>
wind_direction:  <class 'str'>
wind_speed:  <class 'str'>
wind_gust:  <class 'str'>
pressure:  <class 'str'>
air_temperature:  <class 'str'>
3hr_pressure_tendency:  <class 'str'>
Datetime:  <class 'str'>
Station ID:  <class 'str'>
Lat:  <class 'str'>
Lon:  <class 'str'>


In [17]:
print(f"{df_final.shape[0]} rows, {df_final.shape[1]} columns\n")
for col in df_final.columns:
    print(f"Col '{col}' : {df_final[col].isna().sum()}")
print(f"\n{df_final.dtypes}")

41047 rows, 33 columns

Col 'id_x' : 0
Col 'Wind Direction (°)' : 0
Col 'Wind Speed (km/h)' : Wind Speed (km/h)    0
Wind Speed (km/h)    0
dtype: int64
Col 'Wind Gusts (km/h)' : 1096
Col 'Wave Height (m)' : 9855
Col 'dominant_wave_period' : 9855
Col 'Average Wave Period (s)' : 9855
Col 'Dominant Wave Direction (°)' : 9855
Col 'Pressure (hPA)' : 0
Col 'Air T°' : 5485
Col 'Water T°' : 8764
Col 'dewpoint' : 9867
Col '3hr_pressure_tendency' : 0
Col 'Datetime' : 0
Col 'Station ID' : 8764
Col 'Lat' : 0
Col 'Lon' : 0
Col 'id_y' : 0
Col 'T°(C°)' : 0
Col 'Relative Humidity (%)' : 0
Col 'Dew Point (°C)' : 0
Col 'Precipitation (mm)' : 0
Col 'rain' : 0
Col 'showers' : 0
Col ' Sea Level Pressure (hPa)' : 0
Col 'surface_pressure' : 0
Col 'cloud_cover' : 0
Col 'Low Clouds (%)' : 0
Col 'Middle Clouds (%)' : 0
Col 'High Clouds (%)' : 0
Col ' Visibility (%)' : 0
Col 'Wind Speed (km/h)' : Wind Speed (km/h)    0
Wind Speed (km/h)    0
dtype: int64
Col 'is_day' : 0

id_x                             int64


In [18]:
# import pandas as pd

# def handle_null_values(df: pd.DataFrame) -> pd.DataFrame:
#     row_count = df.shape[0]
    
#     # Initialisation des listes pour suivre les colonnes supprimées
#     removed_columns = []
#     non_numeric_columns_to_drop = []
    
#     # Utiliser lambda et apply() pour calculer le nombre de valeurs nulles dans chaque colonne
#     null_counts = df.apply(lambda col: int(col.isnull().sum()))  # Calculer le nombre de NaN par colonne
    
#     # Condition : 1. Colonnes avec toutes les valeurs nulles ou 2. Plus de 50% de valeurs nulles et colonne non numérique
#     columns_to_drop = null_counts[
#         (null_counts == row_count) | 
#         ((null_counts > row_count * 0.5) & ~df.apply(lambda col: pd.api.types.is_numeric_dtype(col)))
#     ].index
    
#     # Ajouter les noms des colonnes supprimées dans les listes appropriées
#     for col in columns_to_drop:
#         if null_counts[col] == row_count:
#             removed_columns.append(col)  # Colonnes entièrement vides
#         elif null_counts[col] > row_count * 0.5 and not pd.api.types.is_numeric_dtype(df[col]):
#             non_numeric_columns_to_drop.append(col)  # Colonnes > 50% nulles et non numériques
    
#     # Supprimer les colonnes identifiées
#     df = df.drop(columns=columns_to_drop)
    
#     # Afficher les résultats
#     print("Colonnes supprimées pour avoir toutes les valeurs nulles:")
#     print(removed_columns)
    
#     print("\nColonnes supprimées pour avoir plus de 50% de valeurs nulles et être non numériques:")
#     print(non_numeric_columns_to_drop)
    
#     return df

# # Exemple d'utilisation
# # df_final = pd.read_csv('ton_fichier.csv') # Assure-toi que df_final est bien un DataFrame valide avant d'appeler la fonction
# df_final = handle_null_values(df_final)


In [19]:
# df_final = df_final.round(2)
# print(df_final.columns)
# df_final.describe()

In [20]:
# def explore_dict_keys(d, parent_key='', sep='_'):
#     """
#     Explore un dictionnaire récursivement pour obtenir toutes les clés, y compris les sous-clés,
#     mais ne retourne pas les valeurs finales.

#     :param d: Le dictionnaire à explorer
#     :param parent_key: La clé parent qui est utilisée pour concaténer les sous-clés
#     :param sep: Le séparateur utilisé pour concaténer les clés (par défaut '_')
#     :return: Une liste des clés (et sous-clés)
#     """
#     keys = []
#     for k, v in d.items():
#         new_key = f"{parent_key}{sep}{k}" if parent_key else k
#         if isinstance(v, dict):  # Si la valeur est un dictionnaire, on explore récursivement
#             keys.append(new_key)  # Ajouter la clé, mais ne pas inclure la valeur
#             keys.extend(explore_dict_keys(v, new_key, sep=sep))  # Continuer l'exploration
#         else:
#             keys.append(new_key)  # Ajouter la clé finale
#     return keys

In [21]:
# def find_key_path(d, target_key, path=[]):
#     """
#     Recherche récursive d'une clé dans un dictionnaire et retourne son chemin.
#     :param d: dictionnaire
#     :param target_key: clé recherchée
#     :param path: liste pour stocker le chemin jusqu'à la clé
#     :return: chemin sous forme de liste
#     """
#     if isinstance(d, dict):  # Si le dictionnaire est encore imbriqué
#         for key, value in d.items():
#             new_path = path + [key]
#             if key == target_key:
#                 return new_path
#             elif isinstance(value, dict):
#                 result = find_key_path(value, target_key, new_path)
#                 if result:  # Si la clé est trouvée, retourner le chemin
#                     return result
#     return None  # Retourne None si la clé n'a pas été trouvée



# # Recherche du chemin pour la clé 'marine_data'
# path = find_key_path(table_dict, "Marine Dataframe")
# print(path)


Auto_convert Test

In [22]:
# for idx, (buoy_id, tables) in enumerate(table_dict.items()):  # Utilisation de .items() pour obtenir (clé, valeur)
#     if isinstance(tables, dict):
#         if idx == 1:  # Vérifier si l'index est égal à 1

Counting Rows of all Dataframes in total