In [33]:
%pip install -q -r requirements.txt

Note: you may need to restart the kernel to use updated packages.


In [34]:
from imports import *
from functions import *

Connection to PostgreSQL

In [35]:
path_postgresql_creds = r"C:\Users\f.gionnane\Documents\Data Engineering\Credentials\postgresql_creds.json"

with open(path_postgresql_creds, 'r') as file:
    content = json.load(file)
    user = content["user"]
    password = content["password"]
    host = content["host"]
    port = content["port"]

db = "Oceanography_ML_Project"
schema_bronze = "Bronze"
schema_silver = "Silver"

# Créer l'engine PostgreSQL
engine = create_engine(f"postgresql+psycopg2://{user}:{password}@{host}:{port}/{db}")
conn = engine.connect()

Charger les Données des Tables de la couche de Bronze

In [36]:
# Charger les métadonnées du schéma existant
metadata = MetaData(schema=schema_bronze)

print("\n🔍 Chargement des métadonnées du schéma...")
metadata.reflect(bind=conn)
print("✅ Métadonnées chargées avec succès.\n")

# Récupérer les noms des tables
table_names = [t.name for t in metadata.sorted_tables]
print(f"🔢 Nombre total de tables dans le schéma : {len(table_names)}\n")

# Filtrer les tables en fonction du contenu de leur nom
marine_tables = {t for t in table_names if "marine" in t.lower()}
meteo_tables = {t for t in table_names if "meteo" in t.lower()}
buoys_data_table = {t for t in table_names if "buoy" in t.lower()}

print(f"🌊 Tables marines trouvées : {len(marine_tables)}")
print(f"🌧️ Tables météo trouvées : {len(meteo_tables)}")
print(f"🐋 Tables de bouées trouvées : {len(buoys_data_table)}\n")

# Initialiser le dictionnaire des résultats
buoys_datas = {}

# Compteurs pour suivre le nombre de tables chargées avec succès
marine_data_count = 0
meteo_data_count = 0
buoys_data_count = 0

# Compteur pour le nombre total de lignes
total_marine_rows = 0
total_meteo_rows = 0
total_buoys_rows = 0  # Changer ici pour compter le nombre de lignes (bouées)

# Vérifier et récupérer les données de la table "buoys_datas"
if buoys_data_table:
    print("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━")
    print("🔄 Chargement des données de la table 'buoys_datas'...")

    try:
        buoys_datas_raw = fetch_table_data(schema=schema_bronze, conn=conn, table_name=next(iter(buoys_data_table)), as_df=True)

        if buoys_datas_raw is not None:
            print("📦 Données récupérées pour 'buoys_datas'.")

            # Conversion JSON → dict si nécessaire
            if isinstance(buoys_datas_raw, str):
                buoys_datas_raw = json.loads(buoys_datas_raw)

            elif isinstance(buoys_datas_raw, pd.DataFrame) and "Station ID" in buoys_datas_raw.columns:
                # Convertir en dictionnaire avec "Station ID" comme clé
                buoys_datas_raw = buoys_datas_raw.set_index("Station ID").to_dict(orient="index")

            # Ajouter au dictionnaire principal directement avec les Station ID comme clés
            buoys_datas.update(buoys_datas_raw)
            buoys_data_count += 1
            total_buoys_rows += len(buoys_datas_raw)  # Compter le nombre de bouées
            print(f"✅ Table 'buoys_datas' chargée avec succès! Nombre de bouées (lignes) : {total_buoys_rows}\n")
        else:
            print("⚠️ Aucun résultat trouvé dans 'buoys_datas'.\n")

    except Exception as e:
        print(f"❌ Erreur lors du chargement de 'buoys_datas': {e}\n")

# Associer les tables marine et meteo en fonction du station_id et récupérer leurs données
for table_set, label, icon, counter, total_rows in [
    (marine_tables, "Marine", "🌊", marine_data_count, total_marine_rows),
    (meteo_tables, "Meteo", "🌧️", meteo_data_count, total_meteo_rows)
]:
    print("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━")
    for table_name in table_set:
        print(f"🔄 Chargement des données pour la table {label} : {table_name}...")

        try:
            station_id = table_name.split("_")[1]

            # Vérifier si la station existe déjà dans buoys_datas, sinon initialiser un dictionnaire
            if station_id not in buoys_datas:
                buoys_datas[station_id] = {}

            # Récupérer les données
            data = fetch_table_data(schema=schema_bronze, conn=conn, table_name=table_name, as_df=True)

            if data is not None:
                print(f"📦 Données récupérées pour la station {station_id} ({label}).")

                if isinstance(data, str):
                    data = pd.DataFrame(json.loads(data))
                elif isinstance(data, dict):
                    data = pd.DataFrame(data)

                # Ajouter les données au dictionnaire de bouées sous la station_id
                buoys_datas[station_id][f"{label} DataFrame"] = data
                counter += 1
                total_rows += len(data)  # Ajouter le nombre de lignes collectées
                print(f"{icon} Données {label} chargées pour la station {station_id}! Nombre de lignes collectées : {len(data)}\n")
            else:
                print(f"⚠️ Aucun résultat trouvé pour la station {station_id} ({label}).\n")

        except Exception as e:
            print(f"❌ Erreur lors du chargement des données {label} pour {table_name} : {e}\n")

    # Mise à jour des compteurs après le chargement des données pour chaque catégorie
    if label == "Marine":
        marine_data_count = counter
        total_marine_rows = total_rows
    elif label == "Meteo":
        meteo_data_count = counter
        total_meteo_rows = total_rows

# Finalement, afficher un récapitulatif global
print("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━")
print(f"🏆 Chargement des données terminé avec succès !")
print(f"🐋 Total des données bouées chargées : {buoys_data_count} - Nombre de bouées (lignes) : {total_buoys_rows}")
print(f"🌊 Total des données marines chargées : {marine_data_count} - Nombre total de lignes : {total_marine_rows}")
print(f"🌧️ Total des données météorologiques chargées : {meteo_data_count} - Nombre total de lignes : {total_meteo_rows}")
print("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━")


🔍 Chargement des métadonnées du schéma...


✅ Métadonnées chargées avec succès.

🔢 Nombre total de tables dans le schéma : 79

🌊 Tables marines trouvées : 39
🌧️ Tables météo trouvées : 39
🐋 Tables de bouées trouvées : 1

━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
🔄 Chargement des données de la table 'buoys_datas'...
📦 Données récupérées pour 'buoys_datas'.
✅ Table 'buoys_datas' chargée avec succès! Nombre de bouées (lignes) : 39

━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
🔄 Chargement des données pour la table Marine : station_42058_marine_central caribbean...
📦 Données récupérées pour la station 42058 (Marine).
🌊 Données Marine chargées pour la station 42058! Nombre de lignes collectées : 6942

🔄 Chargement des données pour la table Marine : station_POTA2_marine_potato point, ak...
📦 Données récupérées pour la station POTA2 (Marine).
🌊 Données Marine chargées pour la station POTA2! Nombre de lignes collectées : 2334

🔄 Chargement des données pour la table Marine : station_LONF1_marine_long key, fl...
📦 Données récupérées pour la station 

In [37]:
list_silver_merged_df = []  
list_failed_dfs = []        

number_marine_data = 0
number_meteo_data = 0
number_merged_data = 0

marine_data_conversion = 0
meteo_data_conversion = 0

HANDLING NULL VALUES

In [38]:
# Traitement des valeurs nulles
for station_id, tables in buoys_datas.items():
    try:
        print(f"Handling Null Values in Marine data for Buoy {station_id}")
        tables["Marine DataFrame"] = handle_null_values(tables["Marine DataFrame"])
    except Exception as e:
        print(f"Error handling Marine Null Values: {e}")

    try:
        print(f"Handling Null Values in Weather data for Buoy {station_id}")
        tables["Meteo DataFrame"] = handle_null_values(tables["Meteo DataFrame"])
    except Exception as e:
        print(f"Error handling Meteo Null Values: {e}")

Handling Null Values in Marine data for Buoy 41008
Dropped columns (100% missing): visibility, water_level_above_mean
Imputed columns (<50% missing, median): wind_direction, wind_speed, wind_gust, wave_height, dominant_wave_period, average_wave_period, dominant_wave_direction, pressure, air_temperature, water_temperature, dewpoint, 3hr_pressure_tendency
Handling Null Values in Weather data for Buoy 41008
Skipped non-numeric columns: temperature_2m, relative_humidity_2m, dew_point_2m, precipitation, rain, showers, pressure_msl, surface_pressure, cloud_cover, cloud_cover_low, cloud_cover_mid, cloud_cover_high, visibility, wind_speed_10m, soil_temperature_0cm, soil_moisture_0_to_1cm
Handling Null Values in Marine data for Buoy 41044
Dropped columns (100% missing): visibility, water_level_above_mean
Imputed columns (<50% missing, median): wind_direction, wind_speed, wind_gust, wave_height, dominant_wave_period, average_wave_period, dominant_wave_direction, pressure, air_temperature, water_

HOUR RESAMPLING

In [39]:
# Resampling des données et stockage dans un nouveau compartiment du dictionnaire
for station_id, tables in buoys_datas.items():
    try:
        print(f"🔁 Processing and resampling marine data for station {station_id}...")
        tables["Resampled Marine DataFrame"] = process_datetime_column(tables["Marine DataFrame"], column='time')
    except Exception as e:
        print(f"Error processing Marine Data for {station_id}: {e}")

    try:
        print(f"🔁 Processing and resampling weather data for station {station_id}...")
        tables["Resampled Meteo DataFrame"] = process_datetime_column(tables["Meteo DataFrame"], column='date')
    except Exception as e:
        print(f"Error processing Meteo Data for {station_id}: {e}")


🔁 Processing and resampling marine data for station 41008...
📌 La colonne 'time' est maintenant convertie en chaîne de caractères.
📌 Conversion réussie de 'time' en datetime.
🔁 Processing and resampling weather data for station 41008...
📌 La colonne 'date' est maintenant convertie en chaîne de caractères.
📌 Conversion réussie de 'date' en datetime.
🔁 Processing and resampling marine data for station 41044...
📌 La colonne 'time' est maintenant convertie en chaîne de caractères.
📌 Conversion réussie de 'time' en datetime.
🔁 Processing and resampling weather data for station 41044...
📌 La colonne 'date' est maintenant convertie en chaîne de caractères.
📌 Conversion réussie de 'date' en datetime.
🔁 Processing and resampling marine data for station 42001...
📌 La colonne 'time' est maintenant convertie en chaîne de caractères.
📌 Conversion réussie de 'time' en datetime.
🔁 Processing and resampling weather data for station 42001...
📌 La colonne 'date' est maintenant convertie en chaîne de car

AUTOCONVERT

In [40]:
# Conversion automatique des données
for station_id, tables in buoys_datas.items():
    try:
        print(f"🔁 Processing Marine Data Conversion for station {station_id}...")
        tables["Autoconverted Marine DataFrame"] = auto_convert(tables["Resampled Marine DataFrame"])
        marine_data_conversion += 1
    except Exception as e:
        print(f"Marine Data Conversion Failed for {station_id}: {e}")

    try:
        print(f"🔁 Processing Meteo Data Conversion for station {station_id}...")
        tables["Autoconverted Meteo DataFrame"] = auto_convert(tables["Resampled Meteo DataFrame"])
        meteo_data_conversion += 1
    except Exception as e:
        print(f"Meteo Data Conversion Failed for {station_id}: {e}")

🔁 Processing Marine Data Conversion for station 41008...
🔁 Processing Meteo Data Conversion for station 41008...
🔁 Processing Marine Data Conversion for station 41044...
🔁 Processing Meteo Data Conversion for station 41044...
🔁 Processing Marine Data Conversion for station 42001...
🔁 Processing Meteo Data Conversion for station 42001...
🔁 Processing Marine Data Conversion for station 42002...
🔁 Processing Meteo Data Conversion for station 42002...
🔁 Processing Marine Data Conversion for station 42012...
🔁 Processing Meteo Data Conversion for station 42012...
🔁 Processing Marine Data Conversion for station 42036...
🔁 Processing Meteo Data Conversion for station 42036...
🔁 Processing Marine Data Conversion for station 42056...
🔁 Processing Meteo Data Conversion for station 42056...
🔁 Processing Marine Data Conversion for station 42058...
🔁 Processing Meteo Data Conversion for station 42058...
🔁 Processing Marine Data Conversion for station 44020...
🔁 Processing Meteo Data Conversion for 

In [41]:
buoys_datas["42058"]["Autoconverted Meteo DataFrame"].dtypes

id                                      int64
Datetime                  datetime64[ns, UTC]
temperature_2m                        float64
relative_humidity_2m                  float64
dew_point_2m                          float64
precipitation                         float64
rain                                  float64
showers                               float64
pressure_msl                          float64
surface_pressure                      float64
cloud_cover                           float64
cloud_cover_low                       float64
cloud_cover_mid                       float64
cloud_cover_high                      float64
visibility                            float64
wind_speed_10m                        float64
soil_temperature_0cm                  float64
soil_moisture_0_to_1cm                float64
is_day                                float64
dtype: object

ADDING COORDINATES

In [42]:
# Ajout des coordonnées
for station_id, tables in buoys_datas.items():
    try:
        tables["Autoconverted Marine DataFrame"]["Lat"] = tables["Lat"]
        tables["Autoconverted Marine DataFrame"]["Lon"] = tables["Lon"]
        print(f"🌐 Coordinates (Lat/Lon) added for station {station_id}.")
    except Exception as e:
        print(f"Error adding coordinates for {station_id}: {e}")

🌐 Coordinates (Lat/Lon) added for station 41008.
🌐 Coordinates (Lat/Lon) added for station 41044.
🌐 Coordinates (Lat/Lon) added for station 42001.
🌐 Coordinates (Lat/Lon) added for station 42002.
🌐 Coordinates (Lat/Lon) added for station 42012.
🌐 Coordinates (Lat/Lon) added for station 42036.
🌐 Coordinates (Lat/Lon) added for station 42056.
🌐 Coordinates (Lat/Lon) added for station 42058.
🌐 Coordinates (Lat/Lon) added for station 44020.
🌐 Coordinates (Lat/Lon) added for station 44025.
🌐 Coordinates (Lat/Lon) added for station 44027.
🌐 Coordinates (Lat/Lon) added for station 44065.
🌐 Coordinates (Lat/Lon) added for station 46001.
🌐 Coordinates (Lat/Lon) added for station 46006.
🌐 Coordinates (Lat/Lon) added for station 46014.
🌐 Coordinates (Lat/Lon) added for station 46022.
🌐 Coordinates (Lat/Lon) added for station 46025.
🌐 Coordinates (Lat/Lon) added for station 46027.
🌐 Coordinates (Lat/Lon) added for station 46029.
🌐 Coordinates (Lat/Lon) added for station 46053.
🌐 Coordinates (Lat/L

PAIRS DATAFRAMES FUSION

In [43]:
# Fusion des DataFrames
for station_id, tables in buoys_datas.items():
    try:
        print(f"🔗 Merging marine and weather data for station {station_id}...")
        df_merged = pd.merge(
            tables["Autoconverted Marine DataFrame"], tables["Autoconverted Meteo DataFrame"], on='Datetime', how='inner'
        )
        tables["Merged Dataframe"] = df_merged
        number_merged_data += df_merged.shape[0]
        list_silver_merged_df.append(df_merged)
    except Exception as e:
        print(f"Error merging data for station {station_id}: {e}")

🔗 Merging marine and weather data for station 41008...
🔗 Merging marine and weather data for station 41044...
🔗 Merging marine and weather data for station 42001...
🔗 Merging marine and weather data for station 42002...
🔗 Merging marine and weather data for station 42012...
🔗 Merging marine and weather data for station 42036...
🔗 Merging marine and weather data for station 42056...
🔗 Merging marine and weather data for station 42058...
🔗 Merging marine and weather data for station 44020...
🔗 Merging marine and weather data for station 44025...
🔗 Merging marine and weather data for station 44027...
🔗 Merging marine and weather data for station 44065...
🔗 Merging marine and weather data for station 46001...
🔗 Merging marine and weather data for station 46006...
🔗 Merging marine and weather data for station 46014...
🔗 Merging marine and weather data for station 46022...
🔗 Merging marine and weather data for station 46025...
🔗 Merging marine and weather data for station 46027...
🔗 Merging 

In [44]:
show_first_row(buoys_datas["41008"]["Merged Dataframe"])

id_x                           1  (int64)
wind_direction                 240.0  (float64)
wind_speed                     5.0  (float64)
wind_gust                      6.0  (float64)
wave_height                    0.7  (float64)
dominant_wave_period           4.0  (float64)
average_wave_period            3.7  (float64)
dominant_wave_direction        218.0  (float64)
pressure                       1020.3  (float64)
air_temperature                14.1  (float64)
water_temperature              15.3  (float64)
dewpoint                       10.4  (float64)
3hr_pressure_tendency          0.0  (float64)
Datetime                       2025-03-22 11:00:00+00:00  (datetime64[ns, UTC])
Station ID                     41008  (int64)
Lat                            31.40N  (object)
Lon                            80.87W  (object)
id_y                           2220  (int64)
temperature_2m                 5.110000133514404  (float64)
relative_humidity_2m           56.0  (float64)
dew_point_2m          

In [45]:
show_null_counts(buoys_datas["41008"]["Merged Dataframe"])

id_x                                    0   / 6973
wind_direction                          0   / 6973
wind_speed                              0   / 6973
wind_gust                               0   / 6973
wave_height                             0   / 6973
dominant_wave_period                    0   / 6973
average_wave_period                     0   / 6973
dominant_wave_direction                 0   / 6973
pressure                                0   / 6973
air_temperature                         0   / 6973
water_temperature                       0   / 6973
dewpoint                                0   / 6973
3hr_pressure_tendency                   0   / 6973
Datetime                                0   / 6973
Station ID                              0   / 6973
Lat                                     0   / 6973
Lon                                     0   / 6973
id_y                                    0   / 6973
temperature_2m                          0   / 6973
relative_humidity_2m           

Concatenating all DataFrames into a Final one

In [46]:
# Fusion finale de tous les DataFrames
try:
    print("🔀 Merging all DataFrames into a final DataFrame...")
    dataframes_to_concat = [tables["Merged Dataframe"] for tables in buoys_datas.values()]
    df_final = pd.concat(dataframes_to_concat, ignore_index=True)
    print(f"📝 Final merged DataFrame size: {df_final.shape}")
except Exception as e:
    print(f"Error during final merge: {e}")
    df_final = None

# Résumé final
print("\n⭐🏆 Processing complete!")
print(f"🔢 Total stations processed: {len(buoys_datas)}")
print(f"Marine data rows collected = {number_marine_data}\nMeteo data rows collected = {number_meteo_data}")
print(f"Marine Data Successfully Converted: {marine_data_conversion}")
print(f"Meteo Data Successfully Converted: {meteo_data_conversion}")

if df_final is not None and not df_final.empty:
    print(f"Total Number of merged rows: {number_merged_data}")
    print(f"Final DataFrame rows number: {df_final.shape[0]}")
else:
    print("The DataFrame is either None or empty.")

print(f"❌ Number of failed stations: {len(list_failed_dfs)}")
if list_failed_dfs:
    print(f"⚠️ Failed stations: {', '.join(map(str, list_failed_dfs))}")

🔀 Merging all DataFrames into a final DataFrame...
📝 Final merged DataFrame size: (228592, 35)

⭐🏆 Processing complete!
🔢 Total stations processed: 39
Marine data rows collected = 0
Meteo data rows collected = 0
Marine Data Successfully Converted: 39
Meteo Data Successfully Converted: 39
Total Number of merged rows: 228592
Final DataFrame rows number: 228592
❌ Number of failed stations: 0


In [48]:
show_first_row(df_final)

id_x                           1  (int64)
wind_direction                 240.0  (float64)
wind_speed                     5.0  (float64)
wind_gust                      6.0  (float64)
wave_height                    0.7  (float64)
dominant_wave_period           4.0  (float64)
average_wave_period            3.7  (float64)
dominant_wave_direction        218.0  (float64)
pressure                       1020.3  (float64)
air_temperature                14.1  (float64)
water_temperature              15.3  (float64)
dewpoint                       10.4  (float64)
3hr_pressure_tendency          0.0  (float64)
Datetime                       2025-03-22 11:00:00+00:00  (datetime64[ns, UTC])
Station ID                     41008.0  (float64)
Lat                            31.40N  (object)
Lon                            80.87W  (object)
id_y                           2220  (int64)
temperature_2m                 5.110000133514404  (float64)
relative_humidity_2m           56.0  (float64)
dew_point_2m      

In [62]:
df_cleaned = df_final.dropna().round(2)
show_null_counts(df_cleaned)

id_x                                    0   / 170723
Wind Direction (°)                      0   / 170723
Wind Speed (km/h)                       0   / 170723
Wind Gusts (km/h)                       0   / 170723
Wave Height (m)                         0   / 170723
dominant_wave_period                    0   / 170723
Average Wave Period (s)                 0   / 170723
Dominant Wave Direction (°)             0   / 170723
Pressure (hPA)                          0   / 170723
Air T°                                  0   / 170723
Water T°                                0   / 170723
dewpoint                                0   / 170723
3hr_pressure_tendency                   0   / 170723
Datetime                                0   / 170723
Station ID                              0   / 170723
Lat                                     0   / 170723
Lon                                     0   / 170723
id_y                                    0   / 170723
T°(C°)                                  0   / 

Renaming Columns

In [None]:
col_to_rename={'temperature_2m': 'T°(C°)',  'relative_humidity_2m': 'Relative Humidity (%)',
 'dew_point_2m': 'Dew Point (°C)', 'precipitation': 'Precipitation (mm)',  'pressure_msl':' Sea Level Pressure (hPa)', 
 'cloud_cover_low':'Low Clouds (%)', 'cloud_cover_mid' : 'Middle Clouds (%)',	 'cloud_cover_high' : 'High Clouds (%)', 
 'visibility' : ' Visibility (%)',  'wind_direction': 'Wind Direction (°)',
 'wind_speed': 'Wind Speed (km/h)','wind_gust': 'Wind Gusts (km/h)', 'wave_height': 'Wave Height (m)',  'average_wave_period': 'Average Wave Period (s)',
 'dominant_wave_direction': 'Dominant Wave Direction (°)','pressure': 'Pressure (hPA)',
 'air_temperature': 'Air T°','water_temperature': 'Water T°'}

df_cleaned = rename_columns(df_cleaned, col_to_rename)
df_cleaned = drop_columns_if_exist(df_cleaned,['soil_temperature_0cm','rain', 'showers', 'is_day', 'id_x', 'id_y','soil_moisture_0_to_1cm'])
df_cleaned.columns

⚠️ Aucune colonne à renommer pour ce spécification : {'temperature_2m': 'T°(C°)', 'relative_humidity_2m': 'Relative Humidity (%)', 'dew_point_2m': 'Dew Point (°C)', 'precipitation': 'Precipitation (mm)', 'pressure_msl': ' Sea Level Pressure (hPa)', 'cloud_cover_low': 'Low Clouds (%)', 'cloud_cover_mid': 'Middle Clouds (%)', 'cloud_cover_high': 'High Clouds (%)', 'visibility': ' Visibility (%)', 'wind_direction': 'Wind Direction (°)', 'wind_speed': 'Wind Speed (km/h)', 'wind_gust': 'Wind Gusts (km/h)', 'wave_height': 'Wave Height (m)', 'average_wave_period': 'Average Wave Period (s)', 'dominant_wave_direction': 'Dominant Wave Direction (°)', 'pressure': 'Pressure (hPA)', 'air_temperature': 'Air T°', 'water_temperature': 'Water T°'}
Colonne 'soil_temperature_0cm' Non Trouvée
Colonne 'rain' Supprimée
Colonne 'showers' Supprimée
Colonne 'id_x' Non Trouvée
Colonne 'id_y' Supprimée
Colonne 'soil_moisture_0_to_1cm' Non Trouvée


Index(['Wind Direction (°)', 'Wind Speed (km/h)', 'Wind Gusts (km/h)',
       'Wave Height (m)', 'dominant_wave_period', 'Average Wave Period (s)',
       'Dominant Wave Direction (°)', 'Pressure (hPA)', 'Air T°', 'Water T°',
       'dewpoint', '3hr_pressure_tendency', 'Datetime', 'Station ID', 'Lat',
       'Lon', 'T°(C°)', 'Relative Humidity (%)', 'Dew Point (°C)',
       'Precipitation (mm)', ' Sea Level Pressure (hPa)', 'surface_pressure',
       'cloud_cover', 'Low Clouds (%)', 'Middle Clouds (%)', 'High Clouds (%)',
       ' Visibility (%)', 'wind_speed_10m', 'is_day', 'Daytime', 'Month'],
      dtype='object')

In [64]:
df_cleaned[['Daytime', 'Month']] = df_cleaned['Datetime'].apply(lambda x: get_day_time(x)).apply(pd.Series)
df_cleaned["Station ID"] = df_cleaned["Station ID"].astype(int)
show_first_row(df_cleaned)

id_x                           1  (int64)
Wind Direction (°)             240.0  (float64)
Wind Speed (km/h)              5.0  (float64)
Wind Gusts (km/h)              6.0  (float64)
Wave Height (m)                0.7  (float64)
dominant_wave_period           4.0  (float64)
Average Wave Period (s)        3.7  (float64)
Dominant Wave Direction (°)    218.0  (float64)
Pressure (hPA)                 1020.3  (float64)
Air T°                         14.1  (float64)
Water T°                       15.3  (float64)
dewpoint                       10.4  (float64)
3hr_pressure_tendency          0.0  (float64)
Datetime                       2025-03-22 11:00:00+00:00  (datetime64[ns, UTC])
Station ID                     41008  (int64)
Lat                            31.40N  (object)
Lon                            80.87W  (object)
id_y                           2220  (int64)
T°(C°)                         5.11  (float64)
Relative Humidity (%)          56.0  (float64)
Dew Point (°C)                 -2.9

In [72]:
df_cleaned_comparing = df_cleaned[['Dew Point (°C)', 'dewpoint', 'Wind Speed (km/h)', 'wind_speed_10m','Air T°','T°(C°)']]
df_cleaned_comparing.head()

Unnamed: 0,Dew Point (°C),dewpoint,Wind Speed (km/h),wind_speed_10m,Air T°,T°(C°)
0,-2.96,10.4,5.0,22.07,14.1,5.11
1,-2.96,10.5,6.0,22.07,14.1,5.11
2,-2.96,10.9,6.0,22.07,14.2,5.11
3,-3.35,10.9,6.0,25.81,14.3,5.76
4,-3.35,11.1,6.0,25.81,14.3,5.76


Third API Test

In [None]:
print(f'{lat},{lon}')

In [76]:
vc_api_key_path = r"C:\Users\f.gionnane\Documents\Data Engineering\Credentials\visual_crossing_weather_api.json"

with open(vc_api_key_path, 'r') as file:
    content = json.load(file)
    vc_api_key = content["api_key"]

df_42058 = df_cleaned[df_cleaned["Station ID"]==42058]
Lat = df_42058["Lat"].iloc[0]  # Récupérer la première valeur de la colonne "Lat"
Lon = df_42058["Lon"].iloc[0]  # Récupérer la première valeur de la colonne "Lon"

lat, lon = convert_coordinates(Lat, Lon)

# Définition des dates dynamiques
today = datetime.now().strftime("%Y-%m-%d")  # Hier pour éviter les données incomplètes d'aujourd'hui
start_date = (datetime.now() - timedelta(days=31)).strftime("%Y-%m-%d")  # 31 jours avant aujourd'hui
last_call_time = None
vc_meteo_data = None

# Construction de l'URL
url_last_month = f"https://weather.visualcrossing.com/VisualCrossingWebServices/rest/services/timeline/{lat},{lon}/{start_date}/{today}?unitGroup=metric&key={vc_api_key}&contentType=json"

# Vérifier si 2 heures se sont écoulées depuis le dernier appel
if last_call_time and (time.time() - last_call_time) < 20 * 60  * 60:
# = 2 * MIN * SEC
    print("Too soon to make another request. Returning previous response.")

    response = {'status_code': 200, 'data': vc_meteo_data}  # Retourner le dernier résultat
else:
    # Effectuer un nouvel appel si les 2 heures sont écoulées
    response = requests.get(url_last_month)
    vc_meteo_data = response.json()
    last_call_time = time.time()  # Mettre à jour l'heure du dernier appel

JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [77]:
# Récupérer les données de l'API
vc_meteo_data = response.json()
print(vc_meteo_data)  # Vérifiez les données récupérées

JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [None]:
# Normaliser les données JSON en DataFrame
df_vc_meteo = pd.json_normalize(vc_meteo_data, record_path=["days", "hours"], meta=["days"])

# Afficher la première ligne des données
df_vc_meteo.head(1)

Unnamed: 0,datetime,datetimeEpoch,temp,feelslike,humidity,dew,precip,precipprob,snow,snowdepth,...,cloudcover,solarradiation,solarenergy,uvindex,severerisk,conditions,icon,stations,source,days
0,00:00:00,1742619600,26.7,26.7,86.71,24.3,0.0,0.0,0.0,,...,100.0,0.0,0.0,0.0,30.0,Overcast,cloudy,[remote],obs,"[{'datetime': '2025-03-22', 'datetimeEpoch': 1..."


In [None]:
# Conversion du timestamp en datetime
df_vc_meteo["Date"] = pd.to_datetime(df_vc_meteo["datetimeEpoch"], unit="s").dt.strftime("%Y-%m-%d")
df_vc_meteo["Hour"] = pd.to_datetime(df_vc_meteo["datetimeEpoch"], unit="s").dt.strftime("%H")

In [None]:
# Définir les dates de filtrage pour les 30 derniers jours
today = datetime.now()
thirty_days_ago = today - timedelta(days=30)

# Convertir les dates en format YYYY-MM-DD
today_str = today.strftime("%Y-%m-%d")
thirty_days_ago_str = thirty_days_ago.strftime("%Y-%m-%d")

In [None]:
# Filtrer les données des 30 derniers jours de df_vc_meteo
df_test_last_month = df_vc_meteo[['Date', 'Hour', 'windspeed']]
df_test_last_month = df_test_last_month[(df_test_last_month['Date'] >= thirty_days_ago_str) & 
                                        (df_test_last_month['Date'] <= today_str)]

# Ajouter les colonnes Date et Hour à df_42058 (si ce n'est pas déjà fait)
df_42058['Date'] = df_42058['Datetime'].dt.strftime("%Y-%m-%d")
df_42058['Hour'] = df_42058['Datetime'].dt.strftime("%H")

# Filtrer les données des 30 derniers jours dans df_42058
df_42058_last_month = df_42058[(df_42058['Date'] >= thirty_days_ago_str) & 
                                (df_42058['Date'] <= today_str)]

# Fusionner les deux DataFrames sur Date et Hour
df_merged = df_test_last_month.merge(df_42058_last_month[['Date', 'Hour', 'Wind Speed (km/h)', 'wind_speed_10m']], 
                                     on=['Date', 'Hour'], 
                                     how='inner')


NameError: name 'df_vc_meteo' is not defined

In [None]:
csv_path =r"C:\Users\f.gionnane\Documents\Data Engineering\NoSQL_DB_comparisons.csv"

nosql_comparison = pd.read_csv(csv_path, encoding='utf-8')
nosql_comparison

Unnamed: 0,Feature,MongoDB,Cassandra,Firebase Firestore,Redis,Couchbase,Amazon DynamoDB
0,Data Model,Document-based (BSON),Wide-column store (keyspace/column-family),Document-based (JSON),Key-value store (in-memory),Document & key-value store (JSON),Key-value & document store (JSON)
1,Scalability,"Horizontal scaling, sharding","Excellent horizontal scalability, designed for...",Horizontal scaling (via Google Cloud),Horizontal scalability (via Redis Cluster),Horizontal scaling via clusters,"Fully managed horizontal scaling, ideal for hi..."
2,Performance,"Good for read-heavy workloads, slower writes",Excellent for write-heavy workloads,"Excellent for mobile apps, scalable",Extremely fast (in-memory),"High-performance, low-latency","Low-latency, high-throughput (ideal for high-t..."
3,Real-time Support,Limited real-time support (with change streams),Not built for real-time data,"Built-in real-time synchronization (e.g., Fire...",High real-time support (pub/sub via channels),Good with N1QL queries and sync features,Good real-time support via DynamoDB Streams
4,Consistency,Tunable consistency (eventual or strong),Eventual consistency,Strong consistency,Eventual consistency (with persistence),Tunable consistency (via N1QL queries),Strong consistency (default)
5,Ease of Use,"Easy to use, rich documentation","Steeper learning curve, complex setup","Very easy to use, ideal for mobile games","Easy to use, focused on caching and speed","Moderate learning curve, powerful queries","Easy to use, fully managed solution"
6,Integration with Unity,Available SDKs and plugins (third-party),"No official SDK, requires custom setup",Direct SDK integration with Unity (Firebase Un...,Unity integration via third-party packages,"SDKs available for Unity, but might need confi...",SDK available for Unity (AWS SDK)
7,Integration with C#,C# libraries available,Integration via REST API,Native C# SDK for Firestore,Integration via third-party C# libraries,C# SDK available for integration,C# SDK for DynamoDB available
8,Cloud/On-Prem Support,Both (MongoDB Atlas for cloud),"Primarily for on-prem, but cloud options avail...",Fully managed cloud solution (Google Cloud),Managed via Redis Cloud or on-prem setup,Both (Couchbase Cloud for cloud),Fully managed cloud (AWS)
9,Use Case Suitability,"General-purpose, complex queries (ideal for us...",High-throughput applications (ideal for large-...,"Real-time user data syncing, especially for mo...",Caching and real-time session data,"High-availability, high-performance, mobile/we...","Ideal for highly scalable games (leaderboards,..."


In [None]:
# import pandas as pd

# def handle_null_values(df: pd.DataFrame) -> pd.DataFrame:
#     row_count = df.shape[0]
    
#     # Initialisation des listes pour suivre les colonnes supprimées
#     removed_columns = []
#     non_numeric_columns_to_drop = []
    
#     # Utiliser lambda et apply() pour calculer le nombre de valeurs nulles dans chaque colonne
#     null_counts = df.apply(lambda col: int(col.isnull().sum()))  # Calculer le nombre de NaN par colonne
    
#     # Condition : 1. Colonnes avec toutes les valeurs nulles ou 2. Plus de 50% de valeurs nulles et colonne non numérique
#     columns_to_drop = null_counts[
#         (null_counts == row_count) | 
#         ((null_counts > row_count * 0.5) & ~df.apply(lambda col: pd.api.types.is_numeric_dtype(col)))
#     ].index
    
#     # Ajouter les noms des colonnes supprimées dans les listes appropriées
#     for col in columns_to_drop:
#         if null_counts[col] == row_count:
#             removed_columns.append(col)  # Colonnes entièrement vides
#         elif null_counts[col] > row_count * 0.5 and not pd.api.types.is_numeric_dtype(df[col]):
#             non_numeric_columns_to_drop.append(col)  # Colonnes > 50% nulles et non numériques
    
#     # Supprimer les colonnes identifiées
#     df = df.drop(columns=columns_to_drop)
    
#     # Afficher les résultats
#     print("Colonnes supprimées pour avoir toutes les valeurs nulles:")
#     print(removed_columns)
    
#     print("\nColonnes supprimées pour avoir plus de 50% de valeurs nulles et être non numériques:")
#     print(non_numeric_columns_to_drop)
    
#     return df

# # Exemple d'utilisation
# # df_final = pd.read_csv('ton_fichier.csv') # Assure-toi que df_final est bien un DataFrame valide avant d'appeler la fonction
# df_final = handle_null_values(df_final)


In [None]:
# df_final = df_final.round(2)
# print(df_final.columns)
# df_final.describe()

In [None]:
# def explore_dict_keys(d, parent_key='', sep='_'):
#     """
#     Explore un dictionnaire récursivement pour obtenir toutes les clés, y compris les sous-clés,
#     mais ne retourne pas les valeurs finales.

#     :param d: Le dictionnaire à explorer
#     :param parent_key: La clé parent qui est utilisée pour concaténer les sous-clés
#     :param sep: Le séparateur utilisé pour concaténer les clés (par défaut '_')
#     :return: Une liste des clés (et sous-clés)
#     """
#     keys = []
#     for k, v in d.items():
#         new_key = f"{parent_key}{sep}{k}" if parent_key else k
#         if isinstance(v, dict):  # Si la valeur est un dictionnaire, on explore récursivement
#             keys.append(new_key)  # Ajouter la clé, mais ne pas inclure la valeur
#             keys.extend(explore_dict_keys(v, new_key, sep=sep))  # Continuer l'exploration
#         else:
#             keys.append(new_key)  # Ajouter la clé finale
#     return keys

In [None]:
# def find_key_path(d, target_key, path=[]):
#     """
#     Recherche récursive d'une clé dans un dictionnaire et retourne son chemin.
#     :param d: dictionnaire
#     :param target_key: clé recherchée
#     :param path: liste pour stocker le chemin jusqu'à la clé
#     :return: chemin sous forme de liste
#     """
#     if isinstance(d, dict):  # Si le dictionnaire est encore imbriqué
#         for key, value in d.items():
#             new_path = path + [key]
#             if key == target_key:
#                 return new_path
#             elif isinstance(value, dict):
#                 result = find_key_path(value, target_key, new_path)
#                 if result:  # Si la clé est trouvée, retourner le chemin
#                     return result
#     return None  # Retourne None si la clé n'a pas été trouvée



# # Recherche du chemin pour la clé 'marine_data'
# path = find_key_path(table_dict, "Marine Dataframe")
# print(path)


Auto_convert Test

In [None]:
# for idx, (buoy_id, tables) in enumerate(table_dict.items()):  # Utilisation de .items() pour obtenir (clé, valeur)
#     if isinstance(tables, dict):
#         if idx == 1:  # Vérifier si l'index est égal à 1

Counting Rows of all Dataframes in total