In [1]:
%pip install -q -r requirements.txt

Note: you may need to restart the kernel to use updated packages.


In [2]:
%load_ext autoreload
%autoreload 1
from imports import *
from functions import *

Connection to MySQL

In [3]:
mysql_user = "flosrv"
password = "Nesrine123"
host = "localhost"
port = 3306
database = "Oceanography_data_analysis"
metadata = MetaData()
# Connect to the database
engine = create_engine(f"mysql+mysqlconnector://{mysql_user}:{password}@{host}/{database}", isolation_level ='AUTOCOMMIT')

In [4]:
# Fonction pour r√©cup√©rer les tables dont le nom commence par un pr√©fixe sp√©cifique
def get_tables_starting_with(engine, prefix: str):
    inspector = inspect(engine)
    all_tables = inspector.get_table_names()
    tables_with_prefix = [table for table in all_tables if table.startswith(prefix)]
    return tables_with_prefix

# Fonction pour r√©cup√©rer les donn√©es de chaque table (en ignorant l'absence de tables)
def get_data_from_table(engine, table_name):
    try:
        query = f"SELECT * FROM `{table_name}`"  # Utilisation de backticks pour les noms de tables
        df = pd.read_sql(query, engine)
        return df
    except Exception as e:
        print(f"‚ö†Ô∏è Erreur lors de la r√©cup√©ration des donn√©es pour la table {table_name}: {e}")
        return None  # Retourner None en cas d'erreur
    
def clean_dataframe(df):
    for column in df.columns:
        # Calculer le pourcentage de valeurs manquantes
        missing_percentage = df[column].isnull().mean() * 100
        
        # Supprimer la colonne si elle est totalement nulle
        if df[column].isnull().sum() == len(df[column]):
            df = df.drop(columns=[column])
            continue
        
        # Si plus de 50% des valeurs sont manquantes, on retire la colonne sauf si c'est num√©rique
        if missing_percentage > 50:
            if df[column].dtype not in ['float64', 'int64']:  # Ne pas supprimer les colonnes num√©riques
                df = df.drop(columns=[column])
        else:
            # Si la colonne est num√©rique, on remplace les NaN par la m√©diane
            if df[column].dtype in ['float64', 'int64']:  # v√©rifier si c'est une colonne num√©rique
                median_value = df[column].median()
                df[column].fillna(median_value, inplace=True)
            else:
                pass
    return df

Charger les Donn√©es des Tables

In [5]:
# Charger les donn√©es
# Dictionnaire pour stocker les donn√©es tri√©es par Station ID
buoy_datas = {}

# R√©cup√©rer les tables commen√ßant par 'br_'
bronze_tables = get_tables_starting_with(engine, prefix='br_')

# Parcourir les tables et les organiser par Station ID et type (marine ou meteo)
for table in bronze_tables:
    parts = table.split('_')
    if len(parts) < 4:
        continue  # Format inattendu

    station_id = str(parts[1])  # Convertir en string pour √©viter KeyError
    label = parts[2].lower()  # "marine" ou "meteo"

    df = get_data_from_table(engine, table)

    if station_id not in buoy_datas:
        buoy_datas[station_id] = {"Marine Dataframe": None, "Meteo Dataframe": None}  

    if label == "marine":
        buoy_datas[station_id]["Marine Dataframe"] = df
    elif label == "meteo":
        buoy_datas[station_id]["Meteo Dataframe"] = df  

# V√©rification
print(f"üìä Nombre total de stations au d√©but : {len(buoy_datas)}")
print("üìù Stations initiales :")
for station_id in buoy_datas.keys():
    print(f"  - Station ID: {station_id}")

üìä Nombre total de stations au d√©but : 41
üìù Stations initiales :
  - Station ID: 41008
  - Station ID: 41044
  - Station ID: 41049
  - Station ID: 42001
  - Station ID: 42002
  - Station ID: 42012
  - Station ID: 42036
  - Station ID: 42056
  - Station ID: 42058
  - Station ID: 44007
  - Station ID: 44020
  - Station ID: 44025
  - Station ID: 44027
  - Station ID: 44065
  - Station ID: 46001
  - Station ID: 46006
  - Station ID: 46014
  - Station ID: 46022
  - Station ID: 46025
  - Station ID: 46027
  - Station ID: 46029
  - Station ID: 46053
  - Station ID: 46069
  - Station ID: 46071
  - Station ID: 46072
  - Station ID: 46078
  - Station ID: 46084
  - Station ID: 46086
  - Station ID: 46087
  - Station ID: 46088
  - Station ID: 51000
  - Station ID: 51001
  - Station ID: 51002
  - Station ID: burl1
  - Station ID: ffia2
  - Station ID: lonf1
  - Station ID: mdrm1
  - Station ID: mrka2
  - Station ID: pota2
  - Station ID: sanf1
  - Station ID: sbio1


Deleting Missing Dataframes

In [6]:
# Virer les df manquantes
stations_removed = 0
stations_to_remove = []

for station_id, data in buoy_datas.items():
    try:
        print(f"\nüîç Traitement de la station ID : {station_id}")

        marine_data = data.get("Marine Dataframe")
        meteo_data = data.get("Meteo Dataframe")

        if marine_data is None or meteo_data is None:
            print(f"‚ö†Ô∏è Dataframe manquante pour la station {station_id}. Suppression.")
            stations_to_remove.append(station_id)
            stations_removed += 1
            continue 

        print(f"‚úÖ Marine Dataframe: {marine_data.shape[0]} lignes")
        print(f"‚úÖ Meteo Dataframe: {meteo_data.shape[0]} lignes")

    except Exception as e:
        print(f"‚ùå Erreur sur la station {station_id}: {str(e)}")

# Supprimer les stations sans donn√©es
for station_id in stations_to_remove:
    del buoy_datas[station_id]

print(f"\nüìä Nombre de stations restantes : {len(buoy_datas)}")
print(f"üóëÔ∏è Nombre de stations supprim√©es : {stations_removed}")


üîç Traitement de la station ID : 41008
‚úÖ Marine Dataframe: 7188 lignes
‚úÖ Meteo Dataframe: 2496 lignes

üîç Traitement de la station ID : 41044
‚úÖ Marine Dataframe: 7157 lignes
‚úÖ Meteo Dataframe: 2496 lignes

üîç Traitement de la station ID : 41049
‚úÖ Marine Dataframe: 7161 lignes
‚úÖ Meteo Dataframe: 2496 lignes

üîç Traitement de la station ID : 42001
‚úÖ Marine Dataframe: 3727 lignes
‚úÖ Meteo Dataframe: 2496 lignes

üîç Traitement de la station ID : 42002
‚úÖ Marine Dataframe: 3884 lignes
‚úÖ Meteo Dataframe: 2496 lignes

üîç Traitement de la station ID : 42012
‚úÖ Marine Dataframe: 7142 lignes
‚úÖ Meteo Dataframe: 2496 lignes

üîç Traitement de la station ID : 42036
‚úÖ Marine Dataframe: 7127 lignes
‚úÖ Meteo Dataframe: 2496 lignes

üîç Traitement de la station ID : 42056
‚úÖ Marine Dataframe: 7166 lignes
‚úÖ Meteo Dataframe: 2496 lignes

üîç Traitement de la station ID : 42058
‚úÖ Marine Dataframe: 7144 lignes
‚úÖ Meteo Dataframe: 2496 lignes

üîç Traitement de

Data Enrichment with MetaData

In [7]:
total_merged_rows = 0  # Variable pour compter le nombre total de lignes fusionn√©es

################## Data enrichment ##########################################

for station_id, data in buoy_datas.items():
    print(f"\nüîç Traitement de la Station ID: {station_id}")

    marine_df = data["Marine Dataframe"]
    meteo_df = data["Meteo Dataframe"]

    try:
        buoy_metadata = get_station_metadata(station_id)
        parsed_data = parse_buoy_json(buoy_metadata)

        # Ajouter les m√©tadonn√©es dans la DataFrame Marine
        if marine_df is not None:
            for key, value in parsed_data.items():
                marine_df[key] = value 

        # Mise √† jour du dictionnaire avec les m√©tadonn√©es
        data.update(parsed_data)

    except Exception as e:
        print(f"‚ö†Ô∏è Erreur de r√©cup√©ration des m√©tadonn√©es pour {station_id}: {e}")

    print(f"‚úÖ Marine Dataframe: {marine_df.shape[0]} lignes" if marine_df is not None else "‚ö†Ô∏è Marine Dataframe: Aucune donn√©e")
    print(f"‚úÖ Meteo Dataframe: {meteo_df.shape[0]} lignes" if meteo_df is not None else "‚ö†Ô∏è Meteo Dataframe: Aucune donn√©e")



üîç Traitement de la Station ID: 41008

üîç D√©but du parsing de la bou√©e...
üåç Zone de la station : grays reef
üÜî Station ID : 41008
‚úÖ Coordonn√©es extraites : Latitude = 31.40N, Longitude = 80.87W
üåä Water Depth : 16 m
üå°Ô∏è Sea Temp Depth : 2
üå¨Ô∏è Barometer Elevation : 2.4
üí® Anemometer Height : 3.8
üå§Ô∏è Air Temp Height : 3.4
üîó URL de la bou√©e : https://www.ndbc.noaa.gov/station_page.php?station=41008
‚úÖ Parsing termin√© !

‚úÖ Marine Dataframe: 7188 lignes
‚úÖ Meteo Dataframe: 2496 lignes

üîç Traitement de la Station ID: 41044

üîç D√©but du parsing de la bou√©e...
üåç Zone de la station : ne st martin
üÜî Station ID : 41044
‚úÖ Coordonn√©es extraites : Latitude = 21.58N, Longitude = 58.63W
üåä Water Depth : 5419 m
üå°Ô∏è Sea Temp Depth : 2
üå¨Ô∏è Barometer Elevation : 2.4
üí® Anemometer Height : 3.8
üå§Ô∏è Air Temp Height : 3.4
üîó URL de la bou√©e : https://www.ndbc.noaa.gov/station_page.php?station=41044
‚úÖ Parsing termin√© !

‚úÖ Marine Da

In [8]:
marine_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1205 entries, 0 to 1204
Data columns (total 26 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   wind_direction           1202 non-null   float64       
 1   wind_speed               1205 non-null   float64       
 2   wind_gust                1205 non-null   float64       
 3   wave_height              0 non-null      object        
 4   dominant_wave_period     0 non-null      object        
 5   average_wave_period      0 non-null      object        
 6   dominant_wave_direction  0 non-null      object        
 7   pressure                 1205 non-null   float64       
 8   air_temperature          1198 non-null   float64       
 9   water_temperature        0 non-null      object        
 10  dewpoint                 0 non-null      object        
 11  visibility               0 non-null      object        
 12  3hr_pressure_tendency    1202 non-

Cleaning

In [None]:
# Nettoyage des DataFrames fusionn√©s
for station_id, data in buoy_datas.items():
    try:
        print(f"\nüîÑ Nettoyage des donn√©es pour la station {station_id}")

        marine_df = data["Marine Dataframe"]
        meteo_df = data["Meteo Dataframe"]

        if marine_df is None:
            print(f"‚ö†Ô∏è Station {station_id} ignor√©e: Marine DataFrame manquant)")
            continue
        if meteo_df is None:
            print(f"‚ö†Ô∏è Station {station_id} ignor√©e: Meteo DataFrame manquant)")
            continue
        

        try:
            cleaned_marine_df = clean_dataframe(marine_df)

            # Ajouter le DataFrame nettoy√© au dictionnaire des r√©sultats
            buoy_datas[station_id] = {'Cleaned Marine Dataframe': cleaned_marine_df}
            print(f"‚úÖ Nettoyage r√©ussi pour la station {station_id} ({cleaned_marine_df.shape[0]} lignes)")

    except Exception as e:
        print(f"‚ùå Erreur lors du nettoyage pour {station_id}: {e}")

# R√©sum√© final du nettoyage
print("\nüìä R√âSUM√â DU NETTOYAGE:")
print(f"üìå Stations au d√©part : {len(buoy_datas)}")
print(f"‚úÖ Stations nettoy√©es : {len([data for data in buoy_datas.values() if 'Cleaned Dataframe' in data])}")

Merge

In [None]:
######## Try to Merge ################################################################################
list_ID =[]
list_merged_df =[]

for station_id, data in buoy_datas.items():
    list_ID.append(station_id)
    try:
        print(f"\nüîÑ Fusion des DataFrames pour la station {station_id}")

        # V√©rifier si les deux DataFrames existent
        if marine_df is None or meteo_df is None:
            print(f"‚ö†Ô∏è Station {station_id} ignor√©e (donn√©es manquantes)")
            continue

        # Assurez-vous que la colonne 'Datetime' existe dans les deux DataFrames
        if 'Datetime' not in marine_df.columns or 'Datetime' not in meteo_df.columns:
            print(f"‚ö†Ô∏è Station {station_id} ignor√©e (colonne 'Datetime' manquante)")
            continue

        # Assurez-vous que la colonne 'Datetime' est dans le bon format datetime
        marine_df['Datetime'] = pd.to_datetime(marine_df['Datetime'], errors='coerce')
        meteo_df['Datetime'] = pd.to_datetime(meteo_df['Datetime'], errors='coerce')

        # Ajouter le DataFrame fusionn√© au dictionnaire des r√©sultats
        try:
            # Tenter un merge inner sur la colonne temporelle 'Datetime'
            merged_df = marine_df.merge(meteo_df, how="inner", on="Datetime")  # Ajuste 'timestamp' si n√©cessaire
            buoy_datas[station_id]['Merged Dataframe'] = merged_df
            list_merged_df.append(merged_df)
        except Exception as e:
            print(f'Error loading Merged Dataframe for Buoy{station_id}')
        print(f"‚úÖ Fusion r√©ussie pour la station {station_id} ({merged_df.shape[0]} lignes)")

        # Ajouter le nombre de lignes de ce DataFrame fusionn√© au total
        total_merged_rows += merged_df.shape[0]

    except Exception as e:
        print(f"‚ùå Erreur lors de la fusion pour {station_id}: {e}")

# R√©sum√© final de la fusion
print("\nüìä R√âSUM√â DE LA FUSION:")
print(f"üìå Stations au d√©part : {len(buoy_datas)}")
print(f"üìä Total de lignes fusionn√©es : {total_merged_rows}")

print(f'\ntest Df:\n{buoy_datas["42058"]['Merged Dataframe']}')

In [None]:
print(f'{buoy_datas["42058"]['Merged Dataframe'].shape}')
print(f'\n{buoy_datas["42058"]['Merged Dataframe'].isnull().sum()}')

Cleaning

In [None]:
print(f'{buoy_datas["42058"]['Cleaned Dataframe'].shape}')
print(f'\n{buoy_datas["42058"]['Cleaned Dataframe'].isnull().sum()}')

Clean Dataframes

In [None]:
marine_cols = [
    "wind_direction", "wind_speed", "wind_gust", "wave_height",
    "dominant_wave_period", "average_wave_period", "dominant_wave_direction",
    "pressure", "air_temperature", "water_temperature", "dewpoint",
    "visibility", "3hr_pressure_tendency", "water_level_above_mean"
]

meteo_cols = [
    "temperature_2m", "relative_humidity_2m", "dew_point_2m", "precipitation", "rain",
    "showers", "pressure_msl", "surface_pressure", "cloud_cover", "cloud_cover_low",
    "cloud_cover_mid", "cloud_cover_high", "visibility", "wind_speed_10m",
    "soil_temperature_0cm", "soil_moisture_0_to_1cm"
]

col_to_rename={'temperature_2m': 'T¬∞(C¬∞)',  'relative_humidity_2m': 'Relative Humidity (%)',
 'dew_point_2m': 'Dew Point (¬∞C)', 'precipitation': 'Precipitation (mm)',  'pressure_msl':' Sea Level Pressure (hPa)', 
 'cloud_cover_low':'Low Clouds (%)', 'cloud_cover_mid' : 'Middle Clouds (%)',	 'cloud_cover_high' : 'High Clouds (%)', 
 'visibility' : ' Visibility (km)',  'wind_direction': 'Wind Direction (¬∞)',
 'wind_speed': 'Wind Speed (km/h)','wind_gust': 'Wind Gusts (km/h)', 'wave_height': 'Wave Height (m)',  'average_wave_period': 'Average Wave Period (s)',
 'dominant_wave_direction': 'Dominant Wave Direction (¬∞)','pressure': 'Pressure (hPA)',
 'air_temperature': 'Air T¬∞','water_temperature': 'Water T¬∞'}

meteo_cols_to_delete = ['soil_temperature_0cm','rain', 'showers', 'is_day',
                  'soil_moisture_0_to_1cm']

for station_id, tables in buoy_datas.items():
    marine_df = tables["Marine DataFrame"]
    marine_df = rename_columns(marine_df, col_to_rename)

    marine_df = drop_columns_if_exist

    meteo_df = tables["Meteo DataFrame"]
    meteo_df = rename_columns(meteo_df,col_to_rename)
    meteo_df = drop_columns_if_exist(meteo_df, meteo_cols_to_delete)

HOUR RESAMPLING

In [None]:
# Resampling des donn√©es et stockage dans un nouveau compartiment du dictionnaire 
for station_id, tables in buoys_datas.items():
    try:
        print(f"üîÅ Processing and resampling marine data for station {station_id}...")
        # Convert columns to numeric types (float or int) excluding datetime columns using pandas to_numeric
        tables["Marine DataFrame"] = process_datetime_column(tables["Marine DataFrame"], column='time')
    except Exception as e:
        print(f"Error processing Marine Data for {station_id}: {e}")

    try:
        print(f"üîÅ Processing and resampling weather data for station {station_id}...")
        # Convert columns to numeric types (float or int) excluding datetime columns using pandas to_numeric
        tables["Meteo DataFrame"] = process_datetime_column(tables["Meteo DataFrame"], column='date')
    except Exception as e:
        print(f"Error processing Meteo Data for {station_id}: {e}")

Test Adding MetaData

Changing Data Types

In [None]:
for station_id, tables in buoys_datas.items():
    try:
        df_merged = tables["Merged DataFrame"]
        print(f"üîó Changing Data Types  for station {station_id}...")
        df_converted = convert_df_columns(df_merged)
        tables["Converted DataFrame"] = df_converted
        
        print(f"Successfully Changed Data Types for Station {station_id}")

    except Exception as e:
        
        print(f"Error changing data types for station {station_id}: {e}")

Cleaning Null Values

In [None]:
for station_id, tables in buoys_datas.items():
    try:

        print(f"üîó Cleaning DataFrame for station {station_id}...")
        df_converted = tables["Converted DataFrame"]
        
        df_cleaned = clean_dataframe(df_converted)

        tables["Cleaned DataFrame"] = df_cleaned

        print(f"Successfully Cleaned DataFrame for Station {station_id}")

    except Exception as e:
        print(f"Error Cleaning DataFrame for station {station_id}: {e}")

Concatenating All in One Final DataFrame

In [None]:
# Fusion finale de tous les DataFrames
try:
    print("üîÄ Merging all DataFrames into a final DataFrame...")
    dataframes_to_concat = [tables["Cleaned DataFrame"] for tables in buoys_datas.values()]

    df_final = pd.concat(dataframes_to_concat, ignore_index=True)

except Exception as e:
    print(f"Error during final merge: {e}")
    df_final = None

# R√©sum√© final
print("\n‚≠êüèÜ Processing complete!")
print(f"üî¢ Total stations processed: {len(buoys_datas)}")

if df_final is not None and not df_final.empty:
    print(f"üìù Final merged DataFrame size: {df_final.shape}")
else:
    print("The DataFrame is either None or empty.")

In [None]:
df_final.dtypes

In [None]:
# Parcourir toutes les colonnes contenant "Station ID" dans leur nom
for column in df_final.columns:
    if "Station ID" in column:
        try:
            # Tenter de convertir la colonne en num√©rique (en utilisant pd.to_numeric avec errors='coerce')
            df_final[column] = pd.to_numeric(df_final[column], errors='raise')
             # Si la conversion est r√©ussie, convertir en int
            df_final[column] = df_final[column].astype(int) 

        except Exception as e:
                print(f"Error in Conversion Step 1 for column: {column}:\n{e}")
        
        try:
            
            df_final[column] = df_final[column].astype(str)

        except Exception as e:
                print(f"Error in Conversion Step 2 for column: {column}:\n{e}")
            

show_first_row(df_final)

In [None]:
df_final = clean_dataframe(df_final)
df_final.isnull().sum()
df_final2 = df_final.dropna()
print(f'{df_final2.shape}\n\n{df_final2.isnull().sum()}')

In [None]:
df_final2 = df_final2.round(2)
show_first_row(df_final2)

In [None]:
df_final2[['Daytime', 'Month']] = df_final2['Datetime'].apply(lambda x: get_day_time(x)).apply(pd.Series)

In [None]:
df_final2=df_final2.round(2)
show_first_row(df_final2)

Renaming, Dropping Useless Columns

Third API Test

In [None]:
show_first_row(df_final2)

Test Envoi Vers PostgreSQL

In [None]:
load_data_in_table(engine=engine, schema = schema_silver, table_name='Silver_Table', df=df_final2, key_column='Datetime')

In [None]:
df_final2.dtypes

In [None]:
# Filtrer le dataframe pour la Station ID 42058
df_42058 = df_final2[df_final2["Station ID"] == 42058]
df_42058.head()

In [None]:
# # Assuming the df_cleaned DataFrame already exists and contains the required data

# # First, load your Visual Crossing Weather Data (example, you may already have it)
# # Assuming vc_meteo_data is the JSON response from Visual Crossing
# # Example of flattening the JSON
# df_vc_meteo = pd.json_normalize(vc_meteo_data, record_path=["days", "hours"], meta=["days"])

# # Convert the datetimeEpoch from Visual Crossing Weather data into Date and Hour columns
# df_vc_meteo["Date"] = pd.to_datetime(df_vc_meteo["datetimeEpoch"], unit="s").dt.strftime("%Y-%m-%d")
# df_vc_meteo["Hour"] = pd.to_datetime(df_vc_meteo["datetimeEpoch"], unit="s").dt.strftime("%H")

# # Filter data from df_vc_meteo for the last 30 days
# today = datetime.now()
# thirty_days_ago = today - timedelta(days=30)

# today_str = today.strftime("%Y-%m-%d")
# thirty_days_ago_str = thirty_days_ago.strftime("%Y-%m-%d")

# # Filter df_vc_meteo for the last 30 days
# df_test_last_month = df_vc_meteo[['Date', 'Hour', 'windspeed']]
# df_test_last_month = df_test_last_month[(df_test_last_month['Date'] >= thirty_days_ago_str) & 
#                                         (df_test_last_month['Date'] <= today_str)]

# # Prepare df_cleaned for merging (add Date and Hour columns)
# df_cleaned['Date'] = df_cleaned['Datetime'].dt.strftime("%Y-%m-%d")
# df_cleaned['Hour'] = df_cleaned['Datetime'].dt.strftime("%H")

# # Filter df_cleaned for the last 30 days
# df_cleaned_last_month = df_cleaned[(df_cleaned['Date'] >= thirty_days_ago_str) & 
#                                    (df_cleaned['Date'] <= today_str)]

# # Merge df_vc_meteo and df_cleaned based on Date and Hour
# df_merged = df_test_last_month.merge(df_cleaned_last_month[['Date', 'Hour', 'Wind Speed (km/h)', 'wind_speed_10m']], 
#                                     on=['Date', 'Hour'], 
#                                     how='inner')

# # Display the merged dataframe
# print(df_merged.head(100))


In [None]:
col_to_rename={'temperature_2m': 'T¬∞(C¬∞)',  'relative_humidity_2m': 'Relative Humidity (%)',
 'dew_point_2m': 'Dew Point (¬∞C)', 'precipitation': 'Precipitation (mm)',  'pressure_msl':' Sea Level Pressure (hPa)', 
 'cloud_cover_low':'Low Clouds (%)', 'cloud_cover_mid' : 'Middle Clouds (%)',	 'cloud_cover_high' : 'High Clouds (%)', 
 'visibility' : ' Visibility (%)',  'wind_direction': 'Wind Direction (¬∞)',
 'wind_speed': 'Wind Speed (km/h)','wind_gust': 'Wind Gusts (km/h)', 'wave_height': 'Wave Height (m)',  'average_wave_period': 'Average Wave Period (s)',
 'dominant_wave_direction': 'Dominant Wave Direction (¬∞)','pressure': 'Pressure (hPA)',
 'air_temperature': 'Air T¬∞','water_temperature': 'Water T¬∞'}

df_cleaned = rename_columns(df_cleaned, col_to_rename)
df_cleaned = drop_columns_if_exist(df_cleaned,['soil_temperature_0cm','rain', 'showers', 'is_day', 'id_x', 'id_y','soil_moisture_0_to_1cm'])
df_cleaned.columns

In [None]:
#  R√©cup√©rer les donn√©es de l'API
# vc_meteo_data = response.json()
# print(vc_meteo_data)  # V√©rifiez les donn√©es r√©cup√©r√©es

In [None]:
# # Normaliser les donn√©es JSON en DataFrame
# df_vc_meteo = pd.json_normalize(vc_meteo_data, record_path=["days", "hours"], meta=["days"])

# # Afficher la premi√®re ligne des donn√©es
# df_vc_meteo.head(1)

In [None]:
# Conversion du timestamp en datetime
df_vc_meteo["Date"] = pd.to_datetime(df_vc_meteo["datetimeEpoch"], unit="s").dt.strftime("%Y-%m-%d")
df_vc_meteo["Hour"] = pd.to_datetime(df_vc_meteo["datetimeEpoch"], unit="s").dt.strftime("%H")

In [None]:
# D√©finir les dates de filtrage pour les 30 derniers jours
today = datetime.now()
thirty_days_ago = today - timedelta(days=30)

# Convertir les dates en format YYYY-MM-DD
today_str = today.strftime("%Y-%m-%d")
thirty_days_ago_str = thirty_days_ago.strftime("%Y-%m-%d")

In [None]:
# Filtrer les donn√©es des 30 derniers jours de df_vc_meteo
df_test_last_month = df_vc_meteo[['Date', 'Hour', 'windspeed']]
df_test_last_month = df_test_last_month[(df_test_last_month['Date'] >= thirty_days_ago_str) & 
                                        (df_test_last_month['Date'] <= today_str)]

# Ajouter les colonnes Date et Hour √† df_42058
df_42058.loc[:, 'Date'] = df_42058['Datetime'].dt.strftime("%Y-%m-%d")
df_42058.loc[:, 'Hour'] = df_42058['Datetime'].dt.strftime("%H")

# Filtrer les donn√©es des 30 derniers jours dans df_42058
df_42058_last_month = df_42058[(df_42058['Date'] >= thirty_days_ago_str) & 
                                (df_42058['Date'] <= today_str)]

# Fusionner les deux DataFrames sur Date et Hour
df_test_merged = df_test_last_month.merge(df_42058_last_month[['Date', 'Hour', 'Wind Speed (km/h)', 'wind_speed_10m']], 
                                     on=['Date', 'Hour'], 
                                     how='inner')

df_test_merged.head()

In [None]:
# import pandas as pd

# def handle_null_values(df: pd.DataFrame) -> pd.DataFrame:
#     row_count = df.shape[0]
    
#     # Initialisation des listes pour suivre les colonnes supprim√©es
#     removed_columns = []
#     non_numeric_columns_to_drop = []
    
#     # Utiliser lambda et apply() pour calculer le nombre de valeurs nulles dans chaque colonne
#     null_counts = df.apply(lambda col: int(col.isnull().sum()))  # Calculer le nombre de NaN par colonne
    
#     # Condition : 1. Colonnes avec toutes les valeurs nulles ou 2. Plus de 50% de valeurs nulles et colonne non num√©rique
#     columns_to_drop = null_counts[
#         (null_counts == row_count) | 
#         ((null_counts > row_count * 0.5) & ~df.apply(lambda col: pd.api.types.is_numeric_dtype(col)))
#     ].index
    
#     # Ajouter les noms des colonnes supprim√©es dans les listes appropri√©es
#     for col in columns_to_drop:
#         if null_counts[col] == row_count:
#             removed_columns.append(col)  # Colonnes enti√®rement vides
#         elif null_counts[col] > row_count * 0.5 and not pd.api.types.is_numeric_dtype(df[col]):
#             non_numeric_columns_to_drop.append(col)  # Colonnes > 50% nulles et non num√©riques
    
#     # Supprimer les colonnes identifi√©es
#     df = df.drop(columns=columns_to_drop)
    
#     # Afficher les r√©sultats
#     print("Colonnes supprim√©es pour avoir toutes les valeurs nulles:")
#     print(removed_columns)
    
#     print("\nColonnes supprim√©es pour avoir plus de 50% de valeurs nulles et √™tre non num√©riques:")
#     print(non_numeric_columns_to_drop)
    
#     return df

# # Exemple d'utilisation
# # df_final = pd.read_csv('ton_fichier.csv') # Assure-toi que df_final est bien un DataFrame valide avant d'appeler la fonction
# df_final = handle_null_values(df_final)


In [None]:
# df_final = df_final.round(2)
# print(df_final.columns)
# df_final.describe()

In [None]:
# def explore_dict_keys(d, parent_key='', sep='_'):
#     """
#     Explore un dictionnaire r√©cursivement pour obtenir toutes les cl√©s, y compris les sous-cl√©s,
#     mais ne retourne pas les valeurs finales.

#     :param d: Le dictionnaire √† explorer
#     :param parent_key: La cl√© parent qui est utilis√©e pour concat√©ner les sous-cl√©s
#     :param sep: Le s√©parateur utilis√© pour concat√©ner les cl√©s (par d√©faut '_')
#     :return: Une liste des cl√©s (et sous-cl√©s)
#     """
#     keys = []
#     for k, v in d.items():
#         new_key = f"{parent_key}{sep}{k}" if parent_key else k
#         if isinstance(v, dict):  # Si la valeur est un dictionnaire, on explore r√©cursivement
#             keys.append(new_key)  # Ajouter la cl√©, mais ne pas inclure la valeur
#             keys.extend(explore_dict_keys(v, new_key, sep=sep))  # Continuer l'exploration
#         else:
#             keys.append(new_key)  # Ajouter la cl√© finale
#     return keys

In [None]:
# def find_key_path(d, target_key, path=[]):
#     """
#     Recherche r√©cursive d'une cl√© dans un dictionnaire et retourne son chemin.
#     :param d: dictionnaire
#     :param target_key: cl√© recherch√©e
#     :param path: liste pour stocker le chemin jusqu'√† la cl√©
#     :return: chemin sous forme de liste
#     """
#     if isinstance(d, dict):  # Si le dictionnaire est encore imbriqu√©
#         for key, value in d.items():
#             new_path = path + [key]
#             if key == target_key:
#                 return new_path
#             elif isinstance(value, dict):
#                 result = find_key_path(value, target_key, new_path)
#                 if result:  # Si la cl√© est trouv√©e, retourner le chemin
#                     return result
#     return None  # Retourne None si la cl√© n'a pas √©t√© trouv√©e



# # Recherche du chemin pour la cl√© 'marine_data'
# path = find_key_path(table_dict, "Marine Dataframe")
# print(path)


Auto_convert Test

In [None]:
# for idx, (buoy_id, tables) in enumerate(table_dict.items()):  # Utilisation de .items() pour obtenir (cl√©, valeur)
#     if isinstance(tables, dict):
#         if idx == 1:  # V√©rifier si l'index est √©gal √† 1

Counting Rows of all Dataframes in total