Import Libraries

In [3]:
import subprocess, os, numpy as np, json
import requests, pandas as pd
from datetime import datetime, timedelta, timezone, time
from pathlib import Path
import meteomatics.api as api
import matplotlib.pyplot as plt

from sklearn.impute import SimpleImputer
from siphon.simplewebservice.ndbc import NDBC
from sklearn.impute import SimpleImputer
from google.auth.transport.requests import Request
from googleapiclient.discovery import build
from google.oauth2.service_account import Credentials
from folium import plugins

libraries = ["dask", "folium", "meteomatics", "matplotlib", "pytz", "requests", "gspread", "google-api-python-client", "google-auth", "google-auth-oauthlib", "google-auth-httplib2", "gspread"]

def install_and_import_libraries(list_of_libraries):
    for library in list_of_libraries:
        try:
            try:
                # Installer la librairie
                result = subprocess.run(["pip", "install", "--upgrade", library, "--user"], capture_output=True, text=True, check=True)
                for line in result.stdout.splitlines():
                    if not line.startswith("Requirement already satisfied"):
                        print(line)
                # Importer la librairie après installation
                globals()[library] = __import__(library)
            except Exception as e:  # Correction ici pour capturer l'exception interne
                print(f"Error installing {library}: {e}")
        except Exception as e:      # Correction ici pour capturer l'exception externe
            print(f"Error importing {library}: {e}")

install_and_import_libraries(libraries)

Error installing google-api-python-client: No module named 'google-api-python-client'
Error installing google-auth: No module named 'google-auth'
Error installing google-auth-oauthlib: No module named 'google-auth-oauthlib'
Error installing google-auth-httplib2: No module named 'google-auth-httplib2'


Functions

In [4]:
# NORMALIZE TABLES DATETIME

def normalize_datetime(df, datetime_column):
    df[datetime_column] = pd.to_datetime(df[datetime_column]) 
    df['Datetime'] = df[datetime_column].dt.strftime('%d/%m/%Y-%H:%M')
    df.drop(columns=[datetime_column], inplace=True)
    
    return df

# DEAL WITH MISSING VALUES 

def impute_missing_values(df):
    # Supprimer les colonnes complètement vides (contenant uniquement des NaN)
    df = df.dropna(axis=1, how='all')

    # Imputation des colonnes numériques
    for column in df.select_dtypes(include=[np.number]).columns:
        # Remplacer les NaN par la médiane
        median_value = df[column].median()
        df.loc[:, column] = df[column].fillna(median_value)

    # Imputation des colonnes catégorielles
    for column in df.select_dtypes(include=['object', 'category']).columns:
        # Remplacer les NaN par la modalité la plus fréquente (mode)
        mode_value = df[column].mode()[0]
        df.loc[:, column] = df[column].fillna(mode_value)

    return df

# CHECK NON JSON COMPLIANT VALUES

def check_data_json_compliance(df):
    # Identifier les valeurs non conformes
    invalid_values = df.isin([np.nan, np.inf, -np.inf])
    Issues = True
    if invalid_values.any().any():  # Si des valeurs non conformes existent
        invalid_types = {
            'NaN': df.isna().sum().sum(),
            'inf': (df == np.inf).sum().sum(),
            '-inf': (df == -np.inf).sum().sum()
        }
        return {key: count for key, count in invalid_types.items() if count > 0}
    else:
        Issues = False
        return "No non-compliant JSON values found."


# CONNECT TO GOOGLE_SHEETS
def connect_to_sheet(scopes, path_to_cred):

    # Créer les crédentials à partir du fichier de service account
    creds = Credentials.from_service_account_file(path_to_google_cred, scopes=scopes)
    # Autoriser l'accès en utilisant les crédentials
    client = gspread.authorize(creds)
    return client


#APPEND DATA TO GOOGLE SHEETS
def append_data(df, worksheet, first_col_compare):
    df.columns = df.columns.str.strip()  # Nettoyer les noms des colonnes

    # Remplacer les valeurs infinies et NaN par None pour éviter les erreurs JSON
    df.replace([float('inf'), float('-inf')], None, inplace=True)

    # Détection des colonnes de type datetime dans df (au lieu de df_new)
    datetime_cols = df.select_dtypes(include=['datetime64[ns]', 'datetime64[ns, UTC]']).columns

    # Récupérer les données existantes de la feuille
    existing_data = worksheet.get_all_records()

    # Si la feuille est vide
    if not existing_data:
        # Conversion des datetime pour tout df car tout sera ajouté
        for col in datetime_cols:
            df[col] = df[col].apply(lambda x: x.isoformat() if pd.notnull(x) else x)

        # Ajouter les noms de colonnes et toutes les données
        worksheet.append_row(df.columns.tolist(), value_input_option='RAW')
        worksheet.append_rows(df.values.tolist(), value_input_option='RAW')
        print(f"The sheet was empty. Column names and all data have been added.")
    else:
        # Convertir les données existantes en DataFrame
        df_existing = pd.DataFrame(existing_data)
        df_existing.columns = df_existing.columns.str.strip()

        # Comparer les valeurs de first_col_compare
        if first_col_compare in df_existing.columns:
            df_new = df[~df[first_col_compare].isin(df_existing[first_col_compare])]
        else:
            df_new = df

        # Conversion des datetime uniquement pour df_new
        if not df_new.empty:
            for col in datetime_cols:
                if col in df_new.columns:
                    df_new[col] = df_new[col].apply(lambda x: x.isoformat() if pd.notnull(x) else x)

            # Ajouter uniquement les nouvelles lignes
            worksheet.append_rows(df_new.values.tolist(), value_input_option='RAW')
        print(f"Number of rows added: {df_new.shape[0]}")


        # Ajouter ces nouvelles lignes à partir de la première ligne vide
        first_empty_row = len(existing_data) + 2  # +2 pour sauter la ligne des en-têtes
        worksheet.insert_rows(df_new.values.tolist(), row=first_empty_row, value_input_option='RAW')

# CONNECT TO METEOMATICS API

def meteomatics_API_call(file_path, parameters, coordinates, model, startdate, enddate, interval):
    with open(file_path, 'r') as file:
        credentials = json.load(file)
        meteo_username = credentials['username']
        meteo_password = credentials['password']
        
    # Récupérer les données
    meteo_df = api.query_time_series(coordinates, startdate, enddate, interval, parameters, meteo_username, meteo_password, model=model)
    meteo_df.reset_index(inplace=True)
    return meteo_df

# MERGE TABLES

"""
def merge_dataframes_on_column(df1, df2, column):
    # Nettoyer les noms de colonnes
    df1.columns = df1.columns.str.strip()
    df2.columns = df2.columns.str.strip()

    # S'assurer que la colonne Datetime est bien au format datetime
    df1[column] = pd.to_datetime(df1[column], format='%d/%m/%Y-%H:%M', errors='coerce')
    df2[column] = pd.to_datetime(df2[column], format='%d/%m/%Y-%H:%M', errors='coerce')

    # Fusionner les deux DataFrames sur la colonne Datetime en utilisant une jointure interne
    merged_df = pd.merge(df1[['Datetime']], df2[['Datetime']], on=column, how='inner')

    # Trier par Datetime pour un ordre chronologique
    merged_df.sort_values(by='Datetime', inplace=True)
"""

"\ndef merge_dataframes_on_column(df1, df2, column):\n    # Nettoyer les noms de colonnes\n    df1.columns = df1.columns.str.strip()\n    df2.columns = df2.columns.str.strip()\n\n    # S'assurer que la colonne Datetime est bien au format datetime\n    df1[column] = pd.to_datetime(df1[column], format='%d/%m/%Y-%H:%M', errors='coerce')\n    df2[column] = pd.to_datetime(df2[column], format='%d/%m/%Y-%H:%M', errors='coerce')\n\n    # Fusionner les deux DataFrames sur la colonne Datetime en utilisant une jointure interne\n    merged_df = pd.merge(df1[['Datetime']], df2[['Datetime']], on=column, how='inner')\n\n    # Trier par Datetime pour un ordre chronologique\n    merged_df.sort_values(by='Datetime', inplace=True)\n"

Show Localisation

In [5]:
scopes = [
    'https://spreadsheets.google.com/feeds',
    'https://www.googleapis.com/auth/drive','https://www.googleapis.com/auth/drive.file',
    'https://www.googleapis.com/auth/drive.appdata','https://www.googleapis.com/auth/drive.appfolder'
]

# Chemin vers le fichier de crédentials JSON
path_to_google_cred = r"C:\Users\f.gionnane\Downloads\Projects\google_sheets_api.json"

client = connect_to_sheet(scopes, path_to_google_cred)

# ID de la feuille Google Sheets
sheet_id = "1rXyD4WW5DcrO2iCywfwLbqps3Z-RumxQH9hJofhVM24"


# Ouvrir la feuille Google Sheets en utilisant son ID
spreadsheet = client.open_by_key(sheet_id)
sheet_data_marine = spreadsheet.get_worksheet(0)
sheet_data_meteo = spreadsheet.get_worksheet(1)
df_meteo_and_marine = spreadsheet.get_worksheet(2)

In [6]:
# Coordonnées du point
latitude = 24.359
longitude = -162.081

# Making a variable of it for the API call later in the code
coordinates = [(latitude, longitude)]

# Création de la carte centrée sur le point
buoy_location_map = folium.Map(location=[latitude, longitude], zoom_start=3)

# Ajouter un marqueur pour la bouée
folium.Marker([latitude, longitude], popup="Bouée NDBC").add_to(buoy_location_map)

# Afficher la carte
buoy_location_map

Get Data From Marine API

In [7]:
# MARINE DATA
df_marine_one = NDBC.realtime_observations('51101')
df_marine_one.head()

Unnamed: 0,wind_direction,wind_speed,wind_gust,wave_height,dominant_wave_period,average_wave_period,dominant_wave_direction,pressure,air_temperature,water_temperature,dewpoint,visibility,3hr_pressure_tendency,water_level_above_mean,time
0,250.0,3.0,5.0,,,,,1016.7,22.7,24.2,,,,,2025-02-06 12:30:00+00:00
1,240.0,4.0,6.0,1.9,13.0,9.3,55.0,1016.7,22.6,24.2,,,,,2025-02-06 12:20:00+00:00
2,250.0,3.0,5.0,1.9,,9.3,55.0,1016.7,22.6,24.2,,,,,2025-02-06 12:10:00+00:00
3,250.0,3.0,5.0,,,,,1016.8,22.6,24.2,,,-0.9,,2025-02-06 12:00:00+00:00
4,250.0,4.0,6.0,2.1,11.0,9.6,40.0,1017.0,22.7,24.2,,,,,2025-02-06 11:50:00+00:00


Get Data From Metemoatics API

In [8]:
parameters = [
            'wind_speed_10m:ms',	
            'wind_dir_10m:d',
            'wind_gusts_10m_1h:ms',
            't_2m:C',
            't_max_2m_24h:C',
            't_min_2m_24h:C',
            'msl_pressure:hPa',
            'precip_1h:mm'
        ]
        
coordinates = [(24.359, -162.081)]
model = 'mix'
enddate = datetime.now(timezone.utc).replace(minute=0, second=0, microsecond=0)
startdate = enddate - timedelta(days=1)
interval = timedelta(minutes=10)
meteo_cred__path = r"C:\\Users\\f.gionnane\\Downloads\\Projects\\credentials.json"

df_meteo_one = meteomatics_API_call(meteo_cred__path, parameters, coordinates, model, startdate, enddate, interval)

In [9]:
# Marine 
df_marine = df_marine_one.drop(columns={'dewpoint','visibility','3hr_pressure_tendency','water_level_above_mean'})
df_marine.rename(columns={'wind_direction': 'Wind Dir (°)', 'wind_speed':'Wind Spd (ms)',
                          'wind_gust':'Wind Gusts 1h (ms)','wave_height':'Wave Height (m)','dominant_wave_period':'Dominant Wave Period (s)',
                          'average_wave_period':'Avg Wave Period (s)', 'dominant_wave_direction': 'Dominant Wave Dir',
                          'air_temperature':'Air Temp','water_temperature':'Water Temp','pressure':'Pressure'}, inplace=True)

# Meteo
df_meteo = df_meteo_one.drop(columns=['lat','lon'])
df_meteo.rename(columns={'validdate' : 'time', 'wind_speed_10m:ms':'wind speed (ms)','wind_dir_10m:d':'wind dir (°)',
                          'wind_gusts_10m_1h:ms':'wind gusts 1h (ms)','t_2m:C':'T° (C°)','t_max_2m_24h:C':'T° max 24h (C°)',
                          't_min_2m_24h:C':'T° min 24h (°)', 'msl_pressure:hPa': 'sea level pressure (hPa)','precip_1h:mm':'Precip 1h (mm)'}, inplace=True)

df_meteo['time'] = pd.to_datetime(df_meteo['time']).dt.tz_localize(None)
df_marine['time'] = pd.to_datetime(df_marine['time']).dt.tz_localize(None)

In [12]:
matching_rows_count = len(pd.merge(df_meteo[['time']], df_marine[['time']], on='time', how='inner'))
merged_df = impute_missing_values(pd.merge(df_meteo, df_marine, on='time', how='inner'))
normalize_datetime(merged_df, 'time')
print(matching_rows_count)
merged_df.head(20)

140


Unnamed: 0,wind speed (ms),wind dir (°),wind gusts 1h (ms),T° (C°),T° max 24h (C°),T° min 24h (°),sea level pressure (hPa),Precip 1h (mm),Wind Dir (°),Wind Spd (ms),Wind Gusts 1h (ms),Wave Height (m),Dominant Wave Period (s),Avg Wave Period (s),Dominant Wave Dir,Pressure,Air Temp,Water Temp,Datetime
0,3.0,85.1,4.1,21.1,21.6,20.8,1017.2,0.0,80.0,3.0,6.0,2.5,13.0,9.5,42.0,1017.6,21.1,24.2,05/02/2025-13:00
1,3.0,86.6,4.1,21.1,21.6,20.8,1017.1,0.0,70.0,3.0,5.0,2.8,13.0,9.4,24.0,1017.2,21.1,24.2,05/02/2025-13:10
2,3.0,88.1,4.2,21.1,21.6,20.8,1017.1,0.0,90.0,3.0,5.0,2.8,13.0,9.4,24.0,1017.0,21.1,24.2,05/02/2025-13:20
3,3.0,89.5,4.2,21.1,21.6,20.8,1017.0,0.0,80.0,3.0,6.0,2.5,13.0,9.5,42.0,1017.1,21.1,24.2,05/02/2025-13:30
4,3.0,90.9,4.2,21.2,21.6,20.8,1016.9,0.0,80.0,3.0,6.0,2.5,13.0,9.5,42.0,1017.3,21.2,24.2,05/02/2025-13:40
5,3.0,92.4,4.2,21.1,21.6,20.8,1016.9,0.0,80.0,3.0,5.0,2.7,14.0,9.5,47.0,1017.2,21.1,24.2,05/02/2025-13:50
6,2.0,93.8,4.2,21.1,21.6,20.8,1016.8,0.0,60.0,2.0,4.0,2.5,13.0,9.5,42.0,1017.2,21.1,24.2,05/02/2025-14:00
7,3.0,95.4,4.2,21.1,21.6,20.8,1016.8,0.0,80.0,3.0,4.0,2.9,13.0,9.9,42.0,1017.0,21.1,24.2,05/02/2025-14:10
8,3.0,97.1,4.2,21.2,21.6,20.8,1016.7,0.0,90.0,3.0,6.0,2.9,13.0,9.9,42.0,1017.0,21.2,24.2,05/02/2025-14:20
9,3.0,98.8,4.2,21.2,21.6,20.8,1016.7,0.0,90.0,3.0,6.0,2.5,13.0,9.5,42.0,1017.0,21.2,24.2,05/02/2025-14:30


Prepa For ML Models

In [14]:
df_for_ML = merged_df
df_for_ML['Wind Speed (m/s)'] = (merged_df['Wind Spd (ms)'] + merged_df['wind speed (ms)'])/2
df_for_ML['Wind Gusts 1h (ms)'] = (df_for_ML['Wind Gusts 1h (ms)'] + df_for_ML['wind gusts 1h (ms)'])/2
df_for_ML['Wind Dir (°)'] = (df_for_ML['wind dir (°)'] + df_for_ML['Wind Dir (°)'])/2
df_for_ML = df_for_ML.drop(columns=['Wind Spd (ms)', 'wind speed (ms)', 'wind dir (°)', 'wind gusts 1h (ms)'])


df_for_ML.head()

Unnamed: 0,T° (C°),T° max 24h (C°),T° min 24h (°),sea level pressure (hPa),Precip 1h (mm),Wind Dir (°),Wind Gusts 1h (ms),Wave Height (m),Dominant Wave Period (s),Avg Wave Period (s),Dominant Wave Dir,Pressure,Air Temp,Water Temp,Datetime,Wind Speed (m/s)
0,21.1,21.6,20.8,1017.2,0.0,82.55,5.05,2.5,13.0,9.5,42.0,1017.6,21.1,24.2,05/02/2025-13:00,3.0
1,21.1,21.6,20.8,1017.1,0.0,78.3,4.55,2.8,13.0,9.4,24.0,1017.2,21.1,24.2,05/02/2025-13:10,3.0
2,21.1,21.6,20.8,1017.1,0.0,89.05,4.6,2.8,13.0,9.4,24.0,1017.0,21.1,24.2,05/02/2025-13:20,3.0
3,21.1,21.6,20.8,1017.0,0.0,84.75,5.1,2.5,13.0,9.5,42.0,1017.1,21.1,24.2,05/02/2025-13:30,3.0
4,21.2,21.6,20.8,1016.9,0.0,85.45,5.1,2.5,13.0,9.5,42.0,1017.3,21.2,24.2,05/02/2025-13:40,3.0


In [17]:
df_for_ML.columns

Index(['T° (C°)', 'T° max 24h (C°)', 'T° min 24h (°)',
       'sea level pressure (hPa)', 'Precip 1h (mm)', 'Wind Dir (°)',
       'Wind Gusts 1h (ms)', 'Wave Height (m)', 'Dominant Wave Period (s)',
       'Avg Wave Period (s)', 'Dominant Wave Dir', 'Pressure', 'Air Temp',
       'Water Temp', 'Datetime', 'Wind Speed (m/s)'],
      dtype='object')

In [18]:
def process_datetime(df, datetime_col):
    # Conversion de la colonne en datetime
    df[datetime_col] = pd.to_datetime(df[datetime_col], format='%d/%m/%Y-%H:%M', errors='coerce')
    
    # Fonction pour déterminer la période de la journée
    def get_period(hour):
        if 5 <= hour < 12:
            return 'Morning'
        elif 12 <= hour < 17:
            return 'Mid Day'
        elif 17 <= hour < 21:
            return 'Evening'
        else:
            return 'Night'
    
    # Créer la colonne 'Période'
    df['Période'] = df[datetime_col].dt.hour.apply(get_period)
    
    # Créer la colonne 'Mois' et supprimer la colonne d'origine
    df['Mois'] = df[datetime_col].dt.strftime('%B')
    df.drop(columns=[datetime_col], inplace=True)
    
    return df

# Exemple d'utilisation
df_for_ML = process_datetime(df_for_ML, 'Datetime')

df_for_ML.head()

Unnamed: 0,T° (C°),T° max 24h (C°),T° min 24h (°),sea level pressure (hPa),Precip 1h (mm),Wind Dir (°),Wind Gusts 1h (ms),Wave Height (m),Dominant Wave Period (s),Avg Wave Period (s),Dominant Wave Dir,Pressure,Air Temp,Water Temp,Wind Speed (m/s),Période,Mois
0,21.1,21.6,20.8,1017.2,0.0,82.55,5.05,2.5,13.0,9.5,42.0,1017.6,21.1,24.2,3.0,Milieu de journée,February
1,21.1,21.6,20.8,1017.1,0.0,78.3,4.55,2.8,13.0,9.4,24.0,1017.2,21.1,24.2,3.0,Milieu de journée,February
2,21.1,21.6,20.8,1017.1,0.0,89.05,4.6,2.8,13.0,9.4,24.0,1017.0,21.1,24.2,3.0,Milieu de journée,February
3,21.1,21.6,20.8,1017.0,0.0,84.75,5.1,2.5,13.0,9.5,42.0,1017.1,21.1,24.2,3.0,Milieu de journée,February
4,21.2,21.6,20.8,1016.9,0.0,85.45,5.1,2.5,13.0,9.5,42.0,1017.3,21.2,24.2,3.0,Milieu de journée,February


In [29]:

append_data(df_for_ML, df_meteo_and_marine, 'time')

Number of rows added: 144


In [30]:
merged_df.to_csv('merged_df.csv')

In [31]:
null_values = merged_df.isnull().sum()
print(f"Row Number : {merged_df.shape[0]}")
print(null_values)

Row Number : 144
wind speed (ms)             0
wind dir (°)                0
wind gusts 1h (ms)          0
T° (C°)                     0
T° max 24h (C°)             0
T° min 24h (°)              0
sea level pressure (hPa)    0
Precip 1h (mm)              0
Wind Dir (°)                0
Wind Spd (ms)               0
Wind Gusts 1h (ms)          0
Wave Height (m)             0
Dominant Wave Period (s)    0
Avg Wave Period (s)         0
Dominant Wave Dir           0
Pressure                    0
Air Temp                    0
Water Temp                  0
Datetime                    0
Wind Speed (m/s)            0
dtype: int64


In [32]:
print(df_meteo.columns)
print(df_marine.columns)
# Fusionner sur 'time' en utilisant un 'outer join' pour garder toutes les colonnes des deux DataFrames
merged_df = df_meteo.merge(df_marine, how='outer', on='time', suffixes=('_meteo', '_marine'))

# Afficher les colonnes du DataFrame fusionné
print(merged_df.columns)

Index(['time', 'wind speed (ms)', 'wind dir (°)', 'wind gusts 1h (ms)',
       'T° (C°)', 'T° max 24h (C°)', 'T° min 24h (°)',
       'sea level pressure (hPa)', 'Precip 1h (mm)'],
      dtype='object')
Index(['Wind Dir (°)', 'Wind Spd (ms)', 'Wind Gusts 1h (ms)',
       'Wave Height (m)', 'Dominant Wave Period (s)', 'Avg Wave Period (s)',
       'Dominant Wave Dir', 'Pressure', 'Air Temp', 'Water Temp', 'time'],
      dtype='object')
Index(['time', 'wind speed (ms)', 'wind dir (°)', 'wind gusts 1h (ms)',
       'T° (C°)', 'T° max 24h (C°)', 'T° min 24h (°)',
       'sea level pressure (hPa)', 'Precip 1h (mm)', 'Wind Dir (°)',
       'Wind Spd (ms)', 'Wind Gusts 1h (ms)', 'Wave Height (m)',
       'Dominant Wave Period (s)', 'Avg Wave Period (s)', 'Dominant Wave Dir',
       'Pressure', 'Air Temp', 'Water Temp'],
      dtype='object')


In [33]:
merged_df.head()

Unnamed: 0,time,wind speed (ms),wind dir (°),wind gusts 1h (ms),T° (C°),T° max 24h (C°),T° min 24h (°),sea level pressure (hPa),Precip 1h (mm),Wind Dir (°),Wind Spd (ms),Wind Gusts 1h (ms),Wave Height (m),Dominant Wave Period (s),Avg Wave Period (s),Dominant Wave Dir,Pressure,Air Temp,Water Temp
0,2024-12-21 00:00:00,,,,,,,,,270.0,3.0,4.0,,,,,1016.1,24.6,26.0
1,2024-12-21 00:10:00,,,,,,,,,270.0,2.0,4.0,4.1,,10.8,321.0,1016.0,24.7,26.0
2,2024-12-21 00:20:00,,,,,,,,,270.0,3.0,5.0,4.1,16.0,10.8,321.0,1016.0,24.7,26.0
3,2024-12-21 00:30:00,,,,,,,,,260.0,2.0,4.0,,,,,1015.7,24.7,26.0
4,2024-12-21 00:40:00,,,,,,,,,260.0,3.0,5.0,3.8,,10.7,320.0,1015.8,24.7,26.0


Normalize Date and Time

In [34]:
df_marine = normalize_datetime(df_marine_one, "time")
df_meteo = normalize_datetime(df_meteo_one, "validdate")
df_list = [df_marine, df_meteo]

for df in df_list:
    print(check_data_json_compliance(df))
    df.columns = df.columns.str.strip()
    print(df.columns)

{'NaN': np.int64(38240)}
Index(['wind_direction', 'wind_speed', 'wind_gust', 'wave_height',
       'dominant_wave_period', 'average_wave_period',
       'dominant_wave_direction', 'pressure', 'air_temperature',
       'water_temperature', 'dewpoint', 'visibility', '3hr_pressure_tendency',
       'water_level_above_mean', 'Datetime'],
      dtype='object')
No non-compliant JSON values found.
Index(['lat', 'lon', 'wind_speed_10m:ms', 'wind_dir_10m:d',
       'wind_gusts_10m_1h:ms', 't_2m:C', 't_max_2m_24h:C', 't_min_2m_24h:C',
       'msl_pressure:hPa', 'precip_1h:mm', 'Datetime'],
      dtype='object')


In [35]:
df_marine.head()

Unnamed: 0,wind_direction,wind_speed,wind_gust,wave_height,dominant_wave_period,average_wave_period,dominant_wave_direction,pressure,air_temperature,water_temperature,dewpoint,visibility,3hr_pressure_tendency,water_level_above_mean,Datetime
0,20.0,7.0,9.0,,,,,1019.4,20.8,24.2,,,,,04/02/2025-15:10
1,30.0,6.0,9.0,,,,,1019.4,20.9,24.2,,,-0.9,,04/02/2025-15:00
2,30.0,7.0,9.0,3.6,15.0,9.4,16.0,1019.4,20.8,24.2,,,,,04/02/2025-14:50
3,30.0,7.0,9.0,3.6,,9.4,16.0,1019.3,20.9,24.2,,,,,04/02/2025-14:40
4,20.0,7.0,11.0,,,,,1019.2,20.9,24.2,,,,,04/02/2025-14:30


Rename Columns + Delete Useless Ones

In [36]:
# Marine 
df_marine = df_marine.drop(columns={'visibility','3hr_pressure_tendency','water_level_above_mean'})
df_marine.rename(columns={'wind_direction': 'Wind Dir (°)', 'wind_speed':'Wind Spd (ms)',
                          'wind_gust':'Wind Gusts 1h (ms)','wave_height':'Wave Height (m)','dominant_wave_period':'Dominant Wave Period (s)',
                          'average_wave_period':'Avg Wave Period (s)', 'dominant_wave_direction': 'Dominant Wave Dir',
                          'air_temperature':'Air Temp','water_temperature':'Water Temp','pressure':'Pressure'}, inplace=True)

# Meteo
df_meteo = df_meteo.drop(columns=['lat','lon'])
df_meteo.rename(columns={'wind_speed_10m:ms':'wind speed (ms)','wind_dir_10m:d':'wind dir (°)',
                          'wind_gusts_10m_1h:ms':'wind gusts 1h (ms)','t_2m:C':'T° (C°)','t_max_2m_24h:C':'T° max 24h (C°)',
                          't_min_2m_24h:C':'T° min 24h (°)', 'msl_pressure:hPa': 'sea level pressure (hPa)','precip_1h:mm':'Precip 1h (mm)'}, inplace=True)

In [37]:
df_marine.head()

Unnamed: 0,Wind Dir (°),Wind Spd (ms),Wind Gusts 1h (ms),Wave Height (m),Dominant Wave Period (s),Avg Wave Period (s),Dominant Wave Dir,Pressure,Air Temp,Water Temp,dewpoint,Datetime
0,20.0,7.0,9.0,,,,,1019.4,20.8,24.2,,04/02/2025-15:10
1,30.0,6.0,9.0,,,,,1019.4,20.9,24.2,,04/02/2025-15:00
2,30.0,7.0,9.0,3.6,15.0,9.4,16.0,1019.4,20.8,24.2,,04/02/2025-14:50
3,30.0,7.0,9.0,3.6,,9.4,16.0,1019.3,20.9,24.2,,04/02/2025-14:40
4,20.0,7.0,11.0,,,,,1019.2,20.9,24.2,,04/02/2025-14:30


Deal with Marine Data Missing Values

In [38]:
print(f"Number of rows in Marine Data : {df_marine.shape[0]}")

print('\nNumber of Missing Values Before')
# Vérification avant imputations
for col in df_marine.columns:
    print(f"{col}: {df_marine[col].isnull().sum()}")

# Exemple d'utilisation
df_marine = impute_missing_values(df_marine)

print("\nNumber of Missing Values After")
# Vérification après imputations
for col in df_marine.columns:
    print(f"{col}: {df_marine[col].isnull().sum()}")

Number of rows in Marine Data : 6509

Number of Missing Values Before
Wind Dir (°): 55
Wind Spd (ms): 4
Wind Gusts 1h (ms): 4
Wave Height (m): 2894
Dominant Wave Period (s): 4364
Avg Wave Period (s): 2894
Dominant Wave Dir: 2894
Pressure: 13
Air Temp: 35
Water Temp: 134
dewpoint: 6509
Datetime: 0

Number of Missing Values After
Wind Dir (°): 0
Wind Spd (ms): 0
Wind Gusts 1h (ms): 0
Wave Height (m): 0
Dominant Wave Period (s): 0
Avg Wave Period (s): 0
Dominant Wave Dir: 0
Pressure: 0
Air Temp: 0
Water Temp: 0
Datetime: 0


In [39]:
df_marine.head()

Unnamed: 0,Wind Dir (°),Wind Spd (ms),Wind Gusts 1h (ms),Wave Height (m),Dominant Wave Period (s),Avg Wave Period (s),Dominant Wave Dir,Pressure,Air Temp,Water Temp,Datetime
0,20.0,7.0,9.0,2.9,13.0,8.3,310.0,1019.4,20.8,24.2,04/02/2025-15:10
1,30.0,6.0,9.0,2.9,13.0,8.3,310.0,1019.4,20.9,24.2,04/02/2025-15:00
2,30.0,7.0,9.0,3.6,15.0,9.4,16.0,1019.4,20.8,24.2,04/02/2025-14:50
3,30.0,7.0,9.0,3.6,13.0,9.4,16.0,1019.3,20.9,24.2,04/02/2025-14:40
4,20.0,7.0,11.0,2.9,13.0,8.3,310.0,1019.2,20.9,24.2,04/02/2025-14:30


Deal with Meteo Data Missing Values

In [40]:
print(f"Number of rows in Meteo Data : {df_meteo.shape[0]}")

print('\nNumber of Missing Values Before')
# Vérification avant imputations
for col in df_meteo.columns:
    print(f"{col}: {df_meteo[col].isnull().sum()}")

# Exemple d'utilisation
df_meteo = impute_missing_values(df_meteo)

print("\nNumber of Missing Values After")
# Vérification après imputations
for col in df_meteo.columns:
    print(f"{col}: {df_meteo[col].isnull().sum()}")

Number of rows in Meteo Data : 145

Number of Missing Values Before
wind speed (ms): 0
wind dir (°): 0
wind gusts 1h (ms): 0
T° (C°): 0
T° max 24h (C°): 0
T° min 24h (°): 0
sea level pressure (hPa): 0
Precip 1h (mm): 0
Datetime: 0

Number of Missing Values After
wind speed (ms): 0
wind dir (°): 0
wind gusts 1h (ms): 0
T° (C°): 0
T° max 24h (C°): 0
T° min 24h (°): 0
sea level pressure (hPa): 0
Precip 1h (mm): 0
Datetime: 0


Check Non Json Compliant Values

In [41]:
df_list = [df_meteo, df_marine]
for df in df_list:
    print(check_data_json_compliance(df))
    print(df.dtypes)
    print('')
    print(df.columns)

No non-compliant JSON values found.
wind speed (ms)             float64
wind dir (°)                float64
wind gusts 1h (ms)          float64
T° (C°)                     float64
T° max 24h (C°)             float64
T° min 24h (°)              float64
sea level pressure (hPa)    float64
Precip 1h (mm)              float64
Datetime                     object
dtype: object

Index(['wind speed (ms)', 'wind dir (°)', 'wind gusts 1h (ms)', 'T° (C°)',
       'T° max 24h (C°)', 'T° min 24h (°)', 'sea level pressure (hPa)',
       'Precip 1h (mm)', 'Datetime'],
      dtype='object')
No non-compliant JSON values found.
Wind Dir (°)                float64
Wind Spd (ms)               float64
Wind Gusts 1h (ms)          float64
Wave Height (m)             float64
Dominant Wave Period (s)    float64
Avg Wave Period (s)         float64
Dominant Wave Dir           float64
Pressure                    float64
Air Temp                    float64
Water Temp                  float64
Datetime              

Connect and Load to Google Sheet

In [42]:
df_marine.head()

Unnamed: 0,Wind Dir (°),Wind Spd (ms),Wind Gusts 1h (ms),Wave Height (m),Dominant Wave Period (s),Avg Wave Period (s),Dominant Wave Dir,Pressure,Air Temp,Water Temp,Datetime
0,20.0,7.0,9.0,2.9,13.0,8.3,310.0,1019.4,20.8,24.2,04/02/2025-15:10
1,30.0,6.0,9.0,2.9,13.0,8.3,310.0,1019.4,20.9,24.2,04/02/2025-15:00
2,30.0,7.0,9.0,3.6,15.0,9.4,16.0,1019.4,20.8,24.2,04/02/2025-14:50
3,30.0,7.0,9.0,3.6,13.0,9.4,16.0,1019.3,20.9,24.2,04/02/2025-14:40
4,20.0,7.0,11.0,2.9,13.0,8.3,310.0,1019.2,20.9,24.2,04/02/2025-14:30


In [43]:
sheet_data_marine_content = pd.DataFrame(sheet_data_marine.get_all_records())
sheet_data_marine_content

In [44]:
print(df_meteo.columns)
print(df_marine.columns)

Index(['wind speed (ms)', 'wind dir (°)', 'wind gusts 1h (ms)', 'T° (C°)',
       'T° max 24h (C°)', 'T° min 24h (°)', 'sea level pressure (hPa)',
       'Precip 1h (mm)', 'Datetime'],
      dtype='object')
Index(['Wind Dir (°)', 'Wind Spd (ms)', 'Wind Gusts 1h (ms)',
       'Wave Height (m)', 'Dominant Wave Period (s)', 'Avg Wave Period (s)',
       'Dominant Wave Dir', 'Pressure', 'Air Temp', 'Water Temp', 'Datetime'],
      dtype='object')


In [45]:
def merge_dataframes_on_datetime(df1, df2):
    # Nettoyer les noms de colonnes
    df1.columns = df1.columns.str.strip()
    df2.columns = df2.columns.str.strip()

    # S'assurer que la colonne Datetime est bien au format datetime
    df1['Datetime'] = pd.to_datetime(df1['Datetime'], format='%d/%m/%Y-%H:%M', errors='coerce')
    df2['Datetime'] = pd.to_datetime(df2['Datetime'], format='%d/%m/%Y-%H:%M', errors='coerce')

    # Fusionner les deux DataFrames sur la colonne Datetime en utilisant une jointure interne
    merged_df = pd.merge(df1[['Datetime']], df2[['Datetime']], on='Datetime', how='inner')

    # Trier par Datetime pour un ordre chronologique
    merged_df.sort_values(by='Datetime', inplace=True)

merged_df = merge_dataframes_on_datetime(df_meteo, df_marine)