In [1]:
from imports import *
from functions import *
from IPython.core.display import *

In [2]:
path_postgresql_creds = r"C:\Users\f.gionnane\Documents\Data Engineering\Credentials\postgresql_creds.json"
with open(path_postgresql_creds, 'r') as file:
    content = json.load(file)
    user = content["user"]
    password = content["password"]
    host = content["host"]
    port = content["port"]

db = "MyProjects"
schema = "End_To_End_Oceanography_ML"

# Créer l'engine PostgreSQL
engine = create_engine(f"postgresql+psycopg2://{user}:{password}@{host}:{port}/{db}")
conn = engine.connect()

Get Data from APIs

Get Cleaned Stations ID List

In [3]:
# get all stations and some metadata as a Pandas DataFrame
stations_df = api.stations()
# parse the response as a dictionary
stations_df = api.stations(as_df=True)
stations_df.head()

Unnamed: 0,Station,Hull No./Config and Location,Location Lat/Long,Wind Speed,Wind Direction,Sea Level Pressure,Wave Height,Dominant Period,Air Temp,Water Temp,Dew Point,Remark
0,41001,East Hatteras,34.70N 72.23W,Sensor/system failure.,Sensor/system failure.,Sensor/system failure.,Sensor/system failure.,Sensor/system failure.,Sensor/system failure.,Sensor/system failure.,Sensor/system failure.,"Buoy recovered 10/26/24, data release stopped."
1,41002,3DV33 (SC) South Hatteras,31.75N 74.93W,Sensor/system failure.,Sensor/system failure.,Sensor/system failure.,Sensor/system failure.,Sensor/system failure.,Sensor/system failure.,Sensor/system failure.,Sensor/system failure.,Buoy adrift 12/30/24.
2,41004,3DV02 (SC) Edisto,32.50N 79.08W,100,100,100,100,100,100,99,100,Dewpoint is intermittent.
3,41008,3D36 (SC) Grays Reef,31.40N 80.85W,9,9,100,99,99,9,99,9,All data is intermittent after 8/18/23.
4,41009,3D65 (SC) Canaveral,28.50N 80.18W,100,100,100,31,31,73,38,73,Wave data ceased 3/6/25. Water temperature cea...


In [4]:
blacklist = ["Failure", "ceased","failed","recovered","stopped", 'adrift']
stations_id_set = set()

print(f'Avant Filtre: {stations_df.shape[0]}')
# Liste pour collecter les indices à supprimer
indices_a_supprimer = []

for idx, row in stations_df.iterrows():
    station_id = row["Station"]
    
    # Vérifier si "Remark" n'est pas NaN et si un des éléments de blacklist est dans "Remark"
    if isinstance(row["Remark"], str) and any(blacklist_word.lower() in row["Remark"].lower() for blacklist_word in blacklist):
        # Ajouter l'index à la liste
        print(f'Station "{station_id}" Failed the Remarks test\n')
        indices_a_supprimer.append(idx)
    else:
        # Si l'URL en temps réel passe, afficher et ajouter le station_id à l'ensemble
        print(f'Buoy {station_id} passed the Remark Test !')
        stations_id_set.add(station_id)
    
# Supprimer les lignes après la boucle
stations_df.drop(index=indices_a_supprimer, inplace=True)

print(f'Après Filtre: {stations_df.shape[0]}')

Avant Filtre: 146
Station "41001" Failed the Remarks test

Station "41002" Failed the Remarks test

Buoy 41004 passed the Remark Test !
Buoy 41008 passed the Remark Test !
Station "41009" Failed the Remarks test

Buoy 41010 passed the Remark Test !
Buoy 41013 passed the Remark Test !
Station "41025" Failed the Remarks test

Buoy 41040 passed the Remark Test !
Station "41041" Failed the Remarks test

Buoy 41043 passed the Remark Test !
Buoy 41044 passed the Remark Test !
Station "41046" Failed the Remarks test

Station "41047" Failed the Remarks test

Station "41048" Failed the Remarks test

Buoy 41049 passed the Remark Test !
Station "42001" Failed the Remarks test

Station "42002" Failed the Remarks test

Buoy 42003 passed the Remark Test !
Buoy 42012 passed the Remark Test !
Station "42019" Failed the Remarks test

Buoy 42020 passed the Remark Test !
Buoy 42035 passed the Remark Test !
Station "42036" Failed the Remarks test

Station "42039" Failed the Remarks test

Station "42040" F

Build Table Names and Collect their Data in a Dict

In [5]:
stations_id_list = list(stations_id_set)
len(stations_id_list)
one_station = stations_id_list[0]
metadata = get_station_metadata(one_station)
metadata

{'Watch circle radius': '212 yards',
 'Water depth': '262.4 m',
 'Sea temp depth': '2 m below water line',
 'Barometer elevation': '2.4 m above mean sea level',
 'Anemometer height': '3.8 m above site elevation',
 'Air temp height': '3.4 m above site elevation',
 'Site elevation': 'sea level',
 'Location': '48.493 N 124.727 W (48°29\'36" N 124°43\'38" W)',
 'Statation Type': 'Funding provided by the United States Coast GuardOwned and maintained by National Data Buoy Center, 3-meter discus buoy, SCOOP payload',
 'Name': 'Station 46087 (LLNR 756) - Neah Bay - 6 NM North of Cape Flattery, WA (Traffic Separation Lighted Buoy)'}

In [6]:
lat_buoy, lon_buoy, station_name, station_id, station_zone, marine_data_table_name = parse_buoy_json(metadata)
print(f" {lat_buoy}, {lon_buoy}\n{station_name}, {station_id}\n{station_zone}, {marine_data_table_name}")

 48.49, 124.73
Station 46087 (LLNR 756), 46087
Neah Bay, marine_data_Station_46087_(LLNR_756)_Neah_Bay_48-49_124-73


In [7]:
for idx, row in stations_df.iterrows():
    stat_id = row["Station"]
    stations_id_list.append(stat_id)

chosen_stations_id_set = set()
station_table_mapping = {}  # Associe chaque station_id à un unique table_name
table_name_set = set()

for _ in range(200):
    random_station = random.choice(stations_id_list)

    if random_station not in chosen_stations_id_set:
        chosen_stations_id_set.add(random_station)

        buoy_metadata = get_station_metadata(random_station)
        lat_buoy, lon_buoy, station_name, station_id,station_zone, table_name = parse_buoy_json(buoy_metadata)

        # Vérifier si la station a déjà été ajoutée avec un nom de table
        if random_station not in station_table_mapping:
            station_table_mapping[random_station] = {
                "Name": station_name,
                "lat": lat_buoy,
                "lon": lon_buoy,
                "zone": station_zone,
                "table name": table_name
            }

            table_name_set.add(table_name)

# Affichage des résultats
for item in table_name_set:
    print(item)

print(f'Longueur du set : {len(table_name_set)})')

marine_data_Station_51000_(LLNR_28005-9)_NORTHERN_HAWAII_ONE_23-53_153-79
marine_data_Station_IOSN3_Isle_of_Shoals,_NH_42-97_70-62
marine_data_Station_PTGC1_Point_Arguello,_CA_34-58_120-65
marine_data_Station_FFIA2_Five_Fingers,_AK_57-27_133-63
marine_data_Station_46086_(LLNR_81)_SAN_CLEMENTE_BASIN_32-5_118-05
marine_data_Station_POTA2_Potato_Point,_AK_61-06_146-7
marine_data_Station_41049_SOUTH_BERMUDA_27-5_62-27
marine_data_Station_SISW1_Smith_Island,_WA_48-32_122-83
marine_data_Station_46087_(LLNR_756)_Neah_Bay_48-49_124-73
marine_data_Station_44013_(LLNR_420)_BOSTON_16_NM_East_of_Boston,_MA_42-35_70-65
marine_data_Station_46022_(LLNR_500)_EEL_RIVER_40-72_124-54
marine_data_Station_46025_(LLNR_181)_Santa_Monica_Basin_33-76_119-05
marine_data_Station_51002_(LLNR_28005-1)_SOUTHWEST_HAWAII_17-07_157-75
marine_data_Station_46072_(LLNR_27510)_CENTRAL_ALEUTIANS_230_NM_SW_Dutch_Harbor_51-65_172-15
marine_data_Station_SANF1_Sand_Key,_FL_24-46_81-88
marine_data_Station_SMKF1_Sombrero_Key,_FL

In [8]:
def print_with_flush(message):
    sys.stdout.write(f'\r{message}  ')  # \r permet de revenir au début de la ligne
    sys.stdout.flush()  # Force l'affichage immédiat

Build Bronze Layer Table Names

In [9]:
bronze_tables_list = []

for item in station_table_mapping.values():
    bronze_table_name = f'bronze_{item["table name"]}'
    bronze_tables_list.append(bronze_table_name)
bronze_tables_list

['bronze_marine_data_Station_51002_(LLNR_28005-1)_SOUTHWEST_HAWAII_17-07_157-75',
 'bronze_marine_data_Station_SBIO1_South_Bass_Island,_OH_41-63_82-84',
 'bronze_marine_data_Station_41049_SOUTH_BERMUDA_27-5_62-27',
 'bronze_marine_data_Station_46053_(LLNR_196)_EAST_SANTA_BARBARA_34-24_119-84',
 'bronze_marine_data_Station_46025_(LLNR_181)_Santa_Monica_Basin_33-76_119-05',
 'bronze_marine_data_Station_IOSN3_Isle_of_Shoals,_NH_42-97_70-62',
 'bronze_marine_data_Station_42012_(LLNR_138)_ORANGE_BEACH_30-06_87-55',
 'bronze_marine_data_Station_42003_(LLNR_1460)_East_GULF_25-93_85-62',
 'bronze_marine_data_Station_SANF1_Sand_Key,_FL_24-46_81-88',
 'bronze_marine_data_Station_BURL1_Southwest_Pass,_LA_28-91_89-43',
 'bronze_marine_data_Station_KTNF1_Keaton_Beach,_FL_29-82_83-59',
 'bronze_marine_data_Station_41044_NE_ST_MARTIN_21-58_58-63',
 'bronze_marine_data_Station_LONF1_Long_Key,_FL_24-84_80-86',
 'bronze_marine_data_Station_46069_(LLNR_181-6)_SOUTH_SANTA_ROSA_33-68_120-21',
 'bronze_mari

In [11]:
first_key = next(iter(station_table_mapping))
print(first_key)

buoy_data = NDBC.realtime_observations(first_key)
df = pd.DataFrame(data=buoy_data)
col =list(df.columns)
col

for table in bronze_tables_list:

    try:
        # Créer la table en appelant la fonction avec les bons paramètres
        create_schema_and_table(conn=conn, schema=schema, table_name=table, col=col)
        
        print(f"Table {table} created successfully!")
    except Exception as e:
        # Capturer l'exception et afficher un message d'erreur avec l'exception
        print(f"Table {table} couldn't be created! \nError: {str(e)}\n")

51002
Schema "End_To_End_Oceanography_ML" already exists.
Table 'bronze_marine_data_Station_51002_(LLNR_28005-1)_SOUTHWEST_HAWAII_17-07_157-75' does not exist. Creating...
Table 'bronze_marine_data_Station_51002_(LLNR_28005-1)_SOUTHWEST_HAWAII_17-07_157-75' created in schema 'End_To_End_Oceanography_ML'.
Table bronze_marine_data_Station_51002_(LLNR_28005-1)_SOUTHWEST_HAWAII_17-07_157-75 created successfully!
Schema "End_To_End_Oceanography_ML" already exists.
Table 'bronze_marine_data_Station_SBIO1_South_Bass_Island,_OH_41-63_82-84' does not exist. Creating...
Table 'bronze_marine_data_Station_SBIO1_South_Bass_Island,_OH_41-63_82-84' created in schema 'End_To_End_Oceanography_ML'.
Table bronze_marine_data_Station_SBIO1_South_Bass_Island,_OH_41-63_82-84 created successfully!
Schema "End_To_End_Oceanography_ML" already exists.
Table 'bronze_marine_data_Station_41049_SOUTH_BERMUDA_27-5_62-27' does not exist. Creating...
Table 'bronze_marine_data_Station_41049_SOUTH_BERMUDA_27-5_62-27' cre

In [None]:
stations_id_list =list(stations_id_set)
buoy_chosen = random.choice(stations_id_list)

In [None]:
buoy_chosen_metadata = get_station_metadata(buoy_chosen)
buoy_chosen_metadata

{'Watch circle radius': '233 yards',
 'Water depth': '131 m',
 'Sea temp depth': '2 m below water line',
 'Barometer elevation': '2.4 m above mean sea level',
 'Anemometer height': '3.8 m above site elevation',
 'Air temp height': '3.4 m above site elevation',
 'Site elevation': 'sea level',
 'Location': '46.163 N 124.487 W (46°9\'48" N 124°29\'12" W)',
 'Statation Type': 'Owned and maintained by National Data Buoy Center, 3-meter discus buoy w/ seal cage, SCOOP payload',
 'Name': 'Station 46029 (LLNR 688) - COLUMBIA RIVER BAR - 20NM West of Columbia River Mouth'}

In [None]:
# Exemple d'utilisation avec un dictionnaire 'buoy_chosen_metadata'
try:
    lat_buoy, lon_buoy, station_name, station_id, station_zone, marine_data_table_name = parse_buoy_json(buoy_chosen_metadata)
    print(lat_buoy, lon_buoy, station_name, station_id, station_zone, marine_data_table_name)
except ValueError as e:
    print(f"Erreur lors du traitement des données: {e}")

46.16 124.49 Station 46029 (LLNR 688) COLUMBIA RIVER BAR COLUMBIA RIVER BAR Station_46029__LLNR_688__COLUMBIA_RIVER_BAR


In [None]:
df_marine = NDBC.realtime_observations(buoy_chosen)
print(type(buoy_chosen_metadata))
df_marine.head()

<class 'dict'>


Unnamed: 0,wind_direction,wind_speed,wind_gust,wave_height,dominant_wave_period,average_wave_period,dominant_wave_direction,pressure,air_temperature,water_temperature,dewpoint,visibility,3hr_pressure_tendency,water_level_above_mean,time
0,280.0,3.0,5.0,,,,,1019.4,2.2,4.6,-0.9,,,,2025-03-15 15:30:00+00:00
1,270.0,3.0,4.0,1.7,8.0,6.3,326.0,1019.5,2.1,4.6,-0.9,,,,2025-03-15 15:20:00+00:00
2,290.0,4.0,5.0,,,,,1019.5,2.1,4.6,-0.9,,,,2025-03-15 15:10:00+00:00
3,290.0,4.0,5.0,,,,,1019.6,2.1,4.6,-0.8,,-0.3,,2025-03-15 15:00:00+00:00
4,280.0,5.0,6.0,1.7,7.0,6.1,303.0,1019.6,2.1,4.6,-0.8,,,,2025-03-15 14:50:00+00:00


Marine API

Get Data From Json

In [None]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from IPython.core.display import display, HTML

# Remplacer `{station_id}` par l'identifiant de la station spécifique
url = f"https://www.ndbc.noaa.gov/station_page.php?station={station_id}"

# Faire une requête GET pour obtenir le HTML de la page
response = requests.get(url)

# Vérifier que la requête a réussi
if response.status_code == 200:
    # Parse le HTML avec BeautifulSoup
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Trouver la division avec l'ID 'stationmetadata'
    station_metadata = soup.find(id="stationmetadata")
    
    # Vérifier si la division existe
    if station_metadata:
        # Chercher les deux images spécifiques
        img_1 = station_metadata.find('img', src='/images/stations/3mfoam_scoop_mini.jpg')
        img_2 = station_metadata.find('img', src='/images/buoycam/W64A_2025_03_15_1510.jpg')

        # Si l'image 1 est trouvée, modifier son lien en absolu
        if img_1:
            img_1['src'] = urljoin(url, img_1['src'])

        # Si l'image 2 est trouvée, modifier son lien en absolu
        if img_2:
            img_2['src'] = urljoin(url, img_2['src'])
        
        # Afficher directement le HTML avec les liens des images mis à jour
        display(HTML(str(station_metadata)))  # Affiche la division en HTML rendu
    else:
        print("La division avec l'ID 'stationmetadata' n'a pas été trouvée.")
else:
    print(f"Erreur lors de la récupération de la page, statut: {response.status_code}")


ImportError: cannot import name 'display' from 'IPython.core.display' (C:\Users\f.gionnane\AppData\Roaming\Python\Python313\site-packages\IPython\core\display.py)

In [None]:
coord_buoy_1 = (lat_buoy, lon_buoy)


map = folium.Map(location=[lat_buoy,lon_buoy], zoom_start=6)

folium.Marker(location=[lat_buoy, lon_buoy], popup=f"Chosen Buoy : {station_name}, lat :{lat_buoy},lon :{lon_buoy}").add_to(map)



In [None]:
#### Prise en charge de plusieurs buoys à la fois
caribbean_df = stations_df[(stations_df['Lat'] >= 9) & 
                           (stations_df['Lat'] <= 25) & 
                           (stations_df['Lon'] >= -85) & 
                           (stations_df['Lon'] <= -60)]
caribbean_df.shape[0]

buoys_ids = []

for index, row in caribbean_df.iterrows():
    buoys_ids.append(row['Station'])  
print(len(buoys_ids))    

# Liste pour stocker les dataframes
all_dataframes = []

# Compteurs pour les bouées réussies et échouées
successful_buoy = 0
failed_buoy = 0

# Parcours de chaque bouée
for buoy_id in buoys_ids:
    try:
        # Récupère les observations en temps réel pour chaque bouée
        df_caribbean_buoy = NDBC.realtime_observations(buoy_id)

        # Ajoute le dataframe à la liste
        all_dataframes.append(df_caribbean_buoy)
        successful_buoy += 1

        # Efface et met à jour la ligne des bouées réussies
        print(f"\r✅ Bouées réussies : {successful_buoy}", end='', flush=True)

    except Exception:
        # Incrémente le compteur des échouées pour toutes les erreurs
        failed_buoy += 1

        # Efface et met à jour la ligne des bouées échouées
        print(f"\r❌ Bouées échouées : {failed_buoy}", end='', flush=True)

# Affichage final propre avec un saut de ligne
print("\n")

# Concaténation des dataframes si disponibles
if all_dataframes:
    final_df = pd.concat(all_dataframes, ignore_index=True)
    print(f"✔️ Concaténation terminée. Nombre de lignes : {final_df.shape[0]} \n Nombre de colonnes : {final_df.shape[1]}")
    print(f"✅ Bouées réussies : {successful_buoy}", end='', flush=True)
    print(f"❌ Bouées échouées : {failed_buoy}", end='', flush=True)

else:
    print("⚠️ Aucune donnée récupérée.")
    

In [None]:
df_marine = NDBC.realtime_observations(nearest)

# Afficher le résultat
print(df_marine.shape)
df_marine.head()

(6481, 15)


Unnamed: 0,wind_direction,wind_speed,wind_gust,wave_height,dominant_wave_period,average_wave_period,dominant_wave_direction,pressure,air_temperature,water_temperature,dewpoint,visibility,3hr_pressure_tendency,water_level_above_mean,time
0,90.0,6.0,7.0,,,,,1012.2,27.0,27.6,22.9,,2.0,,2025-03-15 12:00:00+00:00
1,90.0,7.0,8.0,1.1,6.0,4.3,97.0,1012.2,26.8,27.6,22.5,,,,2025-03-15 11:50:00+00:00
2,90.0,7.0,8.0,,,,,1012.1,26.9,27.6,22.9,,,,2025-03-15 11:40:00+00:00
3,90.0,7.0,8.0,,,,,1011.8,26.9,27.6,22.8,,,,2025-03-15 11:30:00+00:00
4,90.0,7.0,9.0,1.0,6.0,4.2,100.0,1011.7,26.9,27.6,22.9,,,,2025-03-15 11:20:00+00:00


In [None]:
bronze_marine_data_table_name = f"Bronze_marine_data_{station_name.replace(' ', '_')}_{station_zone.replace(' ', '_')}_{str(lat_buoy).replace('.', '-')}_{str(lon_buoy).replace('.', '-')}"
print(bronze_marine_data_table_name)
load_data_in_table(db=db, schema=schema, table_name=bronze_marine_data_table_name, df=df_marine,conn=conn,key_column='time')

Bronze_marine_data_Station_42058_Central_Caribbean_14-51_75-15
Schema "End_To_End_Oceanography_ML" already exists.
Table 'Bronze_marine_data_Station_42058_Central_Caribbean_14-51_75-15' does not exist. Creating...
Table 'Bronze_marine_data_Station_42058_Central_Caribbean_14-51_75-15' created in schema 'End_To_End_Oceanography_ML'.
Data inserted successfully.


In [None]:
coordinates = [lat_buoy, lon_buoy]
df_meteo = meteo_api_request(coordinates=coordinates)
print(df_meteo.shape)
df_meteo.head()

(2376, 18)


Unnamed: 0,date,temperature_2m,relative_humidity_2m,dew_point_2m,precipitation,rain,showers,pressure_msl,surface_pressure,cloud_cover,cloud_cover_low,cloud_cover_mid,cloud_cover_high,visibility,wind_speed_10m,soil_temperature_0cm,soil_moisture_0_to_1cm,is_day
0,2024-12-12 00:00:00+00:00,,,,,,,,,,,,,,,,,0.0
1,2024-12-12 01:00:00+00:00,,,,,,,,,,,,,,,,,0.0
2,2024-12-12 02:00:00+00:00,,,,,,,,,,,,,,,,,1.0
3,2024-12-12 03:00:00+00:00,,,,,,,,,,,,,,,,,1.0
4,2024-12-12 04:00:00+00:00,,,,,,,,,,,,,,,,,1.0


In [None]:
# Création du nom de la table
bronze_meteo_data_table_name = f"bronze_meteo_data_{station_name.replace(' ', '_')}_{station_zone.replace(' ', '_')}_{str(lat_buoy).replace('.', '-')}_{str(lon_buoy).replace('.', '-')}"
bronze_meteo_data_table_name = bronze_meteo_data_table_name.replace('.', '-')
load_data_in_table(db=db, schema=schema, table_name=bronze_meteo_data_table_name, df=df_meteo,conn=conn,key_column='date')

Schema "End_To_End_Oceanography_ML" already exists.
Table 'bronze_meteo_data_Station_42058_Central_Caribbean_14-51_75-15' does not exist. Creating...
Table 'bronze_meteo_data_Station_42058_Central_Caribbean_14-51_75-15' created in schema 'End_To_End_Oceanography_ML'.
Data inserted successfully.


In [None]:
# Ajouter les nouvelles clés au JSON
buoy_near_SM["bronze_marine"] = bronze_marine_data_table_name
buoy_near_SM["bronze_meteo"] = bronze_meteo_data_table_name
buoy_near_SM["db"] = db
buoy_near_SM["schema"] = schema

# Enregistrer en écrasant le fichier s'il existe
with open("buoy_near_SM.json", "w") as f:
    json.dump(buoy_near_SM, f, indent=4)

In [None]:
with open("buoy_near_SM.json", "r") as f:
    buoy_near_SM = json.load(f)


path_postgresql_creds = r"C:\Users\f.gionnane\Documents\Data Engineering\Credentials\postgresql_creds.json"
with open(path_postgresql_creds, 'r') as file:
    content = json.load(file)
    user = content["user"]
    password = content["password"]
    host = content["host"]
    port = content["port"]

# Créer l'engine PostgreSQL
engine = create_engine(f"postgresql+psycopg2://{user}:{password}@{host}:{port}/{db}")
conn = engine.connect()

db = buoy_near_SM["db"]
schema = buoy_near_SM["schema"] 

bronze_marine_data_table_name = buoy_near_SM["bronze_marine"] 
bronze_meteo_data_table_name = buoy_near_SM["bronze_meteo"] 

In [None]:
try:
    df_marine_to_clean = fetch_table_data(conn=conn, schema=schema, table_name= bronze_marine_data_table_name)
    df_meteo_to_clean = fetch_table_data(conn=conn, schema=schema, table_name= bronze_meteo_data_table_name)
except Exception as e:
    print(e)

In [None]:
def auto_convert(df):
    for col in df.columns:
        # Essayer de convertir en datetime
        if df[col].dtype == 'object':
            try:
                # Si tu connais le format, tu peux spécifier ici, par exemple '%Y-%m-%d'
                # Exemple de format: '2021-01-01' ou '01/01/2021'
                df[col] = pd.to_datetime(df[col], format='%Y-%m-%d', errors='raise')  # Converte en datetime
            except Exception as e:
                pass

        # Essayer de convertir en numérique
        if df[col].dtype == 'object':
            try:
                df[col] = pd.to_numeric(df[col], errors='raise')  # Converte en numérique
            except Exception as e:
                pass

    return df

df_marine_to_clean = auto_convert(df_marine_to_clean)
print(df_marine_to_clean.dtypes)
df_marine_to_clean.head()

id                                       int64
wind_direction                         float64
wind_speed                             float64
wind_gust                              float64
wave_height                            float64
average_wave_period                    float64
dominant_wave_direction                float64
pressure                               float64
air_temperature                        float64
water_temperature                      float64
dewpoint                               float64
time                       datetime64[ns, UTC]
dtype: object


Unnamed: 0,id,wind_direction,wind_speed,wind_gust,wave_height,average_wave_period,dominant_wave_direction,pressure,air_temperature,water_temperature,dewpoint,time
0,1,70.0,6.0,7.0,1.7,4.9,81.0,1010.1,27.3,27.8,23.2,2025-03-14 22:10:00+00:00
1,2,70.0,6.0,7.0,1.7,4.9,81.0,1010.1,27.2,27.8,22.9,2025-03-14 22:00:00+00:00
2,3,70.0,6.0,7.0,0.9,4.2,96.0,1010.1,27.2,27.7,22.9,2025-03-14 21:50:00+00:00
3,4,60.0,6.0,7.0,0.9,4.2,96.0,1010.1,27.2,27.8,22.9,2025-03-14 21:40:00+00:00
4,5,60.0,6.0,7.0,1.7,4.9,81.0,1010.0,27.2,27.8,22.9,2025-03-14 21:30:00+00:00


In [None]:
df_marine_to_clean = handle_null_values(df_marine_to_clean)

In [None]:
df_meteo_to_clean = auto_convert(df_meteo_to_clean)
df_meteo_to_clean.dtypes

id                                      int64
date                      datetime64[ns, UTC]
temperature_2m                        float64
relative_humidity_2m                  float64
dew_point_2m                          float64
precipitation                         float64
rain                                  float64
showers                               float64
pressure_msl                          float64
surface_pressure                      float64
cloud_cover                           float64
cloud_cover_low                       float64
cloud_cover_mid                       float64
cloud_cover_high                      float64
visibility                            float64
wind_speed_10m                        float64
soil_temperature_0cm                  float64
soil_moisture_0_to_1cm                float64
is_day                                float64
dtype: object

In [None]:
df_marine_to_clean = handle_null_values(df_marine_to_clean)
df_meteo_to_clean = handle_null_values(df_meteo_to_clean)

Impute la colonne : temperature_2m avec la médiane (12.88% de valeurs manquantes)
Impute la colonne : relative_humidity_2m avec la médiane (12.88% de valeurs manquantes)
Impute la colonne : dew_point_2m avec la médiane (12.88% de valeurs manquantes)
Impute la colonne : precipitation avec la médiane (12.88% de valeurs manquantes)
Impute la colonne : rain avec la médiane (12.88% de valeurs manquantes)
Impute la colonne : showers avec la médiane (12.88% de valeurs manquantes)
Impute la colonne : pressure_msl avec la médiane (12.88% de valeurs manquantes)
Impute la colonne : surface_pressure avec la médiane (12.88% de valeurs manquantes)
Impute la colonne : cloud_cover avec la médiane (12.88% de valeurs manquantes)
Impute la colonne : cloud_cover_low avec la médiane (12.88% de valeurs manquantes)
Impute la colonne : cloud_cover_mid avec la médiane (12.88% de valeurs manquantes)
Impute la colonne : cloud_cover_high avec la médiane (12.88% de valeurs manquantes)
Impute la colonne : visibilit

In [None]:
print(df_marine_to_clean.columns)
print(df_meteo_to_clean.columns)

Index(['id', 'wind_direction', 'wind_speed', 'wind_gust', 'wave_height',
       'average_wave_period', 'dominant_wave_direction', 'pressure',
       'air_temperature', 'water_temperature', 'dewpoint', 'time'],
      dtype='object')
Index(['id', 'date', 'temperature_2m', 'relative_humidity_2m', 'dew_point_2m',
       'precipitation', 'rain', 'showers', 'pressure_msl', 'surface_pressure',
       'cloud_cover', 'cloud_cover_low', 'cloud_cover_mid', 'cloud_cover_high',
       'visibility', 'wind_speed_10m', 'soil_temperature_0cm',
       'soil_moisture_0_to_1cm', 'is_day'],
      dtype='object')


In [None]:
df_meteo_to_clean.head()

Unnamed: 0,Datetime,id,temperature_2m,relative_humidity_2m,dew_point_2m,precipitation,rain,showers,pressure_msl,surface_pressure,cloud_cover,cloud_cover_low,cloud_cover_mid,cloud_cover_high,visibility,wind_speed_10m,soil_temperature_0cm,soil_moisture_0_to_1cm,is_day
0,2024-12-12 00:00:00+00:00,1.0,23.7,57.0,14.34,0.0,0.0,0.0,1012.7,945.48,26.0,0.0,0.0,0.0,24140.0,6.88,22.8,0.07,0.0
1,2024-12-12 01:00:00+00:00,2.0,23.7,57.0,14.34,0.0,0.0,0.0,1012.7,945.48,26.0,0.0,0.0,0.0,24140.0,6.88,22.8,0.07,0.0
2,2024-12-12 02:00:00+00:00,3.0,23.7,57.0,14.34,0.0,0.0,0.0,1012.7,945.48,26.0,0.0,0.0,0.0,24140.0,6.88,22.8,0.07,1.0
3,2024-12-12 03:00:00+00:00,4.0,23.7,57.0,14.34,0.0,0.0,0.0,1012.7,945.48,26.0,0.0,0.0,0.0,24140.0,6.88,22.8,0.07,1.0
4,2024-12-12 04:00:00+00:00,5.0,23.7,57.0,14.34,0.0,0.0,0.0,1012.7,945.48,26.0,0.0,0.0,0.0,24140.0,6.88,22.8,0.07,1.0


open-meteo API

In [None]:
df_meteo_to_clean.isna().sum()

Datetime                  0
id                        0
temperature_2m            0
relative_humidity_2m      0
dew_point_2m              0
precipitation             0
rain                      0
showers                   0
pressure_msl              0
surface_pressure          0
cloud_cover               0
cloud_cover_low           0
cloud_cover_mid           0
cloud_cover_high          0
visibility                0
wind_speed_10m            0
soil_temperature_0cm      0
soil_moisture_0_to_1cm    0
is_day                    0
dtype: int64

In [None]:
df_meteo = drop_columns_if_exist(df_meteo,['rain', 'showers','soil_moisture_0_to_1cm', 'cloud_cover', 'soil_temperature_0cm',	'soil_moisture_0_to_1cm', 'is_day'])
df_meteo.head()

Colonne 'rain' Supprimée
Colonne 'showers' Supprimée
Colonne 'soil_moisture_0_to_1cm' Supprimée
Colonne 'cloud_cover' Supprimée
Colonne 'soil_temperature_0cm' Supprimée
Colonne 'soil_moisture_0_to_1cm' Supprimée
Colonne 'is_day' Supprimée


Unnamed: 0,Datetime,temperature_2m,relative_humidity_2m,dew_point_2m,precipitation,pressure_msl,surface_pressure,cloud_cover_low,cloud_cover_mid,cloud_cover_high,visibility,wind_speed_10m
0,2024-12-12 00:00:00+00:00,23.7,57.0,14.335,0.0,1012.7,945.48,0.0,0.0,0.0,24140.0,6.88
1,2024-12-12 01:00:00+00:00,23.7,57.0,14.335,0.0,1012.7,945.48,0.0,0.0,0.0,24140.0,6.88
2,2024-12-12 02:00:00+00:00,23.7,57.0,14.335,0.0,1012.7,945.48,0.0,0.0,0.0,24140.0,6.88
3,2024-12-12 03:00:00+00:00,23.7,57.0,14.335,0.0,1012.7,945.48,0.0,0.0,0.0,24140.0,6.88
4,2024-12-12 04:00:00+00:00,23.7,57.0,14.335,0.0,1012.7,945.48,0.0,0.0,0.0,24140.0,6.88


In [None]:
df_meteo.rename(columns={'temperature_2m': 'T°(C°)', 
                         'relative_humidity_2m': 'Relative Humidity (%)',
                         'dew_point_2m': 'Dew Point (°C)', 
                         'precipitation': 'Precipitation (mm)', 
                         'pressure_msl':' Sea Level Pressure (hPa)', 
                         'cloud_cover_low':'Low Clouds (%)',
                         'cloud_cover_mid' : 'Middle Clouds (%)',	
                         'cloud_cover_high' : 'High Clouds (%)', 
                         'visibility' : ' Visibility (%)', 
                         'wind_speed_10m' : 'Wind Speed (km/h)'}, 
                         inplace=True)
df_marine.rename(columns={
    'wind_direction': 'Wind Direction (°)',
    'wind_speed': 'Wind Speed (km/h)',
    'wind_gust': 'Wind Gusts (km/h)',
    'wave_height': 'Wave Height (m)',
    'average_wave_period': 'Average Wave Period (s)',
    'dominant_wave_direction': 'Dominant Wave Direction (°)',
    'pressure': 'Pressure (hPA)',
    'air_temperature': 'Air T°',
    'water_temperature': 'Water T°'}, 
    inplace=True)

print(df_meteo.columns)
print(df_marine.columns)
print(df_marine.shape)
print(df_meteo.shape)

Index(['Datetime', 'T°(C°)', 'Relative Humidity (%)', 'Dew Point (°C)',
       'Precipitation (mm)', ' Sea Level Pressure (hPa)', 'surface_pressure',
       'Low Clouds (%)', 'Middle Clouds (%)', 'High Clouds (%)',
       ' Visibility (%)', 'Wind Speed (km/h)'],
      dtype='object')
Index(['Wind Direction (°)', 'Wind Speed (km/h)', 'Wind Gusts (km/h)',
       'Wave Height (m)', 'Average Wave Period (s)',
       'Dominant Wave Direction (°)', 'Pressure (hPA)', 'Air T°', 'Water T°',
       'dewpoint'],
      dtype='object')
(6541, 10)
(2376, 12)


Merging Dataframes

In [None]:
# Effectuer la jointure interne sur la colonne 'time'
df_merged = pd.merge(df_marine, df_meteo, on = 'Datetime', how='inner')

# Afficher le résultat
print(df_merged.shape)
print(df_merged.dtypes)
df_merged.head(20)

(1102, 22)
Datetime                       datetime64[ns, UTC]
Wind Direction (°)                         float64
Wind Speed (km/h)_x                        float64
Wind Gusts (km/h)                          float64
Wave Height (m)                            float64
Average Wave Period (s)                    float64
Dominant Wave Direction (°)                float64
Pressure (hPA)                             float64
Air T°                                     float64
Water T°                                   float64
dewpoint                                   float64
T°(C°)                                     float64
Relative Humidity (%)                      float32
Dew Point (°C)                             float64
Precipitation (mm)                         float64
 Sea Level Pressure (hPa)                  float64
surface_pressure                           float64
Low Clouds (%)                             float32
Middle Clouds (%)                          float32
High Clouds (%)     

Unnamed: 0,Datetime,Wind Direction (°),Wind Speed (km/h)_x,Wind Gusts (km/h),Wave Height (m),Average Wave Period (s),Dominant Wave Direction (°),Pressure (hPA),Air T°,Water T°,...,Relative Humidity (%),Dew Point (°C),Precipitation (mm),Sea Level Pressure (hPa),surface_pressure,Low Clouds (%),Middle Clouds (%),High Clouds (%),Visibility (%),Wind Speed (km/h)_y
0,2025-03-14 22:00:00+00:00,70.0,6.0,7.0,1.7,4.9,81.0,1010.1,27.2,27.8,...,93.0,22.1,0.0,1010.8,943.7,73.0,0.0,0.0,24140.0,7.34
1,2025-03-14 21:00:00+00:00,60.0,6.0,7.0,1.7,4.9,81.0,1009.9,27.2,27.8,...,96.0,22.23,0.0,1010.8,943.61,71.0,0.0,58.0,24140.0,10.16
2,2025-03-14 20:00:00+00:00,70.0,6.0,7.0,1.7,4.9,81.0,1010.0,27.2,27.8,...,98.0,22.67,0.0,1011.2,944.01,72.0,0.0,100.0,24140.0,7.42
3,2025-03-14 19:00:00+00:00,80.0,6.0,7.0,1.7,4.9,81.0,1011.0,27.4,27.8,...,97.0,22.95,0.0,1011.3,944.2,56.0,0.0,100.0,24140.0,6.92
4,2025-03-14 18:00:00+00:00,80.0,6.0,7.0,1.7,4.9,81.0,1011.8,27.2,27.7,...,97.0,23.59,0.0,1012.1,945.08,59.0,49.0,100.0,24140.0,6.83
5,2025-03-14 17:00:00+00:00,80.0,7.0,8.0,1.7,4.9,81.0,1012.8,27.2,27.7,...,97.0,23.54,0.0,1011.6,944.61,59.0,8.0,47.0,24140.0,7.57
6,2025-03-14 16:00:00+00:00,80.0,7.0,8.0,1.7,4.9,81.0,1013.6,27.3,27.6,...,82.0,22.05,0.0,1011.4,944.7,0.0,0.0,39.0,24140.0,10.11
7,2025-03-14 15:00:00+00:00,90.0,7.0,8.0,1.7,4.9,81.0,1013.8,27.1,27.5,...,89.0,24.05,0.0,1010.7,944.19,48.0,60.0,32.0,24140.0,17.31
8,2025-03-14 14:00:00+00:00,80.0,7.0,7.0,1.7,4.9,81.0,1013.7,27.2,27.5,...,51.0,17.71,0.0,1009.2,943.39,0.0,42.0,24.0,24140.0,10.81
9,2025-03-14 13:00:00+00:00,80.0,7.0,8.0,1.7,4.9,81.0,1013.6,27.1,27.5,...,26.0,11.16,0.0,1007.6,942.8,0.0,34.0,2.0,24140.0,3.55


In [None]:
# Exemple d'utilisation
df_merged = add_daytime_and_month_column(df_merged,'Datetime')

print(df_merged.columns)
df_merged.head(10)

Index(['Datetime', 'Wind Direction (°)', 'Wind Speed (km/h)_x',
       'Wind Gusts (km/h)', 'Wave Height (m)', 'Average Wave Period (s)',
       'Dominant Wave Direction (°)', 'Pressure (hPA)', 'Air T°', 'Water T°',
       'dewpoint', 'T°(C°)', 'Relative Humidity (%)', 'Dew Point (°C)',
       'Precipitation (mm)', ' Sea Level Pressure (hPa)', 'surface_pressure',
       'Low Clouds (%)', 'Middle Clouds (%)', 'High Clouds (%)',
       ' Visibility (%)', 'Wind Speed (km/h)_y', 'DayTime', 'Month'],
      dtype='object')


Unnamed: 0,Datetime,Wind Direction (°),Wind Speed (km/h)_x,Wind Gusts (km/h),Wave Height (m),Average Wave Period (s),Dominant Wave Direction (°),Pressure (hPA),Air T°,Water T°,...,Precipitation (mm),Sea Level Pressure (hPa),surface_pressure,Low Clouds (%),Middle Clouds (%),High Clouds (%),Visibility (%),Wind Speed (km/h)_y,DayTime,Month
0,2025-03-14 22:00:00+00:00,70.0,6.0,7.0,1.7,4.9,81.0,1010.1,27.2,27.8,...,0.0,1010.8,943.7,73.0,0.0,0.0,24140.0,7.34,Night,3
1,2025-03-14 21:00:00+00:00,60.0,6.0,7.0,1.7,4.9,81.0,1009.9,27.2,27.8,...,0.0,1010.8,943.61,71.0,0.0,58.0,24140.0,10.16,Night,3
2,2025-03-14 20:00:00+00:00,70.0,6.0,7.0,1.7,4.9,81.0,1010.0,27.2,27.8,...,0.0,1011.2,944.01,72.0,0.0,100.0,24140.0,7.42,Evening,3
3,2025-03-14 19:00:00+00:00,80.0,6.0,7.0,1.7,4.9,81.0,1011.0,27.4,27.8,...,0.0,1011.3,944.2,56.0,0.0,100.0,24140.0,6.92,Evening,3
4,2025-03-14 18:00:00+00:00,80.0,6.0,7.0,1.7,4.9,81.0,1011.8,27.2,27.7,...,0.0,1012.1,945.08,59.0,49.0,100.0,24140.0,6.83,Evening,3
5,2025-03-14 17:00:00+00:00,80.0,7.0,8.0,1.7,4.9,81.0,1012.8,27.2,27.7,...,0.0,1011.6,944.61,59.0,8.0,47.0,24140.0,7.57,Afternoon,3
6,2025-03-14 16:00:00+00:00,80.0,7.0,8.0,1.7,4.9,81.0,1013.6,27.3,27.6,...,0.0,1011.4,944.7,0.0,0.0,39.0,24140.0,10.11,Afternoon,3
7,2025-03-14 15:00:00+00:00,90.0,7.0,8.0,1.7,4.9,81.0,1013.8,27.1,27.5,...,0.0,1010.7,944.19,48.0,60.0,32.0,24140.0,17.31,Afternoon,3
8,2025-03-14 14:00:00+00:00,80.0,7.0,7.0,1.7,4.9,81.0,1013.7,27.2,27.5,...,0.0,1009.2,943.39,0.0,42.0,24.0,24140.0,10.81,Afternoon,3
9,2025-03-14 13:00:00+00:00,80.0,7.0,8.0,1.7,4.9,81.0,1013.6,27.1,27.5,...,0.0,1007.6,942.8,0.0,34.0,2.0,24140.0,3.55,Afternoon,3


In [None]:
df_merged['Wind Speed (km/h)'] = (df_merged['Wind Speed (km/h)_x']+ df_merged['Wind Speed (km/h)_y'])/2
df_merged = drop_columns_if_exist(df_merged, ['Wind Speed (km/h)_x', 'Wind Speed (km/h)_y', 'Wind Gusts (km/h)'])

print(df_merged.columns)
df_merged.head()

Colonne 'Wind Speed (km/h)_x' Supprimée
Colonne 'Wind Speed (km/h)_y' Supprimée
Colonne 'Wind Gusts (km/h)' Supprimée
Index(['Datetime', 'Wind Direction (°)', 'Wave Height (m)',
       'Average Wave Period (s)', 'Dominant Wave Direction (°)',
       'Pressure (hPA)', 'Air T°', 'Water T°', 'dewpoint', 'T°(C°)',
       'Relative Humidity (%)', 'Dew Point (°C)', 'Precipitation (mm)',
       ' Sea Level Pressure (hPa)', 'surface_pressure', 'Low Clouds (%)',
       'Middle Clouds (%)', 'High Clouds (%)', ' Visibility (%)', 'DayTime',
       'Month', 'Wind Speed (km/h)'],
      dtype='object')


Unnamed: 0,Datetime,Wind Direction (°),Wave Height (m),Average Wave Period (s),Dominant Wave Direction (°),Pressure (hPA),Air T°,Water T°,dewpoint,T°(C°),...,Precipitation (mm),Sea Level Pressure (hPa),surface_pressure,Low Clouds (%),Middle Clouds (%),High Clouds (%),Visibility (%),DayTime,Month,Wind Speed (km/h)
0,2025-03-14 22:00:00+00:00,70.0,1.7,4.9,81.0,1010.1,27.2,27.8,22.9,23.3,...,0.0,1010.8,943.7,73.0,0.0,0.0,24140.0,Night,3,6.67
1,2025-03-14 21:00:00+00:00,60.0,1.7,4.9,81.0,1009.9,27.2,27.8,23.0,22.9,...,0.0,1010.8,943.61,71.0,0.0,58.0,24140.0,Night,3,8.08
2,2025-03-14 20:00:00+00:00,70.0,1.7,4.9,81.0,1010.0,27.2,27.8,23.0,23.0,...,0.0,1011.2,944.01,72.0,0.0,100.0,24140.0,Evening,3,6.71
3,2025-03-14 19:00:00+00:00,80.0,1.7,4.9,81.0,1011.0,27.4,27.8,22.7,23.45,...,0.0,1011.3,944.2,56.0,0.0,100.0,24140.0,Evening,3,6.46
4,2025-03-14 18:00:00+00:00,80.0,1.7,4.9,81.0,1011.8,27.2,27.7,23.2,24.1,...,0.0,1012.1,945.08,59.0,49.0,100.0,24140.0,Evening,3,6.415


Connexion BigQuery

In [None]:
# from google.oauth2 import service_account
# from google.cloud import storage  # Exemple pour Google Cloud Storage
# from google.cloud import bigquery
# from google.cloud.exceptions import NotFound
# import pandas as pd
# import pyarrow

# path_to_google_creds = r"C:\Users\f.gionnane\Documents\Data Engineering\Credentials\google_credentials.json"

# bq_client = bigquery.Client.from_service_account_json(
#     path_to_google_creds)
# project_id = "rare-bloom-419220"
# dataset_End_To_End_Oceanography_ML = "End_To_End_Oceanography_ML"
# bq_client
# # dataset_ref = bq_client.dataset('my_dataset_name', project=project_id)


# # LIST DATASETS AND FIND ONE
# datasets = list(bq_client.list_datasets())  # Make an API request.
# project = client.project
# bq_datasets_list =[]

# if datasets:
#     print("Datasets in project {}:".format(project))
#     for dataset in datasets:
#         print("\t{}".format(dataset.dataset_id))
#         bq_datasets_list.append(dataset.dataset_id)
#     if dataset_End_To_End_Oceanography_ML in bq_datasets_list:
#         dataset =  dataset_End_To_End_Oceanography_ML
#         print("Dataset Found !")
# else:
#     print("{} project does not contain any datasets.".format(project))

# # (developer): Set table_id to the ID of the table to determine existence.
# # table_id = "your-project.your_dataset.your_table"

# try:
#     table_ref = bq_client.dataset(dataset).table(table_name)
#     bq_client.get_table(table_ref)  # Make an API request.
#     print("Table {} already exists.".format(table_name))
# except NotFound:
#     print("Table {} is not found.".format(table_name))


# def clean_column_names(df):

#     cleaned_columns = []
#     for column in df.columns:
#         # Remplacer tous les caractères non alphanumériques (sauf underscores) par un underscore
#         cleaned_column = re.sub(r'[^A-Za-z0-9_]', '_', column)
        
#         # Ajouter le nom de colonne nettoyé à la liste
#         cleaned_columns.append(cleaned_column)
    
#     # Appliquer les nouveaux noms de colonnes au DataFrame
#     df.columns = cleaned_columns
#     return df

# df_merged = clean_column_names(df_merged)

# table_id = f"{project_id}.{dataset}.{table_name}"


# try:
#     bq_client.get_table(table_id)  # Make an API request.
#     print("Table {} already exists.".format(table_id))
# except NotFound:
#     print("Table {} is not found.".format(table_id))
#     bq_client.create_table(table_id)
#     print("Creation of the Table {}.".format(table_id))


# def load_data_to_bigquery(client, dataset: str = None, table: str = None, df: pd.DataFrame = None, key_column: str = 'Datetime', table_id: str = None):
   
    
#     # Fonction pour détecter et convertir les types de données
#     def convert_column_types(df):
#         for column in df.columns:
#             dtype = df[column].dtype

#             if dtype == 'object':  # Chaînes de caractères
#                 df[column] = df[column].astype(str)
#             elif dtype == 'datetime64[ns]':  # Datetime
#                 df[column] = pd.to_datetime(df[column], errors='coerce').dt.tz_localize('UTC', ambiguous='NaT').dt.tz_localize(None)
#             elif dtype == 'float64':  # Float
#                 df[column] = df[column].astype('float32')
#             elif dtype == 'int64':  # Integer
#                 df[column] = df[column].astype('int64')  # On garde int64, car BigQuery supporte ce type
#             else:
#                 # Autres types, on les convertit en string
#                 df[column] = df[column].astype(str)
        
#         return df

#     # Convertir les types des colonnes
#     df = convert_column_types(df)

#     if table_id:
#         # Si table_id est fourni, on l'utilise directement.
#         full_table_id = table_id
#     elif dataset and table:
#         # Si table_id n'est pas fourni, on construit table_id à partir de dataset et table.
#         full_table_id = f"{client.project}.{dataset}.{table}"
#     else:
#         raise ValueError("Il faut fournir soit 'table_id' ou les paramètres 'dataset' et 'table' séparés.")

#     # Vérifier si le dataset existe, sinon le créer
#     try:
#         dataset = full_table_id.split('.')[1]
#         client.get_dataset(dataset)  # Vérifie si le dataset existe
#         print(f"Le dataset {dataset} existe déjà.")
#     except NotFound:
#         print(f"Le dataset {dataset} n'existe pas. Création du dataset...")
#         client.create_dataset(dataset)  # Crée le dataset s'il n'existe pas
#         print(f"Le dataset {dataset} a été créé.")

#     # Vérifier si la table existe, sinon la créer
#     try:
#         client.get_table(full_table_id)  # Vérifie si la table existe
#         print(f"La table {full_table_id} existe déjà.")
#     except NotFound:
#         print(f"La table {full_table_id} n'existe pas. Création de la table...")
#         # Créer la table avec le schéma du DataFrame
#         schema = []
#         for name, dtype in df.dtypes.items():
#             if name == 'Datetime':
#                 schema.append(bigquery.SchemaField(name, bigquery.enums.SqlTypeNames.TIMESTAMP))
#             elif dtype == 'float32' or dtype == 'float64':
#                 schema.append(bigquery.SchemaField(name, bigquery.enums.SqlTypeNames.FLOAT64))
#             elif dtype == 'int64':
#                 schema.append(bigquery.SchemaField(name, bigquery.enums.SqlTypeNames.INTEGER))
#             else:
#                 schema.append(bigquery.SchemaField(name, bigquery.enums.SqlTypeNames.STRING))
        
#         # Créer la table avec le schéma
#         table = bigquery.Table(full_table_id, schema=schema)
#         client.create_table(table)  # Crée la table si elle n'existe pas
#         print(f"La table {full_table_id} a été créée.")

#     # Préparer les données pour l'insertion
#     if key_column in df.columns:
#         # Si la colonne clé est fournie, supprimer les doublons en fonction de cette colonne
#         df = df.drop_duplicates(subset=[key_column])

#     # Charger les données dans la table BigQuery
#     job_config = bigquery.LoadJobConfig(
#         schema=[
#             bigquery.SchemaField(name, bigquery.enums.SqlTypeNames.TIMESTAMP) if name == 'Datetime' else
#             bigquery.SchemaField(name, bigquery.enums.SqlTypeNames.FLOAT64) if dtype == 'float32' or dtype == 'float64' else
#             bigquery.SchemaField(name, bigquery.enums.SqlTypeNames.INTEGER) if dtype == 'int64' else
#             bigquery.SchemaField(name, bigquery.enums.SqlTypeNames.STRING)
#             for name, dtype in df.dtypes.items()
#         ],
#         write_disposition="WRITE_APPEND"  # Ajoute les données sans écraser les anciennes
#     )

#     # Charger le DataFrame dans BigQuery
#     job = client.load_table_from_dataframe(df, full_table_id, job_config=job_config)
#     job.result()  # Attendre la fin de la tâche
#     print(f"Données chargées dans la table {full_table_id}.")

# # Exemple d'appel à la fonction
# load_data_to_bigquery(table_id=table_id, client=bq_client, df=df_merged)


PostgreSQL

In [None]:
df_merged.columns

Index(['Datetime', 'Wind Direction (°)', 'Wave Height (m)',
       'Average Wave Period (s)', 'Dominant Wave Direction (°)',
       'Pressure (hPA)', 'Air T°', 'Water T°', 'dewpoint', 'T°(C°)',
       'Relative Humidity (%)', 'Dew Point (°C)', 'Precipitation (mm)',
       ' Sea Level Pressure (hPa)', 'surface_pressure', 'Low Clouds (%)',
       'Middle Clouds (%)', 'High Clouds (%)', ' Visibility (%)', 'DayTime',
       'Month', 'Wind Speed (km/h)'],
      dtype='object')