In [27]:
import os, sys
import shutil
import pandas as pd
import netCDF4
from datetime import datetime, timedelta
import numpy as np
from scipy.spatial import cKDTree
import matplotlib.pyplot as plt

sys.path.append('../src/')
from Biologging_Toolkit.auxiliary import Api_ERA
from Biologging_Toolkit.auxiliary import Api_GPM
import Biologging_Toolkit.utils.rain_empirical_utils as R_Utils


c:\Users\barsro\Documents\SES_tags\notebooks\../src\Biologging_Toolkit


In [2]:
source_path = "E:/individus_brut"
dest_path = "E:/individus_filtered"

depids = ['ml17_280a', 'ml18_296a', 'ml18_294b', 'ml19_292a', 'ml19_292b', 'ml19_293a', 'ml19_294a', 'ml20_293a',
          'ml20_296b', 'ml20_313a', 'ml21_295a', 'ml21_305b']


---
## Remise à zero des depid_sens.nc & depid_dive.csv

In [None]:
for depid in depids[1:] :
    #Enlever les fichiers erronés
    file1 = os.path.join(dest_path, depid, f"{depid}_dive_clean.csv")
    # file2 = os.path.join(dest_path, depid, f"{depid}_dive.csv")
    # file3 = os.path.join(dest_path, depid, f"{depid}_sens.nc")
    file4 = os.path.join(dest_path, depid, f"{depid}_sens_clean.nc")

    new_folder_path = os.path.join(dest_path, depid, "old_NC_CSV")
    os.makedirs(new_folder_path, exist_ok=True)
    shutil.move(file1, new_folder_path)
    # shutil.move(file2, new_folder_path)
    # shutil.move(file3, new_folder_path)
    shutil.move(file4, new_folder_path)

    # Copier les fichiers bruts origins !
    file1_src = os.path.join(source_path, depid, f"{depid}_dive.csv")
    file2_src = os.path.join(source_path, depid, f"{depid}_sens.nc")
    shutil.copy(file1_src, os.path.join(dest_path, depid))
    shutil.copy(file2_src, os.path.join(dest_path, depid))

---
## Nettoyage colonnes 

In [None]:
for depid in depids :
    print(f"Working on {depid}")
    path = 'E:/individus_filtered/'
    df = pd.read_csv(os.path.join(path, depid, f'{depid}_dive.csv'))
    print(df.columns,"\n")
    # df.drop(['cylinder_gpm_0.036', 'cylinder_gpm_0.075', 'cylinder_gpm_0.1'], axis=1, inplace=True)
    # df.to_csv(os.path.join(path, depid, f'{depid}_dive.csv'), index=False)

---
## Ajout des donées GPM

In [None]:
for depid in depids :
    print(f"Working on {depid}")
    Api_GPM.join_gpm(depid, f"E:/individus_filtered", "precipitation")

In [73]:
for depid in depids[0:1] :
    nc_gpm = netCDF4.Dataset(os.path.join(path, depid, f'{depid}_gpm.nc'))
    lats = nc_gpm['lat'][:]
    lons = nc_gpm['lon'][:]
    precip = nc_gpm['precipitation'][:]  # [time, lat, lon]

    base_time = datetime(1980, 1, 6, 0, 0, 0)
    gpm_times = np.array([base_time + timedelta(seconds=float(s)) for s in nc_gpm['time'][:]])

    df = pd.read_csv(os.path.join(path, depid, f'{depid}_dive.csv'))
    df['custom_time'] = pd.to_datetime(df['begin_time'], unit='s', origin='unix') 

    # Créer la grille des points GPM (lat, lon)
    grid_lons, grid_lats = np.meshgrid(lons, lats)
    gpm_points = np.column_stack([grid_lats.ravel(), grid_lons.ravel()])
    tree = cKDTree(gpm_points)
    for radius in [0.01, 0.018, 0.036] :

        # Initialiser la colonne d'interpolation
        df[f'cyl_gpm_{radius}'] = np.nan

        for i, row in df.iterrows():
            if np.isnan(row['lat']) or np.isnan(row['lon']) or np.isinf(row['lat']) or np.isinf(row['lon']):
                print(f"Ligne {i}: lat/lon invalides, ignorée {row['lat']}, {row['lon']}")
                continue
            time = row['custom_time']
            
            # Trouver les deux dates GPM encadrantes
            idx_after = np.searchsorted(gpm_times, time)
            
            if idx_after == 0 or idx_after == len(gpm_times):
                continue  # hors des bornes GPM

            idx_before = idx_after - 1
            t0, t1 = gpm_times[idx_before], gpm_times[idx_after]
            dt = (t1 - t0).total_seconds()
            w1 = (time - t0).total_seconds() / dt
            w0 = 1 - w1

            # Chercher voisin spatial
            point = [row['lat'], row['lon']]
            dist, idx = tree.query(point, distance_upper_bound=radius)
            
            if dist != np.inf:
                lat_idx, lon_idx = np.unravel_index(idx, grid_lats.shape)
                val0 = precip[idx_before, lat_idx, lon_idx]
                val1 = precip[idx_after, lat_idx, lon_idx]
                
                # Interpolation linéaire temporelle
                df.at[i, f'cyl_gpm_{radius}'] = w0 * val0 + w1 * val1
    df.drop(['custom_time'], axis=1, inplace=True)
    df.to_csv(os.path.join(path, depid, f'{depid}_dive.csv'), index = None)

Ligne 0: lat/lon invalides, ignorée nan, nan
Ligne 1858: lat/lon invalides, ignorée nan, nan
Ligne 0: lat/lon invalides, ignorée nan, nan
Ligne 1858: lat/lon invalides, ignorée nan, nan
Ligne 0: lat/lon invalides, ignorée nan, nan
Ligne 1858: lat/lon invalides, ignorée nan, nan


In [74]:
df = pd.DataFrame({})
for depid in depids :
    df = pd.concat([df, pd.read_csv(os.path.join(path, depid, f'{depid}_dive.csv'))], ignore_index=True)
df.head()

Unnamed: 0,dive,begin_time,end_time,zn1,zn2,zn1_up,zn2_up,zn1_down,zn2_down,threhsold_up,...,lstm_cfosat,begin_wind,precipitation_GPM,tpmaxPool,tp,cyl_gpm_0.01,cyl_gpm_0.018,cyl_gpm_0.036,FOD,corr_mld
0,0.0,1507334000.0,1507335000.0,,,,,,,157.202545,...,15.581713,,,3e-05,,,,,,
1,1.0,1507335000.0,1507337000.0,,,,,,,134.639832,...,15.214185,15.187502,0.014598,0.000199,0.000146,,,,,
2,2.0,1507337000.0,1507338000.0,,,,,,,125.335068,...,14.47338,15.082936,0.02026,0.000199,0.000125,,,,,
3,3.0,1507338000.0,1507339000.0,,,,,,,124.460243,...,14.536754,14.976356,0.025509,0.000199,0.000105,,,,,
4,4.0,1507339000.0,1507341000.0,,,,,,,133.336792,...,13.652518,14.824885,0.043237,0.000199,0.000127,,,,,


---
## Ajout des données ERA5

In [None]:
value = 'tp'
for depid in depids[1:] :
    print(f"Working on {depid} :")
    path = os.path.join(dest_path, depid)
    era_path = os.path.join(path, "era", f"{depid}.nc")
    Api_ERA.join_era_maxPool(depid, path, era_path, value)
    Api_ERA.join_era(depid, path, era_path, value)

In [66]:
df = pd.read_csv(os.path.join(dest_path, depid, f"{depid}_dive.csv"))
print(type(df.iloc[0]["end_time"]))

# nc = nc =netCDF4.Dataset(os.path.join(dest_path, depid, f"{depid}_sens.nc")) 
# for var in nc.variables:
#     print(var)
# nc.close()

<class 'numpy.float64'>


---
## Remettre begin_time au bon format

In [None]:
for depid in depids :
    df = pd.read_csv(os.path.join(dest_path, depid, f"{depid}_dive.csv"))
    print(df.iloc[0]["begin_time"], df.iloc[0]["end_time"])
    df['begin_time'] = pd.to_datetime(df['begin_time'])
    df['begin_time'] = df['begin_time'].astype('int64') / 1e9
    print(type(df.iloc[0]["begin_time"]), type(df.iloc[0]["end_time"]))
    df.to_csv(os.path.join(dest_path, depid, f"{depid}_dive.csv"), index=False)
    # df.to_csv("fichier.csv", index=False)

---
## Backup csv, nc files

In [76]:
for depid in depids[1:] :
    csv_path = os.path.join(dest_path, depid, f"{depid}_dive.csv")
    nc_path = os.path.join(dest_path, depid, f"{depid}_sens.nc")
    shutil.copy(csv_path, "E:/nc_csv_backup")
    shutil.copy(nc_path, "E:/nc_csv_backup")