In [25]:
import os
import numpy as np
from netCDF4 import Dataset

from pathlib import Path
import xarray as xr
from tqdm import tqdm
import pandas as pd
import torch

import pickle

In [26]:
input_directory = '/data_aip05/gsaliou/era5/local/DATA_ERA5/new/test/'
geopotential_file = '/data_aip05/gsaliou/era5/local/DATA_ERA5/new/oro/geopotential_0.25_local.nc'
stats = '/data_aip05/gsaliou/daily_local/stats.pkl'

In [23]:
def concat_data_arrays(das):
    coords = ["var", "lon", "lat"]
    ref = {k:das[0][k] for k in coords}
    
    data = np.concatenate([x.values for x in das])
    new_coordinates = {
        "lon":ref["lon"],
        "lat":ref["lat"],
        "var":ref["var"],
        "time":np.concatenate([da["time"].values for da in das])
    }
    da = xr.DataArray(data, coords=new_coordinates, dims=das[0].dims)
    return da

In [4]:
start = 2021
end = 2022

In [7]:
files = [f'/data_aip05/gsaliou/era5/local/DATA_ERA5/new/test/{i}.nc' for i in range(start, end+1)]

In [8]:
dss = [xr.open_dataset(f).to_dataarray().transpose("time", "variable", "lat", "lon") for f in tqdm(files)]


  0%|          | 0/2 [00:00<?, ?it/s][A
 50%|█████     | 1/2 [01:48<01:48, 108.16s/it][A
100%|██████████| 2/2 [02:48<00:00, 84.19s/it] [A


In [11]:
for i, year in enumerate(range(start, end + 1)):
    # Set the time dimension
    dss[i]['time'] = pd.date_range(f"{year}-01-01", f"{year+1}-01-01", freq="h")[:-1].values

    # Convert the time dimension to a pandas datetime index
    dss[i].coords['time'] = pd.DatetimeIndex(dss[i].coords['time'].values)

    # Compute the daily mean
    daily_mean = dss[i].resample(time='1D').mean()

    # Replace original DataArray with daily mean DataArray
    dss[i] = daily_mean

    print(f"Processed daily mean for the year {year}.")
    print(f"the year {dss[i].shape}")

Processed daily mean for the year 2021.
the year (365, 66, 81, 121)
Processed daily mean for the year 2022.
the year (365, 66, 81, 121)


In [13]:
# Déterminer l'année de début et la période
start_year = start
end_year = end
n_years = end - start + 1  


combined_list = []
oro = xr.open_dataset(geopotential_file).to_array().squeeze()

# Itérer sur chaque index dans dss
for i in range(len(dss)):
    # Calculer l'année basée sur l'index
    year = start_year + i
    
    # Extraire le DataArray correspondant à l'index i
    da = dss[i]
    
    # Convertir les DataArray en tableaux NumPy
    dss_np = da.values
    oro_np = oro.values
    
    # Étendre oro pour inclure la dimension 'time' et répéter les valeurs pour chaque pas de temps
    oro_np_expanded = np.expand_dims(oro_np, axis=0)
    oro_np_expanded = np.repeat(oro_np_expanded, dss_np.shape[0], axis=0)
    
    # Ajouter une dimension 'var' à oro_np_expanded
    oro_np_expanded = np.expand_dims(oro_np_expanded, axis=1)
    
    # Utiliser np.concatenate pour ajouter oro_np_expanded comme nouvelle variable dans dss_np
    combined_np = np.concatenate((dss_np, oro_np_expanded), axis=1)
    
    # Reconvertir le tableau NumPy en DataArray avec les bonnes dimensions et coordonnées
    combined = xr.DataArray(
        combined_np,
        dims=('time', 'var', 'lat', 'lon'),
        coords={
            'time': da.coords['time'],
            'var': np.append(da.coords['variable'], '66'),
            'lat': da.coords['lat'],
            'lon': da.coords['lon']
        }
    )
    
    # Ajouter le DataArray combiné à la liste
    combined_list.append(combined)

In [21]:
combined_list[1].shape

(365, 67, 81, 121)

In [24]:
dac = concat_data_arrays(combined_list)

In [28]:
with open(stats, 'rb') as file:
    mean,std = pickle.load(file)

In [29]:
print (mean)

[ 1.94809166e+01  1.01379502e+05  1.00152629e+05  2.83548739e+02
  2.84909047e+02  1.12781512e+06  2.01837853e+05  1.59422552e+05
  1.16541972e+05  9.03430120e+04  7.07335450e+04  5.47698234e+04
  4.12318357e+04  2.94530479e+04  1.90175672e+04  1.42125695e+04
  7.43862930e+03  1.11140525e+03  2.16025154e+02  2.11927629e+02
  2.20445817e+02  2.31317854e+02  2.43878256e+02  2.54316427e+02
  2.62528674e+02  2.69079829e+02  2.74258212e+02  2.76499588e+02
  2.79909820e+02  2.83778841e+02  2.75850870e-06  2.82500917e-06
  2.33557351e-05  1.71620189e-04  5.06167989e-04  9.86237439e-04
  1.64839933e-03  2.55518880e-03  3.87026599e-03  4.70346449e-03
  6.07602949e-03  7.16506255e-03  7.24939196e+00  2.25144782e+01
  3.46184426e+01  2.87023656e+01  2.20136202e+01  1.69430218e+01
  1.29548936e+01  9.55795694e+00  6.53186423e+00  5.14365867e+00
  3.17013694e+00  1.54883671e+00  1.88596903e+00  1.53126595e+00
  2.30069951e+00  2.08136579e+00  1.16043741e+00  5.48478714e-01
  1.65614323e-01 -8.75465

In [30]:
print (std)

[1.49161015e+01 7.65691827e+02 2.62088045e+03 1.12454679e+01
 1.11606460e+01 4.36437903e+05 2.51442244e+03 3.20284499e+03
 4.01641972e+03 3.57175390e+03 2.81771832e+03 2.16117817e+03
 1.62617789e+03 1.18521965e+03 8.32373460e+02 7.02715734e+02
 5.95578168e+02 6.00735823e+02 4.35316068e+00 7.60454523e+00
 4.27365742e+00 8.54413662e+00 1.04219135e+01 1.06573855e+01
 1.05973729e+01 1.07790859e+01 1.10001688e+01 1.10771080e+01
 1.09591031e+01 1.07543984e+01 1.78926645e-07 4.91018848e-07
 2.61071904e-05 1.98798786e-04 5.64899517e-04 1.04798915e-03
 1.63880087e-03 2.33983291e-03 3.18365113e-03 3.59310743e-03
 4.23685987e-03 4.94950776e-03 1.08749444e+01 1.45711316e+01
 2.18972361e+01 2.03193926e+01 1.59566872e+01 1.23656141e+01
 9.70763665e+00 7.77905277e+00 6.60673189e+00 6.39086981e+00
 6.21859607e+00 5.07012613e+00 4.32974001e+00 7.13658513e+00
 1.29469957e+01 1.25420285e+01 1.02245623e+01 8.40948733e+00
 7.16798743e+00 6.29923312e+00 5.81619749e+00 5.79809779e+00
 5.93596877e+00 4.981045

In [32]:
dac -= xr.DataArray(mean, coords={"var":dac["var"].values})
dac /= xr.DataArray(std, coords={"var":dac["var"].values})

In [34]:
da.to_netcdf("/data_aip05/gsaliou/daily_local/test/data_normalisedtest.nc")