In [1]:
import netCDF4 as nc
import pandas as pd
import numpy as np
import xarray as xr
import matplotlib.pyplot as plt
import glob
import os

In [2]:
def find_files(directory, string):
    matches = []
    for root, dirnames, filenames in os.walk(directory):
        for filename in filenames:
            if string in filename and filename.endswith('.nc'):
                matches.append(os.path.join(root, filename))
    return matches

def concatenate_stack_datasets(center_ds, before_ds, after_ds):
    """
    Concatenates three xarray datasets together along the time dimension.

    Parameters
    ----------
    center_ds : xarray.Dataset
        The main dataset to concatenate.
    before_ds : xarray.Dataset
        The dataset containing the time steps before center_ds.
    after_ds : xarray.Dataset
        The dataset containing the time steps after center_ds.

    Returns
    -------
    xarray.Dataset
        The concatenated dataset.
    """
    
    s_before_ds = stack_dataset(before_ds)
    s_center_ds = stack_dataset(center_ds)
    s_after_ds = stack_dataset(after_ds)

    # Concatenate the datasets along the time dimension
    ds_list = [s_before_ds, s_center_ds, s_after_ds]
    ds = xr.concat(ds_list, dim='time')

    return ds

def stack_dataset(ds):
    ds_stacked = ds.stack(time=('forecast_initial_time', 'forecast_hour'))
    time_arr = pd.to_datetime([t[0] + pd.Timedelta(t[1],unit='hour') for t in ds_stacked.time.data])
    ds_stacked = ds_stacked.drop_vars({'time','forecast_hour', 'forecast_initial_time'})
    ds_stacked = ds_stacked.assign_coords(time=time_arr)
    return ds_stacked

def cut_and_select(ds, var_name, lat_range, lon_range):
    """
    Cut a rectangular region defined by latitude and longitude ranges from a dataset, and
    select one variable of interest.

    Parameters
    ----------
    ds : xarray.Dataset
        The input dataset containing the variable of interest and the latitude and longitude
        coordinates.
    var_name : str
        The name of the variable to select.
    lat_range : tuple of two floats
        The latitude range to cut, defined as (min_lat, max_lat).
    lon_range : tuple of two floats
        The longitude range to cut, defined as (min_lon, max_lon).

    Returns
    -------
    xarray.Dataset
        A new dataset containing only the selected variable, with the corresponding latitude
        and longitude coordinates.
    """
    # Select the variable of interest
    var = ds[var_name]

    # Cut the rectangular region based on the latitude and longitude ranges
    var_cut = var.sel(latitude=slice(lat_range[1],lat_range[0]), longitude=slice(lon_range[0], lon_range[1]))

    # Create a new dataset with only the selected variable and its coordinates
    ds_var = xr.Dataset({var_name: var_cut})

    return ds_var


In [154]:
path_data_era5 = '/gpfs/fs1/collections/rda/data/ds633.0/'

list_variables_atmosphere = ['olr','u10','u200','z500']
subfolders = ['e5.oper.fc.sfc.meanflux','e5.oper.an.pl','e5.oper.an.pl','e5.oper.an.pl']
codename = ['.235_040_mtnlwrf.ll025sc.','.128_131_u.ll025uv.','.128_131_u.ll025uv.','.128_129_z.ll025sc.']
level = [None, 10, 200, 500]
var_name = ['MTNLWRF','U','U','Z']

In [155]:
path_outputs_daily = '/glade/work/jhayron/Data4Predictability/ERA5/Daily/'

# OLR

In [5]:
ivar = 0

lat_range = (-30,90)
lon_range = (0,360)
list_files = np.sort(find_files(f'{path_data_era5}{subfolders[ivar]}/',codename[ivar]))
list_files = list_files[957:]

In [None]:
# This already ran, can comment

# for ifile in range(1,len(list_files)-1):
#     print(list_files[ifile])
#     center_ds = xr.open_dataset(list_files[ifile])
#     before_ds = xr.open_dataset(list_files[ifile-1]).isel(forecast_initial_time=[-1])
#     after_ds = xr.open_dataset(list_files[ifile+1]).isel(forecast_initial_time=[0])

#     concatenated = concatenate_stack_datasets(center_ds,before_ds,after_ds)
#     del(center_ds,before_ds,after_ds)
#     daily_mean = concatenated.resample(time='D').mean(dim='time')#.isel(time=[:-1])
#     daily_count = concatenated.resample(time='D').count(dim='time')
#     # Create a mask of valid values based on the count requirement
#     valid_mask = daily_count >= 24
#     # Compute the mean using the valid mask
#     daily_mean = daily_mean.where(valid_mask,drop=True)
#     del(concatenated)

#     if os.path.exists(path_outputs_daily+f'OLR/') == False:
#         os.mkdir(path_outputs_daily+f'OLR/')

#     for i in range(len(daily_mean.time)):
#         temp_daily = daily_mean.sel(time = daily_mean.time[i])
#         temp_daily = cut_and_select(temp_daily,var_name[0],lat_range,lon_range)
#         str_date = daily_mean.time.data[i].astype(str)[:10]    
#         output_path =  path_outputs_daily+f'OLR/OLR_Daily_'+str_date+'.nc'
#         temp_daily.to_netcdf(output_path)

/gpfs/fs1/collections/rda/data/ds633.0/e5.oper.fc.sfc.meanflux/197912/e5.oper.fc.sfc.meanflux.235_040_mtnlwrf.ll025sc.1979120106_1979121606.nc
/gpfs/fs1/collections/rda/data/ds633.0/e5.oper.fc.sfc.meanflux/197912/e5.oper.fc.sfc.meanflux.235_040_mtnlwrf.ll025sc.1979121606_1980010106.nc
/gpfs/fs1/collections/rda/data/ds633.0/e5.oper.fc.sfc.meanflux/198001/e5.oper.fc.sfc.meanflux.235_040_mtnlwrf.ll025sc.1980010106_1980011606.nc
/gpfs/fs1/collections/rda/data/ds633.0/e5.oper.fc.sfc.meanflux/198001/e5.oper.fc.sfc.meanflux.235_040_mtnlwrf.ll025sc.1980011606_1980020106.nc
/gpfs/fs1/collections/rda/data/ds633.0/e5.oper.fc.sfc.meanflux/198002/e5.oper.fc.sfc.meanflux.235_040_mtnlwrf.ll025sc.1980020106_1980021606.nc
/gpfs/fs1/collections/rda/data/ds633.0/e5.oper.fc.sfc.meanflux/198002/e5.oper.fc.sfc.meanflux.235_040_mtnlwrf.ll025sc.1980021606_1980030106.nc
/gpfs/fs1/collections/rda/data/ds633.0/e5.oper.fc.sfc.meanflux/198003/e5.oper.fc.sfc.meanflux.235_040_mtnlwrf.ll025sc.1980030106_1980031606.nc

# Other atmospheric variables

In [156]:
list_variables_atmosphere

['olr', 'u10', 'u200', 'z500']

In [158]:
for ivar in range(1, len(list_variables_atmosphere)):
    lat_range = (-30,90)
    lon_range = (0,360)
    list_files = np.sort(find_files(f'{path_data_era5}{subfolders[ivar]}/',codename[ivar]))
    list_files = list_files[10958:]
    
    if os.path.exists(path_outputs_daily+f'{var_name[ivar]}{int(level[ivar])}/') == False:
        os.mkdir(path_outputs_daily+f'{var_name[ivar]}{int(level[ivar])}/')
    
    for ifile in range(2):
    # for ifile in range(len(list_files)):
        print(list_files[ifile])
        ds_temp = xr.open_dataset(list_files[ifile])
        str_date = ds_temp.time.data[0].astype(str)[:10]   

        ds_temp = cut_and_select(ds_temp,var_name[ivar],lat_range,lon_range)
        ds_temp = ds_temp.sel(level = level[ivar])
        ds_temp = ds_temp.mean(dim='time')

#     if os.path.exists(path_outputs_daily+f'OLR/') == False:
#         os.mkdir(path_outputs_daily+f'OLR/')

        output_path =  path_outputs_daily+f'{var_name[ivar]}{int(level[ivar])}/{var_name[ivar]}{int(level[ivar])}_Daily_'+str_date+'.nc'
        ds_temp.to_netcdf(output_path)

/gpfs/fs1/collections/rda/data/ds633.0/e5.oper.an.pl/197001/e5.oper.an.pl.128_131_u.ll025uv.1970010100_1970010123.nc
/gpfs/fs1/collections/rda/data/ds633.0/e5.oper.an.pl/197001/e5.oper.an.pl.128_131_u.ll025uv.1970010200_1970010223.nc
/gpfs/fs1/collections/rda/data/ds633.0/e5.oper.an.pl/197001/e5.oper.an.pl.128_131_u.ll025uv.1970010100_1970010123.nc
/gpfs/fs1/collections/rda/data/ds633.0/e5.oper.an.pl/197001/e5.oper.an.pl.128_131_u.ll025uv.1970010200_1970010223.nc
/gpfs/fs1/collections/rda/data/ds633.0/e5.oper.an.pl/197001/e5.oper.an.pl.128_129_z.ll025sc.1970010100_1970010123.nc
/gpfs/fs1/collections/rda/data/ds633.0/e5.oper.an.pl/197001/e5.oper.an.pl.128_129_z.ll025sc.1970010200_1970010223.nc


# Land variables

## Land moisture

In [139]:
path_data_era5 = '/gpfs/fs1/collections/rda/data/ds633.0/'

list_variables_moisture = ['swvl1','swvl2','swvl3','swvl4']
subfolders = ['e5.oper.an.sfc','e5.oper.an.sfc','e5.oper.an.sfc','e5.oper.an.sfc']
codename = ['.128_039_swvl1.ll025sc.','.128_040_swvl2.ll025sc.','.128_041_swvl3.ll025sc.','.128_042_swvl4.ll025sc.']
depth = [0.07, 0.21, 0.72, 1.89]
var_name = ['SWVL1','SWVL2','SWVL3','SWVL4']

In [140]:
ivar = 0
lat_range = (-30,90)
lon_range = (0,360)


In [141]:
ifile = 0

ds_moisture = []
for ivar in range(len(list_variables_moisture)):
    list_files = np.sort(find_files(f'{path_data_era5}{subfolders[ivar]}/',codename[ivar]))
    list_files = list_files[480:]
    ds_temp = xr.open_dataset(list_files[ifile])
    ds_moisture.append(ds_temp)
    
ds_moisture = xr.merge(ds_moisture)

In [142]:
ds_moisture_daily = ds_moisture.resample(time='D').mean(dim='time')

In [143]:
for itime in range(len(ds_moisture_daily.time)):
    ds_temp_daily = ds_moisture_daily.isel(time = itime)
    str_date = ds_temp_daily.time.data[()].astype(str)[:10]
    
    average_moisture_full = ((ds_temp_daily[var_name[0]] * depth[0] + 
                       ds_temp_daily[var_name[1]] * depth[1] + 
                       ds_temp_daily[var_name[2]] * depth[2] + 
                       ds_temp_daily[var_name[3]] * depth[3]) / np.sum(depth)).to_dataset(name='SWVL_full')
    average_moisture_full['SWVL_full'].attrs['units'] = 'm**3 m**-3'

    average_moisture_1m = ((ds_temp_daily[var_name[0]] * depth[0] + 
                       ds_temp_daily[var_name[1]] * depth[1] + 
                       ds_temp_daily[var_name[2]] * depth[2]) / np.sum(depth[:-1])).to_dataset(name='SWVL_1m')
    average_moisture_1m['SWVL_1m'].attrs['units'] = 'm**3 m**-3'
    
    if os.path.exists(path_outputs_daily+f'SWVL_full/') == False:
        os.mkdir(path_outputs_daily+f'SWVL_full/')
        
    if os.path.exists(path_outputs_daily+f'SWVL_1m/') == False:
        os.mkdir(path_outputs_daily+f'SWVL_1m/')
    
    output_path_full =  path_outputs_daily+f'SWVL_full/SWVL_full_Daily_'+str_date+'.nc'
    average_moisture_full = cut_and_select(average_moisture_full,'SWVL_full',lat_range,lon_range)
    average_moisture_full.to_netcdf(output_path_full)
    
    output_path_1m =  path_outputs_daily+f'SWVL_1m/SWVL_1m_Daily_'+str_date+'.nc'
    average_moisture_1m = cut_and_select(average_moisture_1m,'SWVL_1m',lat_range,lon_range)
    average_moisture_1m.to_netcdf(output_path_1m)

## Land temperature

In [145]:
path_data_era5 = '/gpfs/fs1/collections/rda/data/ds633.0/'

list_variables_temperature = ['stl1','stl2','stl3','stl4']
subfolders = ['e5.oper.an.sfc','e5.oper.an.sfc','e5.oper.an.sfc','e5.oper.an.sfc']
codename = ['.128_139_stl1.ll025sc.','.128_170_stl2.ll025sc.','.128_183_stl3.ll025sc.','.128_236_stl4.ll025sc.']
depth = [0.07, 0.21, 0.72, 1.89]
var_name = ['STL1','STL2','STL3','STL4']

In [146]:
ivar = 0
lat_range = (-30,90)
lon_range = (0,360)


In [147]:
ifile = 0

ds_temperature = []
for ivar in range(len(list_variables_temperature)):
    list_files = np.sort(find_files(f'{path_data_era5}{subfolders[ivar]}/',codename[ivar]))
    list_files = list_files[480:]
    ds_temp = xr.open_dataset(list_files[ifile])
    ds_temperature.append(ds_temp)
    
ds_temperature = xr.merge(ds_temperature)

In [148]:
ds_temperature_daily = ds_temperature.resample(time='D').mean(dim='time')

In [149]:
for itime in range(len(ds_temperature_daily.time)):
    ds_temp_daily = ds_temperature_daily.isel(time = itime)
    str_date = ds_temp_daily.time.data[()].astype(str)[:10]
    
    average_temperature_full = ((ds_temp_daily[var_name[0]] * depth[0] + 
                       ds_temp_daily[var_name[1]] * depth[1] + 
                       ds_temp_daily[var_name[2]] * depth[2] + 
                       ds_temp_daily[var_name[3]] * depth[3]) / np.sum(depth)).to_dataset(name='STL_full')
    average_temperature_full['STL_full'].attrs['units'] = 'K'

    average_temperature_1m = ((ds_temp_daily[var_name[0]] * depth[0] + 
                       ds_temp_daily[var_name[1]] * depth[1] + 
                       ds_temp_daily[var_name[2]] * depth[2]) / np.sum(depth[:-1])).to_dataset(name='STL_1m')
    average_temperature_1m['STL_1m'].attrs['units'] = 'K'
    
    if os.path.exists(path_outputs_daily+f'STL_full/') == False:
        os.mkdir(path_outputs_daily+f'STL_full/')
        
    if os.path.exists(path_outputs_daily+f'STL_1m/') == False:
        os.mkdir(path_outputs_daily+f'STL_1m/')
    
    output_path_full =  path_outputs_daily+f'STL_full/STL_full_Daily_'+str_date+'.nc'
    average_temperature_full = cut_and_select(average_temperature_full,'STL_full',lat_range,lon_range)
    average_temperature_full.to_netcdf(output_path_full)
    
    output_path_1m =  path_outputs_daily+f'STL_1m/STL_1m_Daily_'+str_date+'.nc'
    average_temperature_1m = cut_and_select(average_temperature_1m,'STL_1m',lat_range,lon_range)
    average_temperature_1m.to_netcdf(output_path_1m)

# Snow depth and cover

In [169]:
path_data_era5 = '/gpfs/fs1/collections/rda/data/ds633.0/'

list_variables_land = ['snow_depth']
subfolders = ['e5.oper.an.sfc']
codename = ['.128_141_sd.ll025sc.']
# level = [None, 10, 200, 500]
var_name = ['SD']

In [None]:
## THIS IS WRONG, CORRECT IN THE FUTURE

for ivar in range(len(list_variables_land)):
    lat_range = (-30,90)
    lon_range = (0,360)
    list_files = np.sort(find_files(f'{path_data_era5}{subfolders[ivar]}/',codename[ivar]))
    list_files = list_files[480:]
    
    if os.path.exists(path_outputs_daily+f'{var_name[ivar]}/') == False:
        os.mkdir(path_outputs_daily+f'{var_name[ivar]}/')
    
    for ifile in range(5):
    # for ifile in range(len(list_files)):
        print(list_files[ifile])
        ds_temp = xr.open_dataset(list_files[ifile])
        str_date = ds_temp.time.data[0].astype(str)[:10]   

        ds_temp = cut_and_select(ds_temp,var_name[ivar],lat_range,lon_range)
        # ds_temp = ds_temp.sel(level = level[ivar])
        ds_temp = ds_temp.mean(dim='time')

#     if os.path.exists(path_outputs_daily+f'OLR/') == False:
#         os.mkdir(path_outputs_daily+f'OLR/')

        output_path =  path_outputs_daily+f'{var_name[ivar]}/{var_name[ivar]}_Daily_'+str_date+'.nc'
        ds_temp.to_netcdf(output_path)

/gpfs/fs1/collections/rda/data/ds633.0/e5.oper.an.sfc/198001/e5.oper.an.sfc.128_141_sd.ll025sc.1980010100_1980013123.nc
/gpfs/fs1/collections/rda/data/ds633.0/e5.oper.an.sfc/198002/e5.oper.an.sfc.128_141_sd.ll025sc.1980020100_1980022923.nc
/gpfs/fs1/collections/rda/data/ds633.0/e5.oper.an.sfc/198003/e5.oper.an.sfc.128_141_sd.ll025sc.1980030100_1980033123.nc
/gpfs/fs1/collections/rda/data/ds633.0/e5.oper.an.sfc/198004/e5.oper.an.sfc.128_141_sd.ll025sc.1980040100_1980043023.nc
/gpfs/fs1/collections/rda/data/ds633.0/e5.oper.an.sfc/198005/e5.oper.an.sfc.128_141_sd.ll025sc.1980050100_1980053123.nc


In [177]:
codename[ivar]

'.128_141_sd.ll025sc.'

In [176]:
subfolders[ivar]

'e5.oper.an.sfc'

In [181]:
list_files

array(['/gpfs/fs1/collections/rda/data/ds633.0/e5.oper.an.sfc/198001/e5.oper.an.sfc.128_141_sd.ll025sc.1980010100_1980013123.nc',
       '/gpfs/fs1/collections/rda/data/ds633.0/e5.oper.an.sfc/198002/e5.oper.an.sfc.128_141_sd.ll025sc.1980020100_1980022923.nc',
       '/gpfs/fs1/collections/rda/data/ds633.0/e5.oper.an.sfc/198003/e5.oper.an.sfc.128_141_sd.ll025sc.1980030100_1980033123.nc',
       '/gpfs/fs1/collections/rda/data/ds633.0/e5.oper.an.sfc/198004/e5.oper.an.sfc.128_141_sd.ll025sc.1980040100_1980043023.nc',
       '/gpfs/fs1/collections/rda/data/ds633.0/e5.oper.an.sfc/198005/e5.oper.an.sfc.128_141_sd.ll025sc.1980050100_1980053123.nc',
       '/gpfs/fs1/collections/rda/data/ds633.0/e5.oper.an.sfc/198006/e5.oper.an.sfc.128_141_sd.ll025sc.1980060100_1980063023.nc',
       '/gpfs/fs1/collections/rda/data/ds633.0/e5.oper.an.sfc/198007/e5.oper.an.sfc.128_141_sd.ll025sc.1980070100_1980073123.nc',
       '/gpfs/fs1/collections/rda/data/ds633.0/e5.oper.an.sfc/198008/e5.oper.an.sfc.128_14

In [172]:
f'{path_data_era5}{subfolders[ivar]}/'

IndexError: list index out of range

In [164]:
ds_temp

In [153]:
list_files

array(['/gpfs/fs1/collections/rda/data/ds633.0/e5.oper.an.sfc/198001/e5.oper.an.sfc.128_236_stl4.ll025sc.1980010100_1980013123.nc',
       '/gpfs/fs1/collections/rda/data/ds633.0/e5.oper.an.sfc/198002/e5.oper.an.sfc.128_236_stl4.ll025sc.1980020100_1980022923.nc',
       '/gpfs/fs1/collections/rda/data/ds633.0/e5.oper.an.sfc/198003/e5.oper.an.sfc.128_236_stl4.ll025sc.1980030100_1980033123.nc',
       '/gpfs/fs1/collections/rda/data/ds633.0/e5.oper.an.sfc/198004/e5.oper.an.sfc.128_236_stl4.ll025sc.1980040100_1980043023.nc',
       '/gpfs/fs1/collections/rda/data/ds633.0/e5.oper.an.sfc/198005/e5.oper.an.sfc.128_236_stl4.ll025sc.1980050100_1980053123.nc',
       '/gpfs/fs1/collections/rda/data/ds633.0/e5.oper.an.sfc/198006/e5.oper.an.sfc.128_236_stl4.ll025sc.1980060100_1980063023.nc',
       '/gpfs/fs1/collections/rda/data/ds633.0/e5.oper.an.sfc/198007/e5.oper.an.sfc.128_236_stl4.ll025sc.1980070100_1980073123.nc',
       '/gpfs/fs1/collections/rda/data/ds633.0/e5.oper.an.sfc/198008/e5.oper

In [162]:
ds_temp