In [1]:
import os
import re
import glob
import xarray as xr
import numpy as np
from pathlib import Path
from typing import Union, Optional, Callable
from tqdm import tqdm
import dask
import dask.array as da
from dask.diagnostics import ProgressBar
from dask.distributed import Client, LocalCluster, progress

In [2]:
def find_years_and_variables(list_files: list[str]) -> tuple[list[str], list[str]]: 
    years = set()
    variables = set()
    
    for file in list_files:
        # Extract the year (4 digits) and variable name (anything before .nc after the coordinates)
        match = re.search(r'/(\d{4})-\d{2}_\[.*?\]_(.*)\.nc$', file)
        if match:
            year = match.group(1)
            variable = match.group(2)
            
            # Add to sets to ensure uniqueness
            years.add(year)
            variables.add(variable)
    
    # Convert sets to sorted lists
    years = sorted(list(years))
    variables = sorted(list(variables))

    return years, variables


def select_data_common_year_variable(year: str, variable: str, list_files: list[str]) -> list[str]:

    filter_conditions = rf"(?=.*{year}-0[1-9]|.*{year}-1[0-2])(?=.*{variable})"
    
    return [file for file in list_files if re.search(filter_conditions,file)]


def preprocess_drop_coord(ds: xr.Dataset, coord_to_drop: str|list[str] = ["expver", "surface"]):
    return ds.drop_vars(coord_to_drop, errors='ignore')


def convert_nc_to_zarr(file_list, output_zarr_path, chunk_dict: Union[dict, str] = "auto", preprocess: Optional[Callable]=None):
    
    ds = xr.open_mfdataset(file_list, 
                           combine='by_coords',
                           engine="h5netcdf",
                           parallel=True,
                           # decode_cf=False,
                           preprocess=preprocess,
                           coords="minimal",
                          ).chunk(chunk_dict)

    with ProgressBar():
        ds.to_zarr(output_zarr_path, mode='w', consolidated=True)

@dask.delayed
def save_zarr_dask(list_files: list[str], output_file: str, chunk_dict: Union[dict, str] = "auto", preprocess: Optional[Callable]=None): 
    try:
        if os.path.exists(output_file):
            print(f"path already exists, skipping: {output_zarr_path}")
        else:
            convert_nc_to_zarr(list_files, output_file, chunk_dict, preprocess=preprocess)
    except Exception as e:
        print(f"error!: \n{e}")
        

In [3]:
data_dir = "/scratch/opodriscoll/data/ERA5"

list_files = glob.glob(str(data_dir) + "/*.nc")

dict_chunk = {
    "valid_time": 200,
    "latitude": 200,
    "longitude": 200,
}

years, variables = find_years_and_variables(list_files)

In [5]:
with LocalCluster() as cluster, Client(cluster) as client:
    
    outputs = []
    
    for year in years:
        for variable in variables:
    
            data_str_filt = select_data_common_year_variable(year=year, variable=variable, list_files=list_files)
                    
            output_zarr_path = f"/scratch/opodriscoll/data/ERA5/{year}_{variable}.zarr"
    
            output = save_zarr_dask(
                list_files=data_str_filt,
                output_file=output_zarr_path,
                chunk_dict=dict_chunk,
                preprocess=preprocess_drop_coord
            )
            outputs.append(output)

    dask.compute(outputs[:4])
            

## Confirm successful storage 

In [1]:
list_files = glob.glob(str(data_dir) + "/*.zarr")
list_files.sort()
list_files

NameError: name 'glob' is not defined

In [4]:
list_files = glob.glob(str(data_dir) + "/*.zarr")
list_files.sort()
list_files

['/scratch/opodriscoll/data/ERA5/2020_10m_u_component_of_wind.zarr',
 '/scratch/opodriscoll/data/ERA5/2020_10m_v_component_of_wind.zarr',
 '/scratch/opodriscoll/data/ERA5/2020_2m_dewpoint_temperature.zarr',
 '/scratch/opodriscoll/data/ERA5/2020_2m_temperature.zarr',
 '/scratch/opodriscoll/data/ERA5/2020_boundary_layer_heighttotal_precipitation.zarr',
 '/scratch/opodriscoll/data/ERA5/2020_mean_sea_level_pressure.zarr',
 '/scratch/opodriscoll/data/ERA5/2020_mean_surface_downward_long_wave_radiation_flux.zarr',
 '/scratch/opodriscoll/data/ERA5/2020_mean_surface_downward_short_wave_radiation_flux.zarr',
 '/scratch/opodriscoll/data/ERA5/2020_mean_surface_latent_heat_flux.zarr',
 '/scratch/opodriscoll/data/ERA5/2020_mean_surface_sensible_heat_flux.zarr',
 '/scratch/opodriscoll/data/ERA5/2020_mean_wave_period.zarr',
 '/scratch/opodriscoll/data/ERA5/2020_ocean_surface_stress_equivalent_10m_neutral_wind_direction.zarr',
 '/scratch/opodriscoll/data/ERA5/2020_ocean_surface_stress_equivalent_10m_n

In [6]:
t = xr.open_mfdataset(list_files, engine="zarr", preprocess=preprocess_drop_coord)

In [7]:
t

Unnamed: 0,Array,Chunk
Bytes,52.80 GiB,28.53 MiB
Shape,"(17544, 561, 1440)","(200, 187, 200)"
Dask graph,2112 chunks in 6 graph layers,2112 chunks in 6 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 52.80 GiB 28.53 MiB Shape (17544, 561, 1440) (200, 187, 200) Dask graph 2112 chunks in 6 graph layers Data type float32 numpy.ndarray",1440  561  17544,

Unnamed: 0,Array,Chunk
Bytes,52.80 GiB,28.53 MiB
Shape,"(17544, 561, 1440)","(200, 187, 200)"
Dask graph,2112 chunks in 6 graph layers,2112 chunks in 6 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,52.80 GiB,28.53 MiB
Shape,"(17544, 561, 1440)","(200, 187, 200)"
Dask graph,2112 chunks in 6 graph layers,2112 chunks in 6 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 52.80 GiB 28.53 MiB Shape (17544, 561, 1440) (200, 187, 200) Dask graph 2112 chunks in 6 graph layers Data type float32 numpy.ndarray",1440  561  17544,

Unnamed: 0,Array,Chunk
Bytes,52.80 GiB,28.53 MiB
Shape,"(17544, 561, 1440)","(200, 187, 200)"
Dask graph,2112 chunks in 6 graph layers,2112 chunks in 6 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,52.80 GiB,19.36 MiB
Shape,"(17544, 561, 1440)","(200, 141, 180)"
Dask graph,2816 chunks in 17 graph layers,2816 chunks in 17 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 52.80 GiB 19.36 MiB Shape (17544, 561, 1440) (200, 141, 180) Dask graph 2816 chunks in 17 graph layers Data type float32 numpy.ndarray",1440  561  17544,

Unnamed: 0,Array,Chunk
Bytes,52.80 GiB,19.36 MiB
Shape,"(17544, 561, 1440)","(200, 141, 180)"
Dask graph,2816 chunks in 17 graph layers,2816 chunks in 17 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,52.80 GiB,28.53 MiB
Shape,"(17544, 561, 1440)","(200, 187, 200)"
Dask graph,2112 chunks in 6 graph layers,2112 chunks in 6 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 52.80 GiB 28.53 MiB Shape (17544, 561, 1440) (200, 187, 200) Dask graph 2112 chunks in 6 graph layers Data type float32 numpy.ndarray",1440  561  17544,

Unnamed: 0,Array,Chunk
Bytes,52.80 GiB,28.53 MiB
Shape,"(17544, 561, 1440)","(200, 187, 200)"
Dask graph,2112 chunks in 6 graph layers,2112 chunks in 6 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,52.80 GiB,28.53 MiB
Shape,"(17544, 561, 1440)","(200, 187, 200)"
Dask graph,2112 chunks in 6 graph layers,2112 chunks in 6 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 52.80 GiB 28.53 MiB Shape (17544, 561, 1440) (200, 187, 200) Dask graph 2112 chunks in 6 graph layers Data type float32 numpy.ndarray",1440  561  17544,

Unnamed: 0,Array,Chunk
Bytes,52.80 GiB,28.53 MiB
Shape,"(17544, 561, 1440)","(200, 187, 200)"
Dask graph,2112 chunks in 6 graph layers,2112 chunks in 6 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,52.80 GiB,28.53 MiB
Shape,"(17544, 561, 1440)","(200, 187, 200)"
Dask graph,2112 chunks in 6 graph layers,2112 chunks in 6 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 52.80 GiB 28.53 MiB Shape (17544, 561, 1440) (200, 187, 200) Dask graph 2112 chunks in 6 graph layers Data type float32 numpy.ndarray",1440  561  17544,

Unnamed: 0,Array,Chunk
Bytes,52.80 GiB,28.53 MiB
Shape,"(17544, 561, 1440)","(200, 187, 200)"
Dask graph,2112 chunks in 6 graph layers,2112 chunks in 6 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,52.80 GiB,28.53 MiB
Shape,"(17544, 561, 1440)","(200, 187, 200)"
Dask graph,2112 chunks in 6 graph layers,2112 chunks in 6 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 52.80 GiB 28.53 MiB Shape (17544, 561, 1440) (200, 187, 200) Dask graph 2112 chunks in 6 graph layers Data type float32 numpy.ndarray",1440  561  17544,

Unnamed: 0,Array,Chunk
Bytes,52.80 GiB,28.53 MiB
Shape,"(17544, 561, 1440)","(200, 187, 200)"
Dask graph,2112 chunks in 6 graph layers,2112 chunks in 6 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,52.80 GiB,28.53 MiB
Shape,"(17544, 561, 1440)","(200, 187, 200)"
Dask graph,2112 chunks in 6 graph layers,2112 chunks in 6 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 52.80 GiB 28.53 MiB Shape (17544, 561, 1440) (200, 187, 200) Dask graph 2112 chunks in 6 graph layers Data type float32 numpy.ndarray",1440  561  17544,

Unnamed: 0,Array,Chunk
Bytes,52.80 GiB,28.53 MiB
Shape,"(17544, 561, 1440)","(200, 187, 200)"
Dask graph,2112 chunks in 6 graph layers,2112 chunks in 6 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,52.80 GiB,19.36 MiB
Shape,"(17544, 561, 1440)","(200, 141, 180)"
Dask graph,2816 chunks in 17 graph layers,2816 chunks in 17 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 52.80 GiB 19.36 MiB Shape (17544, 561, 1440) (200, 141, 180) Dask graph 2816 chunks in 17 graph layers Data type float32 numpy.ndarray",1440  561  17544,

Unnamed: 0,Array,Chunk
Bytes,52.80 GiB,19.36 MiB
Shape,"(17544, 561, 1440)","(200, 141, 180)"
Dask graph,2816 chunks in 17 graph layers,2816 chunks in 17 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,52.80 GiB,19.36 MiB
Shape,"(17544, 561, 1440)","(200, 141, 180)"
Dask graph,2816 chunks in 17 graph layers,2816 chunks in 17 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 52.80 GiB 19.36 MiB Shape (17544, 561, 1440) (200, 141, 180) Dask graph 2816 chunks in 17 graph layers Data type float32 numpy.ndarray",1440  561  17544,

Unnamed: 0,Array,Chunk
Bytes,52.80 GiB,19.36 MiB
Shape,"(17544, 561, 1440)","(200, 141, 180)"
Dask graph,2816 chunks in 17 graph layers,2816 chunks in 17 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,52.80 GiB,28.53 MiB
Shape,"(17544, 561, 1440)","(200, 187, 200)"
Dask graph,2112 chunks in 6 graph layers,2112 chunks in 6 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 52.80 GiB 28.53 MiB Shape (17544, 561, 1440) (200, 187, 200) Dask graph 2112 chunks in 6 graph layers Data type float32 numpy.ndarray",1440  561  17544,

Unnamed: 0,Array,Chunk
Bytes,52.80 GiB,28.53 MiB
Shape,"(17544, 561, 1440)","(200, 187, 200)"
Dask graph,2112 chunks in 6 graph layers,2112 chunks in 6 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,52.80 GiB,19.36 MiB
Shape,"(17544, 561, 1440)","(200, 141, 180)"
Dask graph,2816 chunks in 17 graph layers,2816 chunks in 17 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 52.80 GiB 19.36 MiB Shape (17544, 561, 1440) (200, 141, 180) Dask graph 2816 chunks in 17 graph layers Data type float32 numpy.ndarray",1440  561  17544,

Unnamed: 0,Array,Chunk
Bytes,52.80 GiB,19.36 MiB
Shape,"(17544, 561, 1440)","(200, 141, 180)"
Dask graph,2816 chunks in 17 graph layers,2816 chunks in 17 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,52.80 GiB,28.53 MiB
Shape,"(17544, 561, 1440)","(200, 187, 200)"
Dask graph,2112 chunks in 6 graph layers,2112 chunks in 6 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 52.80 GiB 28.53 MiB Shape (17544, 561, 1440) (200, 187, 200) Dask graph 2112 chunks in 6 graph layers Data type float32 numpy.ndarray",1440  561  17544,

Unnamed: 0,Array,Chunk
Bytes,52.80 GiB,28.53 MiB
Shape,"(17544, 561, 1440)","(200, 187, 200)"
Dask graph,2112 chunks in 6 graph layers,2112 chunks in 6 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,52.80 GiB,28.53 MiB
Shape,"(17544, 561, 1440)","(200, 187, 200)"
Dask graph,2112 chunks in 6 graph layers,2112 chunks in 6 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 52.80 GiB 28.53 MiB Shape (17544, 561, 1440) (200, 187, 200) Dask graph 2112 chunks in 6 graph layers Data type float32 numpy.ndarray",1440  561  17544,

Unnamed: 0,Array,Chunk
Bytes,52.80 GiB,28.53 MiB
Shape,"(17544, 561, 1440)","(200, 187, 200)"
Dask graph,2112 chunks in 6 graph layers,2112 chunks in 6 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,52.80 GiB,28.53 MiB
Shape,"(17544, 561, 1440)","(200, 187, 200)"
Dask graph,2112 chunks in 6 graph layers,2112 chunks in 6 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 52.80 GiB 28.53 MiB Shape (17544, 561, 1440) (200, 187, 200) Dask graph 2112 chunks in 6 graph layers Data type float32 numpy.ndarray",1440  561  17544,

Unnamed: 0,Array,Chunk
Bytes,52.80 GiB,28.53 MiB
Shape,"(17544, 561, 1440)","(200, 187, 200)"
Dask graph,2112 chunks in 6 graph layers,2112 chunks in 6 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,52.80 GiB,19.36 MiB
Shape,"(17544, 561, 1440)","(200, 141, 180)"
Dask graph,2816 chunks in 17 graph layers,2816 chunks in 17 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 52.80 GiB 19.36 MiB Shape (17544, 561, 1440) (200, 141, 180) Dask graph 2816 chunks in 17 graph layers Data type float32 numpy.ndarray",1440  561  17544,

Unnamed: 0,Array,Chunk
Bytes,52.80 GiB,19.36 MiB
Shape,"(17544, 561, 1440)","(200, 141, 180)"
Dask graph,2816 chunks in 17 graph layers,2816 chunks in 17 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray


In [23]:
n_time = t.sizes['valid_time']
n_lat = t.sizes['latitude']
n_lon = t.sizes['longitude']

# Generate random indices to sample from
n_samples = 100000
random_time_idx = np.random.randint(0, n_time, n_samples)
random_lat_idx = np.random.randint(0, n_lat, n_samples)
random_lon_idx = np.random.randint(0, n_lon, n_samples)

# Get the corresponding coordinate values for the random samples
random_valid_times = t['valid_time'].values[random_time_idx]
random_latitudes = t['latitude'].values[random_lat_idx]
random_longitudes = t['longitude'].values[random_lon_idx]

In [25]:
p = t.sel(
    valid_time=xr.DataArray(random_valid_times, dims='points'),
    latitude=xr.DataArray(random_latitudes, dims='points'),
    longitude=xr.DataArray(random_longitudes, dims='points'),
    method='nearest'
).compute()
p