In [1]:
import xarray as xr
import numpy as np
import zarr
from datetime import datetime, timezone
import BuildZarrStore as bzs
import pandas as pd
import os

In [None]:
var ="RR"


folder_path = f'INCA_data/{var}'

filepaths = []

for filename in os.listdir(folder_path):
    file_path = os.path.join(folder_path, filename)
    if os.path.isfile(file_path):
        filepaths.append(file_path)

In [None]:
store = zarr.open(f'INCA.zarr/{var}', mode='r')
arr = store[f'{var}']
dtype=arr.dtype
fill_value = arr.attrs.get('_FillValue', None)

store = zarr.storage.LocalStore("INCA.zarr")
group = zarr.group(store=store)[var]
x_extent = group["x"][:]
y_extent = group["y"][:]

for i, file in enumerate(filepaths):
    data = xr.open_dataset(file, chunks={}, mask_and_scale=False)
    data = data.load()

    x_min, x_max = bzs.get_idx(x_extent, data["x"].values)
    y_min, y_max = bzs.get_idx(y_extent, data["y"].values)

    origin = np.datetime64("2011-03-15T00:00:00").astype("datetime64[h]")
    time_min, time_max = data.time.values[0].astype("datetime64[h]"), data.time.values[-1].astype("datetime64[h]")+1
    time_delta_min, time_delta_max = (time_min - origin).astype("int64"), (time_max - origin).astype("int64")

    full_range = pd.date_range(time_min, time_max, freq="1H").values.astype("datetime64[ns]")

    for value in data.time.values:
        if value in set(full_range):
            continue
        else:
            print(f"{file} Data incomplete")
            empty_array = np.full((full_range.shape[0], data["x"].values.shape[0], data["y"].values.shape[0]),
                                fill_value=fill_value, dtype=dtype)

            template = xr.Dataset({f"{var}": (("time", "x", "y"), empty_array)},
                                  coords={
                                    "time": full_range,
                                    "x": data["x"].values,
                                    "y": data["y"].values
                                  }
                                  )

            data_filled = data.combine_first(template)
            print(f"{file} Data gaps filled with no data values")
            break


    group[var][time_delta_min:time_delta_max, y_min:y_max, x_min:x_max] = data[var].values

    print(f"{file} written to zarr store. {i}/{len(filepaths)} complete💌")
    

INCA_RR_data/INCAL_HOURLY_RR_202107.nc written to zarr store.
INCA_RR_data/INCAL_HOURLY_RR_201502.nc written to zarr store.
INCA_RR_data/INCAL_HOURLY_RR_201304.nc written to zarr store.
INCA_RR_data/INCAL_HOURLY_RR_202308.nc written to zarr store.
INCA_RR_data/INCAL_HOURLY_RR_201111.nc written to zarr store.
INCA_RR_data/INCAL_HOURLY_RR_201808.nc written to zarr store.
INCA_RR_data/INCAL_HOURLY_RR_202503.nc written to zarr store.
INCA_RR_data/INCAL_HOURLY_RR_202101.nc written to zarr store.
INCA_RR_data/INCAL_HOURLY_RR_201308.nc written to zarr store.
INCA_RR_data/INCAL_HOURLY_RR_202103.nc written to zarr store.
INCA_RR_data/INCAL_HOURLY_RR_202008.nc written to zarr store.
INCA_RR_data/INCAL_HOURLY_RR_202001.nc written to zarr store.
INCA_RR_data/INCAL_HOURLY_RR_201108.nc written to zarr store.
INCA_RR_data/INCAL_HOURLY_RR_201305.nc written to zarr store.
INCA_RR_data/INCAL_HOURLY_RR_202004.nc written to zarr store.
INCA_RR_data/INCAL_HOURLY_RR_201802.nc written to zarr store.
INCA_RR_

### Inspecting the zarr store

In [3]:
ds = xr.open_zarr("INCA.zarr", group="RR", consolidated=True, chunks={})#, decode_times=False)
ds

Unnamed: 0,Array,Chunk
Bytes,418.87 GiB,54.93 MiB
Shape,"(200000, 401, 701)","(720, 100, 100)"
Dask graph,11120 chunks in 2 graph layers,11120 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 418.87 GiB 54.93 MiB Shape (200000, 401, 701) (720, 100, 100) Dask graph 11120 chunks in 2 graph layers Data type float64 numpy.ndarray",701  401  200000,

Unnamed: 0,Array,Chunk
Bytes,418.87 GiB,54.93 MiB
Shape,"(200000, 401, 701)","(720, 100, 100)"
Dask graph,11120 chunks in 2 graph layers,11120 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray


In [7]:
filtered = ds.sel(time=slice("2019-06-01T00:00:00.000000000","2019-06-30T23:00:00.000000000"))

In [8]:
loaded = filtered.load()
loaded

In [2]:
xr.open_dataset("INCA_data/INCAL_HOURLY_RR_201906.nc").load()