In [1]:
import xarray as xr
import numpy as np
import glob
import os.path

from dask.distributed import Client
from dask.diagnostics import ProgressBar

from aggfly import dataset, regions, grid_weights
from aggfly.aggregate import TemporalAggregator, SpatialAggregator, get_time_dim

ProgressBar().register()
# client = Client()

In [2]:
# Set file output name/path
output_path = "/home3/dth2133/data/aggregated/counties/"
output_name = "usa_counties_corn_monthly"
output_varn = "dd29"
csv = True

In [3]:
# Open shapefile containing region features.
georegions = regions.from_name('counties')

# Open example climate dataset to calculate grid weights.
clim = dataset.from_path(
    f"/home3/dth2133/data/usa/usa-t2m_tempPrecLand2019.zarr", 
    't2m', 
    'zarr', 
    preprocess=dataset.timefix_era5l)

# Clip climate data to the US (raw data are global)
clim.clip_data_to_georegions_extent(georegions)

# Rechunk dataset to optimize multithreading
clim.rechunk((5, 578, -1, -1, -1, -1))

# Calculate area and crop layer weights.
weights = grid_weights.from_objects(clim, georegions, crop='corn')

# This object covers aggregating hourly and daily data to the yearly 
# level
daily = TemporalAggregator(
    'dd',
    agg_from='hour',
    agg_to='day', 
    ddargs=[29,999,0])

annual = TemporalAggregator(
    'sum', 
    agg_from='day',
    agg_to='month')

# This object aggregates cells within a region to the average across 
# cells, weighted by `weights`, which in this case are the area of the
# cell and the share of the cell with corn crops.
spatial = SpatialAggregator('avg', agg_from='month')


In [4]:
# Calculate the grid weights
w = weights.weights()

[########################################] | 100% Completed |  0.1s
[########################################] | 100% Completed | 16.3s
[########################################] | 100% Completed | 15.5s
[########################################] | 100% Completed | 29.3s
[########################################] | 100% Completed |  0.1s
[########################################] | 100% Completed |  0.1s


In [5]:
def aggregate_era5l_t2m(path):
    # path = "/home3/dth2133/data/usa/usa-t2m_tempPrecLand1969.zarr"
    # Open climate dataset.
    clim = dataset.from_path(
        path, 
        't2m', 
        'zarr', 
        preprocess=dataset.timefix_era5l) # Kelvin to Celsius

    # Clip climate data to the US (raw data are global)
    clim.clip_data_to_georegions_extent(georegions)
    # Rechunk dataset to optimize multithreading
    clim.rechunk((5, 578, -1, -1, -1))

    # Update climate dataset in `clim` to one collapsed over
    # hour and day based upon `temporal` definition above.
    clim = daily.map_execute(clim)
    clim = annual.map_execute(clim)
    clim.rechunk(-1)

    # Again update, but with (weighted) spatial collapse over regions.
    clim = spatial.map_execute(clim, w)

    # Send back the aggregated climate data.
    return clim.da


In [6]:
# Check years from the input path - this just creates a vector of years for
# which my raw climate data are available, e.g. [1970, 1971, ...]
import numpy as np
import glob
from os.path import basename
files = np.sort([x for x in glob.glob('/home3/dth2133/data/usa/*t2m_*')])
# Loop over years and aggregate.
output = list()
for f in files:
    print(f)
    output.append(aggregate_era5l_t2m(f))

/home3/dth2133/data/usa/usa-t2m_tempPrecLand1951.zarr
/home3/dth2133/data/usa/usa-t2m_tempPrecLand1952.zarr
/home3/dth2133/data/usa/usa-t2m_tempPrecLand1953.zarr
/home3/dth2133/data/usa/usa-t2m_tempPrecLand1954.zarr
/home3/dth2133/data/usa/usa-t2m_tempPrecLand1955.zarr
/home3/dth2133/data/usa/usa-t2m_tempPrecLand1956.zarr
/home3/dth2133/data/usa/usa-t2m_tempPrecLand1957.zarr
/home3/dth2133/data/usa/usa-t2m_tempPrecLand1958.zarr
/home3/dth2133/data/usa/usa-t2m_tempPrecLand1960.zarr
/home3/dth2133/data/usa/usa-t2m_tempPrecLand1961.zarr
/home3/dth2133/data/usa/usa-t2m_tempPrecLand1962.zarr
/home3/dth2133/data/usa/usa-t2m_tempPrecLand1963.zarr
/home3/dth2133/data/usa/usa-t2m_tempPrecLand1964.zarr
/home3/dth2133/data/usa/usa-t2m_tempPrecLand1965.zarr
/home3/dth2133/data/usa/usa-t2m_tempPrecLand1966.zarr
/home3/dth2133/data/usa/usa-t2m_tempPrecLand1967.zarr
/home3/dth2133/data/usa/usa-t2m_tempPrecLand1968.zarr
/home3/dth2133/data/usa/usa-t2m_tempPrecLand1969.zarr
/home3/dth2133/data/usa/usa-

In [7]:
da = xr.concat(output, dim='year').compute()

[########################################] | 100% Completed | 29min  4.3s


In [9]:
# client = Client()

In [10]:
# out = client.persist(da)

In [None]:
# client

In [8]:
ds = da.to_dataset(name=output_varn)
ds.to_zarr(os.path.join(output_path, output_name+'.zarr'), mode='a') 

<xarray.backends.zarr.ZarrStore at 0x7f18ae0d67b0>

In [9]:
if csv:
    ds = xr.open_zarr(os.path.join(output_path, output_name+'.zarr'))
    ds.to_dataframe().to_csv(os.path.join(output_path, output_name+'.csv'))

[########################################] | 100% Completed |  0.2s
[########################################] | 100% Completed |  0.1s
[########################################] | 100% Completed |  0.2s
[########################################] | 100% Completed |  0.1s


In [10]:
xr.open_zarr(os.path.join(output_path, output_name+'.zarr'))

Unnamed: 0,Array,Chunk
Bytes,19.35 MiB,619.17 kiB
Shape,"(3108, 68, 12)","(777, 17, 6)"
Count,33 Tasks,32 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 19.35 MiB 619.17 kiB Shape (3108, 68, 12) (777, 17, 6) Count 33 Tasks 32 Chunks Type float64 numpy.ndarray",12  68  3108,

Unnamed: 0,Array,Chunk
Bytes,19.35 MiB,619.17 kiB
Shape,"(3108, 68, 12)","(777, 17, 6)"
Count,33 Tasks,32 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,19.35 MiB,619.17 kiB
Shape,"(3108, 68, 12)","(777, 17, 6)"
Count,33 Tasks,32 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 19.35 MiB 619.17 kiB Shape (3108, 68, 12) (777, 17, 6) Count 33 Tasks 32 Chunks Type float64 numpy.ndarray",12  68  3108,

Unnamed: 0,Array,Chunk
Bytes,19.35 MiB,619.17 kiB
Shape,"(3108, 68, 12)","(777, 17, 6)"
Count,33 Tasks,32 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,19.35 MiB,619.17 kiB
Shape,"(3108, 68, 12)","(777, 17, 6)"
Count,33 Tasks,32 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 19.35 MiB 619.17 kiB Shape (3108, 68, 12) (777, 17, 6) Count 33 Tasks 32 Chunks Type float64 numpy.ndarray",12  68  3108,

Unnamed: 0,Array,Chunk
Bytes,19.35 MiB,619.17 kiB
Shape,"(3108, 68, 12)","(777, 17, 6)"
Count,33 Tasks,32 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,19.35 MiB,619.17 kiB
Shape,"(3108, 68, 12)","(777, 17, 6)"
Count,33 Tasks,32 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 19.35 MiB 619.17 kiB Shape (3108, 68, 12) (777, 17, 6) Count 33 Tasks 32 Chunks Type float64 numpy.ndarray",12  68  3108,

Unnamed: 0,Array,Chunk
Bytes,19.35 MiB,619.17 kiB
Shape,"(3108, 68, 12)","(777, 17, 6)"
Count,33 Tasks,32 Chunks
Type,float64,numpy.ndarray
