In [1]:
%cd "~/repositories/clim-agg"

/home3/dth2133/repositories/clim-agg


In [2]:
import xarray as xr
from aggregate import ClimateDataset, GridWeights, GeoRegions
from aggregate import SpatialAggregator, TemporalAggregator
from aggregate.gridfuncs import *
from aggregate.utils import *
from dask.distributed import Client
from dask.diagnostics import ProgressBar
ProgressBar().register()

In [107]:
# Set file output name/path
output_path = "/home3/dth2133/data/aggregated/counties/"
output_name = "usa_counties_corn"
output_varn = "gdd8_30"
csv = True

In [103]:
# Open shapefile containing region features.
georegions = GeoRegions.from_name('counties')

# Open example climate dataset to calculate grid weights.
clim = ClimateDataset.from_path(
    "/home3/dth2133/data/usa/usa-t2m_tempPrecLand2019.zarr", 
    't2m', 
    'zarr', 
    preprocess=lambda x: x - 273.15)

# Clip climate data to the US (raw data are global)
clim.clip_data_to_georegions_extent(georegions)

# Calculate area and crop layer weights.
weights = GridWeights.from_objects(clim, georegions, crop='corn')

# This object covers aggregating hourly and daily data to the yearly 
# level
temporal = TemporalAggregator.from_name(name='era5l',
              calc={'daily':('dd', 1, [8,30,0]),
                    'yearly':('sum', 1)})
# temporal = TemporalAggregator.from_name(name='era5l',
#               calc={'daily':('avg', 1),
#                     'yearly':('avg', 1)})

# This object aggregates cells within a region to the average across 
# cells, weighted by `weights`, which in this case are the area of the
# cell and the share of the cell with corn crops.
spatial = SpatialAggregator.from_name('era5l')

In [5]:
# Calculate the grid weights
w = weights.weights()

[########################################] | 100% Completed |  0.1s
[########################################] | 100% Completed | 24.4s
[########################################] | 100% Completed | 23.8s
[########################################] | 100% Completed | 43.3s
[########################################] | 100% Completed |  0.1s
[########################################] | 100% Completed |  0.1s


In [104]:
def aggregate_era5l_t2m(year):
    
    # Open climate dataset.
    clim = ClimateDataset.from_path(
        f"/home3/dth2133/data/usa/usa-t2m_tempPrecLand{year}.zarr", 
        't2m', 
        'zarr', 
        preprocess=lambda x: x - 273.15) # Kelvin to Celsius

    # Clip climate data to the US (raw data are global)
    clim.clip_data_to_georegions_extent(georegions)
    # Rechunk dataset to optimize multithreading
    clim.rechunk((5, 578, -1, -1, -1))

    # Update climate dataset in `clim` to one collapsed over
    # hour and day based upon `temporal` definition above.
    clim.update(
        clim.da.data.map_blocks(
            temporal.execute,
            dtype=float,
            drop_axis=(3,4)),
        drop_dims=['day', 'hour'])
    clim.rechunk(-1)

    # Again update, but with (weighted) spatial collapse over regions.
    clim.update(
        w.data.map_blocks(
            spatial.execute,
            clim.da.data,
            dtype=float,
            drop_axis=[1,2],
            new_axis=1),
        drop_dims=['latitude', 'longitude'],
        new_dims={'region': w.region.values})

    # Send back the aggregated climate data.
    return clim.da


In [105]:
# Check years from the input path - this just creates a vector of years for
# which my raw climate data are available, e.g. [1970, 1971, ...]
import glob
years = np.sort([x[44:-5] for x in glob.glob('/home3/dth2133/data/usa/*t2m*')])
# Loop over years and aggregate.
output = list()
for year in years:
    print(year)
    output.append(aggregate_era5l_t2m(year))


1951
1952
1953
1954
1955
1956
1957
1958
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019


In [106]:
# Combine all our data
da = xr.concat(output, dim='year').compute()
da

[########################################] | 100% Completed | 10min 41.2s


In [108]:
ds = da.to_dataset(name=output_varn)
ds.to_zarr(os.path.join(output_path, output_name+'.zarr'), mode='a') 

<xarray.backends.zarr.ZarrStore at 0x7f4492442350>

In [109]:
if csv:
    ds = xr.open_zarr(os.path.join(output_path, output_name+'.zarr'))
    ds.to_dataframe().to_csv(os.path.join(output_path, output_name+'.csv'))

[########################################] | 100% Completed |  0.1s
[########################################] | 100% Completed |  0.1s
