#### Setup

In [1]:
import time
import ctypes
import xarray as xr
import geopandas as gpd
import numpy as np

import aggfly
from aggfly import regions, grid_weights, dataset
from aggfly.aggregate import TemporalAggregator, SpatialAggregator

import dask
from dask.distributed import Client
from dask.diagnostics import ProgressBar
from dask.cache import Cache

# cache = Cache(10e9)  # Leverage two gigabytes of memory
# cache.register()    # Turn cache on globally

ProgressBar().register()



In [2]:
# Set file output name/path
input_path  = "/projects/OPPENHEIMER/tb/climate/"
output_path = "/projects/OPPENHEIMER/tb/climate/aggregated/"
code        = "/home/bearpark/Documents/aggfly/"
output_name = "precip_monthly_polys"
varname     = "prec"
csv = True

#### Define aggregation parameters

These objects describe the spatial and temporal aggregation operations.

In [3]:
# Open shapefile containing region features.
georegions = regions.from_name('uk')

# Years to aggregate
years = np.arange(1994,2020)

# Polynomials
polys = [1,2,3,4,5,6,7,8]

# This object aggregates cells within a region to the average across 
# cells, weighted by `weights`, which in this case are the area of the
# cell and the share of the cell with corn crops.
spatial = SpatialAggregator('avg')

# This object covers aggregating hourly and daily data to the yearly 
# level
daily = [TemporalAggregator(
    'sum',
    agg_from='hour',
    agg_to='day') for p in polys]

monthly = TemporalAggregator(
    'sum', 
    agg_from='day',
    agg_to='month')

monthly

<aggfly.aggregate.temporal.TemporalAggregator at 0x14ccd9f38220>

#### Calculate Weights

In [4]:
# Open example climate dataset to calculate grid weights.
clim = dataset.from_path(
    f"{input_path}tempPrecLand1951.zarr", 
    'tp', 
    'zarr', 
    preprocess=dataset.preprocess_era5l)

# Clip climate data to the US (raw data are global)
clim.clip_data_to_georegions_extent(georegions)

# Rechunk dataset to optimize multithreading
clim.rechunk(-1)

# Calculate area and crop layer weights.
weights = grid_weights.from_objects(clim, georegions, crop=None)

In [None]:
# Calculate the grid weights
w = weights.weights()


[########################################] | 100% Completed |  0.1s




In [None]:
w = w.chunk((-1))

In [None]:
def preprocess_era5l_tp(array):
    with dask.config.set(**{'array.slicing.split_large_chunks': False}):        
        array.coords['longitude'] = (array.coords['longitude'] + 180) % 360 - 180
        array = array.sortby(array.longitude)
        array['year'] = array.time.dt.year
        array['month'] = array.time.dt.month
        array['day'] = array.time.dt.day
        array['hour'] = array.time.dt.hour
        array = array.set_index(time=("year", "month", "day", "hour")).unstack('time')
        # array = array - 273.15
    return array

#### Run aggregation

In [None]:
def run_era5l_agg(year, input_path):
    # This function performs the actual aggregation for a single year, region.
    # Note that this function runs "lazily", i.e., returns an object that is
    # passed on to Dask for computation.
    
    clim = dataset.from_path(
        f"{input_path}tempPrecLand{year}.zarr", 
        var = 'prec', 
        engine = 'zarr', 
        preprocess=dataset.preprocess_era5l_tp)

    clim.clip_data_to_georegions_extent(georegions)

    clim.rechunk(-1)

    # # Temporal aggregation to gridcell by day
    daily_list = [x.map_execute(clim) for x in daily]

    # # Sum across days after taking the non-linear transformation
    monthly_list = [monthly.map_execute(x) for x in daily_list]
    
    # # Spatial agregation
    out_clim = [spatial.map_execute(x, w) for x in monthly_list]
    
    # Return only the dask dataframes from the Datset objects
    return [x.da.data for x in out_clim]


In [None]:
print('AGGREGATING YEARS')
output_list = list()
time_list = list()
for year in years:
    print(year)

    # Start timing
    start = time.time()
    time_list.append(year)

    # Aggregate this year, region & append to list
    output_list.append(dask.compute(run_era5l_agg(year, input_path))[0])

    # Report timing
    stop = time.time()
    duration = stop-start
    print(round(duration/60,2))

In [None]:
print('COMBINING OUTPUT')
# Put everything together in one dataset

d_list = list()
for t in range(len(polys)):
    y_list = list()
    for y in range(len(years)):
        y_list.append(xr.DataArray(
            data = output_list[y][t],
            dims = ['region', 'year', 'month'],
            coords = dict(
                region=('region', georegions.regions),
                year = ('year', [years[y]]),
                month = ('month', np.arange(1,13))
            ),
            # name = f'temp',
            name = f'{varname}{polys[t]}'
        ))
    d_list.append(xr.concat(y_list, dim='year'))

In [None]:
# Save
ds = xr.combine_by_coords(d_list)    
ds = ds.to_dataframe()
ds = ds.reset_index(level=['region', 'year', 'month'])
ds = ds.rename(columns={'region':'id'})
ds

In [None]:
ds.to_csv(os.path.join(output_path, output_name+'.csv'))