In [1]:
import xarray as xr
import numpy as np
import glob
import os.path

from dask.distributed import Client
from dask.diagnostics import ProgressBar

from aggfly import dataset, regions, grid_weights
from aggfly.aggregate import TemporalAggregator, SpatialAggregator, get_time_dim

ProgressBar().register()
# client = Client()

In [3]:
# Set file output name/path
output_path = "/home3/dth2133/data/aggregated/counties/"
crop = 'cotton'
output_name = f"usa_counties_{crop}_monthly"
output_varn = "prec"
csv = True

In [4]:
# Open shapefile containing region features.
georegions = regions.from_name('counties')

# Open example climate dataset to calculate grid weights.
clim = dataset.from_path(
    f"/home3/dth2133/data/usa/usa-tp_tempPrecLand2019.zarr", 
    'tp', 
    'zarr', 
    preprocess=dataset.timefix_era5l)

# Clip climate data to the US (raw data are global)
clim.clip_data_to_georegions_extent(georegions)

# Rechunk dataset to optimize multithreading
clim.rechunk((5, 578, -1, -1, -1, -1))

# Calculate area and crop layer weights.
weight_dict = dict(
    corn='corn',
    soy='soyb',
    cotton='cott')
weights = grid_weights.from_objects(clim, georegions, crop=weight_dict[crop])

# This object covers aggregating hourly and daily data to the yearly 
# level
# daily = [TemporalAggregator(
#     'dd',
#     agg_from='hour',
#     agg_to='day', 
#     ddargs=[x,999,0]) for x in range(-3,36)]

annual = TemporalAggregator(
    'sum', 
    agg_from='hour',
    agg_to='month')

# This object aggregates cells within a region to the average across 
# cells, weighted by `weights`, which in this case are the area of the
# cell and the share of the cell with corn crops.
spatial = SpatialAggregator('avg')


In [5]:
# Calculate the grid weights
w = weights.weights()

[########################################] | 100% Completed |  0.2s
[########################################] | 100% Completed | 22.4s
[########################################] | 100% Completed | 22.8s
[########################################] | 100% Completed | 44.7s
[########################################] | 100% Completed |  0.1s
[########################################] | 100% Completed |  0.1s


In [26]:
# import importlib
# import aggfly
# from aggfly.aggregate.temporal import TemporalAggregator
# importlib.reload(aggfly.aggregate.temporal)
# from aggfly.aggregate.temporal import TemporalAggregator

In [6]:
import dask
def timefix_era5l_tp(array):
    with dask.config.set(**{'array.slicing.split_large_chunks': False}):   
        array['year'] = array.time.dt.year
        array['month'] = array.time.dt.month
        array['day'] = array.time.dt.day
        array['hour'] = array.time.dt.hour
        array = array.set_index(time=("year", "month", "day", "hour")).unstack('time')
        # array = array - 273.15
    return array

In [7]:
def aggregate_era5l_tp(path):
    # path = "/home3/dth2133/data/usa/usa-t2m_tempPrecLand1969.zarr"
    # Open climate dataset.
    clim = dataset.from_path(
        path, 
        'tp', 
        'zarr', 
        preprocess=timefix_era5l_tp) # Kelvin to Celsius
    # print(clim)
    # Clip climate data to the US (raw data are global)
    clim.clip_data_to_georegions_extent(georegions)
    # Rechunk dataset to optimize multithreading
    clim.rechunk((5, 578, -1, -1, -1))

    # Update climate dataset in `clim` to one collapsed over
    # hour and day based upon `temporal` definition above.
    # clim = daily.map_execute(clim)
    annual.map_execute(clim, update=True)
    clim.rechunk(-1)

    # Again update, but with (weighted) spatial collapse over regions.
    spatial.map_execute(clim, w)

    # Send back the aggregated climate data.
    return clim.da


In [8]:
# Check years from the input path - this just creates a vector of years for
# which my raw climate data are available, e.g. [1970, 1971, ...]
import numpy as np
import glob
from os.path import basename
files = np.sort([x for x in glob.glob('/home3/dth2133/data/usa/*tp_*')])
# Loop over years and aggregate.
output = list()
for f in files:
    print(f)
    output.append(aggregate_era5l_tp(f))

/home3/dth2133/data/usa/usa-tp_tempPrecLand1951.zarr
/home3/dth2133/data/usa/usa-tp_tempPrecLand1952.zarr
/home3/dth2133/data/usa/usa-tp_tempPrecLand1953.zarr
/home3/dth2133/data/usa/usa-tp_tempPrecLand1954.zarr
/home3/dth2133/data/usa/usa-tp_tempPrecLand1955.zarr
/home3/dth2133/data/usa/usa-tp_tempPrecLand1956.zarr
/home3/dth2133/data/usa/usa-tp_tempPrecLand1957.zarr
/home3/dth2133/data/usa/usa-tp_tempPrecLand1958.zarr
/home3/dth2133/data/usa/usa-tp_tempPrecLand1960.zarr
/home3/dth2133/data/usa/usa-tp_tempPrecLand1961.zarr
/home3/dth2133/data/usa/usa-tp_tempPrecLand1962.zarr
/home3/dth2133/data/usa/usa-tp_tempPrecLand1963.zarr
/home3/dth2133/data/usa/usa-tp_tempPrecLand1964.zarr
/home3/dth2133/data/usa/usa-tp_tempPrecLand1965.zarr
/home3/dth2133/data/usa/usa-tp_tempPrecLand1966.zarr
/home3/dth2133/data/usa/usa-tp_tempPrecLand1967.zarr
/home3/dth2133/data/usa/usa-tp_tempPrecLand1968.zarr
/home3/dth2133/data/usa/usa-tp_tempPrecLand1969.zarr
/home3/dth2133/data/usa/usa-tp_tempPrecLand197

In [9]:
da = xr.concat(output, dim='year').compute()

[########################################] | 100% Completed | 36min 15.5s


In [10]:
outds = da.to_dataset(name=output_varn)

In [12]:
ds = xr.open_zarr(os.path.join(output_path, output_name+'.zarr'))
# ds = ds.drop(output_varn)
ds = xr.combine_by_coords([ds,outds])

In [13]:
ds = ds.compute()
ds

[########################################] | 100% Completed | 21.2s


In [14]:
# https://github.com/pydata/xarray/issues/3476
for v in list(ds.coords.keys()):
    if ds.coords[v].dtype == object:
        ds.coords[v] = ds.coords[v].astype("unicode")

for v in list(ds.variables.keys()):
    if ds[v].dtype == object:
        ds[v] = ds[v].astype("unicode")
        
ds.to_zarr(os.path.join(output_path, output_name+'.zarr'), mode='w')

<xarray.backends.zarr.ZarrStore at 0x7f87de2033c0>

In [15]:
if csv:
    # ds = xr.open_zarr(os.path.join(output_path, output_name+'.zarr'))
    ds.to_dataframe().to_csv(os.path.join(output_path, output_name+'.csv'))

In [17]:
stata = True
crop = 'cotton'
if stata:
    # clim = pd.read_stata("/home3/dth2133/data/SR09/dataSTATA/weather_corn.dta").set_index(['fips', 'year', 'month'])
    # pds = pd.read_stata('/home3/dth2133/data/rescale_prism/rescale_prism.dta').set_index(['fips', 'year', 'month']).drop(columns='index')
    ds = xr.open_zarr(os.path.join(output_path, output_name+'.zarr')).to_dataframe()
    ddays = {x:f'dday{x[2:]}C' for x in ds.columns if 'dd' in x and 'M' not in x}
    ddays.update({x:f'ddayMinus{x[3:]}C' for x in ds.columns if 'dd' in x and 'M' in x})
    ddays.update({x:f'time{x[4:]}C' for x in ds.columns if 'time' in x and 'M' not in x})
    ddays.update({x:f'timeMinus{x[5:]}C' for x in ds.columns if 'time' in x and 'M' in x})
    ds.index = ds.index.rename(dict(region='fips'))
    ds = ds.rename(columns=ddays)
    ds = ds.reset_index()
    ds['fips'] = np.int64(ds.fips)
    ds.to_stata(f'/home3/dth2133/data/SR09/dataSTATA/weather_{crop}_era5.dta')

[########################################] | 100% Completed |  0.1s
[########################################] | 100% Completed |  0.1s
[########################################] | 100% Completed |  0.1s
[########################################] | 100% Completed |  0.1s
[########################################] | 100% Completed |  0.1s
[########################################] | 100% Completed |  0.1s
[########################################] | 100% Completed |  0.1s
[########################################] | 100% Completed |  0.1s
[########################################] | 100% Completed |  0.1s
[########################################] | 100% Completed |  0.1s
[########################################] | 100% Completed |  0.1s
[########################################] | 100% Completed |  0.1s
[########################################] | 100% Completed |  0.1s
[########################################] | 100% Completed |  0.1s
[########################################] | 100

In [18]:
ds

Unnamed: 0,fips,year,month,dday0C,dday1C,dday10C,dday11C,dday12C,dday13C,dday14C,...,time4C,time40C,time5C,time6C,time7C,time8C,time9C,timeMinus1C,timeMinus2C,timeMinus3C
0,21007,1951,1,141.525569,121.945592,17.359621,12.945633,9.363856,6.273063,3.873843,...,14.410623,0.0,12.923142,10.976443,8.525469,6.721316,5.909275,22.404050,23.883708,25.236002
1,21007,1951,2,176.003419,157.258662,36.884487,28.269789,21.144072,15.539050,10.839093,...,14.787889,0.0,13.787856,12.516663,11.668015,10.673142,9.980090,20.024022,20.722698,21.733802
2,21007,1951,3,273.188512,244.673226,59.703868,47.643429,36.927227,27.713504,19.780192,...,23.721251,0.0,21.737514,19.427384,17.428235,15.933329,14.289401,30.532828,30.782828,30.986100
3,21007,1951,4,378.195241,348.198932,120.080978,102.848380,87.143776,72.879130,60.145894,...,28.680501,0.0,27.079555,25.251316,23.367575,20.781911,19.298109,30.000000,30.000000,30.000000
4,21007,1951,5,631.648199,600.648199,322.291599,291.860426,261.989393,232.836710,204.917603,...,31.000000,0.0,31.000000,31.000000,30.972200,30.888867,30.694434,31.000000,31.000000,31.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2536123,26139,2019,8,,,,,,,,...,,,,,,,,,,
2536124,26139,2019,9,,,,,,,,...,,,,,,,,,,
2536125,26139,2019,10,,,,,,,,...,,,,,,,,,,
2536126,26139,2019,11,,,,,,,,...,,,,,,,,,,
