In [1]:
%load_ext autoreload

In [2]:
%autoreload 2

In [1]:
import xarray as xr
import numpy as np
import pandas as pd
import glob
import os.path

from dask.distributed import Client
from dask.diagnostics import ProgressBar

from aggfly import dataset, regions, grid_weights
from aggfly.aggregate import TemporalAggregator, SpatialAggregator, get_time_dim

ProgressBar().register()
# client = Client()

In [2]:
# Set file output name/path
output_path = "/home3/dth2133/data/aggregated/counties/"
crop = 'corn'
output_name = f"usa_counties_{crop}_monthly"
output_varn = "time"
csv = True

In [3]:
# Open shapefile containing region features.
georegions = regions.from_name('counties')

# Open example climate dataset to calculate grid weights.
clim = dataset.from_path(
    f"/home3/dth2133/data/usa/usa-t2m_tempPrecLand2019.zarr", 
    't2m', 
    'zarr', 
    preprocess=dataset.timefix_era5l)

# Clip climate data to the US (raw data are global)
clim.clip_data_to_georegions_extent(georegions)

# Rechunk dataset to optimize multithreading
clim.rechunk((5, 578, -1, -1, -1, -1))

# Calculate area and crop layer weights.
weight_dict = dict(
    corn='corn',
    soy='soyb',
    cotton='cott')
weights = grid_weights.from_objects(clim, georegions, crop=weight_dict[crop])

# This object covers aggregating hourly and daily data to the yearly 
# level
daily = [TemporalAggregator(
    'time',
    agg_from='hour',
    agg_to='day', 
    ddargs=[x,999,0]) for x in range(-3,41)]

annual = TemporalAggregator(
    'sum', 
    agg_from='day',
    agg_to='month')

# This object aggregates cells within a region to the average across 
# cells, weighted by `weights`, which in this case are the area of the
# cell and the share of the cell with corn crops.
spatial = SpatialAggregator('avg')


In [4]:
# list(range(-3,35))
clim.da

Unnamed: 0,Array,Chunk
Bytes,4.81 GiB,98.43 MiB
Shape,"(250, 578, 1, 12, 31, 24)","(5, 578, 1, 12, 31, 24)"
Count,3276 Tasks,50 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 4.81 GiB 98.43 MiB Shape (250, 578, 1, 12, 31, 24) (5, 578, 1, 12, 31, 24) Count 3276 Tasks 50 Chunks Type float32 numpy.ndarray",1  578  250  24  31  12,

Unnamed: 0,Array,Chunk
Bytes,4.81 GiB,98.43 MiB
Shape,"(250, 578, 1, 12, 31, 24)","(5, 578, 1, 12, 31, 24)"
Count,3276 Tasks,50 Chunks
Type,float32,numpy.ndarray


In [None]:
# Calculate the grid weights
w = weights.weights()

Rescaling corn weights to grid.
This might take a few minutes and use a lot of memory...
[########################################] | 100% Completed | 102.64 ms
[########################################] | 100% Completed | 8.77 ss


In [11]:
def aggregate_era5l_t2m_multi(path):
    
    # path = "/home3/dth2133/data/usa/usa-t2m_tempPrecLand1969.zarr"
    # Open climate dataset.
    clim = dataset.from_path(
        path, 
        't2m', 
        'zarr', 
        preprocess=dataset.timefix_era5l) # Kelvin to Celsius
    
    # Clip climate data to the US (raw data are global)
    clim.clip_data_to_georegions_extent(georegions)
    # Rechunk dataset to optimize multithreading
    clim.rechunk((5, 578, -1, -1, -1))
    
    daily_list = [x.map_execute(clim.da.data) for x in daily]
    annual_list = [annual.map_execute(x) for x in daily_list]
    rc_list = [x.rechunk(-1) for x in annual_list]
    spatial_list = [spatial.map_execute(x, w) for x in rc_list]
    
    return spatial_list
    

In [12]:
# Check years from the input path - this just creates a vector of years for
# which my raw climate data are available, e.g. [1970, 1971, ...]
import numpy as np
import glob
from os.path import basename
files = np.sort([x for x in glob.glob('/home3/dth2133/data/usa/*t2m_*')])
# Loop over years and aggregate.
output = list()
for f in files:
    print(f)
    output.append(aggregate_era5l_t2m_multi(f))

/home3/dth2133/data/usa/usa-t2m_tempPrecLand1951.zarr
/home3/dth2133/data/usa/usa-t2m_tempPrecLand1952.zarr
/home3/dth2133/data/usa/usa-t2m_tempPrecLand1953.zarr
/home3/dth2133/data/usa/usa-t2m_tempPrecLand1954.zarr
/home3/dth2133/data/usa/usa-t2m_tempPrecLand1955.zarr
/home3/dth2133/data/usa/usa-t2m_tempPrecLand1956.zarr
/home3/dth2133/data/usa/usa-t2m_tempPrecLand1957.zarr
/home3/dth2133/data/usa/usa-t2m_tempPrecLand1958.zarr
/home3/dth2133/data/usa/usa-t2m_tempPrecLand1960.zarr
/home3/dth2133/data/usa/usa-t2m_tempPrecLand1961.zarr
/home3/dth2133/data/usa/usa-t2m_tempPrecLand1962.zarr
/home3/dth2133/data/usa/usa-t2m_tempPrecLand1963.zarr
/home3/dth2133/data/usa/usa-t2m_tempPrecLand1964.zarr
/home3/dth2133/data/usa/usa-t2m_tempPrecLand1965.zarr
/home3/dth2133/data/usa/usa-t2m_tempPrecLand1966.zarr
/home3/dth2133/data/usa/usa-t2m_tempPrecLand1967.zarr
/home3/dth2133/data/usa/usa-t2m_tempPrecLand1968.zarr
/home3/dth2133/data/usa/usa-t2m_tempPrecLand1969.zarr
/home3/dth2133/data/usa/usa-

In [13]:
import dask
result = dask.compute(output)

[########################################] | 100% Completed |  1hr 24min  2.3s


In [17]:
# result

In [11]:

yrange

array([1951, 1952, 1953, 1954, 1955, 1956, 1957, 1958, 1960, 1961, 1962,
       1963, 1964, 1965, 1966, 1967, 1968, 1969, 1970, 1971, 1972, 1973,
       1974, 1975, 1976, 1977, 1978, 1979, 1980, 1981, 1982, 1983, 1984,
       1985, 1986, 1987, 1988, 1989, 1990, 1991, 1992, 1993, 1994, 1995,
       1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006,
       2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017,
       2018, 2019])

In [8]:
temp.year

In [7]:
temp = xr.open_zarr(os.path.join(output_path, output_name+'_old.zarr'))


In [25]:
# georegions.regions
yrange = np.arange(1951,2020)
yrange = yrange[yrange != 1959]

In [27]:
np.arange(1,13)

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12])

In [26]:
yrange

array([1951, 1952, 1953, 1954, 1955, 1956, 1957, 1958, 1960, 1961, 1962,
       1963, 1964, 1965, 1966, 1967, 1968, 1969, 1970, 1971, 1972, 1973,
       1974, 1975, 1976, 1977, 1978, 1979, 1980, 1981, 1982, 1983, 1984,
       1985, 1986, 1987, 1988, 1989, 1990, 1991, 1992, 1993, 1994, 1995,
       1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006,
       2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017,
       2018, 2019])

In [5]:
[1,2,3,4] + [5,3,1]

[1, 2, 3, 4, 5, 3, 1]

In [18]:
len(result[0][0])
dlist = list()
for d in range(0, len(result[0][0])):
    ylist = list()
    for y in range(0, len(result[0])):
        ylist.append(result[0][y][d])  
    temp_array = xr.DataArray(
        data = np.concatenate(ylist, axis=1),
        dims = ['region', 'year', 'month'],
        coords = dict(
            region=('region', georegions.regions),
            year = ('year', temp.year.values),
            month = ('month', temp.month.values))).to_dataset(name = lr[d])
    dlist.append(temp_array)

In [19]:
outds = xr.combine_by_coords(dlist)

In [30]:
np.bool_

numpy.bool_

In [22]:
ds = xr.open_zarr(os.path.join(output_path, output_name+'.zarr'))
ld = [x for x in ds.keys() if x in lr]
if len(ld) > 0:
    ds = ds.drop(ld)
ds = xr.combine_by_coords([ds,outds]).compute()

[########################################] | 100% Completed |  9.1s


In [3]:
ds = xr.open_zarr(os.path.join(output_path, output_name+'.zarr'))

In [16]:
import numpy as np
fmax = 35
fmin = 20
ddargs = [20.01]
M = (fmax + fmin) / 2
W = (fmax - fmin) / 2
res = 0
if fmax <= ddargs[0]:
    res+=0
elif ddargs[0] < fmin:
    res+= 1
elif fmin < ddargs[0] and ddargs[0] < fmax:
    xmin = np.arcsin((ddargs[0] - M)/ W)
    res = (np.pi - 2*xmin) / (2*np.pi)
# res = res / (M)

In [17]:
res

0.9835607212266884

In [5]:
xmin

0.3398369094541219

In [25]:
# https://github.com/pydata/xarray/issues/3476
for v in list(ds.coords.keys()):
    if ds.coords[v].dtype == object:
        ds.coords[v] = ds.coords[v].astype("unicode")

for v in list(ds.variables.keys()):
    if ds[v].dtype == object:
        ds[v] = ds[v].astype("unicode")
        
ds.to_zarr(os.path.join(output_path, output_name+'.zarr'), mode='w')

<xarray.backends.zarr.ZarrStore at 0x7f7f213b4ba0>

In [26]:
if csv:
    ds = xr.open_zarr(os.path.join(output_path, output_name+'.zarr'))
    ds.to_dataframe().to_csv(os.path.join(output_path, output_name+'.csv'))

[########################################] | 100% Completed |  0.1s
[########################################] | 100% Completed |  0.1s
[########################################] | 100% Completed |  0.1s
[########################################] | 100% Completed |  0.1s
[########################################] | 100% Completed |  0.1s
[########################################] | 100% Completed |  0.1s
[########################################] | 100% Completed |  0.1s
[########################################] | 100% Completed |  0.1s
[########################################] | 100% Completed |  0.1s
[########################################] | 100% Completed |  0.1s
[########################################] | 100% Completed |  0.1s
[########################################] | 100% Completed |  0.1s
[########################################] | 100% Completed |  0.1s
[########################################] | 100% Completed |  0.1s
[########################################] | 100

In [27]:
stata = True
crop = 'soy'
if stata:
    # clim = pd.read_stata("/home3/dth2133/data/SR09/dataSTATA/weather_corn.dta").set_index(['fips', 'year', 'month'])
    # pds = pd.read_stata('/home3/dth2133/data/rescale_prism/rescale_prism.dta').set_index(['fips', 'year', 'month']).drop(columns='index')
    ds = xr.open_zarr(os.path.join(output_path, output_name+'.zarr')).to_dataframe()
    ddays = {x:f'dday{x[2:]}C' for x in ds.columns if 'dd' in x and 'M' not in x}
    ddays.update({x:f'ddayMinus{x[3:]}C' for x in ds.columns if 'dd' in x and 'M' in x})
    ddays.update({x:f'time{x[4:]}C' for x in ds.columns if 'time' in x and 'M' not in x})
    ddays.update({x:f'timeMinus{x[5:]}C' for x in ds.columns if 'time' in x and 'M' in x})
    ds.index = ds.index.rename(dict(region='fips'))
    ds = ds.rename(columns=ddays)
    ds = ds.reset_index()
    ds['fips'] = np.int64(ds.fips)
    ds.to_stata(f'/home3/dth2133/data/SR09/dataSTATA/weather_{crop}_era5.dta')

[########################################] | 100% Completed |  0.1s
[########################################] | 100% Completed |  0.1s
[########################################] | 100% Completed |  0.1s
[########################################] | 100% Completed |  0.1s
[########################################] | 100% Completed |  0.1s
[########################################] | 100% Completed |  0.1s
[########################################] | 100% Completed |  0.1s
[########################################] | 100% Completed |  0.1s
[########################################] | 100% Completed |  0.1s
[########################################] | 100% Completed |  0.1s
[########################################] | 100% Completed |  0.1s
[########################################] | 100% Completed |  0.1s
[########################################] | 100% Completed |  0.1s
[########################################] | 100% Completed |  0.1s
[########################################] | 100