In [1]:
import xarray as xr
import numpy as np
import pandas as pd
import glob
import os.path

from dask.distributed import Client
from dask.diagnostics import ProgressBar

from aggfly import dataset, regions, grid_weights
from aggfly.aggregate import TemporalAggregator, SpatialAggregator, get_time_dim

ProgressBar().register()
# client = Client()

In [2]:
# Set file output name/path
output_path = "/home3/dth2133/data/aggregated/counties/"
output_name = "usa_counties_cotton_monthly"
output_varn = "dd"
csv = True

In [3]:
# [x for x in range(37,41)]

In [4]:
# Open shapefile containing region features.
georegions = regions.from_name('counties')

# Open example climate dataset to calculate grid weights.
clim = dataset.from_path(
    f"/home3/dth2133/data/usa/usa-t2m_tempPrecLand2019.zarr", 
    't2m', 
    'zarr', 
    preprocess=dataset.timefix_era5l)

# Clip climate data to the US (raw data are global)
clim.clip_data_to_georegions_extent(georegions)

# Rechunk dataset to optimize multithreading
clim.rechunk((5, 578, -1, -1, -1, -1))

# Calculate area and crop layer weights.
weights = grid_weights.from_objects(clim, georegions, crop='cott')

# This object covers aggregating hourly and daily data to the yearly 
# level
daily = [TemporalAggregator(
    'dd',
    agg_from='hour',
    agg_to='day', 
    ddargs=[x,999,0]) for x in range(-3,41)]


annual = TemporalAggregator(
    'sum', 
    agg_from='day',
    agg_to='month')

# This object aggregates cells within a region to the average across 
# cells, weighted by `weights`, which in this case are the area of the
# cell and the share of the cell with corn crops.
spatial = SpatialAggregator('avg')


In [6]:
# list(range(-3,35))
clim.da

Unnamed: 0,Array,Chunk
Bytes,4.81 GiB,98.43 MiB
Shape,"(250, 578, 1, 12, 31, 24)","(5, 578, 1, 12, 31, 24)"
Count,3640 Tasks,50 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 4.81 GiB 98.43 MiB Shape (250, 578, 1, 12, 31, 24) (5, 578, 1, 12, 31, 24) Count 3640 Tasks 50 Chunks Type float32 numpy.ndarray",1  578  250  24  31  12,

Unnamed: 0,Array,Chunk
Bytes,4.81 GiB,98.43 MiB
Shape,"(250, 578, 1, 12, 31, 24)","(5, 578, 1, 12, 31, 24)"
Count,3640 Tasks,50 Chunks
Type,float32,numpy.ndarray


In [7]:
# Calculate the grid weights
w = weights.weights()

[########################################] | 100% Completed |  0.1s
[########################################] | 100% Completed | 22.9s
[########################################] | 100% Completed | 24.2s
[########################################] | 100% Completed | 37.2s
[########################################] | 100% Completed |  0.1s
[########################################] | 100% Completed |  0.1s


In [8]:
def aggregate_era5l_t2m_multi(path):
    
    # path = "/home3/dth2133/data/usa/usa-t2m_tempPrecLand1969.zarr"
    # Open climate dataset.
    clim = dataset.from_path(
        path, 
        't2m', 
        'zarr', 
        preprocess=dataset.timefix_era5l) # Kelvin to Celsius
    
    # Clip climate data to the US (raw data are global)
    clim.clip_data_to_georegions_extent(georegions)
    # Rechunk dataset to optimize multithreading
    clim.rechunk((5, 578, -1, -1, -1))
    
    daily_list = [x.map_execute(clim.da.data) for x in daily]
    annual_list = [annual.map_execute(x) for x in daily_list]
    rc_list = [x.rechunk(-1) for x in annual_list]
    spatial_list = [spatial.map_execute(x, w) for x in rc_list]
    
    return spatial_list
    

In [10]:
# Check years from the input path - this just creates a vector of years for
# which my raw climate data are available, e.g. [1970, 1971, ...]
import numpy as np
import glob
from os.path import basename
files = np.sort([x for x in glob.glob('/home3/dth2133/data/usa/*t2m_*')])
# Loop over years and aggregate.
output = list()
for f in files:
    print(f)
    output.append(aggregate_era5l_t2m_multi(f))

/home3/dth2133/data/usa/usa-t2m_tempPrecLand1951.zarr
/home3/dth2133/data/usa/usa-t2m_tempPrecLand1952.zarr
/home3/dth2133/data/usa/usa-t2m_tempPrecLand1953.zarr
/home3/dth2133/data/usa/usa-t2m_tempPrecLand1954.zarr
/home3/dth2133/data/usa/usa-t2m_tempPrecLand1955.zarr
/home3/dth2133/data/usa/usa-t2m_tempPrecLand1956.zarr
/home3/dth2133/data/usa/usa-t2m_tempPrecLand1957.zarr
/home3/dth2133/data/usa/usa-t2m_tempPrecLand1958.zarr
/home3/dth2133/data/usa/usa-t2m_tempPrecLand1960.zarr
/home3/dth2133/data/usa/usa-t2m_tempPrecLand1961.zarr
/home3/dth2133/data/usa/usa-t2m_tempPrecLand1962.zarr
/home3/dth2133/data/usa/usa-t2m_tempPrecLand1963.zarr
/home3/dth2133/data/usa/usa-t2m_tempPrecLand1964.zarr
/home3/dth2133/data/usa/usa-t2m_tempPrecLand1965.zarr
/home3/dth2133/data/usa/usa-t2m_tempPrecLand1966.zarr
/home3/dth2133/data/usa/usa-t2m_tempPrecLand1967.zarr
/home3/dth2133/data/usa/usa-t2m_tempPrecLand1968.zarr
/home3/dth2133/data/usa/usa-t2m_tempPrecLand1969.zarr
/home3/dth2133/data/usa/usa-

In [11]:
import dask
result = dask.compute(output)

[##########################              ] | 67% Completed | 54min 40.9s

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



[####################################    ] | 92% Completed |  1hr 15min 28.5s

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [13]:
# result

In [15]:
temp = xr.open_zarr(os.path.join(output_path, 'usa_counties_corn_monthly.zarr'))


In [16]:
lr = ['dd'+str(x).replace('-','M') for x in range(-3,41)]

In [31]:
# georegions.regions

In [26]:
# 

In [17]:
len(result[0][0])
dlist = list()
for d in range(0, len(result[0][0])):
    ylist = list()
    for y in range(0, len(result[0])):
        ylist.append(result[0][y][d])  
    temp_array = xr.DataArray(
        data = np.concatenate(ylist, axis=1),
        dims = ['region', 'year', 'month'],
        coords = dict(
            region=('region', georegions.regions),
            year = ('year', temp.year.values),
            month = ('month', temp.month.values))).to_dataset(name = lr[d])
    dlist.append(temp_array)

In [18]:
outds = xr.combine_by_coords(dlist)

In [21]:
# outds

In [22]:
ds = outds

In [40]:
output_name

'usa_counties_soy_monthly'

In [19]:
ds = xr.open_zarr(os.path.join(output_path, output_name+'.zarr'))
ld = [x for x in ds.keys() if x in lr]
if len(ld) > 0:
    ds = ds.drop(ld)
ds = xr.combine_by_coords([ds,outds]).compute()

[########################################] | 100% Completed |  6.9s


In [23]:
# https://github.com/pydata/xarray/issues/3476
for v in list(ds.coords.keys()):
    if ds.coords[v].dtype == object:
        ds.coords[v] = ds.coords[v].astype("unicode")

for v in list(ds.variables.keys()):
    if ds[v].dtype == object:
        ds[v] = ds[v].astype("unicode")
        
ds.to_zarr(os.path.join(output_path, output_name+'.zarr'), mode='w')

<xarray.backends.zarr.ZarrStore at 0x7fc645bb46d0>

In [24]:
output_name

'usa_counties_cotton_monthly'

In [25]:
if csv:
    ds = xr.open_zarr(os.path.join(output_path, output_name+'.zarr'))
    ds.to_dataframe().to_csv(os.path.join(output_path, output_name+'.csv'))

[########################################] | 100% Completed |  0.1s
[########################################] | 100% Completed |  0.1s
[########################################] | 100% Completed |  0.1s
[########################################] | 100% Completed |  0.1s
[########################################] | 100% Completed |  0.1s
[########################################] | 100% Completed |  0.1s
[########################################] | 100% Completed |  0.1s
[########################################] | 100% Completed |  0.1s
[########################################] | 100% Completed |  0.1s
[########################################] | 100% Completed |  0.1s
[########################################] | 100% Completed |  0.1s
[########################################] | 100% Completed |  0.1s
[########################################] | 100% Completed |  0.1s
[########################################] | 100% Completed |  0.1s
[########################################] | 100

In [26]:
stata = True
if stata:
    # clim = pd.read_stata("/home3/dth2133/data/SR09/dataSTATA/weather_corn.dta").set_index(['fips', 'year', 'month'])
    # pds = pd.read_stata('/home3/dth2133/data/rescale_prism/rescale_prism.dta').set_index(['fips', 'year', 'month']).drop(columns='index')
    ds = xr.open_zarr(os.path.join(output_path, output_name+'.zarr')).to_dataframe()

[########################################] | 100% Completed |  0.1s
[########################################] | 100% Completed |  0.1s
[########################################] | 100% Completed |  0.1s
[########################################] | 100% Completed |  0.1s
[########################################] | 100% Completed |  0.1s
[########################################] | 100% Completed |  0.1s
[########################################] | 100% Completed |  0.1s
[########################################] | 100% Completed |  0.1s
[########################################] | 100% Completed |  0.1s
[########################################] | 100% Completed |  0.1s
[########################################] | 100% Completed |  0.1s
[########################################] | 100% Completed |  0.1s
[########################################] | 100% Completed |  0.1s
[########################################] | 100% Completed |  0.1s
[########################################] | 100

In [45]:
# ds

In [27]:
# [print(x) for x in clim.columns]
ddays = {x:f'dday{x[2:]}C' for x in ds.columns if 'dd' in x and 'M' not in x}
ddays.update({x:f'ddayMinus{x[3:]}C' for x in ds.columns if 'dd' in x and 'M' in x})
ddays.update({x:f'time{x[4:]}C' for x in ds.columns if 'time' in x and 'M' not in x})
ddays.update({x:f'timeMinus{x[5:]}C' for x in ds.columns if 'time' in x and 'M' in x})
# ddays = {x:f'dday{x[2:]}C' for x in ds.columns if 'time' in x}
# ddays['prec'] = 'precPDS'
# df = df.rename(columns=ddays)
ddays

{'dd0': 'dday0C',
 'dd1': 'dday1C',
 'dd10': 'dday10C',
 'dd11': 'dday11C',
 'dd12': 'dday12C',
 'dd13': 'dday13C',
 'dd14': 'dday14C',
 'dd15': 'dday15C',
 'dd16': 'dday16C',
 'dd17': 'dday17C',
 'dd18': 'dday18C',
 'dd19': 'dday19C',
 'dd2': 'dday2C',
 'dd20': 'dday20C',
 'dd21': 'dday21C',
 'dd22': 'dday22C',
 'dd23': 'dday23C',
 'dd24': 'dday24C',
 'dd25': 'dday25C',
 'dd26': 'dday26C',
 'dd27': 'dday27C',
 'dd28': 'dday28C',
 'dd29': 'dday29C',
 'dd3': 'dday3C',
 'dd30': 'dday30C',
 'dd31': 'dday31C',
 'dd32': 'dday32C',
 'dd33': 'dday33C',
 'dd34': 'dday34C',
 'dd35': 'dday35C',
 'dd36': 'dday36C',
 'dd37': 'dday37C',
 'dd38': 'dday38C',
 'dd39': 'dday39C',
 'dd4': 'dday4C',
 'dd40': 'dday40C',
 'dd5': 'dday5C',
 'dd6': 'dday6C',
 'dd7': 'dday7C',
 'dd8': 'dday8C',
 'dd9': 'dday9C',
 'ddM1': 'ddayMinus1C',
 'ddM2': 'ddayMinus2C',
 'ddM3': 'ddayMinus3C'}

In [28]:
ds.index = ds.index.rename(dict(region='fips'))
ds = ds.rename(columns=ddays)


In [29]:
ds = ds.reset_index()

In [30]:
ds['fips'] = np.int64(ds.fips)

In [32]:
crop = 'cotton'
ds.to_stata(f'/home3/dth2133/data/SR09/dataSTATA/weather_{crop}_era5.dta')