In [1]:
import pandas as pd
import numpy as np
import time
import aggfly as af

project_dir = '~/data/clim_data_requests/maya'

In [2]:
# Years to aggregate
years = np.arange(2018,2019)

# Load shapefile
georegions = af.georegions_from_path(
    "~/data/shapefiles/county/cb_2018_us_county_500k.shp",
    regionid='GEOID'
)

Converting shapefile CRS to WGS84


In [3]:
# Open example dataset to construct weights
dataset = af.dataset_from_path(
    f"~/data/annual/tempPrecLand2017.zarr", 
    var = 't2m',
    name = 'era5',
    georegions=georegions,
    preprocess = lambda x: (x - 273.15),
)
dataset.da

Unnamed: 0,Array,Chunk
Bytes,84.70 GiB,237.62 MiB
Shape,"(860, 1509, 8760)","(860, 1509, 24)"
Dask graph,365 chunks in 7 graph layers,365 chunks in 7 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 84.70 GiB 237.62 MiB Shape (860, 1509, 8760) (860, 1509, 24) Dask graph 365 chunks in 7 graph layers Data type float64 numpy.ndarray",8760  1509  860,

Unnamed: 0,Array,Chunk
Bytes,84.70 GiB,237.62 MiB
Shape,"(860, 1509, 8760)","(860, 1509, 24)"
Dask graph,365 chunks in 7 graph layers,365 chunks in 7 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray


In [7]:
secondary_weights = af.pop_weights_from_path("~/data/population/landscan-global-2016.tif")

# Calculate area weights.
weights = af.weights_from_objects(
    dataset,
    georegions,
    secondary_weights=secondary_weights,
    project_dir=project_dir
)
weights.calculate_weights()

/home3/dth2133/data/clim_data_requests/maya/tmp/GridWeights/mod-bdaaeae361a219e
Creating new cache (mod-bdaaeae361a219e) in /home3/dth2133/data/clim_data_requests/maya/tmp/GridWeights/mod-bdaaeae361a219e
Rescaling pop weights to grid.
This might take a few minutes and use a lot of memory...
[########################################] | 100% Completed | 2.58 ss
Cache file /home3/dth2133/data/clim_data_requests/maya/tmp/GridWeights/mod-bdaaeae361a219e/d92efaea42c1b40.feather not found.
{'func': 'weights',
 'raster_weights': {'name': None,
                    'path': '/home3/dth2133/data/population/landscan-global-2016.tif',
                    'raster': '<xarray.DataArray (latitude: 860, longitude: '
                              '1509)> Size: 10MB\n'
                              'array([[0., 0., 0., ..., 0., 0., 0.],\n'
                              '       [0., 0., 0., ..., 0., 0., 0.],\n'
                              '       [0., 0., 0., ..., 0., 0., 0.],\n'
                         

In [8]:
df = pd.DataFrame()
for year in years:
    print(f"Aggregating {year}")
    
    start = time.time()
    
    dataset = af.dataset_from_path(
        f"~/data/annual/tempPrecLand{year}.zarr", 
        var = 't2m',
        name = 'era5',
        georegions=georegions,
        preprocess = lambda x: (x - 273.15)
    )
    
    output_df = af.aggregate_dataset(
        dataset=dataset, 
        weights=weights,
        tavg = [
            ('aggregate', {'calc':'mean', 'groupby':'date'}),
            ('transform', {'transform':'power', 'exp':np.arange(1,2)}),
            ('aggregate', {'calc':'sum', 'groupby':'year'})
        ],
        bins= [
            ('aggregate', {'calc':'mean', 'groupby':'date'}),
            ('aggregate', {'calc':'bins', 'groupby':'year', 'ddargs':[[25,99,0],[30,99,0]]})
        ],
        growing_dday = [
            ('aggregate', {'calc':'dd', 'groupby':'date', 'ddargs':[10,30,0]}),
            ('aggregate', {'calc':'sum', 'groupby':'year'}),
        ],
        heating_dday = [
            ('aggregate', {'calc':'dd', 'groupby':'date', 'ddargs':[-99,20,1]}),
            ('aggregate', {'calc':'sum', 'groupby':'year'}),
        ]
    )
    
    df = pd.concat([df, output_df], axis=0)
    end = time.time()
    print(f"Year {year} took {end-start} seconds.")



Aggregating 2018


    >>> with dask.config.set(**{'array.slicing.split_large_chunks': False}):
    ...     array[indexer]

To avoid creating the large chunks, set the option
    >>> with dask.config.set(**{'array.slicing.split_large_chunks': True}):
    ...     array[indexer]
  return self.array[key]


Computing...
[########################################] | 100% Completed | 297.18 s
Combining datasets...
Stacking...
Merging...
Grouping...
Merging again...
Creating Dask DataFrame...
Aggregating...
[########################################] | 100% Completed | 383.74 ms
Year 2018 took 306.7785325050354 seconds.


In [9]:
df

Unnamed: 0,GEOID,time,tavg_1,bins_25_99,bins_30_99,growing_dday,heating_dday
1899,21007,2018-12-31,5568.090175,110.792177,2.000000,53846.230709,64250.100234
1620,21017,2018-12-31,4867.142048,37.964429,0.000000,51807.581907,72966.800032
1864,21031,2018-12-31,5438.734724,87.642718,1.000000,54758.001220,64598.304697
1744,21065,2018-12-31,5027.393584,42.198636,0.000000,53257.174835,69213.583327
1600,21069,2018-12-31,4803.519224,32.031935,0.000000,51778.197098,73536.104671
...,...,...,...,...,...,...,...
1062,31073,2018-12-31,3800.270297,41.522010,1.921511,41466.590675,97136.606738
1078,39075,2018-12-31,4002.577457,20.902673,0.000000,45174.634584,89134.756197
3000,48171,2018-12-31,6888.435441,98.724340,11.932970,64562.616722,37005.916179
611,55079,2018-12-31,3056.739930,6.554803,0.000000,34300.208557,107242.729992
