In [None]:
## import libraries
import os, sys
import yaml
import xarray as xr
import pandas as pd
import numpy as np
import metpy.calc as mpcalc
from metpy.units import units
import dask
from datetime import timedelta
%matplotlib inline

sys.path.append('../modules')
import ar_funcs
dask.config.set(**{'array.slicing.split_large_chunks': True})

In [2]:
path_to_data = '/cw3e/mead/projects/cwp140/scratch/dnash/data/'      # project data -- read only
path_to_out  = '../out/'       # output files (numerical results, intermediate datafiles) -- read & write
path_to_figs = '../figs/'      # figures

In [3]:
## for each year between 2000 and 2019
date_lst = []
for i, yr in enumerate(range(2000, 2012)):
    ## get 45 days before date
    center_date = '{0}-11-20'.format(yr)
    center_date = pd.to_datetime(center_date)
    start_date = center_date - timedelta(days=45)
    
    ## get 45 days after November 21
    end_date = center_date + timedelta(days=45)

    ## make a list of dates between start_date and end_date
    dates = pd.date_range(start_date, end_date, freq='1D')
    
    date_lst.append(dates)
    
## concatenate all years together into single list    
final_lst = np.concatenate(date_lst)

In [4]:
print(start_date, end_date)

2011-10-06 00:00:00 2012-01-04 00:00:00


In [4]:
## load all days from the new subset
## create list of fnames
fname_lst = []
path_to_data = '/data/projects/Comet/cwp140/'
path_to_data = '/cw3e/mead/projects/cwp140/scratch/dnash/data/'
varname = 'ivt'

for i, dt in enumerate(final_lst):
    ts = pd.to_datetime(str(dt)) 
    d = ts.strftime("%Y%m%d")
    fname = path_to_data + 'preprocessed/GEFSv12_reforecast/{0}/{1}_{0}.nc'.format(varname, d)
    fname_lst.append(fname)


Try selecting only the 24 hour lead for each file

In [5]:
%%time
def preprocess(ds):
    ds = ds.drop_vars(["ivtu", "ivtv"])
    ds = ds.isel(step=-1) # select the 24 hr lead step
    
    return ds

## use xr.open_mfdataset to read all the files within that ssn clim
ds = xr.open_mfdataset(fname_lst, concat_dim="valid_time", combine="nested", engine='netcdf4', chunks={"lat": 100, "lon": 100}, preprocess=preprocess)
ds

CPU times: user 20 s, sys: 6.66 s, total: 26.6 s
Wall time: 29.3 s


Unnamed: 0,Array,Chunk
Bytes,5.48 GiB,390.62 kiB
Shape,"(1092, 5, 281, 479)","(1, 5, 100, 100)"
Dask graph,16380 chunks in 4369 graph layers,16380 chunks in 4369 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 5.48 GiB 390.62 kiB Shape (1092, 5, 281, 479) (1, 5, 100, 100) Dask graph 16380 chunks in 4369 graph layers Data type float64 numpy.ndarray",1092  1  479  281  5,

Unnamed: 0,Array,Chunk
Bytes,5.48 GiB,390.62 kiB
Shape,"(1092, 5, 281, 479)","(1, 5, 100, 100)"
Dask graph,16380 chunks in 4369 graph layers,16380 chunks in 4369 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray


In [6]:
## need to rechunk so time is a single chunk
ds = ds.chunk(dict(valid_time=-1))

In [7]:
ds

Unnamed: 0,Array,Chunk
Bytes,5.48 GiB,416.56 MiB
Shape,"(1092, 5, 281, 479)","(1092, 5, 100, 100)"
Dask graph,15 chunks in 4370 graph layers,15 chunks in 4370 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 5.48 GiB 416.56 MiB Shape (1092, 5, 281, 479) (1092, 5, 100, 100) Dask graph 15 chunks in 4370 graph layers Data type float64 numpy.ndarray",1092  1  479  281  5,

Unnamed: 0,Array,Chunk
Bytes,5.48 GiB,416.56 MiB
Shape,"(1092, 5, 281, 479)","(1092, 5, 100, 100)"
Dask graph,15 chunks in 4370 graph layers,15 chunks in 4370 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray


In [8]:
# Percentile will be a set range of percentiles including <90th, then every 0.1 until 100th/MAX
# I might add 75th-90th, and < 75th
a = np.array([0, .75, .9])
b = np.arange(.91, 1.001, 0.01)
quantile_arr = np.concatenate((a, b), axis=0)
quantile_arr


array([0.  , 0.75, 0.9 , 0.91, 0.92, 0.93, 0.94, 0.95, 0.96, 0.97, 0.98,
       0.99, 1.  ])

In [9]:
%%time
ivt_mclimate = ds.quantile(quantile_arr, dim=['valid_time', 'number'], skipna=True)
ivt_mclimate

CPU times: user 13.1 ms, sys: 2 µs, total: 13.1 ms
Wall time: 12.8 ms


Unnamed: 0,Array,Chunk
Bytes,13.35 MiB,0.99 MiB
Shape,"(13, 281, 479)","(13, 100, 100)"
Dask graph,15 chunks in 4375 graph layers,15 chunks in 4375 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 13.35 MiB 0.99 MiB Shape (13, 281, 479) (13, 100, 100) Dask graph 15 chunks in 4375 graph layers Data type float64 numpy.ndarray",479  281  13,

Unnamed: 0,Array,Chunk
Bytes,13.35 MiB,0.99 MiB
Shape,"(13, 281, 479)","(13, 100, 100)"
Dask graph,15 chunks in 4375 graph layers,15 chunks in 4375 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray


In [10]:
%%time
# ivt_mclimate.compute()

CPU times: user 3 µs, sys: 1e+03 ns, total: 4 µs
Wall time: 8.82 µs


In [11]:
%%time
# write to netCDF
fname = os.path.join(path_to_data, 'preprocessed/GEFSv12_reforecast_mclimate_ivt_Nov20_24hr-lead.nc')
ivt_mclimate.load().to_netcdf(path=fname, mode = 'w', format='NETCDF4')

CPU times: user 2min 9s, sys: 47.5 s, total: 2min 56s
Wall time: 2min 9s
