In [1]:
## import libraries
import os, sys
import yaml
import xarray as xr
import pandas as pd
import numpy as np
import metpy.calc as mpcalc
from metpy.units import units
import dask

%matplotlib inline

sys.path.append('../modules')
import ar_funcs
dask.config.set(**{'array.slicing.split_large_chunks': True})

<dask.config.set at 0x7f47f80f5250>

In [2]:
path_to_data = '/cw3e/mead/projects/cwp140/scratch/dnash/data/'      # project data -- read only
path_to_out  = '../out/'       # output files (numerical results, intermediate datafiles) -- read & write
path_to_figs = '../figs/'      # figures

In [3]:
## read AR duration file
duration_df = pd.read_csv('../out/AR_track_duration_SEAK.csv')
duration_df['start_date'] = pd.to_datetime(duration_df['start_date'])
duration_df['start_date'] = duration_df['trackID'].map(ar_funcs.get_new_start)
duration_df['end_date'] = pd.to_datetime(duration_df['end_date'])
duration_df.index = duration_df['start_date'] 

ARID_issues = [200411121210, 200411191202, 200610151213, 200610201812, 201205201201, 201209010004]


error_desc = ['IVT nan', 'IVT nan' ,'prec wrong dates', 'prec wrong dates', 'prec time unsorted', 'freeze level not same datetime as ivt']
duration_df = duration_df[~duration_df['trackID'].isin(ARID_issues)]

ARID_lst = duration_df.index.values

In [4]:
## subset to each SSN
# ASO - August 1 to October 31
# NDJ - November 1 to January 31
# FMA - February 1 to April 30
# MJJ - May 1 to July 31

# start with NDJ

def select_months_df(df, mon_s, mon_e):
    # Select months
    if mon_s > mon_e:
        idx = (df.index.month >= mon_s) | (df.index.month <= mon_e)
    else:
        idx = (df.index.month >= mon_s) & (df.index.month <= mon_e)

    df = df.loc[idx]
    
    return df 

In [5]:
NDJ = select_months_df(duration_df, 11, 1)
NDJ

Unnamed: 0_level_0,Unnamed: 0,trackID,start_date,end_date,duration
start_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2000-01-06 00:00:00,0,2.000011e+11,2000-01-06 00:00:00,2000-01-06 18:00:00,18.0
2000-01-05 00:00:00,1,2.000011e+11,2000-01-05 00:00:00,2000-01-08 06:00:00,6.0
2000-01-19 06:00:00,2,2.000012e+11,2000-01-19 06:00:00,2000-01-19 12:00:00,6.0
2000-01-24 06:00:00,3,2.000012e+11,2000-01-24 06:00:00,2000-01-31 00:00:00,90.0
2000-01-28 12:00:00,4,2.000013e+11,2000-01-28 12:00:00,2000-01-29 00:00:00,6.0
...,...,...,...,...,...
2019-01-15 12:00:00,1457,2.019012e+11,2019-01-15 12:00:00,2019-01-22 06:00:00,6.0
2019-01-24 12:00:00,1458,2.019012e+11,2019-01-24 12:00:00,2019-01-25 00:00:00,12.0
2019-01-22 00:00:00,1459,2.019012e+11,2019-01-22 00:00:00,2019-01-26 12:00:00,30.0
2019-01-28 00:00:00,1460,2.019013e+11,2019-01-28 00:00:00,2019-01-29 06:00:00,6.0


In [6]:
## load all ARIDs from the new subset
## create list of fnames
trackID_lst = NDJ['trackID'].values
fname_lst = []
path_to_data = '/data/projects/Comet/cwp140/'
varname = 'ivt'

for i, trackID in enumerate(trackID_lst):
    ARID = int(trackID)
    fname = path_to_data + 'preprocessed/GEFSv12_reforecast/{0}/{1}_{0}.nc'.format(varname, ARID)
    fname_lst.append(fname)


Try selecting only the 21 hour for each file

In [7]:
%%time
def preprocess(ds):
    ds = ds.drop_vars(["ivtu", "ivtv"])
    ds = ds.sel(time=ds.time.dt.hour == 21) # select the 21 UTC hour
    
    return ds

## use xr.open_mfdataset to read all the files within that ssn clim
ds = xr.open_mfdataset(fname_lst, combine="nested", engine='netcdf4', parallel=False, preprocess=preprocess)
ds
## now read in "../out/SEAK_ardates_hourly.csv' and create a list of 3h times where AR=1
## select only those dates/times from the ds

## now create a dimension called "lead" based on H of each valid time


## calculate percentiles based on those groups
# percentile_lst = [0. 0.75, 0.9] + np.arange(.91, 100.1, 0.1)

Struct() takes at most 1 argument (3 given)


CPU times: user 43min 54s, sys: 19min 33s, total: 1h 3min 27s
Wall time: 41min 45s


Unnamed: 0,Array,Chunk
Bytes,1.34 GiB,125.51 MiB
Shape,"(1597, 281, 401)","(146, 281, 401)"
Dask graph,11 chunks in 4920 graph layers,11 chunks in 4920 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 1.34 GiB 125.51 MiB Shape (1597, 281, 401) (146, 281, 401) Dask graph 11 chunks in 4920 graph layers Data type float64 numpy.ndarray",401  281  1597,

Unnamed: 0,Array,Chunk
Bytes,1.34 GiB,125.51 MiB
Shape,"(1597, 281, 401)","(146, 281, 401)"
Dask graph,11 chunks in 4920 graph layers,11 chunks in 4920 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray


In [13]:
ds = ds.chunk(dict(time=-1))

In [14]:
ds

Unnamed: 0,Array,Chunk
Bytes,1.34 GiB,1.34 GiB
Shape,"(1597, 281, 401)","(1597, 281, 401)"
Dask graph,1 chunks in 4921 graph layers,1 chunks in 4921 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 1.34 GiB 1.34 GiB Shape (1597, 281, 401) (1597, 281, 401) Dask graph 1 chunks in 4921 graph layers Data type float64 numpy.ndarray",401  281  1597,

Unnamed: 0,Array,Chunk
Bytes,1.34 GiB,1.34 GiB
Shape,"(1597, 281, 401)","(1597, 281, 401)"
Dask graph,1 chunks in 4921 graph layers,1 chunks in 4921 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray


In [15]:
# Percentile will be a set range of percentiles including <90th, then every 0.1 until 100th/MAX
# I might add 75th-90th, and < 75th
a = np.array([0, .75, .9])
b = np.arange(.91, 1.001, 0.01)
quantile_arr = np.concatenate((a, b), axis=0)
quantile_arr


array([0.  , 0.75, 0.9 , 0.91, 0.92, 0.93, 0.94, 0.95, 0.96, 0.97, 0.98,
       0.99, 1.  ])

In [16]:
%%time
ivt_mclimate = ds.quantile(quantile_arr, dim='time', skipna=True)
ivt_mclimate

CPU times: user 17.4 ms, sys: 716 µs, total: 18.1 ms
Wall time: 17.3 ms


Unnamed: 0,Array,Chunk
Bytes,11.18 MiB,11.18 MiB
Shape,"(13, 281, 401)","(13, 281, 401)"
Dask graph,1 chunks in 4926 graph layers,1 chunks in 4926 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 11.18 MiB 11.18 MiB Shape (13, 281, 401) (13, 281, 401) Dask graph 1 chunks in 4926 graph layers Data type float64 numpy.ndarray",401  281  13,

Unnamed: 0,Array,Chunk
Bytes,11.18 MiB,11.18 MiB
Shape,"(13, 281, 401)","(13, 281, 401)"
Dask graph,1 chunks in 4926 graph layers,1 chunks in 4926 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray


In [17]:
del(ds)

In [None]:
# write to netCDF
fname = os.path.join(path_to_data, 'preprocessed/GEFSv12_reforecast_mclimate_ivt_NDJ_21hr-lead.nc')
ivt_mclimate.to_netcdf(path=fname, mode = 'w', format='NETCDF4')

In [None]:
ds = xr.open_dataset(fname_lst[2])

ind = pd.MultiIndex.from_product((x,y),names=('segment','new_time'))

In [104]:
def preprocess(fname):
    ds = xr.open_dataset(fname)

    ## now we need to fix the dims
    ## add back in valid time, lead, initialization date

    # ds = ds.assign(valid_time=lambda ds: ds.time)

    # ds = ds.drop(['init_date', 'lead'])

    ds = ds.assign_coords({"init_date": ds.indexes['time'].normalize()})
    ds = ds.assign_coords({"lead": ds.indexes['time'].hour})
    # test = test.drop(['init_date', 'lead'])
    ds = ds.set_index(ct=("init_date", "lead")).unstack('ct')
    # ds.unstack('time')
    ds

    init_date = ds['init_date'].values
    lead = ds['lead'].values
    # ds = ds.drop(['init_date', 'lead'])
    return ds

In [105]:
ds1 = preprocess(fname_lst[1])
ds2 = preprocess(fname_lst[2])

In [107]:
ds = xr.merge([ds1, ds2], compat="override")
ds

In [99]:
ind = pd.MultiIndex.from_product((init_date,lead),names=('init_date','lead'))
ind = ind[1:65]
ind

MultiIndex([('2000-01-12',  3),
            ('2000-01-12',  6),
            ('2000-01-12',  9),
            ('2000-01-12', 12),
            ('2000-01-12', 15),
            ('2000-01-12', 18),
            ('2000-01-12', 21),
            ('2000-01-13',  0),
            ('2000-01-13',  3),
            ('2000-01-13',  6),
            ('2000-01-13',  9),
            ('2000-01-13', 12),
            ('2000-01-13', 15),
            ('2000-01-13', 18),
            ('2000-01-13', 21),
            ('2000-01-14',  0),
            ('2000-01-14',  3),
            ('2000-01-14',  6),
            ('2000-01-14',  9),
            ('2000-01-14', 12),
            ('2000-01-14', 15),
            ('2000-01-14', 18),
            ('2000-01-14', 21),
            ('2000-01-15',  0),
            ('2000-01-15',  3),
            ('2000-01-15',  6),
            ('2000-01-15',  9),
            ('2000-01-15', 12),
            ('2000-01-15', 15),
            ('2000-01-15', 18),
            ('2000-01-15', 21),
        

In [100]:
dsr = ds.assign(time=ind).unstack('time')
# dsr = dsr.rename({'new_time':'time'})

  dsr = ds.assign(time=ind).unstack('time')


In [101]:
dsr

In [27]:
ds.indexes['time'].hour

Index([ 3,  6,  9, 12, 15, 18, 21,  0,  3,  6,  9, 12, 15, 18, 21,  0,  3,  6,
        9, 12, 15, 18, 21,  0,  3,  6,  9, 12, 15, 18, 21,  0,  3,  6,  9, 12,
       15, 18, 21,  0,  3,  6,  9, 12, 15, 18, 21,  0,  3,  6,  9, 12, 15, 18,
       21,  0,  3,  6,  9, 12, 15, 18, 21,  0],
      dtype='int32', name='time')

In [11]:
ds

Unnamed: 0,Array,Chunk
Bytes,4.02 TiB,10.73 GiB
Shape,"(384, 12776, 281, 401)","(1, 12776, 281, 401)"
Dask graph,384 chunks in 3780 graph layers,384 chunks in 3780 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 4.02 TiB 10.73 GiB Shape (384, 12776, 281, 401) (1, 12776, 281, 401) Dask graph 384 chunks in 3780 graph layers Data type float64 numpy.ndarray",384  1  401  281  12776,

Unnamed: 0,Array,Chunk
Bytes,4.02 TiB,10.73 GiB
Shape,"(384, 12776, 281, 401)","(1, 12776, 281, 401)"
Dask graph,384 chunks in 3780 graph layers,384 chunks in 3780 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,4.02 TiB,10.73 GiB
Shape,"(384, 12776, 281, 401)","(1, 12776, 281, 401)"
Dask graph,384 chunks in 3780 graph layers,384 chunks in 3780 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 4.02 TiB 10.73 GiB Shape (384, 12776, 281, 401) (1, 12776, 281, 401) Dask graph 384 chunks in 3780 graph layers Data type float64 numpy.ndarray",384  1  401  281  12776,

Unnamed: 0,Array,Chunk
Bytes,4.02 TiB,10.73 GiB
Shape,"(384, 12776, 281, 401)","(1, 12776, 281, 401)"
Dask graph,384 chunks in 3780 graph layers,384 chunks in 3780 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,4.02 TiB,10.73 GiB
Shape,"(384, 12776, 281, 401)","(1, 12776, 281, 401)"
Dask graph,384 chunks in 3780 graph layers,384 chunks in 3780 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 4.02 TiB 10.73 GiB Shape (384, 12776, 281, 401) (1, 12776, 281, 401) Dask graph 384 chunks in 3780 graph layers Data type float64 numpy.ndarray",384  1  401  281  12776,

Unnamed: 0,Array,Chunk
Bytes,4.02 TiB,10.73 GiB
Shape,"(384, 12776, 281, 401)","(1, 12776, 281, 401)"
Dask graph,384 chunks in 3780 graph layers,384 chunks in 3780 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
