In [1]:
# Standard Python modules
import os, sys
import numpy as np
import pandas as pd
import xarray as xr
import datetime as dt

# plot styles/formatting
import seaborn as sns
import cmocean.cm as cmo
import cmocean

# matplotlib
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1 import AxesGrid
import matplotlib.ticker as mticker
import matplotlib.patches as mpatches
import matplotlib.colors as mcolors
from mpl_toolkits.axes_grid1.inset_locator import mark_inset

# cartopy
import cartopy.crs as ccrs
from cartopy.mpl.geoaxes import GeoAxes
import cartopy.feature as cfeature

# extras
%matplotlib inline
import metpy.calc as mpcalc
from metpy.units import units

# import personal modules
# Path to modules
sys.path.append('../modules')
import nclcmaps as nclc
from plotter import draw_basemap
from preprocess_dataframes import combine_ivt_ar_prec_df

pd.options.display.float_format = "{:,.2f}".format # makes it so pandas tables display only first two decimals

In [2]:
# Set up paths

path_to_data = '/cw3e/mead/projects/cwp140/scratch/dnash/data/'      # project data -- read only
path_to_work = '/work/dnash/SEAK_clim_data/preprocessed/ERA5-IVT/'
path_to_out  = '../out/'       # output files (numerical results, intermediate datafiles) -- read & write
path_to_figs = '../figs/'      # figures

In [3]:
## open precipitation and ivt dfs
## append precip to each community IVT df
option = 'a'
temporal_res = 'daily'
community_lst = ['Hoonah', 'Skagway', 'Klukwan', 'Yakutat', 'Craig', 'Kasaan']
lag_lst = [0]

df_lst = combine_ivt_ar_prec_df(option, temporal_res, community_lst) # combine dfs into list of dfs

In [4]:
## get list of dates that are:
### (middle) top 95th percentile precip (ALL IVT)

ardate_lst = []
for i, df in enumerate(df_lst):
    prec_thres = df['prec'].describe(percentiles=[.95]).loc['95%'] # 95th percentile precipitation threshold
    # idx = (df.AR == 1) & (df.prec > prec_thres) 
    idx = (df.AR == 1) & (df.prec > prec_thres) & (df.index != '2008-02-29 00:00:00') # hack to get rid of the leap day (not in WRF data)
    tmp = df.loc[idx]
    
    ar_dates = tmp.time.values
    ardate_lst.append(tmp.time.values)
    
## merge ardate_lst into single list and remove duplicates
tmp = np.concatenate(ardate_lst, axis=0)
new_data = np.unique(tmp)

In [13]:
print(len(new_data))

1290


In [10]:
## get list of dates that are:
### (left) top 5th percentile IVT, bottom 5th percentile of precip

ardate_lst = []
for i, df in enumerate(df_lst):
    prec_thres = df['prec'].describe(percentiles=[.05]).loc['5%'] # 5th percentile precipitation threshold
    ivt_thres = df['IVT'].describe(percentiles=[.95]).loc['95%'] # 95th percentile IVT threshold
    idx = (df.AR == 1) & (df.prec < prec_thres) & (df.IVT > ivt_thres) & (df.index != '2008-02-29 00:00:00') # hack to get rid of the leap day (not in WRF data)
    tmp = df.loc[idx]
    
    ar_dates = tmp.time.values
    ardate_lst.append(tmp.time.values)
    
## merge ardate_lst into single list and remove duplicates
tmp = np.concatenate(ardate_lst, axis=0)
new_data2 = np.unique(tmp)
print(len(new_data2))

56


In [11]:
## get list of dates that are:
### (right) top 95th percentile IVT, top 95th percentile precip

ardate_lst = []
for i, df in enumerate(df_lst):
    prec_thres = df['prec'].describe(percentiles=[.95]).loc['95%'] # 95th percentile precipitation threshold
    ivt_thres = df['IVT'].describe(percentiles=[.95]).loc['95%'] # 95th percentile IVT threshold
    idx = (df.AR == 1) & (df.prec > prec_thres) & (df.IVT > ivt_thres) & (df.index != '2008-02-29 00:00:00') # hack to get rid of the leap day (not in WRF data)
    tmp = df.loc[idx]
    
    ar_dates = tmp.time.values
    ardate_lst.append(tmp.time.values)
    
## merge ardate_lst into single list and remove duplicates
tmp = np.concatenate(ardate_lst, axis=0)
new_data3 = np.unique(tmp)
print(len(new_data3))

563


In [12]:
ardate_lst = [new_data, new_data2, new_data3]

## Load IVT Data

In [12]:
%%time
### set the extent of the data for processing
ext1 = [-180., -110., 19, 80] # extent of CIMSS Plots
lonmin, lonmax, latmin, latmax = ext1

## This is a lot of data so pulling only dates in ardate_lst
def preprocess(ds):
    '''keep only selected lats and lons'''
    return ds.sel(lat=slice(latmin, latmax), lon=slice(lonmin, lonmax))

ds_lst = []
for i, community in enumerate(community_lst):
    print('Processing', community)
    ardates = ardate_lst[i]
    ## make dataframe of dates and add year, month, day info
    d = {'date': ardates}
    df = pd.DataFrame(data=d)
    df['year'] = pd.DatetimeIndex(df['date']).year
    df['month'] = pd.DatetimeIndex(df['date']).month.map("{:02d}".format)
    df['day'] = pd.DatetimeIndex(df['date']).day.map("{:02d}".format)
    
    ## now iterate through the dates and only open the files for the dates we want
    filenames = []
    for index, row in df.iterrows():
        yr = row['year']
        mon = row['month']
        day = row['day']
        filenames.append('/data/downloaded/Reanalysis/ERA5/IVT/{0}/ERA5_IVT_{0}{1}{2}.nc'.format(yr, mon, day))
    
    ## now open all those files and combine into one ds    
    era = xr.open_mfdataset(filenames, combine='by_coords', preprocess=preprocess, parallel=False)
    if temporal_res == 'hourly':
        era = era
    elif temporal_res == 'daily':
        era = era.resample(time="1D").mean('time')
    ds_lst.append(era)


Processing Hoonah
Processing Skagway
Processing Klukwan
Processing Yakutat
Processing Craig
Processing Kasaan
CPU times: user 56.4 s, sys: 10.1 s, total: 1min 6s
Wall time: 5min 35s


In [18]:
%%time
## make a dataset for each community subset to its AR dates
ds_lst_comp = []
for i, ds in enumerate(ds_lst):
    print('Processing {0}'.format(community_lst[i]))
    tmp = ds.mean('time')
    tmp = tmp.load()
    ds_lst_comp.append(tmp)
    
ds_lst_comp[0]

Processing Hoonah
Processing Skagway
Processing Klukwan
Processing Yakutat
Processing Craig
Processing Kasaan
CPU times: user 8min 51s, sys: 10min 54s, total: 19min 46s
Wall time: 2h 46min 6s


In [24]:
%%time
for i, ds in enumerate(ds_lst_comp):
    community = community_lst[i]
    # write to netCDF
    fname = os.path.join(path_to_work, 'ERA5_IVT_daily_{0}.nc'.format(community))
    ds.to_netcdf(path=fname, mode = 'w', format='NETCDF4')

CPU times: user 164 ms, sys: 77.6 ms, total: 242 ms
Wall time: 1.65 s
