This preprocessing notebook takes 6-hourly AR data from tARget v3, and selects only the times where an AR crosses the 1 km threshold in HMA.

Using the trackID from 

### Import

In [1]:
# Standard Python modules
import os, sys
import numpy as np
import pandas as pd
import xarray as xr
from datetime import datetime, timedelta

# import personal modules

# Path to modules
sys.path.append('../modules')

# Import my modules
from ar_funcs import get_topo_mask
from timeseries import select_months_ds, select_months_df

In [18]:
# Set up paths
server = 'great'
path_to_data = '/home/nash/DATA/data/'                                      # project data -- read only
path_to_out  = '../out/'       # output files (numerical results, intermediate datafiles) -- read & write
path_to_figs = '../figs/'      # figures

### Get list of AR dates and trackIDs when an AR crosses 1000 m elevation threshold in HMA

In [3]:
# identify ARs using single bound box with elevation mask during DJF
bbox = [20, 40, 65, 97] # HMA region
start_date = '1979-12-01 0:00'
end_date = '2019-05-31 18:00'
elev_thres = 1000.
start_mon = 12
end_mon = 2

# open ds
filename =  path_to_data + 'ar_catalog/globalARcatalog_ERA-Interim_1979-2019_v3.0.nc'
ds = xr.open_dataset(filename, chunks={'time': 1460}, engine='netcdf4')
ds = ds.squeeze()
# remove lev and ens coords
ds = ds.reset_coords(names=['lev', 'ens'], drop=True)

# select lats, lons, and dates within start_date, end_date and months
lat1, lat2, lon1, lon2 = bbox
ds = ds.sel(time=slice(start_date, end_date), lat=slice(lat1,lat2), lon=slice(lon1,lon2))
ds = select_months_ds(ds, start_mon, end_mon)

# add topo mask
mask = get_topo_mask(ds.lat, ds.lon) # create a elevation dataset with same grid spacing as ds
ds = ds.where(mask.bedrock >= elev_thres) # mask ds where elevation is less than 1000 m

# convert dataset to dataframe
df = ds.kidmap.to_dataframe(dim_order=['time', 'lat', 'lon'])
df = df.dropna(axis='rows')
# keep only rows that have trackID
trackID = df.groupby('time').kidmap.unique()
# trackID # this is all trackIDs that crossed the 1000 m threshold


time
1979-12-01 00:00:00                  [2864.0, 2861.0]
1979-12-01 06:00:00                          [2864.0]
1979-12-01 12:00:00                  [2864.0, 2861.0]
1979-12-01 18:00:00                  [2864.0, 2861.0]
1979-12-02 00:00:00                          [2861.0]
                                    ...              
2019-02-27 18:00:00                        [128055.0]
2019-02-28 00:00:00                        [128055.0]
2019-02-28 06:00:00    [128055.0, 128064.0, 128066.0]
2019-02-28 12:00:00              [128055.0, 128064.0]
2019-02-28 18:00:00              [128055.0, 128064.0]
Name: kidmap, Length: 3442, dtype: object

In [4]:
id_df = trackID.to_frame() # converts to a pandas dataframe
id_df = id_df.reset_index() # reset the index
id_df = id_df.rename(columns={'time': 'date'}) # rename time column into date
id_df = id_df.set_index(pd.to_datetime(id_df['date'])) # reset the index as "date"
id_df.index = id_df.index.strftime("%Y-%m-%d") # make it so the index date is normalized to daily
id_df = id_df.rename(columns={'date': 'time'}) # rename the date column back to time
id_df = id_df.reset_index() # remove the index
id_df = id_df.explode('kidmap') # explode the dataframe based on trackID
# id_df

Unnamed: 0,date,time,kidmap
0,1979-12-01,1979-12-01 00:00:00,2864.0
0,1979-12-01,1979-12-01 00:00:00,2861.0
1,1979-12-01,1979-12-01 06:00:00,2864.0
2,1979-12-01,1979-12-01 12:00:00,2864.0
2,1979-12-01,1979-12-01 12:00:00,2861.0
...,...,...,...
3439,2019-02-28,2019-02-28 06:00:00,128066.0
3440,2019-02-28,2019-02-28 12:00:00,128055.0
3440,2019-02-28,2019-02-28 12:00:00,128064.0
3441,2019-02-28,2019-02-28 18:00:00,128055.0


In [5]:
# load AR CAT (from Nash et al. 2021)
filepath = path_to_out + 'AR-types_ALLDAYS.csv'
ar_cat = pd.read_csv(filepath)
ar_cat = ar_cat.rename(columns={'Unnamed: 0': 'date'})
ar_cat = ar_cat.set_index(pd.to_datetime(ar_cat['date']))
ar_cat = select_months_df(ar_cat, start_mon, end_mon)
ar_cat.index = ar_cat.index.strftime("%Y-%m-%d")
ar_cat = ar_cat.drop(columns=['date'])
ar_cat = ar_cat.reset_index()
idx = ar_cat['AR_CAT'] > 0
ar_cat = ar_cat.loc[idx]

# ar_cat

Unnamed: 0,date,AR_CAT
0,1979-12-01,2
1,1979-12-02,1
7,1979-12-08,2
8,1979-12-09,1
15,1979-12-16,1
...,...,...
3604,2019-02-23,3
3606,2019-02-25,3
3607,2019-02-26,3
3608,2019-02-27,3


In [6]:
# merge id_df with ar_cat
merge_ar = pd.merge(id_df, ar_cat, how='outer', on='date')
track_ids = merge_ar.kidmap.unique() # get unique list of AR track IDs
# merge_ar

Unnamed: 0,date,time,kidmap,AR_CAT
0,1979-12-01,1979-12-01 00:00:00,2864.0,2
1,1979-12-01,1979-12-01 00:00:00,2861.0,2
2,1979-12-01,1979-12-01 06:00:00,2864.0,2
3,1979-12-01,1979-12-01 12:00:00,2864.0,2
4,1979-12-01,1979-12-01 12:00:00,2861.0,2
...,...,...,...,...
3521,2019-02-28,2019-02-28 06:00:00,128066.0,2
3522,2019-02-28,2019-02-28 12:00:00,128055.0,2
3523,2019-02-28,2019-02-28 12:00:00,128064.0,2
3524,2019-02-28,2019-02-28 18:00:00,128055.0,2


In [7]:
# create df with trackID, ar_cat, start date, end date, and duration of AR (how long it is within HMA region)
ar = []
data = []
for i in [1, 2, 3]:
    idx = (merge_ar.AR_CAT == i)
    ar = merge_ar.loc[idx]

    for j, ids in enumerate(track_ids):
        idx = (ar.kidmap == ids)
        tmp = ar.loc[idx]
        start = pd.to_datetime(tmp.time.min())
        stop = pd.to_datetime(tmp.time.max()) + timedelta(hours=6)
        tmp = (stop - start)
        duration = tmp.total_seconds()/(3600) # convert to number of hours

        data.append([ids, i, start, stop, duration])
    
duration_df = pd.DataFrame(data, columns=['trackID', 'ar_cat', 'start_date', 'end_date', 'duration'])
duration_df = duration_df.dropna()
duration_df

Unnamed: 0,trackID,ar_cat,start_date,end_date,duration
1,2861.0,1,1979-12-02 00:00:00,1979-12-02 18:00:00,18.0
2,2871.0,1,1979-12-09 06:00:00,1979-12-09 18:00:00,12.0
3,2975.0,1,1979-12-16 12:00:00,1979-12-17 00:00:00,12.0
4,2988.0,1,1979-12-21 00:00:00,1979-12-22 12:00:00,36.0
5,3026.0,1,1979-12-24 06:00:00,1979-12-24 12:00:00,6.0
...,...,...,...,...,...
2672,127991.0,3,2019-02-21 00:00:00,2019-02-21 18:00:00,18.0
2673,128016.0,3,2019-02-23 06:00:00,2019-02-23 12:00:00,6.0
2674,128039.0,3,2019-02-25 06:00:00,2019-02-25 18:00:00,12.0
2675,128050.0,3,2019-02-26 06:00:00,2019-02-26 12:00:00,6.0


### Landslide DF

In [8]:
def expand_grid(lat,lon):
    '''list all combinations of lats and lons using expand_grid(lat,lon)'''
    test = [(A,B) for A in lat for B in lon]
    test = np.array(test)
    test_lat = test[:,0]
    test_lon = test[:,1]
    full_grid = pd.DataFrame({'lat': test_lat, 'lon': test_lon})
    full_grid = full_grid.sort_values(by=['lat','lon'])
    full_grid = full_grid.reset_index(drop=True)
    return full_grid

In [9]:
fname = path_to_data + 'CH2_generated_data/Global_Landslide_Catalog_Export.csv' #TODO check this - is it the raw downloaded data?
landslide = pd.read_csv(fname)

# Select lat/lon grid
lonmin = 65
lonmax = 100
latmin = 20
latmax = 42

## Select Landslides within Southern Asia region
idx = (landslide.latitude >= latmin) & (landslide.latitude <= latmax) & (landslide.longitude >= lonmin) & (landslide.longitude <= lonmax)
landslide = landslide.loc[idx]
# set event time as index
landslide = landslide.set_index(pd.to_datetime(landslide.event_date))
# landslide.index = landslide.index.normalize()

# select only landslide dates that are between december and may
idx = (landslide.index.month >= 12) | (landslide.index.month <= 5)
landslide = landslide[idx]

# rename and reindex
landslide = landslide.rename(columns={"latitude": "lat", "longitude": "lon", "event_date": "event_time"})
landslide = landslide.reset_index()

# round event time to the nearest 6 hours
landslide['time'] = landslide['event_date'].dt.round('6H')
landslide = landslide.set_index(pd.to_datetime(landslide.time))

# select only landslide dates that are between december and may
idx = (landslide.index.month >= 12) | (landslide.index.month <= 5)
landslide = landslide[idx]

# landslide

Unnamed: 0_level_0,event_date,source_name,source_link,event_id,event_time,event_time,event_title,event_description,location_description,location_accuracy,...,admin_division_name,admin_division_population,gazeteer_closest_point,gazeteer_distance,submitted_date,created_date,last_edited_date,lon,lat,time
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2007-03-19 00:00:00,2007-03-19 00:00:00,Tribune India.com,,37,03/19/2007 12:00:00 AM,,Doba,"2 killed, 5 injured",Doba,25km,...,Kashmir,14711.0,Kupwāra,18.10887,04/01/2014 12:00:00 AM,11/20/2017 03:17:00 PM,02/15/2018 03:51:00 PM,74.333300,33.877800,2007-03-19 00:00:00
2017-05-28 12:00:00,2017-05-28 13:34:00,AGU Blogs (Petley),http://blogs.agu.org/landslideblog/2017/05/30/...,9774,05/28/2017 01:34:00 PM,,Mudflow in Northern Tajikistan,"Meltwater causes large mudflow, affects cows",Rasht Valley of Tajikistan (exact coordinates ...,exact,...,,,,,06/16/2017 01:34:00 PM,11/20/2017 03:17:00 PM,02/15/2018 03:51:00 PM,71.275479,39.197800,2017-05-28 12:00:00
2017-05-29 12:00:00,2017-05-29 13:34:00,Eleven Myanmar,http://www.elevenmyanmar.com/local/9759,9771,05/29/2017 01:34:00 PM,,"Hakhha Landslide, Zayhuang Ward","Heavy rains cause landslides, affect houses mi...",Hakha in Zayhuang Ward,5km,...,,,,,06/16/2017 01:34:00 PM,11/20/2017 03:17:00 PM,02/15/2018 03:51:00 PM,93.615899,22.656202,2017-05-29 12:00:00
2017-05-20 18:00:00,2017-05-20 20:14:00,World Hindu News,http://www.worldhindunews.com/2017/05/23/58870...,9802,05/20/2017 08:14:00 PM,,Landslide in Chanoli district,"Boulders roll down mountainside, blocking highway","Rishikesh-Badrinath highway, near Vishnuprayag...",5km,...,,,,,06/20/2017 08:14:00 PM,11/20/2017 03:17:00 PM,02/15/2018 03:51:00 PM,79.561632,30.565047,2017-05-20 18:00:00
2017-05-20 18:00:00,2017-05-20 20:14:00,Etemaa Daily,http://www.en.etemaaddaily.com/World/National/...,9809,05/20/2017 08:14:00 PM,,Badrinath highway landslide,Landslide on highway to Hindu temple Badrinath...,"Highway to Badrinath, Uttarakhand",25km,...,,,,,06/20/2017 08:14:00 PM,11/20/2017 03:17:00 PM,02/15/2018 03:51:00 PM,79.492316,30.733156,2017-05-20 18:00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2017-04-23 18:00:00,2017-04-23 17:58:00,Chandigarh Metro,https://chandigarhmetro.com/manali-rohtang-hig...,10853,04/23/2017 05:58:00 PM,,Landslide near Rani Nullah,"Landslide blocks Malai-Rohtang HWY. Article: ""...","Manali Sarchu Road, Manali Sub-District, Kullu...",1km,...,,,,,07/17/2017 05:58:00 PM,12/05/2017 06:57:00 PM,02/15/2018 03:51:00 PM,77.233987,32.361492,2017-04-23 18:00:00
2017-04-17 06:00:00,2017-04-17 04:00:00,AKI Press,http://akipress.com/news:591338/,10897,04/17/2017 04:00:00 AM,,Landslide in Chyrmash,Landslide hits 3 houses,"Chyrmash (??), Ozgon, Osh, Kyrgyzstan",50km,...,,,,,07/28/2017 01:34:00 PM,12/07/2017 09:19:00 PM,02/15/2018 03:51:00 PM,73.604732,40.804379,2017-04-17 06:00:00
2017-03-25 18:00:00,2017-03-25 17:32:00,Greater Kashmir,http://www.greaterkashmir.com/news/jammu/lands...,10845,03/25/2017 05:32:00 PM,,Barnari Sigdi Landslide,Two teenage girls died after they were buried ...,"Barnari Sigdi area, Tehsil Mughalmaidan, Kisht...",5km,...,,,,,09/21/2017 05:32:00 PM,12/05/2017 06:45:00 PM,02/15/2018 03:51:00 PM,75.680611,33.403080,2017-03-25 18:00:00
2016-12-15 06:00:00,2016-12-15 05:00:00,NBC Daily,http://www.nbcdaily.com/separate-landslides-ki...,10973,12/15/2016 05:00:00 AM,,Landslide at Pub Sarania Hill,An octogenarian was killed when a sudden lands...,"Pub Sarania Hill, Guwahati, Assam, India",1km,...,,,,,07/26/2017 01:22:00 PM,12/08/2017 08:37:00 PM,02/15/2018 03:51:00 PM,91.772042,26.181606,2016-12-15 06:00:00


In [10]:
# now we want to see if there is an AR present at the same time and location as the landslides
# open the trackID for ARs
filename =  path_to_data + 'ar_catalog/globalARcatalog_ERA-Interim_1979-2019_v3.0.nc'
ar = xr.open_dataset(filename, engine='netcdf4')
ar = ar.squeeze()

# Select months
idx = (ar.time.dt.month >= 12) | (ar.time.dt.month <= 5)
kid = ar.kidmap.sel(time=idx) # trackID for indexing

# slice the dates so both ds match
kid = kid.sel(time=slice('1979-12-01 00', '2019-05-31 00:00'))
# kid

In [11]:
## for each landslide_id, if the lat/lon falls within an AR, keep that AR ID and landslide ID
landslideID = []
arID = []
landslide_lat = []
landslide_lon = []
for i, row in landslide.T.iteritems():
    t = kid.sel(lat=row['lat'], lon=row['lon'], time=row['time'], method='nearest').values
    # print(t)
    if t > 0:
        landslideID.append(row['event_id'])
        arID.append(t)
        landslide_lat.append(row['lat'])
        landslide_lon.append(row['lon'])
        
d = {'landslideID': landslideID, 'trackID': arID, 
     'landslide_lat': landslide_lat, 'landslide_lon': landslide_lon}
landslide_df = pd.DataFrame(data=d)
# convert the dtype for the trackID column
landslide_df = landslide_df.astype({'trackID': 'float64'})

# landslide_df

Unnamed: 0,landslideID,trackID,landslide_lat,landslide_lon
0,37,88266.0,33.8778,74.3333
1,3116,101168.0,33.33876,75.192106
2,6020,111907.0,24.5784,91.7227
3,3099,101129.0,34.2256,73.673
4,9405,118455.0,26.7223,95.0243
5,1611,98243.0,25.1553,93.028
6,6019,111907.0,24.4136,91.7561
7,419,90945.0,33.237381,75.245282
8,1526,97918.0,33.0097,74.9402
9,9681,98633.0,27.218988,89.518792


In [14]:
# merge AR duration df and landslide DF
merged_data = pd.merge(duration_df, landslide_df, how='outer', on='trackID')
# merged_data 
# note the rows that do not have a date or time 
# are landslides that are associated with a specific AR that was not considered a "HMA AR"

Unnamed: 0,trackID,ar_cat,start_date,end_date,duration,landslideID,landslide_lat,landslide_lon
0,2861.0,1.0,1979-12-02 00:00:00,1979-12-02 18:00:00,18.0,,,
1,2861.0,2.0,1979-12-01 00:00:00,1979-12-02 00:00:00,24.0,,,
2,2871.0,1.0,1979-12-09 06:00:00,1979-12-09 18:00:00,12.0,,,
3,2871.0,2.0,1979-12-08 06:00:00,1979-12-09 00:00:00,18.0,,,
4,2975.0,1.0,1979-12-16 12:00:00,1979-12-17 00:00:00,12.0,,,
...,...,...,...,...,...,...,...,...
1153,114676.0,,NaT,NaT,,6842.0,33.329600,75.203000
1154,114816.0,,NaT,NaT,,6908.0,33.097700,75.578700
1155,121814.0,,NaT,NaT,,10068.0,30.813502,78.617921
1156,121924.0,,NaT,NaT,,10896.0,40.883841,72.906907


In [15]:
## test to make sure merged correctly
# idx = merged_data.landslideID > 0
# test = merged_data[idx]
# test

Unnamed: 0,trackID,ar_cat,start_date,end_date,duration,landslideID,landslide_lat,landslide_lon
373,90945.0,1.0,2008-01-06 12:00:00,2008-01-09 18:00:00,78.0,419.0,33.237381,75.245282
387,94008.0,1.0,2008-12-19 06:00:00,2008-12-20 00:00:00,18.0,940.0,33.245000,75.247000
388,94008.0,3.0,2008-12-20 00:00:00,2008-12-20 18:00:00,18.0,940.0,33.245000,75.247000
406,97756.0,1.0,2010-02-06 00:00:00,2010-02-08 18:00:00,66.0,1488.0,34.004900,73.752600
407,97756.0,1.0,2010-02-06 00:00:00,2010-02-08 18:00:00,66.0,1489.0,34.873300,72.662500
...,...,...,...,...,...,...,...,...
1153,114676.0,,NaT,NaT,,6842.0,33.329600,75.203000
1154,114816.0,,NaT,NaT,,6908.0,33.097700,75.578700
1155,121814.0,,NaT,NaT,,10068.0,30.813502,78.617921
1156,121924.0,,NaT,NaT,,10896.0,40.883841,72.906907


In [17]:
# drop the rows that are not a HMA AR
idx = merged_data['ar_cat'] > 0
merged_data = merged_data.loc[idx]
merged_data

Unnamed: 0,trackID,ar_cat,start_date,end_date,duration,landslideID,landslide_lat,landslide_lon
0,2861.0,1.0,1979-12-02 00:00:00,1979-12-02 18:00:00,18.0,,,
1,2861.0,2.0,1979-12-01 00:00:00,1979-12-02 00:00:00,24.0,,,
2,2871.0,1.0,1979-12-09 06:00:00,1979-12-09 18:00:00,12.0,,,
3,2871.0,2.0,1979-12-08 06:00:00,1979-12-09 00:00:00,18.0,,,
4,2975.0,1.0,1979-12-16 12:00:00,1979-12-17 00:00:00,12.0,,,
...,...,...,...,...,...,...,...,...
1120,127863.0,3.0,2019-02-01 06:00:00,2019-02-01 12:00:00,6.0,,,
1121,127886.0,3.0,2019-02-06 12:00:00,2019-02-10 00:00:00,84.0,,,
1122,128016.0,3.0,2019-02-23 06:00:00,2019-02-23 12:00:00,6.0,,,
1123,128039.0,3.0,2019-02-25 06:00:00,2019-02-25 18:00:00,12.0,,,


## load 2D WRF data

In [57]:
## pull wrflats and wrflons from first file
fname = path_to_data + 'wrf_hasia/d01/ivt/3hr/tmp_1979.nc'.format(dom, varname)
tmp = xr.open_dataset(fname)

## assign those lats to the other ds when you loop
wrflats = tmp.lat.values
wrflons = tmp.lon.values

fname = path_to_data + 'wrf_hasia/d02/prec/3hr/tmp_1979.nc'.format(dom, varname)
tmp = xr.open_dataset(fname)

## assign those lats to the other ds when you loop
wrflats2 = tmp.lat.values
wrflons2 = tmp.lon.values

In [59]:
%%time
def preprocess_ivt(ds):
    '''keep only the current year'''
    year = ds.time.dt.year.max().values
    ds = ds.assign_coords({"lon": wrflons, "lat": wrflats})
    return ds.sel(time=slice('{0}-01-01 00:00'.format(year), '{0}-12-31 21:00'.format(year)))

def preprocess_prec(ds):
    '''keep only the current year'''
    year = ds.time.dt.year.max().values
    ds = ds.assign_coords({"lon": wrflons2, "lat": wrflats2})
    return ds.sel(time=slice('{0}-01-01 00:00'.format(year), '{0}-12-31 21:00'.format(year)))

domains = ['d01', 'd02']
varname_lst = ['ivt', 'prec']

## loop through each ds
ds_lst = []
for i, (dom, varname) in enumerate(zip(domains, varname_lst)):
    if server == 'great':
        data_path = path_to_data + 'wrf_hasia/'
    else:
        data_path = path_to_data + 'wrf_preprocessed_data/wrf_6km/'
        
    filename_pattern = '{0}/{1}/3hr/tmp_*.nc'.format(dom, varname)
    fname = data_path + filename_pattern
    
    if varname == 'ivt':
        ds = xr.open_mfdataset(fname, preprocess=preprocess_ivt)
    elif varname == 'prec':
        ds = xr.open_mfdataset(fname, preprocess=preprocess_prec)
        ## TODO ADD in shift subtraction to get mm per hour 
    
    # subset to just ar days
    ds = ds.sel(time = slice(start_date, end_date))
    
    if varname == 'ivt':
        ds = ds.assign(ivt=lambda ds: np.sqrt(ds.ivtu**2 + ds.ivtv**2))
    if varname == 'geopotential':
        ds = ds.sel(lev=250.)
    
    ds_lst.append(ds)
    
ivt = ds_lst[0]
prec = ds_lst[1]


CPU times: user 1.76 s, sys: 159 ms, total: 1.92 s
Wall time: 1.92 s


In [60]:
# latmin, latmax, lonmin, lonmax
ext1 = [69, 74, 37, 40] # Northwestern precip anomalies
ext2 = [71, 79, 32, 37] # Western precip anomalies
ext3 = [90, 99, 24, 30] # Eastern precip anomalies

region_name = ['northwestern', 'western', 'eastern']
domains = [ext1, ext2, ext3]

In [62]:
%%time
# make a ds for each subregion
ds_lst = []
for i, bnds in enumerate(domains):
    tmp = ivt.sel(lat=slice(bnds[2], bnds[3]), lon=slice(bnds[0], bnds[1]))
    ## TO DO add in calculate IVT direction 
    ds_lst.append(tmp)
ds_lst

CPU times: user 13.5 ms, sys: 1.09 ms, total: 14.6 ms
Wall time: 14.3 ms


[<xarray.Dataset>
 Dimensions:  (time: 103232, lat: 19, lon: 24)
 Coordinates:
   * time     (time) datetime64[ns] 1979-12-01 ... 2015-03-31T21:00:00
   * lat      (lat) float32 37.04 37.2 37.37 37.53 ... 39.48 39.64 39.8 39.96
   * lon      (lon) float32 69.14 69.34 69.55 69.76 ... 73.29 73.5 73.71 73.91
 Data variables:
     ivtu     (time, lat, lon) float64 dask.array<chunksize=(241, 19, 24), meta=np.ndarray>
     ivtv     (time, lat, lon) float64 dask.array<chunksize=(241, 19, 24), meta=np.ndarray>
     iwv      (time, lat, lon) float64 dask.array<chunksize=(241, 19, 24), meta=np.ndarray>
     ivt      (time, lat, lon) float64 dask.array<chunksize=(241, 19, 24), meta=np.ndarray>,
 <xarray.Dataset>
 Dimensions:  (time: 103232, lat: 29, lon: 39)
 Coordinates:
   * time     (time) datetime64[ns] 1979-12-01 ... 2015-03-31T21:00:00
   * lat      (lat) float32 32.08 32.25 32.43 32.6 ... 36.37 36.54 36.7 36.87
   * lon      (lon) float32 71.01 71.21 71.42 71.63 ... 78.28 78.48 78.69 78.9


In [66]:
def ar_ivt(df, ds_lst):
    '''Calculate maximum IVT for a subregion in a ds and append to dataframe.
     For each range of AR event dates, we find the maximum IVT for the duration of the AR for every grid cell. 
    '''
    # the final IVT statistic to retain
    final1 = []

    for k, ds1 in enumerate(ds_lst):
        print('loop', k+1, 'of', len(ds_lst))
        m2_vals = []
        for i, track in enumerate(df.trackID.values):
            start = df.start_date.values[i]
            end = df.end_date.values[i]

            idx = slice(start, end)
            tmp = ds1.sel(time=idx)     
                
            ### localized IVT maxima during event
            event_max = tmp.where(tmp.ivt==tmp.ivt.max(), drop=True).squeeze().load()
            ## TODO pull IVT and IVTDIR
            m2_vals.append(event_max.values)


        final1.append(m2_vals)

        
    return final1

In [67]:
%%time
## For each row, calculate the maximum IVT within the region between start and end
ivt_final = ar_ivt(merged_data, ds_lst)
print(len(ivt_final))

loop 0 of 3
loop 1 of 3
loop 2 of 3
3
CPU times: user 47 s, sys: 1.35 s, total: 48.4 s
Wall time: 48.3 s


In [87]:
start = merged_data.start_date.values[0]
end = merged_data.end_date.values[0]
ds1 = ds_lst[0]

idx = slice(start, end)
tmp = ds1.sel(time=idx)
tmp.where(tmp.ivt==tmp.ivt.max(), drop=True).squeeze().load()

In [None]:
def ar_precip(df, ds_lst):
    '''Calculate precipitation statistics for a subregion in a ds and append to dataframe.
     Mode is chosen based on calculation. For each range of AR event dates, we calculate the total accumulated precip for every grid cell. 
     Then we remove all gridcells that had less than 1 mm of rain per event (these are not included in any calc)
     Then we weight the gridcells by the cosine of the latitude.
     Then based on mode selected, different statistics are retained:
         'mean-total' averages all viable gridcells within the subregion and retains this number
         'max-total' selects the maximum gridcell value to append
         'percentile-total' calcuates the 95th percentile and then averages all the grid cells that exceed this threshold
    '''
    # the final precip statistic to retain
    final1 = []
    final2 = []
    final3 = []

    for k, ds1 in enumerate(ds_lst):
        print('loop', k, 'of', len(ds_lst))
        m1_vals = []
        m2_vals = []
        m3_vals = []
        for i, track in enumerate(df.trackID.values):
            start = df.start_date.values[i]
            end = df.end_date.values[i]

            idx = slice(start, end)
            tmp = ds1.sel(time=idx)

            ### event-total precipitation per event for every grid cell
            tmp = tmp.sum('time')
            ### mask out grid cells with less than 1 mm per event
            tmp2 = xr.where(cond=(tmp.prec > 1), x=tmp.prec, y=np.nan)
            
            ### area weighted
            tmp = tmp2.weighted(tmp.weights)
            
            ## mode 1: mean-total
            # average over gridcells in subregion
            mean_tot = tmp.mean(['lat', 'lon'], skipna=True)
            # append to list
            m1_vals.append(mean_tot.values.tolist())
                
            ## mode 2: max-total
            ### localized precip maxima during event
            event_max = tmp2.max(['lat', 'lon'])
            m2_vals.append(event_max.values.tolist())
                
            ## mode 3: percentile-total
            ###  get 95th percentile thres
            q_thres = tmp2.quantile(0.95, dim=['lat', 'lon'], interpolation='linear')
            ## mask out grid cells below threshold
            perc_prec = xr.where(cond=(tmp2 > q_thres), x=tmp2, y=np.nan)
            # average over all grid cells skipping nans
            mean = perc_prec.mean(['lat', 'lon'], skipna=True)
            m3_vals.append(mean.values.tolist())

        final1.append(m1_vals)
        final2.append(m2_vals)
        final3.append(m3_vals)
    final = [final1, final2, final3]
        
    return final