This preprocessing notebook takes 6-hourly AR data from tARget v3, and selects only the times where an AR crosses the 1 km threshold in HMA.

Using the trackID from 

### Import

In [1]:
# Standard Python modules
import os, sys
import numpy as np
import pandas as pd
import xarray as xr
from datetime import datetime, timedelta
import metpy.calc as mpcalc
from metpy.units import units

# import personal modules

# Path to modules
sys.path.append('../modules')

# Import my modules
from ar_funcs import get_topo_mask
from timeseries import select_months_ds, select_months_df

In [2]:
# Set up paths
server = 'great'
path_to_data = '/home/nash/DATA/data/'                                      # project data -- read only
path_to_out  = '../out/'       # output files (numerical results, intermediate datafiles) -- read & write
path_to_figs = '../figs/'      # figures

### Get list of AR dates and trackIDs when an AR crosses 1000 m elevation threshold in HMA

In [3]:
# identify ARs using single bound box with elevation mask during DJF
bbox = [20, 40, 65, 97] # HMA region
elev_thres = 1000.
ssn = 'MAM'

if ssn == 'DJF':
    start_date = '1979-12-01 0:00'
    end_date = '2015-02-28 18:00'
    start_mon = 12
    end_mon = 2
if ssn == 'MAM':
    start_date = '1980-03-01 0:00'
    end_date = '2014-05-31 18:00'
    start_mon = 3
    end_mon = 5

# open ds
filename =  path_to_data + 'ar_catalog/globalARcatalog_ERA-Interim_1979-2019_v3.0.nc'
ds = xr.open_dataset(filename, chunks={'time': 1460}, engine='netcdf4')
ds = ds.squeeze()
# remove lev and ens coords
ds = ds.reset_coords(names=['lev', 'ens'], drop=True)

# select lats, lons, and dates within start_date, end_date and months
lat1, lat2, lon1, lon2 = bbox
ds = ds.sel(time=slice(start_date, end_date), lat=slice(lat1,lat2), lon=slice(lon1,lon2))
ds = select_months_ds(ds, start_mon, end_mon)

# add topo mask
mask = get_topo_mask(ds.lat, ds.lon) # create a elevation dataset with same grid spacing as ds
ds = ds.where(mask.bedrock >= elev_thres) # mask ds where elevation is less than 1000 m

# convert dataset to dataframe
df = ds.kidmap.to_dataframe(dim_order=['time', 'lat', 'lon'])
df = df.dropna(axis='rows')
# keep only rows that have trackID
trackID = df.groupby('time').kidmap.unique()
# trackID # this is all trackIDs that crossed the 1000 m threshold


In [4]:
id_df = trackID.to_frame() # converts to a pandas dataframe
id_df = id_df.reset_index() # reset the index
id_df = id_df.rename(columns={'time': 'date'}) # rename time column into date
id_df = id_df.set_index(pd.to_datetime(id_df['date'])) # reset the index as "date"
id_df.index = id_df.index.strftime("%Y-%m-%d") # make it so the index date is normalized to daily
id_df = id_df.rename(columns={'date': 'time'}) # rename the date column back to time
id_df = id_df.reset_index() # remove the index
id_df = id_df.explode('kidmap') # explode the dataframe based on trackID
# id_df

In [5]:
# load AR CAT (from Nash et al. 2021)
filepath = path_to_out + 'AR-types_ALLDAYS.csv'
ar_cat = pd.read_csv(filepath)
ar_cat = ar_cat.rename(columns={'Unnamed: 0': 'date'})
ar_cat = ar_cat.set_index(pd.to_datetime(ar_cat['date']))
ar_cat = select_months_df(ar_cat, start_mon, end_mon)
ar_cat.index = ar_cat.index.strftime("%Y-%m-%d")
ar_cat = ar_cat.drop(columns=['date'])
ar_cat = ar_cat.reset_index()
idx = ar_cat['AR_CAT'] > 0
ar_cat = ar_cat.loc[idx]

# ar_cat

In [6]:
# merge id_df with ar_cat
merge_ar = pd.merge(id_df, ar_cat, how='outer', on='date')
track_ids = merge_ar.kidmap.unique() # get unique list of AR track IDs
ar_dates = merge_ar.time.unique() # get unique list of AR date/times (for later)
# merge_ar

In [8]:
# create df with trackID, ar_cat, start date, end date, and duration of AR (how long it is within HMA region)
ar = []
data = []
for i in [1, 2, 3]:
    idx = (merge_ar.AR_CAT == i)
    ar = merge_ar.loc[idx]

    for j, ids in enumerate(track_ids):
        idx = (ar.kidmap == ids)
        tmp = ar.loc[idx]
        start = pd.to_datetime(tmp.time.min())
        stop = pd.to_datetime(tmp.time.max()) + timedelta(hours=6)
        tmp = (stop - start)
        duration = tmp.total_seconds()/(3600) # convert to number of hours

        data.append([ids, i, start, stop, duration])
    
duration_df = pd.DataFrame(data, columns=['trackID', 'ar_cat', 'start_date', 'end_date', 'duration'])
duration_df = duration_df.dropna()
duration_df

Unnamed: 0,trackID,ar_cat,start_date,end_date,duration
0,3579.0,1,1980-03-03 00:00:00,1980-03-04 06:00:00,30.0
2,3603.0,1,1980-03-05 06:00:00,1980-03-06 00:00:00,18.0
9,3719.0,1,1980-03-20 00:00:00,1980-03-21 00:00:00,24.0
34,6760.0,1,1981-03-04 00:00:00,1981-03-04 18:00:00,18.0
35,6783.0,1,1981-03-04 18:00:00,1981-03-05 06:00:00,12.0
...,...,...,...,...,...
3605,111955.0,3,2014-05-13 06:00:00,2014-05-13 12:00:00,6.0
3610,112035.0,3,2014-05-23 06:00:00,2014-05-23 12:00:00,6.0
3611,112080.0,3,2014-05-27 06:00:00,2014-05-28 00:00:00,18.0
3612,112083.0,3,2014-05-28 06:00:00,2014-05-28 12:00:00,6.0


### Landslide DF

In [9]:
def expand_grid(lat,lon):
    '''list all combinations of lats and lons using expand_grid(lat,lon)'''
    test = [(A,B) for A in lat for B in lon]
    test = np.array(test)
    test_lat = test[:,0]
    test_lon = test[:,1]
    full_grid = pd.DataFrame({'lat': test_lat, 'lon': test_lon})
    full_grid = full_grid.sort_values(by=['lat','lon'])
    full_grid = full_grid.reset_index(drop=True)
    return full_grid

In [10]:
fname = path_to_data + 'CH2_generated_data/Global_Landslide_Catalog_Export.csv' #TODO check this - is it the raw downloaded data?
landslide = pd.read_csv(fname)

# Select lat/lon grid
lonmin = 65
lonmax = 100
latmin = 20
latmax = 42

## Select Landslides within Southern Asia region
idx = (landslide.latitude >= latmin) & (landslide.latitude <= latmax) & (landslide.longitude >= lonmin) & (landslide.longitude <= lonmax)
landslide = landslide.loc[idx]
# set event time as index
landslide = landslide.set_index(pd.to_datetime(landslide.event_date))
# landslide.index = landslide.index.normalize()

# select only landslide dates that are between december and may
idx = (landslide.index.month >= 12) | (landslide.index.month <= 5)
landslide = landslide[idx]

# rename and reindex
landslide = landslide.rename(columns={"latitude": "lat", "longitude": "lon", "event_date": "event_time"})
landslide = landslide.reset_index()

# round event time to the nearest 6 hours
landslide['time'] = landslide['event_date'].dt.round('6H')
landslide = landslide.set_index(pd.to_datetime(landslide.time))

# select only landslide dates that are between december and may
idx = (landslide.index.month >= 12) | (landslide.index.month <= 5)
landslide = landslide[idx]

# landslide

In [11]:
# now we want to see if there is an AR present at the same time and location as the landslides
# open the trackID for ARs
filename =  path_to_data + 'ar_catalog/globalARcatalog_ERA-Interim_1979-2019_v3.0.nc'
ar = xr.open_dataset(filename, engine='netcdf4')
ar = ar.squeeze()

# Select months
idx = (ar.time.dt.month >= 12) | (ar.time.dt.month <= 5)
kid = ar.kidmap.sel(time=idx) # trackID for indexing

# slice the dates so both ds match
kid = kid.sel(time=slice('1979-12-01 00', '2019-05-31 00:00'))
# kid

In [12]:
## for each landslide_id, if the lat/lon falls within an AR, keep that AR ID and landslide ID
landslideID = []
arID = []
landslide_lat = []
landslide_lon = []
for i, row in landslide.T.iteritems():
    t = kid.sel(lat=row['lat'], lon=row['lon'], time=row['time'], method='nearest').values
    # print(t)
    if t > 0:
        landslideID.append(row['event_id'])
        arID.append(t)
        landslide_lat.append(row['lat'])
        landslide_lon.append(row['lon'])
        
d = {'landslideID': landslideID, 'trackID': arID, 
     'landslide_lat': landslide_lat, 'landslide_lon': landslide_lon}
landslide_df = pd.DataFrame(data=d)
# convert the dtype for the trackID column
landslide_df = landslide_df.astype({'trackID': 'float64'})

# landslide_df

In [13]:
# merge AR duration df and landslide DF
merged_data = pd.merge(duration_df, landslide_df, how='outer', on='trackID')
# merged_data 
# note the rows that do not have a date or time 
# are landslides that are associated with a specific AR that was not considered a "HMA AR"

In [14]:
## test to make sure merged correctly
# idx = merged_data.landslideID > 0
# test = merged_data[idx]
# test

In [15]:
# drop the rows that are not a HMA AR
idx = merged_data['ar_cat'] > 0
merged_data = merged_data.loc[idx]
merged_data

Unnamed: 0,trackID,ar_cat,start_date,end_date,duration,landslideID,landslide_lat,landslide_lon
0,3579.0,1.0,1980-03-03 00:00:00,1980-03-04 06:00:00,30.0,,,
1,3579.0,2.0,1980-03-01 00:00:00,1980-03-03 00:00:00,48.0,,,
2,3603.0,1.0,1980-03-05 06:00:00,1980-03-06 00:00:00,18.0,,,
3,3719.0,1.0,1980-03-20 00:00:00,1980-03-21 00:00:00,24.0,,,
4,3719.0,2.0,1980-03-19 06:00:00,1980-03-20 00:00:00,18.0,,,
...,...,...,...,...,...,...,...,...
1381,111955.0,3.0,2014-05-13 06:00:00,2014-05-13 12:00:00,6.0,,,
1382,112035.0,3.0,2014-05-23 06:00:00,2014-05-23 12:00:00,6.0,,,
1383,112080.0,3.0,2014-05-27 06:00:00,2014-05-28 00:00:00,18.0,,,
1384,112083.0,3.0,2014-05-28 06:00:00,2014-05-28 12:00:00,6.0,,,


## load 2D WRF data

In [16]:
## pull wrflats and wrflons from first file
fname = path_to_data + 'wrf_hasia/d01/ivt/3hr/tmp_2015.nc'
tmp = xr.open_dataset(fname)
# print(tmp.time[:100])
# print(tmp.time[-100:])

## assign those lats to the other ds when you loop
wrflats = tmp.lat.values
wrflons = tmp.lon.values

fname = path_to_data + 'wrf_hasia/d02/prec/3hr/tmp_2014.nc'
tmp = xr.open_dataset(fname)
# print(tmp.time[:100])
# print(tmp.time[-100:])

## assign those lats to the other ds when you loop
wrflats2 = tmp.lat.values
wrflons2 = tmp.lon.values


In [20]:
%%time
def preprocess_ivt(ds):
    '''keep only the current year'''
    year = ds.time.dt.year.max().values
    ds = ds.assign_coords({"lon": wrflons, "lat": wrflats})
    if year == 1980:
        ds = ds
    else:
        ds = ds.sel(time=slice('{0}-01-01 00:00'.format(year), '{0}-12-31 21:00'.format(year)))
    return ds

def preprocess_prec(ds):
    '''keep only the current year'''
    year = ds.time.dt.year.max().values
    ds = ds.assign_coords({"lon": wrflons2, "lat": wrflats2})
    if year == 1980:
        ds = ds
    else:
        ds = ds.sel(time=slice('{0}-01-01 00:00'.format(year), '{0}-12-31 21:00'.format(year)))
    return ds

domains = ['d01', 'd02']
varname_lst = ['ivt', 'prec']

## loop through each ds
ds_lst = []
for i, (dom, varname) in enumerate(zip(domains, varname_lst)):
    print(varname)
    if server == 'great':
        data_path = path_to_data + 'wrf_hasia/'
    else:
        data_path = path_to_data + 'wrf_preprocessed_data/wrf_6km/'
        
    filename_pattern = '{0}/{1}/3hr/tmp_*.nc'.format(dom, varname)
    fname = data_path + filename_pattern
    
    if varname == 'ivt':
        ds = xr.open_mfdataset(fname, preprocess=preprocess_ivt)
        ds = ds.assign(ivt=lambda ds: np.sqrt(ds.ivtu**2 + ds.ivtv**2))
    elif varname == 'prec':
        ds = xr.open_mfdataset(fname, preprocess=preprocess_prec)
        ## shift subtraction to get mm per hour 
        # # rain at next time step - rain at current time step
        ds = ds.shift(time=-1) - ds # if in xarray
    elif varname == 'geopotential':
        ds = ds.sel(lev=250.)
    
    # subset to just ar days
    # ds = ds.sel(time = slice(start_date, end_date))
    # ds = select_months_ds(ds, start_mon, end_mon)
    ds = ds.sel(time = ar_dates[:-1])
    
    ds_lst.append(ds)
    
ivt = ds_lst[0]
prec = ds_lst[1]


ivt
prec
CPU times: user 1.76 s, sys: 50.2 ms, total: 1.81 s
Wall time: 1.8 s


In [23]:
# ## Having trouble with wrf ds not having all the dates in the ar_dates list 
# ## use this to find out which dates are having a problem
# # make a pandas dataframe of AR Dates
# d = {'dates': ar_dates[:-1]}
# df_A = pd.DataFrame(data=d)
# df_A = df_A.set_index(pd.to_datetime(df_A['dates'])) # reset the index as "dates"


# # make a pandas dataframe of WRF dates
# d = {'dates': ivt.time}
# df_B = pd.DataFrame(data=d)
# df_B = df_B.set_index(pd.to_datetime(df_B['dates'])) # reset the index as "dates"

# # test = df_A.isin(df_B)

# x = df_A.index
# y = df_B.index
# test = x.isin(y)

# idx = (test== False)
# df_A.loc[idx]


In [24]:
# latmin, latmax, lonmin, lonmax
ext1 = [71, 79, 32, 37] # Western precip anomalies
ext2 = [69, 74, 37, 40] # Northwestern precip anomalies
ext3 = [90, 99, 24, 30] # Eastern precip anomalies

region_name = ['western', 'northwestern', 'eastern']
domains = [ext1, ext2, ext3]

In [25]:
# %%time
# # make a ds for each subregion
# ds_lst = []
# for i, bnds in enumerate(domains):
#     tmp = ivt.sel(lat=slice(bnds[2], bnds[3]), lon=slice(bnds[0], bnds[1])) 
#     ds_lst.append(tmp)
# ds_lst

In [26]:
def ar_ivt(df, ds, domains):
    '''Calculate maximum IVT for a subregion in a ds and append to dataframe.
     For each range of AR event dates, we find the maximum IVT for the duration of the AR for every grid cell. 
    '''
    # the final IVT statistic to retain
    ivtdir_vals = []
    ivt_vals = []
    # loop through each AR track
    for i, (arcat, track) in enumerate(zip(df.ar_cat.values, df.trackID.values)):
        start = df.start_date.values[i]
        end = df.end_date.values[i]
        # print('Getting maximum between', start, end)
        print(i)
        # get bbox based on ar_cat
        bnds = domains[int(arcat)-1]
        # select only the time steps for AR event and specified domain
        tmp = ds.sel(time=slice(start, end), lat=slice(bnds[2], bnds[3]), lon=slice(bnds[0], bnds[1]))

        ### localized IVT maxima during event
        # event_max = tmp.where(tmp.ivt==tmp.ivt.max(), drop=True).squeeze()
        event_max = tmp.where(tmp.ivt==tmp.ivt.max(), drop=True).squeeze().load() # this was taking too long, decided to load earlier
        ## pull IVT and IVTDIR where ivt is max
        uvec = event_max.ivtu.values
        uvec = units.Quantity(uvec, "m/s")
        vvec = event_max.ivtv.values
        vvec = units.Quantity(vvec, "m/s")
        ivtdir = mpcalc.wind_direction(uvec, vvec)
        ivtdir_vals.append(ivtdir.item())
        ivt_vals.append(event_max.ivt.values.tolist())
        
    final = [ivtdir_vals, ivt_vals]
        
    return final

In [None]:
%%time
## For each row, calculate the maximum IVT within the region between start and end
ivt_final = ar_ivt(merged_data, ivt, domains)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
27

In [None]:
## attach data to existing df
merged_data['ivt'] = ivt_final[1]
merged_data['ivtdir'] = ivt_final[0]

# # Export dataframe as csv
# outfile = path_to_out + 'IVTDIR_IVT_new.csv'     
# merged_data.to_csv(outfile)

In [None]:
def ar_precip(df, ds, domains, mode):
    '''Calculate precipitation statistics for a subregion in a ds and append to dataframe.
     Mode is chosen based on calculation. For each range of AR event dates, we calculate the total accumulated precip for every grid cell. 
     Then we remove all gridcells that had less than 1 mm of rain per event (these are not included in any calc)
     Then we weight the gridcells by the cosine of the latitude.
     Then based on mode selected, different statistics are retained:
         'mean-total' averages all viable gridcells within the subregion and retains this number
         'max-total' selects the maximum gridcell value to append
         'percentile-total' calcuates the 95th percentile and then averages all the grid cells that exceed this threshold
    '''
    # the final precip statistic to retain
    m1_vals = []

    for i, (arcat, track) in enumerate(zip(df.ar_cat.values, df.trackID.values)):
        start = df.start_date.values[i]
        end = df.end_date.values[i]
        # print('Getting maximum between', start, end)
        print(i)
        # get bbox based on ar_cat
        bnds = domains[int(arcat)-1]
        # select only the time steps for AR event and specified domain
        tmp = ds.sel(time=slice(start, end), lat=slice(bnds[2], bnds[3]), lon=slice(bnds[0], bnds[1]))

        ### event-total precipitation per event for every grid cell
        tmp = tmp.sum('time')
        ### mask out grid cells with less than 1 mm per event
        tmp2 = xr.where(cond=(tmp.prec > 1), x=tmp.prec, y=np.nan)

        ### area weighted
        # tmp = tmp2.weighted(tmp.weights)

        if mode == 'mean-total':
            ## mode 1: mean-total
            # average over gridcells in weighted subregion
            mean_tot = tmp.mean(['lat', 'lon'], skipna=True)
            # append to list
            m1_vals.append(mean_tot.values.tolist())
        elif mode == 'max-total':
            ## mode 2: max-total
            ### localized precip maxima during event
            event_max = tmp2.max(['lat', 'lon'])
            m1_vals.append(event_max.values.tolist())
        elif mode == 'percentile-total':
            ## mode 3: percentile-total
            ###  get 95th percentile thres
            q_thres = tmp2.quantile(0.95, dim=['lat', 'lon'], interpolation='linear')
            ## mask out grid cells below threshold
            perc_prec = xr.where(cond=(tmp2 > q_thres), x=tmp2, y=np.nan)
            # average over all grid cells skipping nans
            mean = perc_prec.mean(['lat', 'lon'], skipna=True)
            m1_vals.append(mean.values.tolist())

        
    return m1_vals

In [None]:
%%time
## For each row, calculate the maximum IVT within the region between start and end
prec_final = ar_precip(merged_data, prec, domains, 'max-total')



In [None]:
merged_data['prec'] = prec_final

# # Export dataframe as csv
# outfile = path_to_out + 'IVTDIR_IVT_prec.csv'     
# merged_data.to_csv(outfile)

In [None]:
%%time
varname = 'zerodegisotherm'
domain = 'd01'

filename_pattern = path_to_data + 'wrf_hasia/{0}/{1}/daily/out.wrf6km.{1}.daily_*.nc'.format(domain, varname)
print(filename_pattern)
ds = xr.open_mfdataset(filename_pattern)

# Trim date range
idx = slice(start_date, end_date)
ds = ds.sel(time=idx)

# select only months we are interested in
ds = select_months_ds(ds, start_mon, end_mon)

ds

In [None]:
## load filtered annual climatology and std
clim_std = xr.open_dataset(path_to_data + 'wrf_hasia/d01/zerodegisotherm/daily_std_clim_zerodegisotherm.nc')
clim_mean = xr.open_dataset(path_to_data + 'wrf_hasia/d01/zerodegisotherm/filtered_daily_mean_clim_zerodegisotherm.nc')

## Calculate Anomalies
anomalies = ds.groupby('time.dayofyear') - clim_mean

In [None]:
# normalize AR dates
## get normalized start date for each row in the df - need this to create subset list of freezing level days
df = merged_data.rename(columns={'start_date': 'date'})
df = df.set_index(pd.to_datetime(df['date']))
df = select_months_df(df, start_mon, end_mon)
df.index = df.index.strftime("%Y-%m-%d")
df = df.rename(columns={'date': 'start_date'})
df = df.reset_index()
df

In [None]:
# get list of dates that ar is present
ar_dates = pd.to_datetime(df['date']).values
# subset freezing level to just ar days
anomalies = anomalies.sel(time = ar_dates)
anomalies

In [None]:
%%time
## Calculate low freezing (x - mean < - 1.5*std)
low_freezing = anomalies.where(anomalies.z.groupby('time.dayofyear') < clim_std.z*-1.)
## Calculate high freezing (x - mean > 1.5*std)
high_freezing = anomalies.where(anomalies.z.groupby('time.dayofyear') > clim_std.z*1.)

In [None]:
%%time
# make a ds for each subregion
ds_low = []
ds_high = []
for i, dom in enumerate(domains):
    tmp = low_freezing.sel(lon=slice(dom[0], dom[1]), lat=slice(dom[2], dom[3]))
    ds_low.append(tmp.load())
    tmp = high_freezing.sel(lon=slice(dom[0], dom[1]), lat=slice(dom[2], dom[3]))
    ds_high.append(tmp.load())
ds_low

In [None]:
%%time
## this version takes the average value in the subregion
for i, region in enumerate(region_name):
    ## compute low freezing level
    x = ds_low[i].z.values
    # flatten array to 2D so it is ntimes, nlat*nlon
    ntimes, nlats, nlons = x.shape
    x = x.reshape(ntimes, nlats*nlons)
    
    # calculate mean, skipping nans
    low = np.nanmean(x, axis=1)
    colname = region + '_low'
    df[colname] = low

    ## compute high freezing level
    x = ds_high[i].z.values
    # flatten array to 2D so it is ntimes, nlat*nlon
    ntimes, nlats, nlons = x.shape
    x = x.reshape(ntimes, nlats*nlons)

    # count number of True for each time step
    high = np.nanmean(x, axis=1)
    colname = region + '_high'
    df[colname] = high


df

In [None]:
%%time
for i, region in enumerate(region_name):
    ## compute low freezing level
    x = ds_low[i].z.values
    # flatten array to 2D so it is ntimes, nlat*nlon
    ntimes, nlats, nlons = x.shape
    x = x.reshape(ntimes, nlats*nlons)

    # mark True if value is not nan
    a = ~np.isnan(x)
    # # mark True if any value for each time step is True (aka not nan)
    # z = np.any(a, axis=1)
    
    # count number of True for each time step
    low = np.count_nonzero(a, axis=1)
    df['low'] = low

    ## compute high freezing level
    x = ds_high[i].z.values
    # flatten array to 2D so it is ntimes, nlat*nlon
    ntimes, nlats, nlons = x.shape
    x = x.reshape(ntimes, nlats*nlons)

    # mark True if value is not nan
    a = ~np.isnan(x)
    # # mark True if any value for each time step is True (aka not nan)
    # z = np.any(a, axis=1)
    
    # count number of True for each time step
    high = np.count_nonzero(a, axis=1)
    df['high'] = high
    
    colname = region + '_freeze'
    df[colname] = 0
    df.loc[df['low'] > df['high'], colname] = -1
    df.loc[df['low'] < df['high'], colname] = 1
    
    # drop low and high columns
    df = df.drop(columns=['low', 'high'])


df

In [None]:
# Export dataframes as csv
df.to_csv(path_to_out + '{0}_ivt_ar_types_freezing_level_max_prec_new.csv'.format(ssn))