In [1]:
# Standard Python modules
import os, sys
import glob
import numpy as np
import pandas as pd
import xarray as xr

# plot styles/formatting
import seaborn as sns
import cmocean.cm as cmo
import cmocean

# matplotlib
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1 import AxesGrid
import matplotlib.ticker as mticker
import matplotlib.patches as mpatches
from matplotlib.gridspec import GridSpec
from matplotlib.colorbar import Colorbar # different way to handle colorbar
from matplotlib.colorbar import ColorbarBase
import matplotlib.colors as mcolors
from mpl_toolkits.axes_grid1.inset_locator import inset_axes
from mpl_toolkits.axes_grid1.axes_divider import make_axes_locatable
from matplotlib.projections import get_projection_class

sys.path.append('../modules')
from timeseries import select_months

In [2]:
path_to_data = '/expanse/nfs/cw3e/cwp140/'
path_to_out  = '../out/'       # output files (numerical results, intermediate datafiles) -- read & write
path_to_figs = '../figs/'      # figures

In [6]:
## read unique landslide dates
df = pd.read_csv('../out/landslide_dates.csv')
df = df.set_index(pd.to_datetime(df['init_date'], format='%Y%m%d'))
final_dates_lst = df.index

date_lst = []
impact_date_lst = []
model_lst = []
F_lst = []
for i, date in enumerate(final_dates_lst):
    ## skip 2 events - 20200227, 20200817
    ## the data from GEFS was too hard to download for these dates
    if (date.strftime("%Y%m%d") == '20200227') | (date.strftime("%Y%m%d") == '20200817'):
        pass
    else:
        for j, init_lead in enumerate(np.arange(1, 8)):
            F_lst.append(init_lead*24) # lead in hours
            init_date = date - pd.to_timedelta(init_lead, unit='D')
            date_lst.append(init_date)
            impact_date_lst.append(date)
            
            if init_date.year < 2020:
                model_name = 'GEFSv12_reforecast'
            else:
                model_name = 'GEFS_archive'
    
            model_lst.append(model_name)

d = {'impact_date': impact_date_lst, 'init_date': date_lst, 'model_name': model_lst, 'F': F_lst}
df = pd.DataFrame(d)

## cut the df so it takes out init dates before 2000-01-01
idx = (df['init_date'] >= '2000-01-01')
df = df.loc[idx]
df

Unnamed: 0,impact_date,init_date,model_name,F
0,2002-07-09,2002-07-08,GEFSv12_reforecast,24
1,2002-07-09,2002-07-07,GEFSv12_reforecast,48
2,2002-07-09,2002-07-06,GEFSv12_reforecast,72
3,2002-07-09,2002-07-05,GEFSv12_reforecast,96
4,2002-07-09,2002-07-04,GEFSv12_reforecast,120
...,...,...,...,...
4797,2024-09-16,2024-09-13,GEFS_archive,72
4798,2024-09-16,2024-09-12,GEFS_archive,96
4799,2024-09-16,2024-09-11,GEFS_archive,120
4800,2024-09-16,2024-09-10,GEFS_archive,144


In [2]:
IVT_lst = []
Z0_lst = []
UV_lst = []
QPF_lst = []
AR_index_lst = []
for index, row in df.iterrows():
    ## read csv files from landslide dates
    model_name = row['model_name']
    F = row['F']
    
    fdate = row['init_date'].strftime("%Y%m%d")
    impact_date = row['impact_date'].strftime("%Y-%m-%d")
    try:
        ## for each row, open the file using the init date
        fname = '/expanse/nfs/cw3e/cwp140/csv_non-landslide_historical/mclimate_init{0}.csv'.format(fdate)
        test = pd.read_csv(fname)
        test["valid_time"] = row['init_date'] + pd.to_timedelta((test.index + 1) * 6, unit="h")
        test = test.set_index(pd.to_datetime(test['valid_time']))
        ## then subset to impact date
        subset = test.loc[impact_date]
        
        ## pull the maximum values for each var
        IVT_lst.append(subset['IVT'].max())
        Z0_lst.append(subset['Freezing Level'].max())
        UV_lst.append(subset['UV'].max())
        QPF_lst.append(subset['QPF'].max())
        AR_index_lst.append(subset['AR_index'].max())
    except FileNotFoundError:
        print('Skipping {0}, data not available...'.format(fdate))
        ## set vals to nan
        IVT_lst.append(np.nan)
        Z0_lst.append(np.nan)
        UV_lst.append(np.nan)
        QPF_lst.append(np.nan)
        AR_index_lst.append(np.nan)
        

df['IVT'] = IVT_lst
df['Z0'] = Z0_lst
df['UV'] = UV_lst
df['QPF'] = QPF_lst
df['AR_index'] = AR_index_lst
df

NameError: name 'df' is not defined

In [1]:
ssn = 'all'
# ## read csv
# fname = path_to_out + 'box_whisker_2000-2019.csv'
# df1 = pd.read_csv(fname)

# fname = path_to_out + 'box_whisker_2020-2024.csv'
# df2 = pd.read_csv(fname)

# df = pd.concat([df1, df2])
# df['IVT'] = df['IVT']*100
# df['Z0'] = df['Z0']*100
# df['UV'] = df['UV']*100

df = df.set_index(pd.to_datetime(df['impact_date']))
if ssn == 'DJF':
    df = select_months(df, 12, 2)
elif ssn == 'cool-season':
    df = select_months(df, 9, 2)
else:
    df = df
df
df = df.reset_index(drop=True)

NameError: name 'df' is not defined

In [None]:
F_lst = np.arange(24, 168+24, 24)
varlst = ['IVT', 'Z0', 'UV', 'QPF', 'AR_index']
for j, F in enumerate(F_lst):
    print('F = {0}'.format(F))
    idx = (df.F == F)
    tmp = df.loc[idx]
    tmp = tmp.drop(columns=["impact_date", "init_date", "F"])
    print(tmp.describe())

In [None]:
# Create figure
fig = plt.figure(figsize=(8.5, 11))
fig.dpi = 300
fname = path_to_figs + 'box_whisker_{0}'.format(ssn)
fmt = 'png'

nrows = 3
ncols = 2

## Use gridspec to set up a plot with a series of subplots that is
## n-rows by n-columns
gs = GridSpec(nrows, ncols, height_ratios=[1, 1, 1], width_ratios = [1, 1], wspace=0.2, hspace=0.2)
## use gs[rows index, columns index] to access grids

#######################
### BOX AND WHISKER ###
#######################
varname_lst = ['IVT', 'Z0', 'UV', 'QPF', 'AR_index']
color_lst = ['#54B36D', '#FC5F3F', '#9956AD', '#2171b5', '#DF65B0']
row_lst = [0, 0, 1, 1, 2]
col_lst = [0, 1, 0, 1, 0]
for i, varname in enumerate(varname_lst):
    ax = fig.add_subplot(gs[row_lst[i], col_lst[i]])
    PROPS = {'boxprops':{'facecolor':color_lst[i], 'edgecolor':'k'},
             'medianprops':{"color": "k"},
             'whiskerprops':{"color": "k"},
             'capprops':{"color": "k"},
             'flierprops':{"marker": "x"},
             'meanprops':{'c':'k', 'lw':1},
             'bootstrap': 5000}
    
    bplot = sns.boxplot(y=varname, x="F", data=df, 
                        whis=[0, 100],
                    order=np.arange(24, 8*24, 24),
                    meanline=True, showmeans=True, 
                    notch=False, showcaps=True,
                    linewidth=0.75,
                    **PROPS)
    
    #  set the ticks first
    if varname == 'AR_index':
        plt.ylim(-1., 6)
        bplot.set_yticks(np.arange(0., 6, 1))
    else:
        plt.ylim(-1., 101)
        bplot.set_yticks(np.arange(0., 101, 10))
    ax.minorticks_on()
    ax.tick_params(axis='y', which='minor', bottom=True)
    ax.tick_params(axis='y', which='major')
    
        
    # ax.set_title('(a)', loc='left')
    # ax.annotate(plt_lbl[i], (5, 207), xycoords='axes points', fontsize=12.,
    #         backgroundcolor='white', zorder=100)
    if i <= 3:
        ax.set_ylabel('{0} percentile rank (xth)'.format(varname))
    else:
        ax.set_ylabel('AR Impact Index')
    ax.set_xlabel('Lead (hours)')


fig.savefig('%s.%s' %(fname, fmt), bbox_inches='tight', dpi=fig.dpi, transparent=True)

# Show
plt.show()

In [None]:
# Thus, if we have this weird "flipped" appearance in the notched box plots,
# it simply means that the 1st quartile has a lower value than the confidence of the mean and 
# vice versa for the 3rd quartile. 
# Although it looks ugly, it's actually useful information about the (un)confidence of the median.

In [None]:
def calc_fraction_percentiles(df, varname, F, perc_rank):
    idx = df['F'] == F
    denom = len(df.loc[idx]) # how many impact dates with lead time
    
    idx = (df[varname] == perc_rank) & (df['F'] == F)
    numer = len(df.loc[idx]) # how many impact dates with lead time that have the percentile rank value
    
    return (numer/denom)*100
data_lst = []
varlst = ['IVT', 'Z0', 'UV', 'AR_index']
for i, varname in enumerate(varlst):
    if varname == 'AR_index':
        perc_rank_lst = [0, 1, 2, 3, 4, 5]
    else:
        perc_rank_lst = [0., 75., 90., 94., 95., 96., 97., 98., 99., 100.]

    F_lst = np.arange(24, 168+24, 24)
    for j, perc_rank in enumerate(perc_rank_lst):
        for k, F in enumerate(F_lst):
            frac = calc_fraction_percentiles(df, varname, F, perc_rank)
            data = (varname, perc_rank, F, frac)
            data_lst.append(data)

# create DataFrame using data
df2 = pd.DataFrame(data_lst, columns =['Variable', 'Percentile Rank', 'F', 'Fraction'])
df2

In [None]:
# Create figure
fig = plt.figure(figsize=(10, 10))
fig.dpi = 300
fname = path_to_figs + 'heatmaps_{0}'.format(ssn)
fmt = 'png'

nrows = 2
ncols = 2

## Use gridspec to set up a plot with a series of subplots that is
## n-rows by n-columns
gs = GridSpec(nrows, ncols, height_ratios=[1, 1], width_ratios = [1, 1], wspace=0.3, hspace=0.2)
## use gs[rows index, columns index] to access grids

#######################
### BOX AND WHISKER ###
#######################
varname_lst = ['IVT', 'Z0', 'UV', 'AR_index']
title_lst = ['(a) IVT', '(b) Freezing Level', '(c) 1000-hPa wind', '(d) AR Impact Index']
row_idx = [0, 0, 1, 1]
col_idx = [0, 1, 0, 1]
for i, varname in enumerate(varname_lst):
    ax = fig.add_subplot(gs[row_idx[i], col_idx[i]])

    tmp = df2.loc[df2['Variable'] == varname].pivot(index="Percentile Rank", columns="F", values="Fraction")
    if varname == 'AR_index':
        tck_lbly = [0, 1, 2, 3, 4, 5]
    else:
        tck_lbly = ['< 75th', '75th', '90th', '94th', '95th', '96th', '97th', '98th', '99th', 'MAX']
    g = sns.heatmap(tmp, annot=True, fmt=".0f", cmap="crest", cbar=False, yticklabels=tck_lbly, ax=ax)
    g.invert_yaxis()

    ax.set_title(title_lst[i], loc='left')

fig.savefig('%s.%s' %(fname, fmt), bbox_inches='tight', dpi=fig.dpi, transparent=True)

# Show
plt.show()