# EOF Analysis of AR days

* Multivariate EOF analysis - testing different configurations to see which cEOF captures the most varaince of precipitation related to AR events

  1. cEOF 250 hPa H, U, V in extratropics, 850 hPa Q, U, V in tropics
  2. cEOF 500 hPa H, U, V in extratropics, 500 hPa Q, U, V in tropics
  3. cEOF 500 hPa H, U, V in extratropics, 500 hPa H, U, V in tropics
  4. cEOF 500 hPa H, U, V in full domain [0-120 E, 0-50N]
  5. cEOF IVTu and IVTv in full domain
  6. cEOF 500 hPa H, U, V and 850 hPa Q in full domain?


In [1]:
# Import Python modules
import os, sys
import yaml
from pathlib import Path
import numpy as np
import numpy.ma as ma
import pandas as  pd
import xarray as xr
from sklearn.cluster import KMeans
from itertools import cycle
# matplotlib
import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec
from mpl_toolkits.axes_grid1 import AxesGrid
from matplotlib.colors import ListedColormap
from matplotlib import rcParams
import matplotlib.ticker as mticker
# cartopy
import cartopy.crs as ccrs
from cartopy.mpl.geoaxes import GeoAxes
from cartopy.mpl.gridliner import LONGITUDE_FORMATTER, LATITUDE_FORMATTER
import cartopy.feature as cfeature
# plot styles/formatting
import seaborn as sns
import cmocean.cm as cmo
import cmocean

from IPython.display import Image, display
from scipy import stats

# Path to modules
sys.path.append('../modules')

# Import my modules
from plotter import draw_basemap
from timeseries import persistence, select_months
from eofs import *
from ar_funcs import preprocess_ar_area_subregions
from kmeans import *
import nclcmaps as ncl

In [2]:
# Set up paths

path_to_data = '/home/nash/DATA/data/'                            # project data -- read only
path_to_out  = '/home/nash/DATA/repositories/AR_types/out/'       # output files (numerical results, intermediate datafiles) -- read & write
path_to_figs = '/home/nash/DATA/repositories/AR_types/figs/test/k3/'      # figures


In [3]:
# Set a default font for all matplotlib text (can only set this ONCE; must restart kernel to change it)

rcParams['font.family'] = 'sans-serif'   # set the default font family to 'sans-serif'
rcParams['font.sans-serif'] = 'Arial'    # set the default sans-serif font to 'Arial'

## Data

### AR time series

In [4]:
## Select MERRA2 or ERA5
reanalysis = 'era5'

if reanalysis == 'era5':
    start_date = '1979-01-01'
    end_date = '2018-12-31'
    filename = 'ar_catalog_v3_ERAI_fraction_HASIAsubregions.nc'
## if MERRA2
else:
    start_date = '1980-01-01'
    end_date = '2017-12-31'
    filename = 'ar_catalog_fraction_HASIAsubregions.nc'
    
f1 = path_to_data + 'CH1_generated_data/' + filename
ds = xr.open_dataset(f1)
# Set dates
ds = ds.sel(time=slice(start_date, end_date))
## Preprocess AR subregions - get dataframe of AR days based on area threshold
df = preprocess_ar_area_subregions(df=ds.to_dataframe(), thres=0.3)
# Show table
df.head()

Unnamed: 0_level_0,R01,R02,R03,track_id,ar,location
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1979-01-01,0.0,0.0,0.0,,0,
1979-01-02,0.0,0.0,0.0,,0,
1979-01-03,0.005102,0.0,0.0,14.0,0,
1979-01-04,0.0,0.0,0.0,,0,
1979-01-05,0.0,0.0,0.0,,0,


### Configuration Dictionary

In [5]:
# import configuration file for season dictionary choice
yaml_doc = '../data/config.yml'
config1 = yaml.load(open(yaml_doc), Loader=yaml.SafeLoader)
ssn = 'djf'
ar_dict = config1[ssn]

# import configuration file for ceof dictionary choice
## test number - NOTE as of 7/1/2020 sticking with test 4
yaml_doc = '../data/ceof_config.yml'
config2 = yaml.load(open(yaml_doc), Loader=yaml.SafeLoader)
config_dict = config2['test4']


### Import reanalysis

#### ERA5

In [6]:
## Set variable names (for saving data/figs)
testname = config_dict['name']
eofmode = 't' # s or t
dispmat = 'cor' # dispersion matrix type correlation/covariance
pathvar = config_dict['pathvar']

# Select lat/lon grid 
# Tropics/Extratropics Domain
lonmin = config_dict['latlon'][0]
lonmax = config_dict['latlon'][1]
latmin = config_dict['latlon'][2]
latmax = config_dict['latlon'][3]

lev = config_dict['levs']

# for figure names for testing different configurations
fname_id = config_dict['fnameID'] + eofmode + str(lonmin) + str(lonmax) + str(latmin) + str(latmax) + dispmat
print(fname_id)

if pathvar == 'huvq':
    def preprocess(ds):
        '''keep only selected lats and lons'''
        return ds.sel(latitude=slice(latmax, latmin), longitude=slice(lonmin, lonmax), level=lev)
if pathvar == 'ivt':
        def preprocess(ds):
            '''keep only selected lats and lons'''
            return ds.sel(latitude=slice(latmax, latmin), longitude=slice(lonmin, lonmax))

# open anomaly data
filepath_pattern = path_to_data + 'ERA5/{0}/anomalies/daily_filtered_anomalies_{0}_*.nc'.format(pathvar)
    
    
f2 = xr.open_mfdataset(filepath_pattern, preprocess=preprocess, combine='by_coords')

f2

HUV500t0120050cor


Unnamed: 0,Array,Chunk
Bytes,116.88 kB,2.93 kB
Shape,"(14610,)","(366,)"
Count,120 Tasks,40 Chunks
Type,int64,numpy.ndarray
"Array Chunk Bytes 116.88 kB 2.93 kB Shape (14610,) (366,) Count 120 Tasks 40 Chunks Type int64 numpy.ndarray",14610  1,

Unnamed: 0,Array,Chunk
Bytes,116.88 kB,2.93 kB
Shape,"(14610,)","(366,)"
Count,120 Tasks,40 Chunks
Type,int64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,2.84 GB,71.27 MB
Shape,"(14610, 1, 101, 241)","(366, 1, 101, 241)"
Count,200 Tasks,40 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 2.84 GB 71.27 MB Shape (14610, 1, 101, 241) (366, 1, 101, 241) Count 200 Tasks 40 Chunks Type float64 numpy.ndarray",14610  1  241  101  1,

Unnamed: 0,Array,Chunk
Bytes,2.84 GB,71.27 MB
Shape,"(14610, 1, 101, 241)","(366, 1, 101, 241)"
Count,200 Tasks,40 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,2.84 GB,71.27 MB
Shape,"(14610, 1, 101, 241)","(366, 1, 101, 241)"
Count,200 Tasks,40 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 2.84 GB 71.27 MB Shape (14610, 1, 101, 241) (366, 1, 101, 241) Count 200 Tasks 40 Chunks Type float64 numpy.ndarray",14610  1  241  101  1,

Unnamed: 0,Array,Chunk
Bytes,2.84 GB,71.27 MB
Shape,"(14610, 1, 101, 241)","(366, 1, 101, 241)"
Count,200 Tasks,40 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,2.84 GB,71.27 MB
Shape,"(14610, 1, 101, 241)","(366, 1, 101, 241)"
Count,200 Tasks,40 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 2.84 GB 71.27 MB Shape (14610, 1, 101, 241) (366, 1, 101, 241) Count 200 Tasks 40 Chunks Type float64 numpy.ndarray",14610  1  241  101  1,

Unnamed: 0,Array,Chunk
Bytes,2.84 GB,71.27 MB
Shape,"(14610, 1, 101, 241)","(366, 1, 101, 241)"
Count,200 Tasks,40 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,2.84 GB,71.27 MB
Shape,"(14610, 1, 101, 241)","(366, 1, 101, 241)"
Count,200 Tasks,40 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 2.84 GB 71.27 MB Shape (14610, 1, 101, 241) (366, 1, 101, 241) Count 200 Tasks 40 Chunks Type float64 numpy.ndarray",14610  1  241  101  1,

Unnamed: 0,Array,Chunk
Bytes,2.84 GB,71.27 MB
Shape,"(14610, 1, 101, 241)","(366, 1, 101, 241)"
Count,200 Tasks,40 Chunks
Type,float64,numpy.ndarray


In [7]:
if pathvar == 'huvq':

    # Create new dataset to rename lat lon
    ds = xr.Dataset({'H': (['time', 'level', 'lat', 'lon'], f2['z'].values),
                     'U': (['time', 'level',  'lat', 'lon'], f2['u'].values),
                     'V': (['time', 'level',  'lat', 'lon'], f2['v'].values),
                     'QV': (['time', 'level',  'lat', 'lon'], f2['q'].values)},
                          coords={'time': (['time'], f2.time.values),
                                  'level': (['level'], f2.level.values),
                                  'lat': (['lat'], f2.latitude.values),
                                  'lon': (['lon'], f2.longitude.values)})

if pathvar == 'ivt':
    ds = xr.Dataset({'ivte': (['time', 'lat', 'lon'], f2['p71.162'].values),
                     'ivtn': (['time', 'lat', 'lon'], f2['p72.162'].values)},
                          coords={'time': (['time'], f2.time.values),
                                  'lat': (['lat'], f2.latitude.values),
                                  'lon': (['lon'], f2.longitude.values)})


ds
print('ds size in GB {:0.2f}\n'.format(ds.nbytes / 1e9))

ds size in GB 11.38



In [8]:
# Add AR time series to merra; set as coordinate variables
ds['ar'] = ('time', df.ar)
ds = ds.set_coords('ar')

ds['location'] = ('time', df.location)
ds = ds.set_coords('location')

# print dataset
print(ds)

<xarray.Dataset>
Dimensions:   (lat: 101, level: 1, lon: 241, time: 14610)
Coordinates:
  * time      (time) datetime64[ns] 1979-01-01T09:00:00 ... 2018-12-31T09:00:00
  * level     (level) float64 500.0
  * lat       (lat) float32 50.0 49.5 49.0 48.5 48.0 ... 2.0 1.5 1.0 0.5 0.0
  * lon       (lon) float32 0.0 0.5 1.0 1.5 2.0 ... 118.5 119.0 119.5 120.0
    ar        (time) int64 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0 0 0
    location  (time) object nan nan nan nan nan nan ... nan nan nan nan nan nan
Data variables:
    H         (time, level, lat, lon) float64 -2.615e+03 -2.655e+03 ... 222.9
    U         (time, level, lat, lon) float64 1.489 1.765 1.789 ... 5.374 5.342
    V         (time, level, lat, lon) float64 -9.859 -9.301 ... -1.137 -1.631
    QV        (time, level, lat, lon) float64 -0.0004586 -0.000463 ... 0.0003623


### Data Subset Selection

In [9]:
idx = slice(ar_dict['start_date'], ar_dict['end_date'])
ds = ds.sel(time=idx)

# Select months
if ar_dict['mon_s'] > ar_dict['mon_e']:
    idx = (ds.time.dt.month >= ar_dict['mon_s']) | (ds.time.dt.month <= ar_dict['mon_e'])
else:
    idx = (ds.time.dt.month >= ar_dict['mon_s']) & (ds.time.dt.month <= ar_dict['mon_e'])

ds = ds.sel(time=idx)

# # Select AR days JUST IN R01
# idx = (ds.ar >= 1) & (ds.location == 'R01')
# Select AR days in all subregions
idx = (ds.ar >= 1)
ds_ar = ds.sel(time=idx)
# ds_ar = ds

# print results
print(ds_ar)

<xarray.Dataset>
Dimensions:   (lat: 101, level: 1, lon: 241, time: 355)
Coordinates:
  * time      (time) datetime64[ns] 1979-12-01T09:00:00 ... 2018-02-26T09:00:00
  * level     (level) float64 500.0
  * lat       (lat) float32 50.0 49.5 49.0 48.5 48.0 ... 2.0 1.5 1.0 0.5 0.0
  * lon       (lon) float32 0.0 0.5 1.0 1.5 2.0 ... 118.5 119.0 119.5 120.0
    ar        (time) int64 1 1 1 1 1 1 1 1 1 1 1 1 1 ... 1 1 1 1 1 1 1 1 1 1 1 1
    location  (time) object 'R03' 'R01' 'R01' 'R01' ... 'R03' 'R02' 'R01' 'R01'
Data variables:
    H         (time, level, lat, lon) float64 1.439e+03 1.405e+03 ... 0.2929
    U         (time, level, lat, lon) float64 12.56 13.75 14.45 ... 1.248 0.8793
    V         (time, level, lat, lon) float64 -5.224 -5.615 ... -0.2258 0.272
    QV        (time, level, lat, lon) float64 -0.0002335 ... -0.0002357


In [10]:
# Count number of independent AR events and their duration in days
x = ds.ar
event_id, total_events, duration = persistence(x)
print('Total number of AR days in season: ', x.sum())
print('Total number of independent AR events: ', total_events)

Total number of AR days in season:  <xarray.DataArray 'ar' ()>
array(355)
Total number of independent AR events:  194


## Preprocessing

### Reshape, center, and standardize data matrix

In [11]:
%%time
# Load merra_ar dataset into memory
ds_ar = ds_ar.load()


CPU times: user 43 µs, sys: 227 µs, total: 270 µs
Wall time: 300 µs


In [12]:
## Create list of variable arrays
    
var_list = []
for i in range(len(config_dict['varlist'])):
    var1 = config_dict['varlist'][i]
    domain = config_dict['domain'][i]
    domain_bnds = config2['eof_domain'][domain]
    lonslice = slice(domain_bnds[0], domain_bnds[1])
    latslice = slice(domain_bnds[3], domain_bnds[2])
    if pathvar == 'huvq':
        lev = config_dict['varlev'][i]
        var_list.append(ds_ar[var1].sel(lon=lonslice, lat=latslice, level=lev))
    if pathvar == 'ivt':
        var_list.append(ds_ar[var1].sel(lon=lonslice, lat=latslice))

# Check that sizes of arrays match
for i, in_array in enumerate(var_list):
    # Extract variable as numpy array
    var1 = in_array.values
    print(var1.shape)


(355, 101, 241)
(355, 101, 241)
(355, 101, 241)


In [13]:
%%time
# Weight the data by the square root of the cosine of the lat
var_list = spatial_weights(var_list)

## Flatten data to [time x space]
var_list = flatten_array(var_list)

## Center data
var_list = center_data(var_list)

## Standardize Arrays by removing the mean and dividing by the standard deviation of the columns
## For multivariate, place into single flattened array
Xs = standardize_arrays(var_list, mode=eofmode, dispersion_matrix=dispmat)


EOF mode:  t
Dispersion Matrix:  cor
(73023, 355)
Column means:  -0.0
Column std:  1.0
CPU times: user 1.75 s, sys: 944 ms, total: 2.69 s
Wall time: 2.67 s


## EOF Analysis

In [14]:
%%time

# Compute eigenvalues & eigenvectors
R, evals, evecs = calc_eigs(z=Xs, mode=eofmode)

print('Eigenvalues: ', evals.shape)
print(evals, '\n')

print('Eigenvectors: ', evecs.shape)
print(np.round(evecs, 3), '\n')

# save eigenvalues and eigenvectors
outfile = path_to_out + 'eigenvalues_'+ fname_id + ssn + '.txt'
np.savetxt(outfile, evals, fmt='%.5f')

outfile = path_to_out + 'eigenvectors_'+ fname_id + ssn + '.txt'
np.savetxt(outfile, evecs, fmt='%.5f', delimiter=',')

Eigenvalues:  (355,)
[5.76026992e+01 3.73756339e+01 2.63657457e+01 2.35681889e+01
 1.82084667e+01 1.71870915e+01 1.40157281e+01 1.20958335e+01
 1.09201135e+01 1.04492924e+01 9.07836626e+00 7.59117463e+00
 6.58855992e+00 6.14252212e+00 6.22236968e+00 4.83134872e+00
 4.27963130e+00 4.00545618e+00 3.77448554e+00 3.43413575e+00
 3.31255460e+00 2.84872316e+00 2.55352602e+00 2.45213981e+00
 2.27035157e+00 1.98712693e+00 1.91705808e+00 1.74802388e+00
 1.65507619e+00 1.62851938e+00 1.53565514e+00 1.41451719e+00
 1.35766917e+00 1.22395482e+00 1.19784734e+00 1.16459986e+00
 1.09956140e+00 1.04638261e+00 9.94160937e-01 9.90020776e-01
 9.53236067e-01 9.28225830e-01 8.08643534e-01 8.00131217e-01
 7.79518688e-01 7.39165863e-01 7.22529901e-01 6.85612674e-01
 6.45805237e-01 6.33207074e-01 6.18349777e-01 6.07800091e-01
 5.84849859e-01 5.60731515e-01 5.41861431e-01 5.35915142e-01
 4.89550687e-01 4.82019835e-01 4.64119179e-01 4.53461993e-01
 4.44192309e-01 4.34305583e-01 4.27119386e-01 4.18952405e-01
 4.

### Explained Variance

In [15]:
# Calculate the percent explained var by each eigenvector
pctvar = pct_variance(evals)

# Number of EOFs that explain more than 1% of the total variance
idx = pctvar[pctvar >= 1.0]
neofs = len(idx)

# print exp var >= 1.0
cumvar = np.sum(pctvar[0:neofs-1])
print(f'Cumulative variance explained by the first {neofs} EOFs:')
print(f'{cumvar:.2f}% \n')

# print exp var: neofs = 4
cumvar = np.sum(pctvar[0:3])
print(f'Cumulative variance explained by the first 4 EOFs:')
print(f'{cumvar:.2f}% \n')

# print exp var for 4 eofs
for k in range(4):
    print(f'{k+1} \t {pctvar[k]:.2f}%')

Cumulative variance explained by the first 19 EOFs:
77.89% 

Cumulative variance explained by the first 4 EOFs:
34.18% 

1 	 16.23%
2 	 10.53%
3 	 7.43%
4 	 6.64%


### North Test

In [16]:
err = north_test(evals, total_events)
upper = pctvar + err
lower = pctvar - err

print(np.round(upper[0:6],3))
print(np.round(pctvar[0:6],3))
print(np.round(lower[0:6],3))

[17.873 11.597  8.181  7.313  5.65   5.333]
[16.226 10.528  7.427  6.639  5.129  4.841]
[14.578  9.459  6.673  5.965  4.608  4.35 ]


### EOFs and PCs

In [17]:
# choose neofs and npcs based on North Test
# to save, plot, etc.
neofs = 2
npcs = neofs

# Calculate EOFS (spatial modes)
eofs = calc_eofs(Xs, evecs, evals, neofs, mode=eofmode)

# Split eofs into separate arrays for each variable
ntim, nlat, nlon = var1.shape
npts = nlat*nlon
nvar = len(var_list)
# Reshape spatial dim back to 2D map
eofmodes = var_list
for i in np.arange(len(var_list)):
    tmp = eofs[:,i*npts:(i+1)*npts]
    eofmodes[i] = np.reshape(tmp, (neofs,nlat,nlon))
    
# Calculate PCs (time coefficients)
pcs = calc_pcs(Xs, evecs, evals, npcs, mode=eofmode)
# results in [ntim, npcs] to plot in PC plot

## loadings*
## in the case of t-mode these are our "pcs" or time-coefficients
## in the case of s-mode, these are our "eofs" or spatial loadings
loads = pcs

# Save loadings
outfile = path_to_out + 'loadings_'+ fname_id + ssn + '.txt'
np.savetxt(outfile, loads[:,0:neofs], fmt='%.4f', delimiter=',')

### Spatial Modes

In [None]:
# Data for plotting 
lons = ds_ar.lon.data
lats = ds_ar.lat.data
udat = eofmodes[1][0:neofs,:,:]
vdat = eofmodes[2][0:neofs,:,:]
data = eofmodes[0][0:neofs,:,:]

print(np.nanmin(data), np.nanmax(data))

# Set up projection
mapcrs = ccrs.PlateCarree()
datacrs = ccrs.PlateCarree()

# Set tick/grid locations
dx = np.arange(lonmin,lonmax+20,20)
dy = np.arange(latmin,latmax+20,20)

# subtitles
eof_label = [ ]
pc_label = [ ]
var_label = [ ]
for k in range(neofs):
    eof_label.append("EOF{:1d}".format(k+1,))
    pc_label.append("PC{:1d}".format(k+1,))
    var_label.append("{:.2f}%".format(pctvar[k]))

In [None]:
# Create figure
fig = plt.figure(figsize=(10,11))
filepath = path_to_figs + testname + ssn + '_spatial' + '.png'
nrows = neofs
ncols = 1

# Set up Axes Grid
axes_class = (GeoAxes,dict(map_projection=mapcrs))
axgr = AxesGrid(fig, 
                111, 
                axes_class=axes_class,
                nrows_ncols=(nrows, ncols), 
                axes_pad = 0.55,
                cbar_location='bottom', 
                cbar_mode='single',
                cbar_pad=0.0, 
                cbar_size='5%',
                label_mode='')

# Loop for drawing each plot
for k, ax in enumerate(axgr):
    ax = draw_basemap(ax, extent=[lonmin,lonmax,latmin,latmax], xticks=dx, yticks=dy)
    
    # Contour Filled
    clevs = np.arange(-30,31,5)
    cf = ax.contourf(lons, lats, data[k,:,:], transform=datacrs,
                     levels=clevs,
                     cmap="bwr", extend='both')
    # Wind barbs / vectors
    ax.quiver(lons, lats, udat[k,:,:], vdat[k,:,:], transform=datacrs,
              color='black', pivot='middle', regrid_shape=20) 
    
    # subtitles
    ax.set_title(eof_label[k], loc='left', fontsize=12)
    ax.set_title(var_label[k], loc='right', fontsize=12)
    
# single colorbar
cb = fig.colorbar(cf, axgr.cbar_axes[0], orientation='horizontal', drawedges=True)
cb.set_label('m', fontsize=11)
cb.ax.tick_params(labelsize=10)
    
# Display figure
plt.savefig(filepath, dpi=200, bbox_inches='tight')
plt.show()

### K-means clustering

In [None]:
# Determine optimal K

# maximum number of clusters (number of iterations)
kmax =15
# input data
xdata = loads[:,0:neofs]

# Elbow plot
outfile = path_to_figs + testname + ssn + '_elbow'
plot_optimal_k(xdata, kmax, create_plot=True, filename=outfile)


In [None]:
# determine optimal k by examining the kde of the eofs
# make a dict of the first n eofs with column labels for df
values = []
for i in range(neofs):
    values.append(loads[:,i])
    
keys = eof_label
dicts = dict(zip(keys, values))
# print(dicts)

# Create new dataframe
dates_allDays = ds_ar.time.values
dates_arDays = ds_ar.time.values
df_out = pd.DataFrame(dicts, index=dates_arDays)

In [None]:
# sns.kdeplot(df_test.EOF1, df_test.EOF2)

g = sns.PairGrid(df_out, diag_sharey=True, corner=True)
# g.map_upper(sns.kdeplot)
g.map_lower(sns.kdeplot)
g.map_diag(sns.kdeplot, lw = 3)
g = g.add_legend(fontsize=14)

filepath = path_to_figs + fname_id + ssn + 'neof_' + str(neofs) + '_hist_kde'+ '.png'
g.savefig(filepath)

In [None]:
## K-means cluster analysis

# Number of clusters
nk = 4

# Input data
xdata = loads[:,0:neofs]

# Compute k means and assign each point to a cluster
kmeans = KMeans(n_clusters=nk)
kmeans.fit(xdata)
cluster = kmeans.predict(xdata)

# LLJ category labels (llj days only)
ar_cat = cluster + 1


In [None]:
# Count number of days in each cluster
klabels, counts = np.unique(ar_cat, return_counts=True)

# Save counts to txt file
res = np.column_stack((klabels,counts))
headstr = 'AR_TYPE, COUNT'
outfile = path_to_out + fname_id + ssn + 'k_counts.txt'
print(outfile)
np.savetxt(outfile, res, delimiter=',', fmt='%d', header=headstr)



In [None]:
# Cluster centroids (nclust x neofs)
centroids = kmeans.cluster_centers_

# Save centroids to txt file
res = np.column_stack((klabels,centroids))
keys[:0] = ['AR_TYPE']
headstr = ', '.join(keys)
# headstr = "AR_TYPE, EOF1, EOF2, EOF3, EOF4"
outfile = path_to_out + fname_id + ssn + 'centroids.txt'
print(outfile)
np.savetxt(outfile, res, delimiter=',', fmt='%s', header=headstr)


In [None]:
## Save AR location, loadings (EOF1-n), and category label (AR days only)
df_out['LOC'] = ds_ar.location.values
df_out['AR_CAT'] = ar_cat

# Export dataframe as csv
outfile = path_to_out + fname_id + ssn + 'neof_' + str(neofs) + '_nk' + str(nk) + '_hma_AR-types-loadings.csv'
df_out.to_csv(outfile)
print(outfile)

In [None]:
g = sns.PairGrid(df_out,  hue="AR_CAT", diag_sharey=False)
g.map_upper(sns.kdeplot)
g.map_lower(sns.kdeplot)
g.map_diag(sns.kdeplot, lw = 3, legend = True)
g = g.add_legend(fontsize=14)

filepath = path_to_figs + fname_id + ssn + 'neof_' + str(neofs) + '_hist_kde_arcat'+ '.png'
g.savefig(filepath)

In [None]:
## Save time series of all DJF days with AR types

# Arrays with ALL DJF days
dates_allDays = ds.time.values
ar_cat_allDays = np.zeros(len(dates_allDays), dtype=int)

# Loop over ar days and match to ar_full 
for i, date in enumerate(dates_arDays):
    idx = np.where(dates_allDays == date)
    ar_cat_allDays[idx] = ar_cat[i]

# Create dataframe
data = {'AR_CAT':ar_cat_allDays}
df_out = pd.DataFrame(data, index=dates_allDays)
print(df_out)

outfile = path_to_out + fname_id + 'hma_AR-types-' + ssn + 'neof_' + str(neofs) + '_nk' + str(nk) + '.csv'
df_out.to_csv(outfile)


### Plot Histogram of Difference in days between Types

In [None]:
## Separate the AR Types into multiple columns
# create a new df that breaks up the AR_CAT col so each category has its own column
keys = []
for k in range(nk):
    keys.append("AR_CAT{:1d}".format(k+1,))

values = np.zeros((len(dates_arDays)))
dicts = dict(zip(keys, values))

df_cat = pd.DataFrame(dicts, index=dates_arDays)

for k in range(nk):
    idx = (df_out['AR_CAT'] == k+1)
    col = "AR_CAT{:1d}".format(k+1,)
    df_cat.loc[idx, col] = 1

df_cat

In [None]:
def nearest(items, pivot):
    '''Find the nearest date in a list compared to a single date'''
    nearest=min(items, key=lambda x: abs(x - pivot))
    timedelta = nearest-pivot
    return timedelta.days

In [None]:
# List of dates that match each AR type conditions
x = []
ns = []
for k in range(nk):
    col = "AR_CAT{:1d}".format(k+1,)
    idx = (df_cat[col] > 0)
    mask = df_cat.index[idx]
    x.append(mask)
    ns.append(len(mask))
    
print(len(x[0]), len(x))

In [None]:
# calculate the number of days between Type 1 and the other types
near = []
data = []
for i in range(nk-1):
    for j in range(len(x[0])):
        t1 = x[0][j]
        t2 = pd.to_datetime(x[i+1])
        near.append(nearest(items=t2, pivot=t1))
    s = pd.Series(np.asarray(near))
    data.append(s)
        

In [None]:
# # List of dates that match AR1 conditions
# idx = (df_cat.AR_CAT1 > 0)
# x = df_cat.index[idx]

# # list of dates that match AR2 conditions
# idx = (df_cat.AR_CAT2 > 0)
# y = df_cat.index[idx]

# # # list of dates that match AR3 conditions
# # idx = (df_cat.AR_CAT3 > 0)
# # y2 = df_cat.index[idx]

# ns = [len(x), len(y), len(y2)]
# timedel = []
# timedel2 = []
# timedel3 = []
# for i in range(len(x)):
# #     t1 = pd.to_datetime(x.iloc[i])
#     t1 = x[i]
#     t2 = pd.to_datetime(y)
# #     t3 = pd.to_datetime(y2)
#     timedel.append(nearest(items=t2, pivot=t1))
# #     timedel2.append(nearest(items=t3, pivot=t1))
# # for i in range(len(y)):
# #     timedel3.append(nearest(items=t3, pivot=y[i]))

# # plot histogram
# s = pd.Series(np.asarray(timedel))
# # s2 = pd.Series(np.asarray(timedel2))
# # s3 = pd.Series(np.asarray(timedel3))

# # # truncate to a reasonable range
# # s = s[(s > -50) & (s < 50)]
# # s2 = s2[(s2 > -50) & (s2 < 50)]
# # s3 = s3[(s3 > -50) & (s3 < 50)]

# # data = [s, s2, s3]
# data = [s]

In [None]:
# plot labels with each number of AR days in each cluster
plt_labels = []
for k in range(nk-1):
    plt_labels.append("Type {0} (n={1}) and Type {2} (n={3})".format(k+1, ns[k], k+2, ns[k+1]))
    
print(plt_labels)

In [None]:
nrows = 1
ncols = nk-1
nplots = nrows*ncols
# Create figure
fig = plt.figure(figsize=(16,3))
for i in np.arange(nplots):
    ax = plt.subplot(nrows, ncols, i+1)
    n, bins, patches = plt.hist(x=data[i], bins=np.arange(-48, 50, 2), color='#0504aa',
                                alpha=0.7, rwidth=0.85, weights=np.ones(len(data[i])) / ns[i])
#     print(len(data[i]))
#     print(ns[i])
    plt.gca().yaxis.set_major_formatter(mticker.PercentFormatter(xmax=1, decimals=None))
    plt.grid(axis='y', alpha=0.75)
    plt.xlabel('Difference (days)')
    plt.xticks(np.arange(-48, 50, 8)) 
    plt.ylabel('Frequency (% events)')
    plt.title(plt_labels[i])
    plt.ylim([0, .20])

# Save figure
filepath = path_to_figs + testname + ssn + '_histogram_perc'+ 'neof_' + str(neofs) + '_nk' + str(nk) + '.png'
plt.savefig(filepath, dpi=150, bbox_inches='tight')