# EOF Analysis of AR days

* Multivariate EOF analysis of H, U and V

In [1]:
# Import Python modules
import os, sys
from pathlib import Path
import numpy as np
import numpy.ma as ma
import pandas as  pd
import xarray as xr
from sklearn.cluster import KMeans
# matplotlib
import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec
from mpl_toolkits.axes_grid1 import AxesGrid
from matplotlib.colors import ListedColormap
from matplotlib import rcParams
import matplotlib.ticker as mticker
# cartopy
import cartopy.crs as ccrs
from cartopy.mpl.geoaxes import GeoAxes
from cartopy.mpl.gridliner import LONGITUDE_FORMATTER, LATITUDE_FORMATTER
import cartopy.feature as cfeature
# plot styles/formatting
import seaborn as sns
import cmocean.cm as cmo
import cmocean

from IPython.display import Image, display

# Path to modules
sys.path.append('../modules')

# Import my modules
from plotter import draw_basemap
from timeseries import persistence
from eofs import *
from ar_funcs import preprocess_ar_area_subregions
from kmeans import *

In [2]:
# Set up paths

path_to_data = '/home/nash/DATA/data/'                            # project data -- read only
path_to_out  = '/home/nash/DATA/repositories/AR_types/out/'       # output files (numerical results, intermediate datafiles) -- read & write
path_to_figs = '/home/nash/DATA/repositories/AR_types/figs/cEOF_hma/'      # figures


In [3]:
# Set a default font for all matplotlib text (can only set this ONCE; must restart kernel to change it)

rcParams['font.family'] = 'sans-serif'   # set the default font family to 'sans-serif'
rcParams['font.sans-serif'] = 'Arial'    # set the default sans-serif font to 'Arial'

## Data

### AR time series

In [None]:
## Select MERRA2 or ERA5
reanalysis = 'era5'
if reanalysis == 'era5':
    start_date = '1979-01-01'
    end_date = '2018-12-31'
    filename = 'ar_catalog_ERAI_fraction_HASIAsubregions.nc'
## if MERRA2
else:
    start_date = '1980-01-01'
    end_date = '2017-12-31'
    filename = 'ar_catalog_fraction_HASIAsubregions.nc'
    
f1 = path_to_data + 'CH1_generated_data/' + filename
ds = xr.open_dataset(f1)
# Set dates
ds = ds.sel(time=slice(start_date, end_date))
## Preprocess AR subregions - get dataframe of AR days based on area threshold
df = preprocess_ar_area_subregions(df=ds.to_dataframe(), thres=0.3)
# Show table
df.head()

### MERRA2 reanalysis

In [5]:
## Set variable names (for saving data/figs)
var_names = 'HUV500'
eofmode = 't' # s or t
dispmat = 'cor' # dispersion matrix type correlation/covariance

# Select lat/lon grid 
# HASIA Domain
lonmin = 0
lonmax = 120
latmin = 0
latmax =  50


# for figure names for testing different configurations
fname_id = var_names + eofmode + str(lonmin) + str(lonmax) + str(latmin) + str(latmax) + dispmat

### MERRA2 DATA ###
def preprocess(ds):
    '''keep only selected lats and lons'''
    return ds.sel(lat=slice(latmin, latmax), lon=slice(lonmin, lonmax))

# open H data
filepath_pattern = path_to_data + 'MERRA2/anomalies/H500/daily_*.nc'

ds_h = xr.open_mfdataset(filepath_pattern, preprocess=preprocess, concat_dim='time', combine='by_coords')
print('ds size in GB {:0.2f}\n'.format(ds_h.nbytes / 1e9))

## open UV data
filepath_pattern = path_to_data + 'MERRA2/anomalies/UV500/daily_*.nc'
ds_uv = xr.open_mfdataset(filepath_pattern, preprocess=preprocess, combine='by_coords')
print('ds size in GB {:0.2f}\n'.format(ds_uv.nbytes / 1e9))

## combine H and UV data into 1 ds object
merra = xr.merge([ds_h, ds_uv.U, ds_uv.V])
# merra

ds size in GB 2.16

ds size in GB 4.33



In [6]:
# Add AR time series to merra; set as coordinate variables
merra['ar'] = ('time', df.ar)
merra = merra.set_coords('ar')

merra['location'] = ('time', df.location)
merra = merra.set_coords('location')

# print dataset
print(merra)

<xarray.Dataset>
Dimensions:    (lat: 101, lon: 193, time: 13880)
Coordinates:
    lev        float64 500.0
  * lon        (lon) float64 0.0 0.625 1.25 1.875 ... 118.1 118.8 119.4 120.0
  * lat        (lat) float64 0.0 0.5 1.0 1.5 2.0 ... 48.0 48.5 49.0 49.5 50.0
  * time       (time) datetime64[ns] 1980-01-01T09:00:00 ... 2017-12-31T09:00:00
    dayofyear  (time) int64 dask.array<chunksize=(366,), meta=np.ndarray>
    ar         (time) int64 0 0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0 0 0
    location   (time) object nan nan nan nan nan nan ... nan nan nan nan nan nan
Data variables:
    H          (time, lat, lon) float64 dask.array<chunksize=(366, 101, 193), meta=np.ndarray>
    U          (time, lat, lon) float64 dask.array<chunksize=(366, 101, 193), meta=np.ndarray>
    V          (time, lat, lon) float64 dask.array<chunksize=(366, 101, 193), meta=np.ndarray>


### Data Subset Selection

In [7]:
# Trim date range
start_date = '1980-12-01'
end_date = '2017-02-28'
idx = slice(start_date, end_date)
merra = merra.sel(time=idx)

# Select DJF months
idx = (merra.time.dt.month >= 12) | (merra.time.dt.month <= 2)
merra = merra.sel(time=idx)

# # Select AR days JUST IN R01
# idx = (merra.ar >= 1) & (merra.location == 'R01')
# Select AR days in all subregions
idx = (merra.ar >= 1)
merra_ar = merra.sel(time=idx)

# print results
print(merra_ar)

<xarray.Dataset>
Dimensions:    (lat: 101, lon: 193, time: 374)
Coordinates:
    lev        float64 500.0
  * lon        (lon) float64 0.0 0.625 1.25 1.875 ... 118.1 118.8 119.4 120.0
  * lat        (lat) float64 0.0 0.5 1.0 1.5 2.0 ... 48.0 48.5 49.0 49.5 50.0
  * time       (time) datetime64[ns] 1980-12-12T09:00:00 ... 2017-02-22T09:00:00
    dayofyear  (time) int64 dask.array<chunksize=(4,), meta=np.ndarray>
    ar         (time) int64 1 1 1 1 1 1 1 1 1 1 1 1 ... 1 1 1 1 1 1 1 1 1 1 1 1
    location   (time) object 'R01' 'R01' 'R02' 'R02' ... 'R01' 'R01' 'R03' 'R03'
Data variables:
    H          (time, lat, lon) float64 dask.array<chunksize=(4, 101, 193), meta=np.ndarray>
    U          (time, lat, lon) float64 dask.array<chunksize=(4, 101, 193), meta=np.ndarray>
    V          (time, lat, lon) float64 dask.array<chunksize=(4, 101, 193), meta=np.ndarray>


In [8]:
# Count number of independent AR events

years = np.arange(1980, 2018) 
nyrs = len(years)
total_events = 0
for k in range(nyrs-1):    
    # Extract single DJF season
    date1 = "{}-12-01".format(years[k])
    date2 = "{}-02-28".format(years[k+1])
    x = merra.ar.sel(time=slice(date1,date2)).values
    # Count AR events in that season
    tags, tmp = persistence(x)
    # Add to running event count
    total_events += tmp

print("Number of independent AR events: ", total_events)

Number of independent AR events:  215


## Preprocessing

### Reshape, center, and standardize data matrix

In [9]:
%%time
# Load merra_ar dataset into memory
merra_ar = merra_ar.load()


CPU times: user 3.23 s, sys: 2min 21s, total: 2min 24s
Wall time: 3min 47s


In [35]:
## Create list of variable arrays
# Extratropic variables
var1 = merra_ar.U
var2 = merra_ar.V
var3 = merra_ar.H

var_list = [var1, var2, var3]

# Check that sizes of arrays match
for i, in_array in enumerate(var_list):
    # Extract variable as numpy array
    var1 = in_array.values
    print(var1.shape)


(374, 101, 193)
(374, 101, 193)
(374, 101, 193)


In [36]:
%%time
# Weight the data by the square root of the cosine of the lat
var_list = spatial_weights(var_list)

## Flatten data 
var_list = flatten_array(var_list)

## Remove nans
## outputs two lists of arrays
## one without nans removed, and one with nans removed
var_list, var_list_nan = remove_nans(var_list)

print('Size of array with missing data removed: ', var_list_nan[0].shape)
print('Size of array without missing data removed: ', var_list[0].shape)

Nans removed success is  True
Nans removed success is  True
Nans removed success is  True
Size of array with missing data removed:  (374, 19490)
Size of array without missing data removed:  (374, 19493)
CPU times: user 659 ms, sys: 452 ms, total: 1.11 s
Wall time: 1.1 s


In [39]:
print('Size of array with missing data removed: ', var_list_nan[1].shape)
print('Size of array without missing data removed: ', var_list[1].shape)
print('Size of array with missing data removed: ', var_list_nan[2].shape)
print('Size of array without missing data removed: ', var_list[2].shape)

Size of array with missing data removed:  (374, 19490)
Size of array without missing data removed:  (374, 19493)
Size of array with missing data removed:  (374, 19493)
Size of array without missing data removed:  (374, 19493)


In [27]:
nvar = 3
ntim, npts = var_list[0].shape

tmp1 = var_list[0]
tmp2 = var_list[1]
tmp3 = var_list[2]

# Transpose arrays to get [space x time]
X1 = tmp1.T
X2 = tmp2.T
X3 = tmp3.T

# Center and standardize by columns
x1mean = np.mean(X1, axis=0)
x1std = np.std(X1, axis=0)
X1s = (X1-x1mean) / x1std

x2mean = np.mean(X2, axis=0)
x2std = np.std(X2, axis=0)
X2s = (X2-x2mean) / x2std

x3mean = np.mean(X3, axis=0)
x3std = np.std(X3, axis=0)
X3s = (X3-x3mean) / x3std

# Combine variables into single data matrix Xs
Xs = np.empty((nvar*npts,ntim))
Xs[0*npts:npts+0,:] = X1s
Xs[1*npts:npts*(1+1),:] = X2s
Xs[2*npts:(2+1)*npts,:] = X3s
print(Xs.shape)

# Check that column means=0 and std dev=1
test = np.mean(np.mean(Xs, axis=0))
print("Column means: ", np.round(test,2))
test = np.mean(np.std(Xs, axis=0))
print("Column std: ", np.round(test,2))

(58479, 374)
Column means:  nan
Column std:  nan


In [32]:
nvar = 3
ntim, npts = var_list[0].shape

tmp1 = var_list[0]
tmp2 = var_list[1]
tmp3 = var_list[2]

# Transpose arrays to get [space x time]
X1 = tmp1.T
X2 = tmp2.T
X3 = tmp3.T

# Combine variables into single data matrix Xs
Xs = np.empty((nvar*npts,ntim))
Xs[0*npts:npts+0,:] = X1
Xs[1*npts:npts*(1+1),:] = X2
Xs[2*npts:(2+1)*npts,:] = X3
print(Xs.shape)

# Center and standardize by columns
x1mean = np.mean(Xs, axis=0)
x1std = np.std(Xs, axis=0)
X1s = (Xs-x1mean) / x1std

# Check that column means=0 and std dev=1
test = np.mean(np.mean(Xs, axis=0))
print("Column means: ", np.round(test,2))
test = np.mean(np.std(Xs, axis=0))
print("Column std: ", np.round(test,2))

(58479, 374)
Column means:  nan
Column std:  nan


In [37]:
def standardize_arrays_new(arr_list, mode='t', dispersion_matrix='cor'):
    '''standardize variables then put in single flattened array
     
     Parameters
     ----------
     arr_list : list
        list of variable arrays
     
     mode : str
         mode of EOF - t or s
     
     dispersion_matrix : str
         type of dispersion matrix - cor or cov
         
     Returns
     -------
     X : single data matrix with all variables stacked
        arrays are standardized by the mode and dispersion matrix types
     
     ''' 
    print('EOF mode: ', mode)
    print('Dispersion Matrix: ', dispersion_matrix)
    nvar = len(arr_list)
    ntim, npts = arr_list[0].shape
    
    # empty flat array to put variables in
    if mode == 't':
        Xs = np.empty((nvar*npts,ntim))
    else: # mode is s
        Xs = np.empty((ntim, nvar*npts))
    
    for i, var1 in enumerate(arr_list):
        # if t-mode
        if mode == 't':
            # transpose to [space x time]
            X1 = var1.T
            # Standardize by columns and remove column mean for ALL variables
            x1mean = np.mean(X1, axis=0)
            x1std = np.std(X1, axis=0)
            if dispersion_matrix == 'cor':
                # Standardize by columns (if correlation)
                # remove column mean
                X = (X1 - x1mean) / x1std
            else: ## dispersion matrix == cov (covariance)
                # remove column mean
                X = (X1 - x1mean)
            
            # Combine variables into single data matrix Xs
            Xs[i*npts:(i+1)*npts,:] = X
        
        # if s-mode
        else:
            # keep array as [time x space]
            X1 = var1
            # Standardize by columns and remove column mean for ALL variables
            x1mean = np.mean(X1, axis=0)
            x1std = np.std(X1, axis=0)
            if dispersion_matrix == 'cor':
                # Standardize by columns (if correlation)
                # remove column mean
                X = (X1 - x1mean) / x1std
            else: ## dispersion matrix == cov (covariance)
                # remove column mean
                X = (X1 - x1mean)
        
            # Combine variables into single data matrix Xs
            Xs[:, i*npts:(i+1)*npts] = X
                       
    print(Xs.shape)

    # Check that column means=0 and std dev=1
    test = np.mean(np.mean(Xs, axis=0))
    print("Column means: ", np.round(test,2))
    test = np.mean(np.std(Xs, axis=0))
    print("Column std: ", np.round(test,2))
    
    return Xs

In [38]:
Xs = standardize_arrays_new(var_list_nan, mode=eofmode, dispersion_matrix='cov')

# Xs = standardize_arrays(var_list, mode=eofmode, dispersion_matrix=dispmat)

EOF mode:  t
Dispersion Matrix:  cov


ValueError: could not broadcast input array from shape (19493,374) into shape (19490,374)

## EOF Analysis

In [29]:
%%time

# Compute eigenvalues & eigenvectors
R, evals, evecs = calc_eofs(z=Xs, mode=eofmode)

print('Eigenvalues: ', evals.shape)
print(evals, '\n')

print('Eigenvectors: ', evecs.shape)
print(np.round(evecs, 3), '\n')

LinAlgError: Array must not contain infs or NaNs

In [None]:
plt.contourf(R)
plt.colorbar()
title = 'Dispersion Matrix (' + dispmat + ')'
plt.title(title)

### Explained Variance

In [None]:
# Calculate the percent explained var by each eigenvector
pctvar = pct_variance(evals)

# Number of EOFs that explain more than 1% of the total variance
idx = pctvar[pctvar >= 1.0]
neofs = len(idx)

# print exp var >= 1.0
cumvar = np.sum(pctvar[0:neofs-1])
print(f'Cumulative variance explained by the first {neofs} EOFs:')
print(f'{cumvar:.2f}% \n')

# print exp var: neofs = 4
cumvar = np.sum(pctvar[0:3])
print(f'Cumulative variance explained by the first 4 EOFs:')
print(f'{cumvar:.2f}% \n')

# print exp var for 4 eofs
for k in range(4):
    print(f'{k+1} \t {pctvar[k]:.2f}%')

### North Test

In [None]:
err = north_test(evals, total_events)
upper = pctvar + err
lower = pctvar - err

print(np.round(upper[0:6],3))
print(np.round(pctvar[0:6],3))
print(np.round(lower[0:6],3))

### Fig 2: Variance

In [None]:
# set seaborn style
sns.set()
sns.set_style("ticks", {'patch.force_edgecolor':False})

# create figure
fig, ax = plt.subplots(figsize=(6,4))

# plot data
xvals = np.arange(neofs) + 1
ax.bar(xvals, pctvar[0:neofs], yerr=err[0:neofs], 
       color='tab:blue', alpha=0.8)

# x-axis
ax.set_xlabel('EOF')
ax.set_xticks(xvals)

# y-axis
ax.set_ylabel('Explained Variance (%)')
yticks = np.arange(0,16,3)
ax.set_yticks(yticks)
ax.set_yticklabels(yticks) 

# save fig
filepath = path_to_figs + 'exp_variance_' + fname_id + '.png'
plt.savefig(filepath, dpi=300)

# show
plt.show()

### Loadings

In [None]:
neofs = 19
loads = loadings(evals, evecs, neofs)

print(loads.shape)
print(np.round(loads,3))

### Save EOFs

In [None]:
# Save eigenvalues, eigenvectors, and loadings

neofs = 4   # number of EOFs to save (evecs, loadings3)

outfile = path_to_out + 'eigenvalues_'+ fname_id + '.txt'
np.savetxt(outfile, evals, fmt='%.5f')

outfile = path_to_out + 'eigenvectors_'+ fname_id + '.txt'
np.savetxt(outfile, evecs[:,0:neofs], fmt='%.5f', delimiter=',')

outfile = path_to_out + 'loadings_'+ fname_id + '.txt'
np.savetxt(outfile, loads[:,0:neofs], fmt='%.4f', delimiter=',')


### PCs

In [None]:
# Calculate principal components (spatial modes)
neofs = 19
pcs = calc_pcs(Xs, evecs, neofs, mode=eofmode)

In [None]:
# Split pcs into separate arrays for each variable
ntim, nlat, nlon = var2.shape
npts = nlat*nlon
nvar = len(var_list)
# Reshape spatial dim back to 2D map
pcmodes = var_list
for i in np.arange(len(var_list)):
    tmp = pcs[:,i*npts:(i+1)*npts]
    pcmodes[i] = np.reshape(tmp, (neofs,nlat,nlon))

### Fig 3: Spatial Modes

In [None]:
# Panel Plot of Spatial Modes

# number of eofs to plot
neofs = 4

# Data for plotting extratropics
lons = var2.lon.data
lats = var2.lat.data
udat = pcmodes[0][0:neofs,:,:]
vdat = pcmodes[1][0:neofs,:,:]
data = pcmodes[2][0:neofs,:,:]

print(np.nanmin(data), np.nanmax(data))

# Set up projection
mapcrs = ccrs.PlateCarree()
datacrs = ccrs.PlateCarree()

# Set tick/grid locations
dx = np.arange(lonmin,lonmax+20,20)
dy = np.arange(latmin,latmax+20,20)

# subtitles
eof_label = [ ]
pc_label = [ ]
var_label = [ ]
for k in range(neofs):
    eof_label.append("EOF{:1d}".format(k+1,))
    pc_label.append("PC{:1d}".format(k+1,))
    var_label.append("{:.2f}%".format(pctvar[k]))

In [None]:
# Create figure
fig = plt.figure(figsize=(10,11))
filepath = path_to_figs + 'eofs_'+ fname_id + '.png'
nrows = 2
ncols = 2

# Set up Axes Grid
axes_class = (GeoAxes,dict(map_projection=mapcrs))
axgr = AxesGrid(fig, 
                111, 
                axes_class=axes_class,
                nrows_ncols=(nrows, ncols), 
                axes_pad = 0.55,
                cbar_location='bottom', 
                cbar_mode='single',
                cbar_pad=0.0, 
                cbar_size='2.5%',
                label_mode='')

#newcmap = cmocean.tools.crop_by_percent(cmo.matter, 15, which='max', N=None)

# Loop for drawing each plot
for k, ax in enumerate(axgr):
    ax = draw_basemap(ax, extent=[lonmin,lonmax,latmin,latmax], xticks=dx, yticks=dy)
#     ax = draw_basemap(ax, extent=None, xticks=dx, yticks=dy)
    
    # Add contour fill plot for extratropics
    clevs = np.arange(-55,60,5)
    cf = ax.contourf(lons, lats, data[k,:,:], transform=datacrs,
                     levels=clevs,
                     cmap="bwr", extend='both')
    # add vectors for extratropics
    ax.quiver(lons, lats, udat[k,:,:], vdat[k,:,:], transform=datacrs,
              color='black', pivot='middle', regrid_shape=20) 
    
    # subtitles
    ax.set_title(eof_label[k], loc='left', fontsize=12)
    ax.set_title(var_label[k], loc='right', fontsize=12)
    
# single colorbar
cb = fig.colorbar(cf, axgr.cbar_axes[0], orientation='horizontal', drawedges=True)
cb.set_label('m', fontsize=11)
cb.ax.tick_params(labelsize=10)
    
# Display figure
plt.savefig(filepath, dpi=200, bbox_inches='tight')
plt.show()

In [None]:
fig = plt.figure(figsize=(9,11))
fig.dpi = 200
fname = path_to_figs + 'pc_'+ fname_id
fmt = 'png'
results = evecs[:,:4]

X, nplots = results.shape
x = np.arange(len(results))

for i in np.arange(nplots):
    ax = plt.subplot(4, 1, i+1)
    ax.plot(x, results[:,i], '-')
    ax.axhline(0, color='k')
    ax.set_ylim(-0.2, 0.2)
    ax.set_ylabel('Normalized Units')
    # subtitles
    ax.set_title(pc_label[i], loc='left', fontsize=12)
    ax.set_title(var_label[i], loc='right', fontsize=12)

plt.subplots_adjust(hspace=0.35, wspace=0.003)

fig.savefig('%s.%s' %(fname, fmt), bbox_inches='tight', dpi=fig.dpi)
fig.clf()


plotFile = fname + '.png'
print(plotFile)
display(Image(plotFile))

### K-means clustering

In [None]:
# Determine optimal K

# maximum number of clusters (number of iterations)
kmax =15
# number of eofs
neofs = 4
# input data
xdata = loads[:,0:neofs]

# Elbow plot
outfile = path_to_figs + 'elbow'+ fname_id
plot_optimal_k(xdata, kmax, filename=outfile)


In [None]:
# Count number of days in each cluster
klabels, counts = np.unique(ar_cat, return_counts=True)

# Save counts to txt file
res = np.column_stack((klabels,counts))
headstr = 'AR_TYPE, COUNT'
outfile = path_to_out + fname_id + 'k_counts.txt'
np.savetxt(outfile, res, delimiter=',', fmt='%d', header=headstr)



In [None]:
# Cluster centroids (nclust x neofs)
centroids = kmeans.cluster_centers_

# Save centroids to txt file
res = np.column_stack((klabels,centroids))
headstr = "AR_TYPE, EOF1, EOF2, EOF3, EOF4"
outfile = path_to_out + fname_id + 'centroids.txt'
np.savetxt(outfile, res, delimiter=',', fmt='%s', header=headstr)


### Save AR Category Labels

In [None]:
## Save AR location, loadings (EOF1-4), and category label (AR days only)

# Vector of AR dates
dates_arDays = era_ar.time.values

# Create new dataframe
data = {'LOC':era_ar.location.values,
        'EOF1':loads[:,0],
        'EOF2':loads[:,1],
        'EOF3':loads[:,2],
        'EOF4':loads[:,3],
        'AR_CAT':ar_cat}
df_out = pd.DataFrame(data, index=dates_arDays)
print(df_out)

# Export dataframe as csv
outfile = path_to_out + fname_id + 'hma_AR-types-loadings.csv'
df_out.to_csv(outfile)


In [None]:
## Save time series of all DJF days with AR types

# Arrays with ALL DJF days
dates_allDays = era.time.values
ar_cat_allDays = np.zeros(len(dates_allDays), dtype=int)

# Loop over ar days and match to ar_full 
for i, date in enumerate(dates_arDays):
    idx = np.where(dates_allDays == date)
    ar_cat_allDays[idx] = ar_cat[i]  

# Create dataframe
data = {'AR_CAT':ar_cat_allDays}
df_out = pd.DataFrame(data, index=dates_allDays)
print(df_out)

outfile = path_to_out + fname_id + 'hma_AR-types-djf.csv'
df_out.to_csv(outfile)
