# EOF Analysis of SALLJ days

* Multivariate EOF analysis in T-mode
* K-means clustering

In [1]:
# Import Python modules
import os, sys
from pathlib import Path
import numpy as np
import pandas as  pd
import xarray as xr
from sklearn.cluster import KMeans
# matplotlib
import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec
from mpl_toolkits.axes_grid1 import AxesGrid
from matplotlib.colors import ListedColormap
from matplotlib import rcParams
# cartopy
import cartopy.crs as ccrs
from cartopy.mpl.geoaxes import GeoAxes
# plot styles/formatting
import seaborn as sns
import cmocean.cm as cmo
import cmocean

In [None]:
# Set path to module directory
sys.path.append('/home/sbarc/students/montini/repos/sallj-types/modules/')

# Import my modules
from plotter import draw_basemap
from timeseries import persistence
from eofs import *
#from kmeans import *


In [None]:
# Set up paths

home = Path.home()                     # users home directory
root = home/'repos'/'sallj-types'  # project root directory
path_to_data = root/'data'           # project data -- read only
path_to_out  = root/'out'             # output files (numerical results, intermediate datafiles) -- read & write
path_to_figs = root/'figs'           # figures

# check that path exists
path_to_figs.exists()

In [None]:
# Set a default font for all matplotlib text (can only set this ONCE; must restart kernel to change it)

rcParams['font.family'] = 'sans-serif'   # set the default font family to 'sans-serif'
rcParams['font.sans-serif'] = 'Arial'    # set the default sans-serif font to 'Arial'

## Data

### SALLJ time series

In [None]:
# Read CSV data into pandas DataFrame
filepath = path_to_data / 'erai.llj.day.1979-2016.csv'
df = pd.read_csv(filepath)

# Add datetime column
df['dates'] = pd.date_range(start='1979-01-01',end='2016-12-31',freq='D')
# Set the index to `dates`
df.set_index('dates', inplace=True)

# Add column of LLJ days (no LLJ day eq 0; LLJ day eq 1)
df['llj'] = 0
idx = (df['llj_sc'] > 0) | (df['llj_ma'] > 0)
df.loc[idx, 'llj'] = 1

# Add column of LLJ locations ('SC', 'MA', 'SC/MA', nan)
df['location'] = np.nan

idx = (df['llj_sc'] == 1) & (df['llj_ma'] == 0)
df.loc[idx, 'location'] = 'SC'

idx = (df['llj_sc'] == 0) & (df['llj_ma'] == 1)
df.loc[idx, 'location'] = 'MA'

idx = (df['llj_sc'] == 1) & (df['llj_ma'] == 1)
df.loc[idx, 'location'] = 'SC/MA'

# Show table
df.head()

### ERA5 reanalysis

In [None]:
# Read datafiles into xarray datasets
f1 = xr.open_dataset(path_to_data/'era5.sam.05dg.ivte.1979-2016.nc')
f2 = xr.open_dataset(path_to_data/'era5.sam.05dg.ivtn.1979-2016.nc')

# Merge variables into one dataset
era = xr.merge([f1,f2])

# Add LLJ time series to era5; set as coordinate variables
era['llj'] = ('time', df.llj)
era = era.set_coords('llj')

era['location'] = ('time', df.location)
era = era.set_coords('location')

# print dataset
print(era)

### Data Subset Selection

In [None]:
# Trim date range
start_date = '1979-11-01'
end_date = '2016-03-31'
idx = slice(start_date, end_date)
era = era.sel(time=idx)

# Select NDJFM months
idx = (era.time.dt.month >= 11) | (era.time.dt.month <= 3)
era = era.sel(time=idx)

# Select lat/lon grid
lonmin = -83
lonmax = -32
latmin = -47
latmax =  10
era = era.sel(longitude=slice(lonmin,lonmax), latitude=slice(latmax,latmin))

# Select LLJ days
idx = (era.llj >= 1)
era_llj = era.sel(time=idx)

# print results
print(era_llj)

In [None]:
# Count number of independent LLJ events

years = np.arange(1979, 2017) 
nyrs = len(years)
total_events = 0
for k in range(nyrs-1):    
    # Extract single NDJFM season
    date1 = "{}-11-01".format(years[k])
    date2 = "{}-03-31".format(years[k+1])
    x = era.llj.sel(time=slice(date1,date2)).values
    # Count LLJ events in that season
    tags, tmp = persistence(x)
    # Add to running event count
    total_events += tmp

print("Number of independent LLJ events: ", total_events)

### Climatology and Anomalies

In [None]:
# Mean IVT of LLJ days in NDJFM
era_llj_clim = era_llj.mean(dim='time')
#print(era_llj_clim, '\n')

# IVT Anomalies
era_llj['ivte_anom'] = era_llj.ivte - era_llj_clim.ivte
era_llj['ivtn_anom'] = era_llj.ivtn - era_llj_clim.ivtn
#print(era_llj)

## Preprocessing

### Reshape, center, and standardize data matrix

In [None]:
# Extract variables as numpy arrays
var1 = era_llj.ivte_anom.values
var2 = era_llj.ivtn_anom.values

# Data dimensions
ntim = var1.shape[0]
nlat = var1.shape[1]
nlon = var1.shape[2]
npts = nlat*nlon
nvar = 2

# Reshape into 2D arrays by flattening the spatial dimension
tmp1 = np.reshape(var1, (ntim, npts))
tmp2 = np.reshape(var2, (ntim, npts))

# Transpose arrays to get [space x time]
X1 = tmp1.T
X2 = tmp2.T

# Center and standardize by columns
x1mean = np.mean(X1, axis=0)
x1std = np.std(X1, axis=0)
X1s = (X1-x1mean) / x1std

x2mean = np.mean(X2, axis=0)
x2std = np.std(X2, axis=0)
X2s = (X2-x2mean) / x2std

# Combine variables into single data matrix Xs
Xs = np.empty((nvar*npts,ntim))
Xs[0:npts,:] = X1s
Xs[npts:,:]  = X2s
print(Xs.shape)

# Check that column means=0 and std dev=1
test = np.mean(np.mean(Xs, axis=0))
print("Column means: ", np.round(test,2))
test = np.mean(np.std(Xs, axis=0))
print("Column std: ", np.round(test,2))

## EOF Analysis

In [None]:
# Compute eigenvalues & eigenvectors
evals, evecs = calc_eofs(Xs)

print('Eigenvalues: ', evals.shape)
print(evals, '\n')

print('Eigenvectors: ', evecs.shape)
print(np.round(evecs, 3), '\n')

### Explained Variance

In [None]:
# Calculate the percent explained var by each eigenvector
pctvar = pct_variance(evals)

# Number of EOFs that explain more than 1% of the total variance
idx = pctvar[pctvar >= 1.0]
neofs = len(idx)

# print exp var >= 1.0
cumvar = np.sum(pctvar[0:neofs-1])
print(f'Cumulative variance explained by the first {neofs} EOFs:')
print(f'{cumvar:.2f}% \n')

# print exp var: neofs = 4
cumvar = np.sum(pctvar[0:3])
print(f'Cumulative variance explained by the first 4 EOFs:')
print(f'{cumvar:.2f}% \n')

# print exp var for 4 eofs
for k in range(4):
    print(f'{k+1} \t {pctvar[k]:.2f}%')

### North Test

In [None]:
err = north_test(evals, total_events)
upper = pctvar + err
lower = pctvar - err

print(np.round(upper[0:6],3))
print(np.round(pctvar[0:6],3))
print(np.round(lower[0:6],3))

### Fig 2: Variance

In [None]:
# set seaborn style
sns.set()
sns.set_style("ticks", {'patch.force_edgecolor':False})

# create figure
fig, ax = plt.subplots(figsize=(6,4))

# plot data
xvals = np.arange(neofs) + 1
ax.bar(xvals, pctvar[0:neofs], yerr=err[0:neofs], 
       color='tab:blue', alpha=0.8)

# x-axis
ax.set_xlabel('EOF')
ax.set_xticks(xvals)

# y-axis
ax.set_ylabel('Explained Variance (%)')
yticks = np.arange(0,16,3)
ax.set_yticks(yticks)
ax.set_yticklabels(yticks) 

# save fig
filepath = path_to_figs / 'fig2.png'
plt.savefig(filepath, dpi=300)

# show
plt.show()

### Loadings

In [None]:
neofs = 19
loads = loadings(evals, evecs, neofs)

print(loads.shape)
print(np.round(loads,3))

### Save EOFs

In [None]:
# Save eigenvalues, eigenvectors, and loadings

neofs = 4   # number of EOFs to save (evecs, loadings3)

outfile = path_to_out / 'eigenvalues.txt'
np.savetxt(outfile, evals, fmt='%.5f')

outfile = path_to_out / 'eigenvectors.txt'
np.savetxt(outfile, evecs[:,0:neofs], fmt='%.5f', delimiter=',')

outfile = path_to_out / 'loadings.txt'
np.savetxt(outfile, loads[:,0:neofs], fmt='%.4f', delimiter=',')


### PCs

In [None]:
# Calculate principal components (spatial modes)
pcs = calc_pcs(Xs, evecs, 19)

In [None]:
# Split pcs into separate arrays for each variable
tmp1 = pcs[:,0:npts]
tmp2 = pcs[:,npts:]

# Reshape spatial dim back to 2D map
neofs=19
pcmodes_var1 = np.reshape(tmp1, (neofs,nlat,nlon))
pcmodes_var2 = np.reshape(tmp2, (neofs,nlat,nlon))
#print(pcmodes_var1.shape, pcmodes_var2.shape)

### Fig 3: Spatial Modes

In [None]:
# Panel Plot of Spatial Modes

# number of eofs to plot
neofs = 4

# Data for plotting
lons = era_llj.longitude.data
lats = era_llj.latitude.data
udat = pcmodes_var1[0:neofs,:,:]
vdat = pcmodes_var2[0:neofs,:,:]
data = np.sqrt(udat**2 + vdat**2)
#print(data.min(), data.max())

# Set up projection
mapcrs = ccrs.PlateCarree()
datacrs = ccrs.PlateCarree()

# Set tick/grid locations
dx = np.arange(-80,lonmax,20)
dy = np.arange(-40,latmax,20)

# subtitles
eof_label = [ ]
var_label = [ ]
for k in range(neofs):
    eof_label.append("EOF{:1d}".format(k+1,))
    var_label.append("{:.2f}%".format(pctvar[k]))

In [None]:
# Create figure
fig = plt.figure(figsize=(10,11))
nrows = 2
ncols = 2

sns.set_style('ticks')

# Set up Axes Grid
axes_class = (GeoAxes,dict(map_projection=mapcrs))
axgr = AxesGrid(fig, 111, axes_class=axes_class,
                nrows_ncols=(nrows, ncols), axes_pad = 0.55,
                cbar_location='bottom', cbar_mode='single',
                cbar_pad=0.0, cbar_size='2.5%',label_mode='')

#newcmap = cmocean.tools.crop_by_percent(cmo.matter, 15, which='max', N=None)

# Loop for drawing each plot
for k, ax in enumerate(axgr):
    ax = draw_basemap(ax, extent=[lonmin,lonmax,latmin,latmax], xticks=dx, yticks=dy)
    
    # Add contour fill plot
    clevs = np.arange(0,71,5)
    cf = ax.contourf(lons, lats, data[k,:,:], transform=datacrs,
                     levels=clevs,cmap="Blues")
    # add vectors
    ax.quiver(lons, lats, udat[k,:,:], vdat[k,:,:], transform=datacrs,
              color='black', pivot='middle', regrid_shape=20)      
    # subtitles
    ax.set_title(eof_label[k], loc='left', fontsize=12)
    ax.set_title(var_label[k], loc='right', fontsize=12)
    
# single colorbar
cb = fig.colorbar(cf, axgr.cbar_axes[0], orientation='horizontal', drawedges=True)
cb.set_label('kg m$^{-1}$ s$^{-1}$', fontsize=11)
cb.ax.tick_params(labelsize=10)
    
# Display figure
filepath = home/'Desktop' / 'eofs.png'
plt.savefig(filepath, dpi=200, bbox_inches='tight')
plt.show()

## K means clustering

In [None]:
# Determine optimal K

# maximum number of clusters (number of iterations)
kmax =15
# number of eofs
neofs = 4
# input data
xdata = loads[:,0:neofs]

# Elbow plot
outfile = home/'Desktop' / 'xfig1.png'
plot_optimal_k(xdata, kmax, filename=outfile)


In [None]:
## K-means cluster analysis

# Number of clusters
nk = 4

# Input data
xdata = loads[:,0:neofs]

# Compute k means and assign each point to a cluster
kmeans = KMeans(n_clusters=nk)
kmeans.fit(xdata)
cluster = kmeans.predict(xdata)

# LLJ category labels (llj days only)
llj_cat = cluster + 1


In [None]:
# Count number of days in each cluster
klabels, counts = np.unique(llj_cat, return_counts=True)

# Save counts to txt file
res = np.column_stack((klabels,counts))
headstr = 'LLJ_TYPE, COUNT'
outfile = path_to_out / 'k_counts.txt'
np.savetxt(outfile, res, delimiter=',', fmt='%d', header=headstr)


In [None]:
# Cluster centroids (nclust x neofs)
centroids = kmeans.cluster_centers_

# Save centroids to txt file
res = np.column_stack((klabels,centroids))
headstr = "LLJ_TYPE, EOF1, EOF2, EOF3, EOF4"
outfile = path_to_out / 'centroids.txt'
np.savetxt(outfile, res, delimiter=',', fmt='%s', header=headstr)


### Save LLJ category labels

In [None]:
## Save LLJ location, loadings (EOF1-4), and category label (LLJ days only)

# Vector of LLJ dates
dates_lljDays = era_llj.time.values

# Create new dataframe
data = {'LOC':era_llj.location.values,
        'EOF1':loads[:,0],
        'EOF2':loads[:,1],
        'EOF3':loads[:,2],
        'EOF4':loads[:,3],
        'LLJ_CAT':llj_cat}
df_out = pd.DataFrame(data, index=dates_lljDays)
print(df_out)

# Export dataframe as csv
outfile = path_to_out / 'sallj-types-loadings.csv'
df_out.to_csv(outfile)


In [None]:
## Save time series of all NDJFM days with SALLJ types

# Arrays with ALL NDJFM days
dates_allDays = era.time.values
llj_cat_allDays = np.zeros(len(dates_allDays), dtype=int)

# Loop over llj days and match to llj_full 
for i, date in enumerate(dates_lljDays):
    idx = np.where(dates_allDays == date)
    llj_cat_allDays[idx] = llj_cat[i]  

# Create dataframe
data = {'LLJ_CAT':llj_cat_allDays}
df_out = pd.DataFrame(data, index=dates_allDays)
print(df_out)

outfile = path_to_out / 'sallj-types-ndjfm.csv'
df_out.to_csv(outfile)
