<a href="https://colab.research.google.com/github/davidnoone/GEOPHYS_NOTEBOOKS/blob/main/ClimateVariability_partial_solution.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Climate variability - solution

In [None]:
# Obtain the data files
!test ! -f meiv2.data && wget -O meiv2.data http://kete.rangi.cloud.edu.au/u/dcn/meiv2.data
!test ! -f ERA5_monthly_surface.nc && wget -O ERA5_monthly_surface.nc http://kete.rangi.cloud.edu.au/u/dcn/ERA5_monthly_surface.nc
!test ! -f ERA5_monthly_4layer.nc && wget -O ERA5_monthly_4layer.nc http://kete.rangi.cloud.edu.au/u/dcn/ERA5_monthly_4layer.nc

In [None]:
import numpy as np
import pandas as pd
from scipy import stats
from scipy import linalg
import matplotlib
import matplotlib.pyplot as plt
try:
  import netCDF4 as nc
except:
  !pip install netCDF4
  import netCDF4 as nc

from datetime import datetime, timedelta

from google.colab import files

#ncdump
ncdump is a little (unix) tool for showing the contents of a netcdf file.
Here is a python function which is similar. 

In [None]:
# Example: show contents of netcdf file (like "ncdump" on )
def ncdump(nc_fid, verb=True):
    '''
    ncdump outputs dimensions, variables and their attribute information.
    The information is similar to that of NCAR's ncdump utility.
    ncdump requires a valid instance of Dataset.

    Parameters
    ----------
    nc_fid : netCDF4.Dataset
        A netCDF4 dateset object
    verb : Boolean
        whether or not nc_attrs, nc_dims, and nc_vars are printed

    Returns
    -------
    nc_attrs : list
        A Python list of the NetCDF file global attributes
    nc_dims : list
        A Python list of the NetCDF file dimensions
    nc_vars : list
        A Python list of the NetCDF file variables
    '''
    def print_ncattr(key):
        """
        Prints the NetCDF file attributes for a given key

        Parameters
        ----------
        key : unicode
            a valid netCDF4.Dataset.variables key
        """
        try:
            print("\t\ttype:", repr(nc_fid.variables[key].dtype))
            for ncattr in nc_fid.variables[key].ncattrs():
                print('\t\t%s:' % ncattr,\
                      repr(nc_fid.variables[key].getncattr(ncattr)))
        except KeyError:
            print("\t\tWARNING: %s does not contain variable attributes" % key)

    # NetCDF global attributes
    nc_attrs = nc_fid.ncattrs()
    if verb:
        print("NetCDF Global Attributes:")
        for nc_attr in nc_attrs:
            print('\t%s:' % nc_attr, repr(nc_fid.getncattr(nc_attr)))
    nc_dims = [dim for dim in nc_fid.dimensions]  # list of nc dimensions
    # Dimension shape information.
    if verb:
        print("NetCDF dimension information:")
        for dim in nc_dims:
            print("\tName:", dim )
            print("\t\tsize:", len(nc_fid.dimensions[dim]))
            print_ncattr(dim)
    # Variable information.
    nc_vars = [var for var in nc_fid.variables]  # list of nc variables
    if verb:
        print("NetCDF variable information:")
        for var in nc_vars:
            if var not in nc_dims:
                print('\tName:', var)
                print("\t\tdimensions:", nc_fid.variables[var].dimensions)
                print("\t\tsize:", nc_fid.variables[var].size)
                print_ncattr(var)
    return nc_attrs, nc_dims, nc_vars

#Open and read the data

In [None]:
# open and read the netcdf files
file_erap = 'ERA5_monthly_4layer.nc'
file_eras = 'ERA5_monthly_surface.nc'


with nc.Dataset(file_eras,'r') as fid:
    #ncdump(fid,verb=True)
    time = fid.variables['time'][:]
    lons = fid.variables['longitude'][:]
    lats = fid.variables['latitude'][:]
    data = fid.variables['t2m'][:,:,:]

# do some masking
nn =20
data[:,:nn,:] = 0
data[:,-nn:,:] = 0

#nlon = np.size(lons)
#nlat = np.size(lats)
ntime, nlat, nlon = np.shape(data)
print ('nlon:',nlon,'nlat:',nlat, 'ntime:',ntime)


# Example to read time from file:
#Set "zero" as per netcdf time units
dt_zero = datetime.strptime("1900-01-01 00:00:00","%Y-%m-%d %H:%M:%S")
year = []
for h in time:
    dt = dt_zero + timedelta(hours=float(h))
    year.append(dt.year)
print('YEAR:',year)


In [None]:
# open and read the MEI data file
file_mei = 'meiv2.data'
columns = ['Year', 'Jan', 'Feb', 'Mar', 'Apr', 'May', 
                   'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
df = pd.read_csv(file_mei, sep='\s+', header=None, names=columns)


Example function to perform PCA

In [None]:
def pca_eig(Z):
  """ 
      A simple principle component analysis with eigenvalue decomposition
      Z(x,t) = sum_M a(t)e(x)

      Z MUST BE PASSED IN AS CENTRED (mean remove)

      with e the eiegen vectors and a the temporal loadings
      Notice that if nspace > ntime is is MUCH better (faster)
      Here, we do this as a simple transpose, and leave it
      to the calling code to figure out how to manage the rescaling.
  
  """
  # want to work in the smallest dimensions
  transpose = False
  ntime, nspace = np.shape(Z)
  if (ntime < nspace):
      print('PCA: doing transpose to speed up calculation')
      Z = np.transpose(Z)
      transpose = True

  # Compute the covariance matrix
  covariance_matrix = np.cov(Z, rowvar=False)
  print('Covaraince matrix:',np.shape(covariance_matrix))

  # Calulate the eigen values/vectors of the covariance
  eigenvalues, eigenvectors = linalg.eigh(covariance_matrix)

  # Sort these, since python (actually LAPACK!) doesn't do it
  idx = np.argsort(eigenvalues)[::-1]
  eigenvalues  = eigenvalues[idx]
  eigenvectors = eigenvectors[:, idx]

  # Project the data onto the new eigen space to obtain the time factors
  principal_components = np.dot(Z, eigenvectors)

  # If transposed input, transpose output
  if (transpose):
      eigenvectors         = np.transpose(eigenvectors)
      principal_components = np.transpose(principal_components)
      return eigenvalues, principal_components, eigenvectors  # return in reverse order

  return eigenvalues, eigenvectors, principal_components

In [None]:
# This version using SVD PCA; Z is centered on input
def pca_svd(Z):
    """ 
       Performs principal component analysis using SVD of the input "Z"
       data matrix. Z should be "centered" on input. (Z = data - mean(data))

       Z = U s V.T

      Equivalence with the PCA problem notes
      Z = A E

      With variance: 
           lambda = s^2/(n-1)
           loadings: US
           eigenfunctions: V

      if ntime < nspace the loadings

    """

    ntime, mspace = np.shape(Z)
    print ('SVD Z shape:',ntime, mspace)

    # Perform Singular Value Decomposition (SVD) on the centered data
    U, s, Vt = np.linalg.svd(Z, full_matrices=False)

    # Compute loadings
    US = np.dot(U, np.diag(s))

    # Compute eigenvalues from the singular values
    eigenvalues = s ** 2 / (ntime - 1)

    # Compute the principal components by multiplying the centered data with eigenvectors
    #principal_components = np.dot(Z, Vt.T)
    #principal_components = principal_components.T

    # depending on if space or time is larger, 
    # time > space
    #eigenvectors = Vt.T

    # space > time
    eigenvectors = Vt
    principal_components = US.T
    

    # normalize
    nnorm = np.sqrt(mspace)
    eigenvectors = eigenvectors*nnorm
    principal_components = principal_components/nnorm

    return eigenvalues, eigenvectors, principal_components

##Part 1: regression maps

In [None]:
lat_pnt = -36.8509
lon_pnt = 174.7645

ipnt = np.argmin(np.abs(lons - lon_pnt))
jpnt = np.argmin(np.abs(lats - lat_pnt))
print('Clostest point:',lons[ipnt],lats[jpnt])


##Part 2: patterns of variability

In [None]:
#
print('Doing principal component analysis')
z_data = data.reshape(ntime,nlat*nlon)
z_mean = np.mean(z_data,axis=0)
print('Zmean:',np.shape(z_mean))
for n in range(ntime):
    z_data[n,:] = z_data[n,:] - z_mean[:]

data_variance = np.sum(np.var(z_data, axis=1))

# Perform the PCA: notice order of PC and Evectors when working in time domain
eval, evec, afac = pca_eig(z_data) 

print('RAW: vals:',np.shape(eval),'vec:',np.shape(evec),'pcs:',np.shape(afac))
evec = evec.reshape(ntime, nlat, nlon)

# Eigen values are proportional to variance, so figure out the fractional vaianc explained
total_variance = np.sum(eval)
variance_explained = eval/total_variance

print('VARIANCE (original, from evals):',data_variance, total_variance)
#print('vals:',np.shape(eval),'vec:',np.shape(evec),'pcs:',np.shape(afac))
#
# Let;s check the eigen vectors are "unitless"
print('EIGENVECTOR MAGNITUDE:',np.linalg.norm(evec, axis=1))
print('P COMPONENT MAGNITUDE:',np.linalg.norm(afac, axis=0))
print('EIGENVALUES          :',eval)
print('Variance_explained   :',variance_explained*100)


# Some plots
num = 1
fig, ax = plt.subplots(3, 2, figsize=(10, 12))

cf = ax[0,0].contourf(lons,lats,evec[num,:,:])
cb = plt.colorbar(cf)
ax[1,0].plot(year,afac[num,:])
ax[2,0].plot(variance_explained*100)

#
# Do it again with SVD 
#
eval, evec, afac = pca_svd(z_data)

evec = evec.reshape(ntime, nlat, nlon)
total_variance = np.sum(eval)
variance_explained = eval/total_variance

print('SVD')
print('VARIANCE (original, from evals):',data_variance, total_variance)
print('EIGENVECTOR MAGNITUDE:',np.linalg.norm(evec, axis=1))
print('P COMPONENT MAGNITUDE:',np.linalg.norm(afac, axis=0))
print('EIGENVALUES          :',eval)
print('Variance_explained   :',variance_explained*100)



cf = ax[0,1].contourf(lons,lats,evec[num,:,:])
cb = plt.colorbar(cf)
ax[1,1].plot(year,afac[num,:])
ax[2,1].plot(variance_explained*100)


