# Experiment 6: PCA on full dataset (U-velocity)

## Imports

In [1]:
%matplotlib notebook

In [2]:
# Import packages:
import mikeio
import matplotlib.pyplot as plt
import matplotlib.animation as animation
import numpy as np
import os
import sys
import pickle as pkl

sys.path.append("../")
plt.style.use("seaborn-v0_8-whitegrid")

from Scripts import my_functions as mf

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA, IncrementalPCA
from IPython.display import HTML
from tqdm import tqdm

## Data Extraction

In [3]:
## Find the relative path to Data/DHI_wk_sim/Area.dfsu from current directory:

# Go up two levels from current directory:
path = os.path.abspath(os.path.join(os.getcwd(), os.pardir, os.pardir))

# Overwrite path:
path = os.path.join(path, "Data/DHI_yr_sim/Area.dfsu")

# Get first and last time step:
time = mikeio.open(path).time

# Load data: (~ 3.5 min)
data = mikeio.read(path,
                   time=slice(time[48], time[-1]),
                   items="U velocity")

In [4]:
data

<mikeio.Dataset>
dims: (time:18143, element:17980)
time: 1996-12-19 00:00:00 - 1997-12-31 23:00:00 (18143 records)
geometry: Dfsu2D (17980 elements, 10460 nodes)
items:
  0:  U velocity <u velocity component> (meter per sec)

In [5]:
# Get surface elevation values:
data_vals = data.U_velocity.values

# Scale features:
scaler = StandardScaler().fit(data_vals)

# Transform features:
data_vals_scaled = scaler.transform(data_vals)

## Full PCA

In [6]:
# Auxilliary variable:
compute = 1

# Try to load results from earlier runs:
if 1:
    
    # Load scaler and pca if they exist:
    if os.path.exists("../Data_Results/Exp_6_scaler.pkl") and \
       os.path.exists("../Data_Results/Exp_6_pca.pkl"):
        
        # Load scaler:
        with open("../Data_Results/Exp_6_scaler.pkl", "rb") as f:
            scaler = pkl.load(f)
            
        # Load pca:
        with open("../Data_Results/Exp_6_pca.pkl", "rb") as f:
            ipca = pkl.load(f)
        
        
        # Change compute to 0:
        compute = 0
        
print(f"compute = {compute}")        

compute = 0


In [7]:
# Compute scaler and pca:
if compute:
    
    # Fit scaler
    scaler = StandardScaler().fit(data_vals)
    
    # Transform features:
    data_vals_scaled = scaler.transform(data_vals)
    
    # Fit IncrementalPCA:
    ipca = IncrementalPCA(batch_size=250).fit(data_vals_scaled)

In [8]:
# Save scaler and ipca objects (in Coding/Data_Results):
if compute:
    
    with open("../Data_Results/Exp_6_scaler.pkl", "wb") as f:
        pkl.dump(scaler, f)

    with open("../Data_Results/Exp_6_pca.pkl", "wb") as f:
        pkl.dump(ipca, f)

### Explained Variance

In [9]:
# Plot the PCA E.V.R:
fig1 = plt.figure(figsize=(7.5,5))
pca_x_ticks = np.arange(1, ipca.n_components_+1, 1)
plt.plot(pca_x_ticks,
         np.cumsum(ipca.explained_variance_ratio_),
         "o",
         linestyle="dashed",
         color="red")

plt.plot(pca_x_ticks,
         ipca.explained_variance_ratio_,
         "o",
         linestyle="dotted",
         color="blue")

pc_keep = np.sum(ipca.explained_variance_ > 1)

plt.vlines(pc_keep, 0, 1, colors="black", linestyles="dashed")

plt.xscale("symlog")
plt.xlabel("Number of components")
plt.ylabel("Explained variance ratio")

pca_x_gridlines = np.array([2**i for i in range(10)])

plt.xticks(pca_x_gridlines,
           [str(pca_x_gridlines[i]) \
            for i in range(len(pca_x_gridlines))])

# Set legend:
plt.legend(["Cumulative", "Individual", f"PC cutoff ({pc_keep})"],
            loc="right",
            frameon=True, fancybox=True,
            shadow=True, framealpha=1, facecolor="lightgrey")

plt.title("PCA explained variance ratio")
plt.show()

<IPython.core.display.Javascript object>

In [10]:
# Save figure:
fig1.savefig("../Figures/Exp_6_full_PCA_EVR.png", dpi=100)

### Reconstruction vs Original

In [11]:
# Choose principal components:
princomps = ipca.components_[:pc_keep]

# Transform features:
data_vals_scaled_pca = data_vals_scaled[:100] @ princomps.T

# Reconstruct features:
data_vals_scaled_recon = data_vals_scaled_pca @ princomps

# Rescale features:
data_vals_recon = scaler.inverse_transform(data_vals_scaled_recon)


In [12]:
# Select plot times and plot data:
plot_times = [12, 22]
plot_data = data_vals[plot_times]
plot_data_recon = data_vals_recon[plot_times]


plot_obj = data.U_velocity[0].copy()
plot_rec_obj = data.U_velocity[0].copy()

# Create 2 by 3 subplot:
fig2, axs = plt.subplots(2, 3, figsize=(15, 10))

# Loop over the two times:
for t in range(len(plot_times)):

    # Plot the original data:
    plot_obj.values = plot_data[t]
    
    cmap_max = np.quantile(np.abs(plot_obj.values), 0.99)
    digits = np.sum([cmap_max * (10**i) < 1 for i in range(5)])
    cmap_max = np.round(cmap_max, digits)

    
    plot_obj.plot(ax=axs[t,0], cmap="seismic",
                vmin = -cmap_max, vmax = cmap_max)
    axs[t,0].set_title(f"Original data @ Time: {plot_times[t]}")

    # Plot the reconstructed data:
    plot_rec_obj.values = plot_data_recon[t]
    
    cmap_max = np.quantile(np.abs(plot_rec_obj.values), 0.99)
    digits = np.sum([cmap_max * (10**i) < 1 for i in range(5)])
    cmap_max = np.round(cmap_max, digits)

    
    plot_rec_obj.plot(ax=axs[t,1], cmap="seismic",
                    vmin = -cmap_max, vmax = cmap_max)
    axs[t,1].set_title(
        f"Reconstructed data @ Time: {plot_times[t]}")

    # Compute the RMSE:
    rmse_diff = mf.rmse(plot_data[t], plot_data_recon[t])

    # Plot the difference:
    plot_obj.values = plot_data[t] - plot_data_recon[t]
    
    cmap_max = np.quantile(np.abs(plot_obj.values), 0.99)
    digits = np.sum([cmap_max * (10**i) < 1 for i in range(5)])
    cmap_max = np.round(cmap_max, digits)

    
    plot_obj.plot(ax=axs[t,2], cmap="seismic",
                    vmin = -cmap_max, vmax = cmap_max)
    axs[t,2].set_title(f"Difference.\nRMSE: {rmse_diff:.6f}")

fig2.suptitle(f"PCA ({pc_keep}-PC) reconstruction comparison\n",
              fontsize=16)

fig2.tight_layout()

plt.show()


<IPython.core.display.Javascript object>

In [13]:
# Save figure:
fig2.savefig(f"../Figures/Exp_6_full_PCA_{pc_keep}PC_comparison.png", dpi=100)