# Experiment 13: Model building using full dataset (Surface Elevation, U- and V-velocity)

## Imports

In [1]:
%matplotlib notebook

In [2]:
# Import packages:
import mikeio
import matplotlib.pyplot as plt
import matplotlib.animation as animation
import numpy as np
import os
import sys
import pickle as pkl
import pandas as pd

sys.path.append("../")
plt.style.use("seaborn-v0_8-whitegrid")

from Scripts import my_functions as mf
from Scripts import my_models3 as mm

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA, IncrementalPCA
from sklearn.linear_model import LinearRegression, Ridge

from IPython.display import HTML
from tqdm import tqdm

## Setup

### Combine data

In [3]:
## Find the relative path to Data/DHI_wk_sim/Area.dfsu from current directory:

# Go up two levels from current directory:
path = os.path.abspath(os.path.join(os.getcwd(), os.pardir, os.pardir))

# Define path to dfsu file:
path_area = os.path.join(path, "Data/DHI_yr_sim/Area.dfsu")

path_wind = os.path.join(path, "Data/DHI_yr_sim/HD_OERESUND_CREA6_1997_v2.m21fm - Result Files/wind.dfs0")

# Define paths to boundary conditions:
path_bc_north = os.path.join(path, "Data/DUMP/waterlevel_bc/waterlevel_north.dfs1")
path_bc_south = os.path.join(path, "Data/DUMP/waterlevel_bc/waterlevel_south.dfs1")

# Open dfsu file:
mikeio.open(path_area)

Dfsu2D
number of elements: 17980
number of nodes: 10460
projection: LONG/LAT
items:
  0:  Surface elevation <Surface Elevation> (meter)
  1:  Total water depth <Water Depth> (meter)
  2:  U velocity <u velocity component> (meter per sec)
  3:  V velocity <v velocity component> (meter per sec)
time: 18191 steps with dt=1800.0s
      1996-12-18 00:00:00 -- 1997-12-31 23:00:00

In [4]:
%%time

# Auxilliary variable:
compute = 1

# Try to load results from earlier runs:
if 1:
    
    # Load combined data if available:
    if os.path.exists("../../Data/my_data/data.pkl"):
        
        # Load dataframe:
        with open("../../Data/my_data/data.pkl", "rb") as f:
            df_full = pkl.load(f)
            
        # Change compute to 0:
        compute = 0
        
print(f"compute = {compute}")   

compute = 0
Wall time: 9.44 s


In [5]:
%%time

# Compute the combined data if not available: (~ 10 min)
if compute:
    
    # Extract time:
    time_data = mikeio.open(path_area).time

    # Load files:
    zuv_data  = mikeio.read(path_area,
                           time=time_data)
    wind_data = mikeio.read(path_wind,
                           time=time_data)

    bc_north_data = mikeio.read(path_bc_north,
                               time=time_data)
    bc_south_data = mikeio.read(path_bc_south,
                               time=time_data)
    
    
    
    # Extract values of surface elevation, u-velocity and v-velocity from zuv_data:
    z_vals = zuv_data["Surface elevation"].values
    u_vals = zuv_data["U velocity"].values
    v_vals = zuv_data["V velocity"].values

    # Extract values of u-velocity and v-velocity from wind_data:
    wu_vals = np.concatenate([wind_data[i].values.reshape(-1,1) for i in range(25)], axis=1)
    wv_vals = np.concatenate([wind_data[i].values.reshape(-1,1) for i in range(25, 50)], axis=1)

    # Extract values of bc_north_data and bc_south_data:
    bcn_vals = bc_north_data["North"].values
    bcs_vals = bc_south_data["South"].values

    
    # Create dataframes:
    df_z = pd.DataFrame(z_vals).add_prefix("z_")
    df_u = pd.DataFrame(u_vals).add_prefix("u_")
    df_v = pd.DataFrame(v_vals).add_prefix("v_")
    
    df_wu = pd.DataFrame(wu_vals).add_prefix("wu_")
    df_wv = pd.DataFrame(wv_vals).add_prefix("wv_")

    df_bcn = pd.DataFrame(bcn_vals).add_prefix("bcn_")
    df_bcs = pd.DataFrame(bcs_vals).add_prefix("bcs_")
    
    
    # Combine everything:
    df_full = pd.concat([df_z, df_u, df_v, 
                         df_bcn, df_bcs, 
                         df_wu, df_wv], axis=1)
    
    # Set datetime as index:
    df_full.set_index(time_data)
    

Wall time: 0 ns


In [6]:
%%time

# Save combined data:
if compute:
    
    with open("../../Data/my_data/data.pkl", "wb") as f:
        pkl.dump(df, f)


Wall time: 0 ns


### Create or load PCA and scaler:

In [7]:
# Auxilliary variable:
compute = 1

# Try to load results from earlier runs:
if 1:
    
    # Load scaler and pca if they exist:
    if os.path.exists("../Data_Results/Exp_13_scaler.pkl") and \
       os.path.exists("../Data_Results/Exp_13_pca.pkl"):
        
        # Load scaler:
        with open("../Data_Results/Exp_13_scaler.pkl", "rb") as f:
            scaler = pkl.load(f)
            
        # Load pca:
        with open("../Data_Results/Exp_13_pca.pkl", "rb") as f:
            ipca = pkl.load(f)
        
        
        # Change compute to 0:
        compute = 0
        
print(f"compute = {compute}")        

compute = 0


In [8]:
%%time

# Create small dataframe:
df = df_full.iloc[:3200].copy()

Wall time: 967 ms


## Extract data:

#### Split data:

In [9]:
%%time 

# Train test split:
tts = int(0.5 * len(df) // 1)

df_train = df.iloc[:tts]
df_test = df.iloc[tts:]

print(df_train.shape, df_test.shape)

(1600, 54004) (1600, 54004)
Wall time: 2 ms


#### Feature extractor method:

In [10]:
def extract_features(df):
    
    # Surface elevation, U- and V-velocity:
    z_data = df.filter(regex="z_").values
    u_data = df.filter(regex="^u_").values
    v_data = df.filter(regex="^v_").values
    
    # North and south BC data:
    bcn_data = df.filter(regex="bcn_").values
    bcs_data = df.filter(regex="bcs_").values
    
    # U- and V- wind velocity data:
    wu_data = df.filter(regex="wu_").values
    wv_data = df.filter(regex="wv_").values
    
    data_list = {"z"  :   z_data, "u"  :   u_data, "v" : v_data,
                 "bcn": bcn_data, "bcs": bcs_data,
                 "wu" :  wu_data, "wv" :  wv_data}
    
    return data_list

**Prediction model comparison method:**

In [11]:
def compare_models(models):
    
    xs = range(len(df_train)+len(df_test))
    x_train = xs[:len(df_train)]
    x_test = xs[len(df_train):]
    
    plt.figure(figsize=(12,8), dpi=100)
    plt.title(f"Model comparison", 
              fontsize=16)
    
    plot_colors = []
    
    min_err = 10
    
    # Plot training errors:
    for model in models:
    
        train_errors = model.model["train_errors"]
        train_line,  = plt.plot(x_train, train_errors)
        
        plot_colors.append(train_line.get_color())
        
        if train_errors.min() < min_err: 
            min_err = train_errors.min()
        
    # Setup yscale and vertical line:
    my_yticks = [1]+[1/(10**i) for i in range(1,10)]
    
    good_yticks = np.argwhere(np.array(my_yticks) < min_err)
    
    if len(good_yticks) > 2:
        my_yticks = my_yticks[:good_yticks[2][0]]
    
    plt.vlines(x=len(x_train), ymin=my_yticks[-1], ymax=my_yticks[0], color="black",
           linestyle="dashed")
    
    # Plot test errors:
    for i, model in enumerate(models):
        
        test_errors = model.model["test_errors"]
        test_line,  = plt.plot(x_test, test_errors,
                               color=plot_colors[i],
                               linestyle="dotted")
        
        if test_errors.min() < min_err:
            min_err = test_errors.min()
        
        
    plt.xlabel("Time steps", fontsize=14)
    plt.ylabel("RMSE", fontsize=14)
    
    plt.legend([model.name for model in models]+["Train-Test-Split"],
                fontsize=11, frameon=True, fancybox=True,
                shadow=True, framealpha=1, facecolor="lightgrey")
    
    
    plt.yscale("log")
    
    plt.yticks(my_yticks)
    
    plt.show()
    

## Models

In [12]:
# Check my_models3.py for source codes.

In [13]:
import warnings
warnings.filterwarnings("ignore")
M = mm.MyModels(df, "standard", "pca")

Init was run.


In [14]:
train_frac = 0.25

In [15]:
ewoi2u3r3u94toejv

NameError: name 'ewoi2u3r3u94toejv' is not defined

In [None]:
baseline_models = []

baseline_models.extend([M.BaselineModel(["z"], "Collective", train_frac),
                       M.BaselineModel(["z"], "Coordinate", train_frac),
                       M.ReconModel(["z"], latent_dim=1),
                       M.ReconModel(["z"], latent_dim=10),
                       M.ReconModel(["z"], latent_dim=100)])

In [None]:
# Setup models (with 1 latent space dimension):
models = []

for var in ["z"]:
    for state_lag in range(1,4):
        for bc in [True, False]:
            for wind in [True, False]:
                if wind is True or bc is True:
                    for extra_lag in range(3):
                        for extra_lead in range(3):
                            models.append(
                                M.RegressionModel("linear", [var], [var], wind, bc, 
                                                  state_lag, extra_lag, extra_lead,
                                                  latent_dim=1, train_frac=train_frac)
                                    )
                else:
                    models.append(
                                M.RegressionModel("linear", [var], [var], wind, bc, 
                                                  state_lag, extra_lag, extra_lead,
                                                  latent_dim=1, train_frac=train_frac)
                                    )
                    
                    
models_ls1 = models                               

In [None]:
print(len(models_ls1))

In [None]:
# Setup models (with 10 latent space dimensions):
models = []


for var in ["z"]:
    for state_lag in range(1,4):
        for bc in [True, False]:
            for wind in [True, False]:
                if wind is True or bc is True:
                    for extra_lag in range(3):
                        for extra_lead in range(3):
                            models.append(
                                M.RegressionModel("linear", [var], [var], wind, bc, 
                                                  state_lag, extra_lag, extra_lead,
                                                  latent_dim=10, train_frac=train_frac)
                                    )
                else:
                    models.append(
                                M.RegressionModel("linear", [var], [var], wind, bc, 
                                                  state_lag, extra_lag, extra_lead,
                                                  latent_dim=10, train_frac=train_frac)
                                    )
                    
                    
models_ls10 = models   

In [None]:
# Setup models (with 100 latent space dimensions):
models = []


for var in ["z"]:
    for state_lag in range(1,4):
        for bc in [True, False]:
            for wind in [True, False]:
                if wind is True or bc is True:
                    for extra_lag in range(3):
                        for extra_lead in range(3):
                            models.append(
                                M.RegressionModel("linear", [var], [var], wind, bc, 
                                                  state_lag, extra_lag, extra_lead,
                                                  latent_dim=100, train_frac=train_frac)
                                    )
                else:
                    models.append(
                                M.RegressionModel("linear", [var], [var], wind, bc, 
                                                  state_lag, extra_lag, extra_lead,
                                                  latent_dim=100, train_frac=train_frac)
                                    )
                    
                    
models_ls100 = models   

## Model testing

In [None]:
dlja3wroupok

In [None]:
%%time
n_models = len(baseline_models)

for i, model in (enumerate(baseline_models)):
    
    print(f"Running model ({i+1}/{n_models}): {model.name}")
    
    model.fit().predict()

In [None]:
%%time

n_models = len(models)

for i, model in (enumerate(models_ls1)):
    
    print(f"Running model ({i+1}/{n_models}): {model.name}")
    
    model.fit().predict()
    

In [None]:
%%time

n_models = len(models)

for i, model in (enumerate(models_ls10)):
    
    print(f"Running model ({i+1}/{n_models}): {model.name}")
    
    model.fit().predict()

In [None]:
%%time

n_models = len(models)

for i, model in (enumerate(models_ls100)):
    
    print(f"Running model ({i+1}/{n_models}): {model.name}")
    
    model.fit().predict()

In [None]:
mm.Storage().latent_spaces["z"]
mm.Storage.latent_spaces["z"]

In [None]:
# Checking performance of models:
plt.figure()
plt.title("Mean RMSEs of tested models with 1 dimension latent space.")
plt.plot(rmses_ls1, "-o", alpha=0.5)
plt.yscale("log")
my_yticks = [1/(10**(i)) for i in range(0,2)] 
plt.yticks(my_yticks)
plt.ylim(my_yticks[-1], my_yticks[0])
plt.show()

In [None]:
test = mf.rmse(model.output_data["z"], model.output_preds["z"], axis=1)

In [None]:
models = []

for var in ["z"]:
    for state_lag in range(1,4):
        for bc in [True, False]:
            for wind in [True, False]:
                if wind is True or bc is True:
                    for extra_lag in range(3):
                        for extra_lead in range(3):
                            models.append(
                                M.RegressionModel("linear", [var], [var], wind, bc, 
                                                  state_lag, extra_lag, extra_lead,
                                                  latent_dim=1, train_frac=train_frac)
                                    )
                else:
                    models.append(
                                M.RegressionModel("linear", [var], [var], wind, bc, 
                                                  state_lag, extra_lag, extra_lead,
                                                  latent_dim=1, train_frac=train_frac)
                                    )


In [None]:
%%time

n_models = len(models)

rmses_ls1 = []

for i, model in (enumerate(models)):
    
    print(f"Running model ({i+1}/{n_models}): {model.name}")
    
    model.fit().predict()
    
    rmses_ls1.append(np.round(np.mean(mf.rmse(model.output_data[var], model.output_preds[var], axis=0)), 6))

    
rmses_ls1 = np.array(rmses_ls1)

In [None]:
for i,j in zip(range(3), range(1,4)):
    print(i,j)

In [None]:
np.round(np.mean(mf.rmse(models[0].output_data["z"], models[0].output_preds["z"], axis=0)), 6)

In [None]:
models = [mm.my_models().Baseline(),
          mm.my_models().Coordinate_Baseline(),
          mm.my_models().PCAReconstruction(),
          
          mm.my_models().PCA_Multistep_Regression_Z(pca_bs = 100, pca_comps=10, ar=1),
          mm.my_models().PCA_Multistep_Regression_BC(pca_bs = 100, pca_comps=10, ar=1),
          mm.my_models().PCA_Multistep_Regression_Z_BC(pca_bs = 100, pca_comps=10, ar=1),
          mm.my_models().PCA_Regression_Z_BC(pca_bs=100, pca_comps=10),
          
          mm.my_models().PCA_Multistep_Regression_Z(pca_bs = 100, pca_comps=10, ar=2),
          mm.my_models().PCA_Multistep_Regression_BC(pca_bs = 100, pca_comps=10, ar=2),
          mm.my_models().PCA_Multistep_Regression_Z_BC(pca_bs = 100, pca_comps=10, ar=2),
          
          mm.my_models().PCA_Multistep_Regression_Z(pca_bs=100, pca_comps=10, ar=3),
          mm.my_models().PCA_Multistep_Regression_BC(pca_bs=100, pca_comps=10, ar=3),
          mm.my_models().PCA_Multistep_Regression_Z_BC(pca_bs=100, pca_comps=10, ar=3),]
          


for model in models:
    print(f"Running model: {model.name}")
    model.run(df_train, df_test)


In [None]:
for model in models:
    model.plot_errors()
    


In [None]:
df = pd.DataFrame({"Names": [i.name for i in models],
                   "Avg. Train. Err.": [i.model["train_errors"].mean() for i in models],
                   "Avg. Train. Rank.": np.argsort(np.argsort(np.array([i.model["train_errors"].mean() for i in models]))),
                   "Avg. Test. Err.": [i.model["test_errors"].mean() for i in models],
                   "Avg. Test. Rank.": np.argsort(np.argsort(np.array([i.model["test_errors"].mean() for i in models])))})
df

In [None]:
compare_models(models[-3:]+[models[2]])

In [None]:
compare_models(models)

In [None]:
m = mm.my_models().PCA_Multistep_Regression_BC(pca_bs=100, pca_comps=1, ar=3)
m.run(df_train, df_test)
m.plot_errors()




In [None]:
m.model["y_train_pred"]


In [None]:
m = mm.my_models().PCA_Multistep_Regression_BC(pca_bs=100, pca_comps=2, ar=3)
m.run(df_train, df_test)
m.plot_errors()