# Experiment 15: Pipeline for model building, training, parameter optimization, and evaluation.

### Setup

In [1]:
# Import necessary libraries
import mikeio
import matplotlib.pyplot as plt
import matplotlib.animation as animation
import numpy as np
import os
import sys
import pickle as pkl
import dill
import pandas as pd
import sklearn
import re

sys.path.append("../")
plt.style.use("seaborn-v0_8-whitegrid")

from Scripts import my_functions as mf, my_models as mm

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.metrics import make_scorer, mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.experimental import enable_halving_search_cv
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import IncrementalPCA
from sklearn.model_selection import TimeSeriesSplit, RandomizedSearchCV, GridSearchCV, train_test_split, cross_val_score, cross_validate

from xgboost import XGBRegressor as XGBR

from IPython.display import HTML
from tqdm import tqdm

### Load data from memory:

In [2]:
# Auxilliary variable:
compute = 1

In [3]:
%%time 

# Try to load results from earlier runs:
if compute:
    
    # Load combined data if available:
    if os.path.exists("../../Data/my_data/data.pkl"):
        
        # Load dataframe:
        with open("../../Data/my_data/data.pkl", "rb") as f:
            df_full = dill.load(f)
            
        # Change compute to 0:
        compute = 0
        
print(f"compute = {compute}")  

compute = 0
Wall time: 9.17 s


In [4]:
print(df_full.shape)

(18143, 54004)


In [5]:
# Isolate small part of data:
df = df_full.iloc[:100,]
df.shape

(100, 54004)

### Storage class

In [6]:
# Create a Storage class that holds useful functions and variables:
class Storage:

    """Class for storing data and functions."""

    # Method for initializing:
    def __init__(self):

        self.data = {}
        self.scalers = {}
        self.projectors = {}

        self.n_every = 10
        self.scalers_fitted = False

        self.max_dim = min(500, len(df) // self.n_every)
        self.bat_size = 512

        self.debug = False

    # Method for loading data into storage:    
    def load_data(self, dataframe):

        """Loads data from dataframe into dictionary of dataframes."""

        # Rename:
        df = dataframe
            
        # Dictionary w/ variables and regexes:
        my_var_dict = {"z"   : "z_", 
                        "u"   : "^u_",
                        "v"   : "^v_",
                        "bcn" : "bcn_",
                        "bcs" : "bcs_",
                        "wu"  : "wu_",
                        "wv"  : "wv_"}

        # Loop over variables:
        for key, regex in my_var_dict.items():

            # Filter:
            df_temp = df.filter(regex=regex)

            # Store:
            self.data[key] = df_temp

        # Return:
        return 

    # Method for fetching data from storage:
    def get_data(self, variables):

        """Fetches dictionaries corresponding to variables."""
            
        # Initialize:
        data = {}

        # Loop over variables:
        for key in variables:

            # Store:
            data[key] = self.data[key]

        # Return:
        return data

    # Method for loading scalers:
    def load_scalers(self, scaler, variables):
        
        """Loads scalers for each variable."""
            
        # Loop over variables:
        for key in variables:
            
            # Store:
            self.scalers[key] = scaler

        # Return:
        return

    # Method for loading projectors:
    def load_projectors(self, projector, variables):
        
        """Loads projectors for each variable."""
        
        # Set max dimensions:
        projector.n_components = self.max_dim

        # Loop over variables:
        for key in variables:
            
            # Store:
            self.projectors[key] = projector
            
        # Return:
        return

    # Method for fetching scalers:
    def get_scalers(self, variables):
        
        """Fetches dictionaries corresponding to variables."""
            
        # Initialize:
        scalers = {}

        # Loop over variables:
        for key in variables:

            # Store:
            scalers[key] = self.scalers[key]

        # Return:
        return scalers

    # Method for fetching projectors:
    def get_projectors(self, variables):
        
        """Fetches dictionaries corresponding to variables."""
            
        # Initialize:
        projectors = {}

        # Loop over variables:
        for key in variables:

            # Store:
            projectors[key] = self.projectors[key]

        # Return:
        return projectors

    # Method for fitting scalers:
    def fit_scalers(self, variables):
            
        """Fits scalers for each variable."""
        
        # Initialize:
        fit_data = None

        # Loop over variables:
        for key in variables:
            
            # Use subset of data:
            fit_data = self.data[key].iloc[::self.n_every].values

            # Fit:
            self.scalers[key].fit(fit_data)
        
        # Set flag:
        self.scalers_fitted = True

        # Return:
        return

    # Method for fitting projectors:
    def fit_projectors(self, variables):
            
        """Fits projectors for each variable."""
        
        # Initialize:
        fit_data = None

        # Loop over variables:
        for key in variables:
            
            # Use subset of data:
            fit_data = self.data[key].iloc[::self.n_every].values

            # Check if scalers have been fitted:
            if self.scalers_fitted:
                    
                # Transform:
                fit_data = self.scalers[key].transform(fit_data)

            # Fit:
            self.projectors[key].fit(fit_data)
        
        # Return:
        return

    # Method for converting dictionary of dataframes into dataframe:
    def dicts2df(self, data, variables):
                
            """Converts dictionary of dataframes into dataframe."""
            
            # Initialize:
            df = None
            df_list = []
    
            # Loop over variables:
            for key in variables:
                
                # Append:
                df_list.append(data[key])

            # Concatenate:
            df = pd.concat(df_list, axis=1)

            # Return:
            return df

    # Method for converting dataframe into dictionary of dataframes:
    def df2dicts(self, data, variables):
            
        """Recovers dictionary of dataframes from data."""
        
        # Initialize:
        reconverted_data = {}

        if variables is None:
            return None

        # Loop over variables:
        for key in variables:
            
            # Filter dataframe:
            data_tmp = data.filter(regex=f"^{key}")

            # Overwrite
            reconverted_data[key] = data_tmp

        # Return:
        return reconverted_data
    
    # Method for scaling data:
    def scale_data(self, direction, data, variables):
            
        """Scales data for each variable."""
        
        # Initialize:
        scaled_data = {}
        data_tmp = None

        # Loop over variables:
        for key in variables:

            indices = data[key].index

            if direction == "forward":
                
                # Transform:
                data_tmp = self.scalers[key].transform(data[key])

            if direction == "backward":
                
                # Inverse transform:
                data_tmp = self.scalers[key].inverse_transform(data[key])

            # Get shape:
            h, w = data_tmp.shape
            
            # Overwrite:
            scaled_data[key] = self.data[key].iloc[:h, :w].copy()

            scaled_data[key].iloc[:, :] = data_tmp

            scaled_data[key].index = indices

        # Return:
        return scaled_data
    
    # Method for projecting data:
    def project_data(self, direction, data, variables):
            
        """Projects data for each variable."""
        
        # Initialize:
        projected_data = {}
        data_tmp = {}

        # Loop over variables:
        for key in variables:
            
            indices = data[key].index

            if direction == "forward":
                
                # Transform:
                data_tmp = self.projectors[key].transform(data[key])

            if direction == "backward":
                
                # Inverse transform:
                data_tmp = self.projectors[key].inverse_transform(data[key])

            # Get shape:
            h, w = data_tmp.shape

            # Overwrite:
            projected_data[key] = self.data[key].iloc[:h, :w].copy()

            projected_data[key].iloc[:, :] = data_tmp

            projected_data[key].index = indices

        # Return:
        return projected_data

    # Method for lagging data:
    def lag_n_lead_data(self, data, variables, lag, lead):
            
        """Lags and leads data for each variable and returns data as a single dataframe. Lag must be at least 1 and lead must be at least 0.
        
        """
        
        # Initialize:
        shift_data = {}
        data_tmp = None

        # Create data list:
        data_list = []
        
        if variables is None:
            data_list.append(None)

        else:
                
            # If lag is needed:
            if lag > 0:
                
                # Iterate over lags (in reverse):
                for i in range(1, lag+1)[::-1]:
                    
                    # Iterate over variables:
                    for key in variables:

                        # Create copy:
                        data_copy = data[key].copy()
                        
                        # Create copy of copy:
                        data_copy_copy = data_copy.copy()

                        # Fix column names:
                        data_copy_copy.columns = \
                            [f"lag_{i}_{col}" for col in data_copy_copy.columns]
                        
                        # Append shifted data:
                        data_list.append(data_copy_copy.shift(i))
                
            else:
                data_list.append(None)

            # If lead is needed:
            if lead > 0:
        
                # Iterate over leads:
                for i in range(0, lead):

                    # Iterate over variables:
                    for key in variables:
                        
                        # Create copy:
                        data_copy = data[key].copy()

                        # Create copy of copy:
                        data_copy_copy = data_copy.copy()

                        # Fix column names:
                        if i == 0:
                            data_copy_copy.columns = \
                                [f"{col}" for col in data_copy_copy.columns]
                        else:
                            data_copy_copy.columns = \
                                [f"lead_{i}_{col}" for col in data_copy_copy.columns]
                        
                        # Append shifted data:
                        data_list.append(data_copy_copy.shift(-i))

            else:
                data_list.append(None)


        # Concatenate:
        data_tmp = pd.concat(data_list, axis=1).dropna()

        # Overwrite:
        shift_data = data_tmp

        # Return:
        return shift_data

    # Method for splitting data:
    def split_data(self, data, train_frac):
                
            """Splits data into train and test sets."""
            
            # Split:
            train_data, test_data = train_test_split(data, train_size=train_frac, shuffle=False)
    
            # Return:
            return train_data, test_data
     
    
# Create storage object:
storage = Storage()

#### Pipeline classes:

In [19]:
# PIPELINE PROCESSING BLOCKS:

class Setup(BaseEstimator, TransformerMixin):
    def __init__(self, scaler, projector, vars):
        print("Setup.__init__")
        self.scaler = scaler
        self.projector = projector
        self.vars = vars
        storage.load_scalers(scaler, vars)
        storage.load_projectors(projector, vars)
        storage.fit_scalers(vars)
        storage.fit_projectors(vars)

    def fit(self, X, y=None):
        print("Setup.fit") 
        return self
    
    def transform(self, X):
        print("Setup.transform")
        return X # Return X as a tuple


class GetDicts(BaseEstimator, TransformerMixin):
    """ Gets dictionaries of dataframes."""
    def __init__(self, input_vars, extra_vars):
        if storage.debug:
            print("GetDicts.__init__")

        self.input_vars = input_vars
        self.extra_vars = extra_vars
        self.output_vars = input_vars
        self.y = None
        
    def fit(self, X, y):
        if storage.debug:
            print("GetDicts.fit")

        # Convert y to dictionary:
        self.y = storage.df2dicts(y, self.output_vars)
        return self
    
    def transform(self, X):

        if storage.debug:
            print("GetDicts.transform")

        # Convert X to dictionary:
        X1 = storage.df2dicts(X, self.input_vars)
        E1 = storage.df2dicts(X, self.extra_vars)

        # Output handling:
        if self.y is not None:
            y1 = self.y
            self.y = None

            return self.input_vars, self.extra_vars, self.output_vars, X1, E1, y1 
        
        else:
            return self.input_vars, self.extra_vars, self.output_vars, X1, E1
        

class DataScaler(BaseEstimator, TransformerMixin):
    """ Scales data."""
    def __init__(self, direction, scaler=None):
        if storage.debug:
            print("DataScaler.__init__")
        self.direction = direction
        
        if scaler == "standard":
            self.scaler = StandardScaler()
        else:
            self.scaler = scaler
            
        self.y_org = None
        self.y = None


    def fit(self, X, y=None):
        """ Always receives a tuple of (X, E, y) and returns self."""
        if storage.debug:
            print("DataScaler.fit")

        if self.direction == "forward":
            
            # Extract variables:
            input_vars, _, _, _, _, y1 = X

            # Store data:
            self.y_org = y1
            self.y = y1

            # Conditional: 
            if self.scaler is not None:

                # Load and fit scalers:
                storage.load_scalers(self.scaler, input_vars)
                storage.fit_scalers(input_vars)

                # Scale data:
                self.y = storage.scale_data(self.direction, self.y, input_vars)
           
            return self

        if self.direction == "backward":
            return self


    def transform(self, X):
        """ Can receive a tuple of (X, E, y) or (X, E) and returns a tuple of (X, E, y) or (X, E). """
        if storage.debug:
            print("DataScaler.transform")

        if self.direction == "forward":
                
            # Input handling:
            if len(X) == 6:
                input_vars, extra_vars, output_vars, X1, E1, _ = X 
            
            elif len(X) == 5:
                input_vars, extra_vars, output_vars, X1, E1 = X  

            # Conditional:
            if self.scaler is not None:

                # Scale data:
                X1 = storage.scale_data(self.direction, X1, input_vars)

            # Output handling:
            if self.y is not None:
                y1 = self.y
                self.y = None

                return self.scaler, input_vars, extra_vars, output_vars, X1, E1, y1 

            else:
                return self.scaler, input_vars, extra_vars, output_vars, X1, E1 

        if self.direction == "backward":
            
            # Extract data:
            scaler, output_vars, y_pred = X

            if scaler is not None:
                    
                # Rescale data:
                y_pred = storage.scale_data(self.direction, y_pred, output_vars)
            
            return y_pred


class DataProjector(BaseEstimator, TransformerMixin):
    """ Projects data."""
    def __init__(self, direction, projector=None, n_dim=1):
        if storage.debug:
            print("DataProjector.__init__")
        self.direction = direction
        self.n_dim = n_dim
        if projector == "ipca":
            self.projector = IncrementalPCA(n_components=self.n_dim)
        else:
            self.projector = projector
        
        self.y_org = None
        self.y = None

    def fit(self, X, y=None):
        if storage.debug:
            print("DataProjector.fit")

        if self.direction == "forward":

            # Extract variables:
            _, input_vars, _, _, _, _, y1 = X

            # Store data:
            self.y_org = y1
            self.y = y1

            # Conditional:
            if self.projector is not None:

                # Load and fit projectors:
                storage.load_projectors(self.projector, input_vars)
                storage.fit_projectors(input_vars)

                # Project data:
                self.y = storage.project_data(self.direction, self.y, input_vars)
            
            return self

        if self.direction == "backward":
            return self

    def transform(self, X):
        if storage.debug:
            print("DataProjector.transform")

        if self.direction == "forward":

            # Input handling:
            if len(X) == 7:
                scaler, input_vars, extra_vars, output_vars, X1, E1, _ = X 
            
            elif len(X) == 6:
                scaler, input_vars, extra_vars, output_vars, X1, E1 = X 
                  
            if self.projector is not None:

                # Project data: 
                X1 = storage.project_data(self.direction, X1, input_vars)

            # Output handing:
            if self.y is not None:
                y1 = self.y
                self.y = None

                return scaler, self.projector, input_vars, extra_vars, output_vars, X1, E1, y1 
            
            else:
                return scaler, self.projector, input_vars, extra_vars, output_vars, X1, E1
        
        if self.direction == "backward":

            # Extract data:
            scaler, projector, _, _, output_vars, y_pred = X
            
            if projector is not None:
                    
                # Reproject data:
                y_pred = storage.project_data(self.direction, y_pred, output_vars)

            return scaler, output_vars, y_pred


class DataLagger(BaseEstimator, TransformerMixin):
    """ Lags and leads data."""
    def __init__(self, lag, lead):
        if storage.debug:
            print("DataLagger.__init__")
        self.lag = lag
        self.lead = lead
        self.y = None

    def fit(self, X, y=None):
        if storage.debug:
            print("DataLagger.fit")

        return self

    def transform(self, X):
        if storage.debug:
            print("DataLagger.transform")
        
        # Input handling:
        if len(X) == 8:
            scaler, projector, input_vars, extra_vars, output_vars, X1, E1, self.y = X

        elif len(X) == 7:
            scaler, projector, input_vars, extra_vars, output_vars, X1, E1 = X

        # Conditional:
        if self.y is not None:
            self.y = storage.lag_n_lead_data(self.y, output_vars, lag=0, lead=1)

        # Lag and lead:
        X1 = storage.lag_n_lead_data(X1, input_vars, self.lag, self.lead)        

        # Output handling:
        if self.y is not None:
            y1 = self.y
            self.y = None
            return scaler, projector, input_vars, extra_vars, output_vars, X1, E1, y1
        
        else:
            return scaler, projector, input_vars, extra_vars, output_vars, X1, E1  


class ExtraLagger(BaseEstimator, TransformerMixin):
    """ Lags and leads extra variables."""
    def __init__(self, lag, lead):
        if storage.debug:
            print("ExtraLagger.__init__")
        self.lag = lag
        self.lead = lead
        self.y = None


    def fit(self, X, y=None):
        if storage.debug:
            print("ExtraLagger.fit")
        return self


    def transform(self, X):
        if storage.debug:
            print("ExtraLagger.transform")

        # Input handling:
        if len(X) == 8:
            scaler, projector, input_vars, extra_vars, output_vars, X1, E1, self.y = X
        
        elif len(X) == 7:
            scaler, projector, input_vars, extra_vars, output_vars, X1, E1 = X


        # Lag and lead:
        E1 = storage.lag_n_lead_data(E1, extra_vars, self.lag, self.lead)

        
        # Output handling:
        if self.y is not None:
            y1 = self.y
            self.y = None

            return scaler, projector, input_vars, extra_vars, output_vars, X1, E1, y1  
        
        else:
            return scaler, projector, input_vars, extra_vars, output_vars, X1, E1 


class FixIndices(BaseEstimator, TransformerMixin):
    """ Finds common indices of X, y, and E, and strips away the rest."""
    def __init__(self):
        if storage.debug:
            print("FixIndices.__init__")
        self.y = None

    def fit(self, X, y=None):
        if storage.debug:
            print("FixIndices.fit")

        return self

    def transform(self, X):
        if storage.debug:
            print("FixIndices.transform")

        # Input handling:
        if len(X) == 8:
            scaler, projector, input_vars, extra_vars, output_vars, X1, E1, self.y = X  

        elif len(X) == 7:
            scaler, projector, input_vars, extra_vars, output_vars, X1, E1 = X  


        # Find common indices:
        indices = X1.index.intersection(E1.index)

        # Conditional:
        if self.y is not None:
            indices = indices.intersection(self.y.index)
    
        # Overwrite:
        X1 = X1.loc[indices, :]
        E1 = E1.loc[indices, :]


        # Output handling:
        if self.y is not None:
            y1 = self.y
            self.y = None

            # Overwrite:
            y1 = y1.loc[indices, :]

            indices = None

            return scaler, projector, input_vars, extra_vars, output_vars, X1, E1, y1 
        
        else:
            return scaler, projector, input_vars, extra_vars, output_vars, X1, E1  


# PIPELINE MODEL BLOCKS:
class LinearRegressor(BaseEstimator, TransformerMixin):
    def __init__(self):
        if storage.debug:
            print("LinearRegressor.__init__")
        self.model = LinearRegression()
        self.y = None

    def fit(self, X, y=None):
        if storage.debug:
            print("LinearRegressor.fit")

        # Extract data:
        _, _, _, _, _, X1, E1, y1 = X

        # Combine X and E:
        train_input = pd.concat([X1, E1], axis=1)
        train_target = y1

        # Fit:
        self.model.fit(X=train_input, y=train_target)
        
        self.y = y1

        return self


    def transform(self, X, y=None):
        if storage.debug:
            print("LinearRegressor.transform")

        # Input handling:
        if len(X) == 8:
            scaler, projector, input_vars, extra_vars, output_vars, X1, E1, _ = X

        elif len(X) == 7:
            scaler, projector, input_vars, extra_vars, output_vars, X1, E1 = X

        # Combine X and E:
        test_input = pd.concat([X1, E1], axis=1)

        y_pred = pd.DataFrame(index=test_input.index, columns=self.y.columns)

        # Predict:
        y_pred.values[:] = self.model.predict(test_input)
    
        return scaler, projector, input_vars, extra_vars, output_vars, y_pred


class XGBoostRegressor(BaseEstimator, TransformerMixin):
    def __init__(self, n_estimators, max_depth, learning_rate,
                 random_state=42, verbosity=10):

        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.learning_rate = learning_rate
        self.random_state = random_state
        self.verbosity = verbosity
        

        if storage.debug:
            print("XGBRegressor.__init__")

        self.model = XGBR(n_estimators=self.n_estimators,
                          max_depth=self.max_depth, 
                          learning_rate=self.learning_rate, 
                          random_state=self.random_state, 
                          verbosity=self.verbosity)
        self.y = None

    def fit(self, X, y=None):

        if storage.debug:
            print("XGBRegressor.fit")

        # Extract data:
        _, _, _, _, _, X1, E1, y1 = X

        # Combine X and E:
        train_input = pd.concat([X1, E1], axis=1)
        train_target = y1

        # Fit:
        self.model.fit(X=train_input, y=train_target)
        
        self.y = y1

        return self


    def transform(self, X, y=None):
        if storage.debug:
            print("XGBRegressor.transform")

        # Input handling:
        if len(X) == 8:
            scaler, projector, input_vars, extra_vars, output_vars, X1, E1, _ = X

        elif len(X) == 7:
            scaler, projector, input_vars, extra_vars, output_vars, X1, E1 = X

        # Combine X and E:
        test_input = pd.concat([X1, E1], axis=1)

        y_pred = pd.DataFrame(index=test_input.index, columns=self.y.columns)

        # Predict:
        y_pred.values[:] = self.model.predict(test_input)
    
        return scaler, projector, input_vars, extra_vars, output_vars, y_pred

class DMDc():

    def __init__(self, input_modes, extra_modes):
        self.input_modes = input_modes
        self.extra_modes = extra_modes

    def fit(self, X, E, y):
        """ Fits the DMDc model to the data."""

        # Renaming:
        X_prime = y.values
        X_c = E.values
        X = X.values
        p = self.input_modes + self.extra_modes
        r = self.input_modes
        n = X.shape[1]

        # Step 0: Flip matrices (Columns=Snapshots)
        X_prime = X_prime.T
        X = X.T
        X_c = X_c.T
            
        # Step 1: Construct Omega:
        Omega = np.concatenate([X, X_c], axis=0)
        
        # Step 2: Compute the (tilde)-SVD of Omega
        U_tilde, S_tilde, VT_tilde = np.linalg.svd(Omega, full_matrices=0)
        
        S_tilde_org = S_tilde
        
        # Step 2.1: Truncate with value p (> r):
        U_tilde = U_tilde[:, :p]
        S_tilde = np.diag(S_tilde[:p])
        VT_tilde = VT_tilde[:p, :]
        
        # Step 2.2: Split U_tilde into 2:
        U_tilde1 = U_tilde[:n, :]
        U_tilde2 = U_tilde[n:, :]
        
        # Step 3: Compute the (hat)-SVD of X_prime:
        U_hat, S_hat, VT_hat = np.linalg.svd(X_prime, full_matrices=0)
        
        S_hat_org = S_hat
        
        # Step 3.1: Truncate with value r (< p):
        U_hat = U_hat[:, :r]
        S_hat = np.diag(S_hat[:r])
        VT_hat = VT_hat[:r, :]
        
        # Step 4: Compute A and B:
        XVS = np.linalg.solve(S_tilde.T, (X_prime @ VT_tilde.T).T).T

        A_tilde = U_hat.T @ XVS @ U_tilde1.T @ U_hat
        B_tilde = U_hat.T @ XVS @ U_tilde2.T
        
        # Step 5: Eigenvalue decomposition:
        Lambda, W = np.linalg.eig(A_tilde)
        
        # Extras: Correct the ordering of eigenvalues and eigenvectors:
        idx = Lambda.argsort()[::-1]
        Lambda = np.diag(Lambda[idx])
        W = W[:, idx]
        
        # Step 6: Compute the dynamic modes of A:
        Phi = XVS @ U_tilde1.T @ U_hat @ W 

        # Keep data:
        self.A_tilde = A_tilde
        self.B_tilde = B_tilde
        self.Phi = Phi
        self.U_hat = U_hat

        return self

    def predict(self, X, E):
        """ Predicts the next value from data."""

        X = X.T
        E = E.T

        X_tilde = self.U_hat.T @ X
        
        y_tilde = self.A_tilde @ X_tilde + self.B_tilde @ E

        y = self.U_hat @ y_tilde

        return y.T
    
    def predict_linked(self, X, E):
        """ Predicts the next value from data and previous predictions."""

        X = X.T
        E = E.T

        X_tilde = self.U_hat.T @ X[:, 0]

        x_tilde_list = []

        for k in tqdm(range(X.shape[1])):
            X_tilde = self.A_tilde @ X_tilde + self.B_tilde @ E[:, k]
            x_tilde_list.append(X_tilde)


        y = self.U_hat @ np.array(x_tilde_list).T

        return y.T

class DMDcRegressor(BaseEstimator, TransformerMixin):
    def __init__(self, input_modes=None, extra_modes=None):

        if storage.debug:
            print("DMDcRegressor.__init__")

        self.input_modes = input_modes
        self.extra_modes = extra_modes

        self.model = DMDc(self.input_modes, self.extra_modes)

        self.y = None

    def fit(self, X, y=None):

        if storage.debug:
            print("DMDcRegressor.fit")

        # Extract data:
        _, _, _, _, _, X1, E1, y1 = X

        # Fit:
        self.model.fit(X1, E1, y1)
        
        self.y = y1

        return self


    def transform(self, X, y=None):
        if storage.debug:
            print("DMDRegressor.transform")

        # Input handling:
        if len(X) == 8:
            scaler, projector, input_vars, extra_vars, output_vars, X1, E1, _ = X

        elif len(X) == 7:
            scaler, projector, input_vars, extra_vars, output_vars, X1, E1 = X

        y_pred = pd.DataFrame(index=X1.index, columns=self.y.columns)

        # Predict:
        y_pred.values[:] = self.model.predict(X1, E1)
    
        return scaler, projector, input_vars, extra_vars, output_vars, y_pred
      

# PIPELINE POSTPROCESSING BLOCKS:
class RecoverDict(BaseEstimator, TransformerMixin):
    """ Recovers dictionary of dataframes from data."""
    def __init__(self):
        if storage.debug:
            print("RecoverDict.__init__")

    def fit(self, X, y=None):
        if storage.debug:
            print("RecoverDict.fit")
        return self

    def transform(self, X):
        if storage.debug:
            print("RecoverDict.transform")

        # Extract data:
        scaler, projector, input_vars, extra_vars, output_vars, y_pred = X

        y_pred = storage.df2dicts(y_pred, output_vars)

        return scaler, projector, input_vars, extra_vars, output_vars, y_pred


class DummyEstimator(BaseEstimator, TransformerMixin):
    """ A simple class to ensure that all fit and transform methods of classes before the Dummy are called, due to the way sklearn pipelines work. """
    
    def __init__(self, test=False):
        if storage.debug:
            print("DummyEstimator.__init__")
        self.test = test

    def fit(self, X, y=None):
        if storage.debug:
            print("DummyEstimator.fit")
        return self
    
    def predict(self, X):
        if storage.debug:
            print("DummyEstimator.predict")

        if self.test:

            # Extract data:
            scaler, projector, input_vars, extra_vars, output_vars, y_pred = X

            # Check projector:
            if projector is not None:

                y_pred = storage.project_data("backward", y_pred, output_vars)

            # Check scaler:
            if scaler is not None:

                y_pred = storage.scale_data("backward", y_pred, output_vars)

            return y_pred
        
        else:
            return X

   
### TSCV MIGHT BE USEFUL HERE ###
# https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.TimeSeriesSplit.html    


In [22]:
# Load data into storage:
storage.load_data(df)

# Define input and extra variables:
input_vars = ["z", "u", "v"]
extra_vars = ["bcs", "bcn", "wu", "wv"]

# Load inputs, extra and outputs:
input_data_dict = storage.get_data(input_vars)
extra_data_dict = storage.get_data(extra_vars)
output_data_dict = storage.get_data(input_vars)

# Concatenate inputs, extras and outputs:
input_data_df = storage.dicts2df(input_data_dict, input_vars)
extra_data_df = storage.dicts2df(extra_data_dict, extra_vars)
output_data_df = storage.dicts2df(output_data_dict, input_vars)

# Define input and target variables for pipeline:
X_df = pd.concat([input_data_df, extra_data_df], axis=1)
y_df = output_data_df

In [None]:
# Define pipeline:
pipeline_lr = Pipeline([
    ("get_dicts", GetDicts(input_vars=input_vars, 
                           extra_vars=extra_vars)),
    ("scaler", DataScaler(direction="forward", scaler="standard")),
    ("projector", DataProjector(direction="forward", projector="ipca")),
    ("lag_n_lead_data", DataLagger(lag=1, lead=0)),
    ("lag_n_lead_extra", ExtraLagger(lag=1, lead=1)),
    ("merge_indices", FixIndices()),
    ("model", LinearRegressor()),
    ("dict_recover", RecoverDict()),
    ("reprojector", DataProjector(direction="backward", projector=None)),
    ("rescaler", DataScaler(direction="backward", scaler=None)),
    ("estimator", DummyEstimator())
], verbose=False)

# Define pipeline:
pipeline_xgb = Pipeline([
    ("get_dicts", GetDicts(input_vars=input_vars, 
                           extra_vars=extra_vars)),
    ("scaler", DataScaler(direction="forward", scaler=StandardScaler())),
    ("projector", DataProjector(direction="forward", projector=IncrementalPCA())),
    ("lag_n_lead_data", DataLagger(lag=1, lead=0)),
    ("lag_n_lead_extra", ExtraLagger(lag=1, lead=1)),
    ("merge_indices", FixIndices()),
    ("model", XGBoostRegressor(n_estimators=1, max_depth=1, learning_rate=1)),
    ("dict_recover", RecoverDict()),
    ("reprojector", DataProjector(direction="backward", projector=None)),
    ("rescaler", DataScaler(direction="backward", scaler=None)),
    ("estimator", DummyEstimator())
], verbose=False)

# Define pipeline:
pipeline_dmdc = Pipeline([
    ("get_dicts", GetDicts(input_vars=input_vars, 
                           extra_vars=extra_vars)),
    ("scaler", DataScaler(direction="forward", scaler=None)),
    ("projector", DataProjector(direction="forward", projector=None)),
    ("lag_n_lead_data", DataLagger(lag=1, lead=0)),
    ("lag_n_lead_extra", ExtraLagger(lag=1, lead=1)),
    ("merge_indices", FixIndices()),
    ("model", DMDcRegressor(input_modes=2, extra_modes=2)),
    ("dict_recover", RecoverDict()),
    ("reprojector", DataProjector(direction="backward", projector=None)),
    ("rescaler", DataScaler(direction="backward", scaler=None)),
    ("estimator", DummyEstimator())
], verbose=False)




# Define parameter grid:
param_grid_lr = {
    "get_dicts__input_vars": [["z"]],
    "get_dicts__extra_vars": [[None], ["bcn", "bcs"], ["wu", "wv"], ["bcn", "bcs", "wu", "wv"]],
    "projector__n_dim": [10**(i) for i in range(0, 4)],
    "lag_n_lead_data__lag": [i for i in range(1, 6)],
    "lag_n_lead_data__lead": [0],
    "lag_n_lead_extra__lag": [i for i in range(1, 6)],
    "lag_n_lead_extra__lead": [i for i in range(1, 6)],
}

# Define parameter grid:
param_grid_xgb = {
    "get_dicts__input_vars": [["z"]],
    "get_dicts__extra_vars": [[None], ["bcn", "bcs"], ["wu", "wv"], ["bcn", "bcs", "wu", "wv"]],
    "projector__n_dim": [int(10**(i)) for i in range(0, 4)],    
    "lag_n_lead_data__lag": [i for i in range(1, 6)],
    "lag_n_lead_data__lead": [0],
    "lag_n_lead_extra__lag": [i for i in range(1, 6)],
    "lag_n_lead_extra__lead": [i for i in range(1, 6)],
    "model__n_estimators": [int(i*10) for i in range(1, 6)],
    "model__max_depth": [i for i in range(1, 6)],
    "model__learning_rate": [0.1, 0.01, 0.001],
}

# Define parameter grid:
param_grid_dmdc = {
    "get_dicts__input_vars": [["z"]],
    "get_dicts__extra_vars": [[None], ["bcn", "bcs"], ["wu", "wv"], ["bcn", "bcs", "wu", "wv"]],
    "lag_n_lead_data__lag": [i for i in range(1, 6)],
    "lag_n_lead_data__lead": [0],
    "lag_n_lead_extra__lag": [i for i in range(1, 6)],
    "lag_n_lead_extra__lead": [i for i in range(1, 6)],
    "model__input_modes": [1, 10, 100, 500],
    "model__extra_modes": [1, 10, 100],
}


# Get mikeio area data:
img_data = mf.get_mikeio_format()

mesh_weights = img_data.geometry.get_element_area()

# Define custom scoring setup:
custom_scorer = make_scorer(mf.compute_metric, 
                            greater_is_better=False,
                            kwargs={"metric": "rmse", 
                                    "axis": "time",
                                    "weights": mesh_weights,
                                    "neg": True})

# Define cross-validation setup:
cv = TimeSeriesSplit(n_splits=2)

# Set debug mode:
storage.debug = False

# Test cross-validation:
if 1:
    scores_lr = cross_validate(pipeline_lr, X_df, y_df, 
                               cv=cv, scoring=custom_scorer, 
                               return_estimator=False, 
                               verbose=1, n_jobs=1, error_score="raise")

    # Print scores:
    print(f"Scores (LinearRegressor): {scores_lr['test_score']}")
    print(f"Mean score (LinearRegressor): {np.mean(scores_lr['test_score'])}")
    print(f"Std score (LinearRegressor): {np.std(scores_lr['test_score'])}")
    
    

    scores_dmdc = cross_validate(pipeline_dmdc, X_df, y_df, cv=cv, scoring=custom_scorer,
                            return_estimator=False,
                            verbose=1, n_jobs=1, error_score="raise")

    # Print scores:
    print(f"Scores: (DMDcRegressor) {scores_dmdc['test_score']}")
    print(f"Mean score:(DMDcRegressor) {np.mean(scores_dmdc['test_score'])}")
    print(f"Std score: (DMDcRegressor) {np.std(scores_dmdc['test_score'])}")



    scores_xgb = cross_validate(pipeline_xgb, X_df, y_df,
                                cv=cv, scoring=custom_scorer,
                                return_estimator=False,
                                verbose=1, n_jobs=1, error_score="raise")

    # Print scores:
    print(f"Scores: (XGBRegressor) {scores_xgb['test_score']}")
    print(f"Mean score (XGBRegressor): {np.mean(scores_xgb['test_score'])}")
    print(f"Std score (XGBRegressor): {np.std(scores_xgb['test_score'])}")



import warnings
warnings.filterwarnings("ignore")

    
if 0:     

    max_iters = 1

    # Test randomized search:
    search_lr = RandomizedSearchCV(pipeline_lr, param_grid_lr, n_iter=max_iters,
                                scoring=custom_scorer, n_jobs=1,
                                cv=cv, verbose=10, refit=False,
                                random_state=42, error_score="raise")

    
    search_lr.fit(X_df, y_df)

    # Print results:
    print(f"Best score (LinearRegressor): {search_lr.best_score_}")
    print(f"Best params (LinearRegressor): {search_lr.best_params_}")


    # Test randomized search:
    search_xgb = RandomizedSearchCV(pipeline_xgb, param_grid_xgb, n_iter=max_iters, 
                                    scoring=custom_scorer, n_jobs=1, cv=cv, 
                                    verbose=2, refit=False, random_state=42)

    search_xgb.fit(X_df, y_df)

    # Print results:
    print(f"Best score (XGBRegressor): {search_xgb.best_score_}")
    print(f"Best params (XGBRegressor): {search_xgb.best_params_}")

    
    # Test randomized search:
    search_dmdc = RandomizedSearchCV(pipeline_dmdc, param_grid_dmdc, n_iter=max_iters, scoring=custom_scorer, n_jobs=1, cv=cv, verbose=2, refit=False, random_state=42)

    search_dmdc.fit(X_df, y_df)

    # Print results:
    print(f"Best score (DMDcRegressor): {search_dmdc.best_score_}")
    print(f"Best params (DMDcRegressor): {search_dmdc.best_params_}")




[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   26.6s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Scores (LinearRegressor): [0.13452574 0.10604277]
Mean score (LinearRegressor): 0.12028425808049349
Std score (LinearRegressor): 0.014241484613632685
