In [1]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.cross_decomposition import PLSRegression
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.metrics import root_mean_squared_error, r2_score
from sklearn.model_selection import validation_curve
import plotly.io as pio
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.model_selection import KFold
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

pio.templates.default = "plotly_white"
pcolors = px.colors.qualitative.T10
pcolors25 = px.colors.qualitative.Alphabet


In [2]:
def generate_bwu(owu):
    if 'timesteps' in owu.columns:
        owu = owu.drop(["timesteps"], axis=1)
    # Input: multiindex OWU
    # Output: singleindex BWU
    for run_ix, run in owu.groupby("run"):
        if run_ix == owu.index.get_level_values('run')[0]:
            bwuindex = run.unstack(level=1)
        else:
            bwuindex = pd.concat([bwuindex, run.unstack(level=1)])
    bwu_columns = [str(bwuindex.columns.get_level_values(0)[i])+str(":")+str(bwuindex.columns.get_level_values(1)[i]) 
                   for i in range(len(bwuindex.columns.get_level_values(0)))]
    bwu = pd.DataFrame(bwuindex.to_numpy(), columns=bwu_columns)
    
    return bwu


def generate_y(bwu, return_aggr=False):
    # Input: singleindex BWU
    # Output: singleindex BWU having only target
    titer_column = [c for c in bwu.columns if c.startswith("X:Titer")]
    targets = pd.DataFrame(columns=["Y:Titer", "Y:Aggr"], index=bwu.index)

    # iterate through experiments
    for j in list(bwu.index):
        x_titer = bwu.loc[j, titer_column]
        x_prod = [0]
        x_aggr = [0]
        k_aggr = 10**-7
        for i in range(len(x_titer)):
            if i == 0:
                continue
            xt_titer = x_titer.iloc[i]
            dt_titer = x_titer.iloc[i] - x_titer.iloc[i - 1]
            x_prod.append(xt_titer)
            x_aggr.append(k_aggr * (xt_titer**2))

            dt_aggr = x_aggr[i] - x_aggr[i - 1]
            dt_prod = dt_titer - 2 * dt_aggr
            dt_aggr = k_aggr * (x_prod[i - 1] + dt_prod) ** 2

            x_aggr[i] = x_aggr[i - 1] + dt_aggr
            x_prod[i] = x_prod[i - 1] + dt_prod
        y_prod = x_prod[-1]
        y_aggr = x_aggr[-1]

        targets.loc[j, "Y:Titer"] = y_prod
        targets.loc[j, "Y:Aggr"] = y_aggr
    if return_aggr:
        target = targets["Y:Aggr"]
    else:
        target = targets["Y:Titer"]

    return pd.DataFrame(target)


def vip(X, model):
    # Score matrix T (latent variables), corresponding to T in the formula
    t = model.x_scores_

    # Weight matrix W, corresponding to W in the formula
    w = model.x_weights_

    # Loadings matrix Q, corresponding to c in the formula (sometimes Q is used for loadings in PLS models)
    q = model.y_loadings_

    # Number of samples (m) and number of variables (p), corresponding to the shape of X
    m, p = X.shape

    # Number of latent variables (h), corresponding to the shape of T
    _, h = t.shape

    # Initialize VIP scores array
    vips = np.zeros((p,))

    # Calculate SS(c_i t_i), the s in the formula, representing the sum of squares for the i-th latent variable
    # Here, t.T @ t is T^t * T, q.T @ q is c^t * c
    s = np.diag(t.T @ t @ q.T @ q).reshape(h, -1)

    # Calculate the total sum of SS(c_i t_i)
    total_s = np.sum(s)

    # Calculate the VIP score for each variable
    for i in range(p):
        # Calculate (w_ij / ||w_i||)^2 for each latent variable j
        # w[:,j] is the j-th column of weights, representing the weights for the j-th latent variable
        weight = np.array([(w[i, j] / np.linalg.norm(w[:, j])) ** 2 for j in range(h)])

        # Calculate the VIP score using the formula:
        # VIP_j = (k * ∑(SS(c_i t_i) * (w_ij / ||w_i||)^2) / ∑(SS(c_i t_i)))^(1/2)
        # Where k = p (number of variables)
        vips[i] = np.sqrt(p * np.sum(s.T @ weight) / total_s)

    return vips

In [3]:
def r2(y, y_pred):
    return round(r2_score(y, y_pred), 3)


def absolute_rmse(y, y_pred):
    return round(root_mean_squared_error(y, y_pred), 3)


def relative_rmse(y, y_pred):
    return round(root_mean_squared_error(y, y_pred) / np.std(np.array(y)), 3)

# Dataset

In [4]:
# data_type = 'interpolation'
data_type = 'extrapolation'
root_path = f'dataset/datahow_2022/{data_type}/'

def read_owu_v4(file, root_path = 'dataset/datahow_2022/interpolation/'):
    data = pd.read_csv(f'{root_path}/{file}.csv')
    owu_df = data.copy()
    num_runs = len(pd.read_csv(f'{root_path}/{file}_doe.csv'))
    if 'run' not in owu_df.columns:
        owu_df.index = pd.MultiIndex.from_product(
            [list(range(num_runs)), list(range(15))], names=["run", "time"]
        )
    else:
        owu_df.set_index(['run', 'time'], inplace=True)
    owu_df = owu_df.loc[owu_df.index.get_level_values('run') < 10, :]
    return owu_df

def read_doe(file, root_path= 'dataset/datahow_2022/interpolation/'):
    data = pd.read_csv(f'{root_path}/{file}.csv', usecols=["feed_start", "feed_end", "Glc_feed_rate", "Glc_0", "VCD_0"])
    doe_df = data.copy()
    doe_df = doe_df.loc[doe_df.index < 10, :]
    return doe_df

In [5]:
owu = read_owu_v4('owu', root_path=root_path)
doe = read_doe('owu_doe', root_path=root_path)
bwu = generate_bwu(owu)
tar = generate_y(bwu, return_aggr=False)

owu_test = read_owu_v4('owu_test', root_path=root_path)
doe_test = read_doe('owu_test_doe', root_path=root_path)
bwu_test = generate_bwu(owu_test)
tar_test = generate_y(bwu_test, return_aggr=False)

# PLS Introduction
<details>
<summary>
<font size="3" color="black">
<b> ⏏︎Click to open</b>
</font>
</summary>

<img src="assets/pls_explain.png" alt="Variables Type" width="1000">


PLS 模型允许我们使用更多的变量/列而不会导致模型过度拟合。

在只有一个响应变量 $ y $ 和 $ k $ 个预测变量的情况下，具有 $ h $ 个潜变量的 PLS 回归模型表达如下：

$$ X = T W^t + E$$

$$ y = U c^t + f $$

### Model Explain

- **X**：原始预测变量矩阵。
- **T**：得分矩阵（潜变量矩阵）。
- **W**：权重矩阵。
- **E**：误差矩阵。
- **y**：响应变量。
- **U**：响应变量的得分矩阵。
- **c**：回归系数向量。
- **f**：响应变量的误差向量。

PLS 模型通过找到一组新的潜变量（得分矩阵 $ T $ 和 $ U $）来解释原始变量和响应变量之间的关系，从而减少数据的维度并避免多重共线性的问题。

### VIP Scores

VIP 分数是用于衡量变量在模型中重要性的指标。对于第 $ j $ 个变量，VIP 分数计算公式如下：

$$ VIP_j = \left( k \sum_{i=1}^h \left(SS(c_i t_i) \left(\frac{w_{ij}}{||w_i||}\right)^2\right) / \sum_{k=1}^h (c_i t_i) \right)^{1/2} $$

- **VIP_j**：第 $ j $ 个变量的 VIP 分数。
- **k**：总的预测变量数。
- **h**：潜变量的数量。
- **SS(c_i t_i)**：第 $ i $ 个潜变量的平方和。
- **w_{ij}**：第 $ j $ 个变量在第 $ i $ 个潜变量中的权重。
- **||w_i||**：第 $ i $ 个潜变量权重的范数。

### VIP Usuage

- VIP 分数的平方平均值等于 1，因此“一大于一规则”通常用作变量选择的标准。即，VIP 分数大于 1 的变量被认为对模型重要，可以优先保留。

### Summary
- PLS 模型通过引入潜变量减少维度，并避免多重共线性的问题，使得我们可以使用更多的变量而不会导致模型过度拟合。
- VIP 分数则帮助我们评估每个变量在模型中的重要性，提供了一个有效的变量选择标准。

</details>

# Data-Driven Models for Simulation

In [6]:
def transform_owu(owu, t_steps=15, batch_first=False):
    X_columns = [col for col in owu.columns if "X:" in col]
    X_owu = owu[X_columns].copy()
    X_owu = X_owu.sort_index(level=["run", "time"])

    C = len(X_columns)
    B = X_owu.index.get_level_values("run").nunique()
    T = t_steps

    if batch_first:
        X = np.zeros((B, T, C))
    else:
        X = np.zeros((T, C, B))

    for i, (run, group) in enumerate(X_owu.groupby(level="run")):
        if len(group) != T:
            raise ValueError(f"Run {run} does not have {T} time steps.")

        if batch_first:
            X[i, :, :] = group.values
        else:
            X[:, :, i] = group.values

    return X, X_columns


def plot_rmse_by_latent_vars(all_train_eval, all_valid_eval, latent_vars_options):
    train_rmse = []
    valid_rmse = []
    
    for t in all_train_eval.keys():
        train_rmse.append([all_train_eval[t][lv] for lv in latent_vars_options])
        valid_rmse.append([all_valid_eval[t][lv] for lv in latent_vars_options])
    
    train_rmse = np.array(train_rmse).T
    valid_rmse = np.array(valid_rmse).T

    fig = go.Figure()

    for i, lv in enumerate(latent_vars_options):
        fig.add_trace(go.Scatter(
            x=list(all_train_eval.keys()),
            y=train_rmse[i],
            mode='lines+markers',
            name=f'Train - {lv}',
            line=dict(dash='solid')
        ))
        fig.add_trace(go.Scatter(
            x=list(all_train_eval.keys()),
            y=valid_rmse[i],
            mode='lines+markers',
            name=f'Validation - {lv}',
            line=dict(dash='dash')
        ))

    fig.update_layout(
        title='RMSE by Latent Variables Over Time',
        xaxis_title='Time Steps',
        yaxis_title='RMSE',
        legend_title='Evaluation Type',
        width=1000,
        height=600
    )

    fig.show()



def plot_multi_step_pls_model_eval(
    X,
    X_pred,
    X_test,
    X_test_pred,
    X_columns=None,
):
    for i, col in enumerate(X_columns):
        y = X[:, i, :].copy()
        y_pred = X_pred[:, i, :].copy()
        y_test = X_test[:, i, :].copy()
        y_test_pred = X_test_pred[:, i, :].copy()

        # Metrics for training set
        train_r2 = r2(y, y_pred)
        train_abs_rmse = absolute_rmse(y, y_pred)
        train_rel_rmse = relative_rmse(y, y_pred)

        # Metrics for testing set
        test_r2 = r2(y_test, y_test_pred)
        test_abs_rmse = absolute_rmse(y_test, y_test_pred)
        test_rel_rmse = relative_rmse(y_test, y_test_pred)

        # Plot observed vs predicted
        fig = make_subplots(
            rows=1,
            cols=2,
            subplot_titles=(
                f"Train Set - {col} <br> R^2 = {train_r2} <br> Abs RMSE = {train_abs_rmse} <br> Rel RMSE = {train_rel_rmse}",
                f"Test Set - {col} <br> R^2 = {test_r2} <br> Abs RMSE = {test_abs_rmse} <br> Rel RMSE = {test_rel_rmse}",
            ),
        )

        # Train set plot
        _, _, NUM_TRAIN = X.shape
        for i in range(NUM_TRAIN):
            fig.add_trace(
                go.Scatter(
                    x=y[:, i].reshape(-1),
                    y=y_pred[:, i].reshape(-1),
                    mode="markers",
                    name=f"Run id in Train {i}",
                    legendgroup=f"train_{i}",
                ),
                row=1,
                col=1,
            )
        fig.add_shape(
            type="line",
            x0=y_pred.min(),
            y0=y_pred.min(),
            x1=y_pred.max(),
            y1=y_pred.max(),
            layer="above",
            line=dict(dash="dash"),
        )

        # Test set plot
        _, _, NUM_TEST = X_test.shape
        for j in range(NUM_TEST):
            fig.add_trace(
                go.Scatter(
                    x=y_test[:, j].reshape(-1),
                    y=y_test_pred[:, j].reshape(-1),
                    mode="markers",
                    name=f"Run id in Test {j}",
                    legendgroup=f"test_{j}",
                ),
                row=1,
                col=2,
            )
        fig.add_shape(
            type="line",
            x0=y_test_pred.min(),
            y0=y_test_pred.min(),
            x1=y_test_pred.max(),
            y1=y_test_pred.max(),
            layer="above",
            line=dict(dash="dash"),
            row=1,
            col=2,
        )

        fig.update_layout(width=1600)
        fig.update_xaxes(title="Observed values", row=1, col=1)
        fig.update_xaxes(title="Observed values", row=1, col=2)
        fig.update_yaxes(title="Predicted values", row=1, col=1)
        fig.update_yaxes(title="Predicted values", row=1, col=2)
        fig.show()



def plot_relative_rmse_by_variables(
    X,
    X_pred,
    X_test,
    X_test_pred,
    X_columns=None,
):
    relative_rmse_train = []
    relative_rmse_test = []
    for i, col in enumerate(X_columns):
        y = X[:, i, :].copy()
        y_pred = X_pred[:, i, :].copy()
        y_test = X_test[:, i, :].copy()
        y_test_pred = X_test_pred[:, i, :].copy()

        # Metrics for training set
        train_rel_rmse = relative_rmse(y, y_pred)
        relative_rmse_train.append(train_rel_rmse)

        # Metrics for testing set
        test_rel_rmse = relative_rmse(y_test, y_test_pred)
        relative_rmse_test.append(test_rel_rmse)

    fig_rmse = go.Figure()
    fig_rmse.add_trace(
        go.Bar(
            x=X_columns,
            y=relative_rmse_train,
            name="Train Set",
            marker_color=pcolors[0],
            text=[f"{v:.2f}" for v in relative_rmse_train],
            textposition="outside",
        )
    )

    fig_rmse.add_trace(
        go.Bar(
            x=X_columns,
            y=relative_rmse_test,
            name="Test Set",
            marker_color=pcolors[1],
            text=[f"{v:.2f}" for v in relative_rmse_test],
            textposition="outside",
        )
    )

    fig_rmse.update_layout(
        barmode="group",
        title="Relative RMSE for Each Variables",
        xaxis_title="Feature",
        yaxis_title="Relative RMSE",
        legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="center", x=0.5),
    )

    fig_rmse.show()


def plot_predicted_profile(X, X_pred, X_columns, select_runs=[0], height=1000):
    max_cols_per_row = 5
    num_columns = len(X_columns)
    num_rows = (num_columns + max_cols_per_row) // max_cols_per_row

    fig = make_subplots(
        rows=num_rows, cols=min(num_columns, max_cols_per_row), 
        subplot_titles=X_columns
    )

    color_palette = px.colors.qualitative.Plotly

    for idx, j in enumerate(select_runs):
        color = color_palette[idx % len(color_palette)]
        for i, c in enumerate(X_columns):
            row = i // max_cols_per_row + 1
            col = i % max_cols_per_row + 1
            show_legend = (i == 0)
            fig.add_trace(
                go.Scatter(
                    x=list(range(15)),
                    y=X[:, i, j],
                    name=f"Run {j} Observed",
                    marker=dict(color=color),
                    showlegend=show_legend,
                    legendgroup=f"group_{j}"
                ),
                row=row,
                col=col,
            )
            fig.add_trace(
                go.Scatter(
                    x=list(range(15)),
                    y=X_pred[:, i, j],
                    name=f"Run {j} Predicted",
                    line=dict(dash="dash"),
                    marker=dict(color=color),
                    showlegend=show_legend,
                    legendgroup=f"group_{j}"
                ),
                row=row,
                col=col,
            )

    fig.update_layout(
        showlegend=True,
        title_text="Process variable evolution for selected runs",
        height=height,
    )
    fig.show()


## BB-PLS1
- Black Box - Partial Least Square Model (PLS1)
- One model per timepoint per process variable is developed, denoted as $PLS1_{i, t}$
- Training: $[Z, X(t = 0)] \rightarrow PLS1_{i, t} \rightarrow X_i(t = t_{model})$

In [7]:
def fit_pls_model(
    X, y, latent_variables=5, polynomial_degree=1, interactions_only=False
):
    # Define pipeline
    include_bias = False
    if polynomial_degree == 0:
        print("Constant model for pls is not allowed!")
        latent_variables = 1
        include_bias = True
    if polynomial_degree == 1:
        latent_variables = min(latent_variables, 5)
    poly_features = PolynomialFeatures(
        degree=polynomial_degree,
        interaction_only=interactions_only,
        include_bias=include_bias,
    )

    # Normlization data
    standard_scaler = StandardScaler(with_mean=True, with_std=True)

    pls_model = PLSRegression(n_components=latent_variables, scale=True)
    pipe = Pipeline(
        [("features", poly_features), ("scaler", standard_scaler), ("model", pls_model)]
    )

    # Fit PLS model
    pipe.fit(X, y)
    return pipe


def fit_multi_step_pls_model(
    Z,
    X,
    latent_variables=2,
    polynomial_degree=1,
    interactions_only=False,
    best_latent_vars=None,
):
    # Function to train using multiple BB-PLS1 models
    T, C, B = X.shape
    models = {}
    for t in range(T):
        models[t] = {}
        for i in range(C):

            if best_latent_vars:
                latent_variables = best_latent_vars.get(t)

            model = fit_pls_model(
                X=Z,
                y=X[t, i, :],
                latent_variables=latent_variables,
                polynomial_degree=polynomial_degree,
                interactions_only=interactions_only,
            )
            models[t][i] = model
    return models


def predict_multi_step_pls_model(Z, multi_step_models, t_steps=15):
    # Prediction in train set
    B, _ = Z.shape
    T = len(multi_step_models)
    C = len(multi_step_models[0])

    if t_steps != T:
        raise ValueError(f"Models does not have {t_steps} time steps.")

    X_pred = np.zeros((T, C, B))
    for t in range(T):
        for i in range(C):
            X_pred[t, i, :] = multi_step_models[t][i].predict(Z)

    return X_pred


def cross_val_predict_pls(X, y, latent_variables, cv_folds):
    kf = KFold(n_splits=cv_folds, shuffle=True, random_state=42)
    rmse_train, rmse_valid = [], []

    for train_index, valid_index in kf.split(X):
        X_train, X_valid = X[train_index], X[valid_index]
        y_train, y_valid = y[train_index], y[valid_index]

        max_components = min(X_train.shape[0], X_train.shape[1])
        n_components = min(latent_variables, max_components)

        model = fit_pls_model(X_train, y_train, n_components)

        y_train_pred = model.predict(X_train)
        y_valid_pred = model.predict(X_valid)

        rmse_train.append(absolute_rmse(y_train, y_train_pred))
        rmse_valid.append(absolute_rmse(y_valid, y_valid_pred))

    return np.mean(rmse_train), np.mean(rmse_valid)


def multi_step_pls_cross_validation_for_each_day(
    doe, owu, X_columns, latent_variables_options, cv_folds=5
):
    all_train_eval = {}
    all_valid_eval = {}

    t_steps = 15  # Assuming t_steps is 15
    X, _ = transform_owu(owu, t_steps=t_steps, batch_first=False)
    B = doe.shape[0]

    for t in range(t_steps):
        all_train_eval[t] = {}
        all_valid_eval[t] = {}

        for latent_vars in tqdm(latent_variables_options):
            train_eval, valid_eval = [], []

            for i, col in enumerate(X_columns):
                y_hist = X[t, i, :]

                Z = doe.values
                y = y_hist

                rmse_train, rmse_valid = cross_val_predict_pls(
                    Z, y, latent_vars, cv_folds
                )
                train_eval.append(rmse_train)
                valid_eval.append(rmse_valid)

            all_train_eval[t][latent_vars] = np.mean(train_eval)
            all_valid_eval[t][latent_vars] = np.mean(valid_eval)

    return all_train_eval, all_valid_eval, latent_variables_options


### Setting

In [8]:
""" Number of latent Variables """
LATENT_VARIABLES = 3
""" Ploynomial degree of features """
POLYNOMIAL_DEGREE_PLS = 1
""" Add only interaction between features"""
INTERACTIONS_ONLY = True

""" Number of days of process history """
PROCESS_HISTORY = 15

### Data

In [9]:
X, X_columns = transform_owu(owu, t_steps=15, batch_first=False)
X_test, X_columns = transform_owu(owu_test, t_steps=15, batch_first=False)

### Train

In [10]:
models = fit_multi_step_pls_model(
    Z=doe,
    X=X,
    latent_variables=LATENT_VARIABLES,
    polynomial_degree=POLYNOMIAL_DEGREE_PLS,
    interactions_only=INTERACTIONS_ONLY,
)

### Test

In [11]:
X_pred = predict_multi_step_pls_model(Z=doe, multi_step_models=models, t_steps=15)
X_test_pred = predict_multi_step_pls_model(Z=doe_test, multi_step_models=models, t_steps=15)

plot_multi_step_pls_model_eval(
    X,
    X_pred,
    X_test,
    X_test_pred,
    X_columns = X_columns,
)

plot_relative_rmse_by_variables(
    X,
    X_pred,
    X_test,
    X_test_pred,
    X_columns = X_columns,
)

In [12]:
plot_predicted_profile(X_test, X_test_pred, X_columns, select_runs=[0,9], height=500)

### K-Fold

In [13]:
latent_vars_options = list(range(1, 16))

all_train_eval, all_valid_eval, latent_vars_options = multi_step_pls_cross_validation_for_each_day(
    doe,
    owu,
    X_columns=X_columns,
    latent_variables_options=latent_vars_options,
    cv_folds=5
)

  0%|          | 0/15 [00:00<?, ?it/s]

100%|██████████| 15/15 [00:00<00:00, 60.97it/s]
100%|██████████| 15/15 [00:00<00:00, 57.37it/s]
100%|██████████| 15/15 [00:00<00:00, 56.64it/s]
100%|██████████| 15/15 [00:00<00:00, 53.87it/s]
100%|██████████| 15/15 [00:00<00:00, 55.54it/s]
100%|██████████| 15/15 [00:00<00:00, 55.48it/s]
100%|██████████| 15/15 [00:00<00:00, 55.01it/s]
100%|██████████| 15/15 [00:00<00:00, 55.64it/s]
100%|██████████| 15/15 [00:00<00:00, 57.14it/s]
100%|██████████| 15/15 [00:00<00:00, 58.21it/s]
100%|██████████| 15/15 [00:00<00:00, 56.34it/s]
100%|██████████| 15/15 [00:00<00:00, 55.85it/s]
100%|██████████| 15/15 [00:00<00:00, 55.62it/s]
100%|██████████| 15/15 [00:00<00:00, 55.49it/s]
100%|██████████| 15/15 [00:00<00:00, 53.97it/s]


In [14]:
# 绘制RMSE图表
plot_rmse_by_latent_vars(all_train_eval, all_valid_eval, latent_vars_options)

# 选出每个时刻的最优潜变量数
best_latent_vars = {}
for t, evals in all_valid_eval.items():
    best_latent_vars[t] = min(evals, key=evals.get)

print("Best latent variables for each time step:", best_latent_vars)

Best latent variables for each time step: {0: 5, 1: 5, 2: 1, 3: 1, 4: 1, 5: 1, 6: 2, 7: 2, 8: 2, 9: 4, 10: 4, 11: 4, 12: 4, 13: 4, 14: 4}


### Retrain

In [15]:
models = fit_multi_step_pls_model(
    Z=doe,
    X=X,
    latent_variables=LATENT_VARIABLES,
    polynomial_degree=POLYNOMIAL_DEGREE_PLS,
    interactions_only=INTERACTIONS_ONLY,
	best_latent_vars=best_latent_vars
)

In [16]:
X_pred = predict_multi_step_pls_model(Z=doe, multi_step_models=models, t_steps=15)
X_test_pred = predict_multi_step_pls_model(Z=doe_test, multi_step_models=models, t_steps=15)

plot_multi_step_pls_model_eval(
    X,
    X_pred,
    X_test,
    X_test_pred,
    X_columns = X_columns,
)

plot_relative_rmse_by_variables(
    X,
    X_pred,
    X_test,
    X_test_pred,
    X_columns = X_columns,
)

## BWU-PLS1
- Batch Wise Unfolded - Partial Least Square Model (PLS1), also called Historical-PLS
- One model per timepoint per process variable is developed, denoted as $PLS1_{i, t}$
	- But, using the process condition and the historical information available until a given time
- Training: $[Z, X(t < t_{model})] \rightarrow PLS1_{i, t} \rightarrow X_i(t=t_{model})$
- Testing: $[Z, X(t = 0), X^{predicted}(t < t_{model})] \rightarrow PLS1_{i, t} \rightarrow X_i(t=t_{model})$

In [17]:
def fit_hist_pls_model(
    X,
    y,
    latent_variables=15,
):
    # Define Pipeline
    pscaler = StandardScaler(with_mean=True, with_std=True)
    pls_bwu = PLSRegression(n_components=latent_variables)
    pipe = Pipeline([("scaler", pscaler), ("model", pls_bwu)])

    # Train PLS model
    pipe.fit(X, y)
    return pipe


def fit_multi_step_hist_pls_model(
    doe,
    owu,
    latent_variables,
    t_steps,
    X_columns=None,
    W_columns=None,
    best_latent_vars=None,
):
    models = {}

    for t in range(1, t_steps):
        for i, col in enumerate(X_columns):
            X_hist = owu.loc[owu.index.get_level_values("time") < t, X_columns]
            W_hist = owu.loc[owu.index.get_level_values("time") < t, W_columns]
            X_bwu_hist = generate_bwu(X_hist)
            W_bwu_hist = generate_bwu(W_hist)

            y_hist = owu.loc[owu.index.get_level_values("time") == t, X_columns]

            if not X_bwu_hist.empty and not y_hist.empty:
                Z = doe.values
                X_preproc = np.hstack([Z, X_bwu_hist, W_bwu_hist])
                y = y_hist.loc[:, col].values

                if best_latent_vars:
                    latent_variables = best_latent_vars.get(t)
                model = fit_hist_pls_model(X_preproc, y, latent_variables)
                models[(i, t)] = model

    return models


def predict_multi_step_hist_pls_model(
    doe, X0, W, multi_step_models, t_steps, X_columns=None
):
    X_hist = X0.copy()

    for t in range(1, t_steps):
        predictions = {col: [] for col in X_columns}

        for i, col in enumerate(X_columns):
            model = multi_step_models.get((i, t))

            Z = doe.values
            X_bwu_hist = generate_bwu(X_hist)
            W_bwu_hist = generate_bwu(W.loc[W.index.get_level_values("time") < t])

            X_preproc = np.hstack(
                [Z, X_bwu_hist, W_bwu_hist]
            )  # B, X_bwu_columns_length+Z_columns_length

            prediction = model.predict(X_preproc)
            predictions[col].extend(prediction)

        next_pred = pd.DataFrame(predictions)
        next_pred.index = pd.MultiIndex.from_product(
            [list(X0.index.get_level_values("run").unique()), [t]],
            names=["run", "time"],
        )
        X_hist = pd.concat([X_hist, next_pred])

    X_pred, _ = transform_owu(X_hist)
    return X_pred


def cross_val_predict_hist_pls(X, y, latent_variables, cv_folds):
    kf = KFold(n_splits=cv_folds, shuffle=True, random_state=42)
    rmse_train, rmse_valid = [], []

    for train_index, valid_index in kf.split(X):
        X_train, X_valid = X[train_index], X[valid_index]
        y_train, y_valid = y[train_index], y[valid_index]

        # 确定 n_components 的上限
        max_components = min(X_train.shape[0], X_train.shape[1])
        n_components = min(latent_variables, max_components)

        model = fit_hist_pls_model(X_train, y_train, n_components)

        y_train_pred = model.predict(X_train)
        y_valid_pred = model.predict(X_valid)

        rmse_train.append(absolute_rmse(y_train, y_train_pred))
        rmse_valid.append(absolute_rmse(y_valid, y_valid_pred))

    return np.mean(rmse_train), np.mean(rmse_valid)


def multi_step_hist_pls_cross_validation_for_each_day(
    doe, owu, X_columns, W_columns, latent_variables_options, cv_folds=5
):
    all_train_eval = {}
    all_valid_eval = {}

    t_steps = 15  # Assuming t_steps is 15
    X, _ = transform_owu(owu, t_steps=t_steps, batch_first=False)
    B = doe.shape[0]

    for t in range(1, t_steps):
        all_train_eval[t] = {}
        all_valid_eval[t] = {}

        for latent_vars in tqdm(latent_variables_options):
            train_eval, valid_eval = [], []

            for i, col in enumerate(X_columns):
                X_hist = owu.loc[owu.index.get_level_values("time") < t, X_columns]
                X_bwu_hist = generate_bwu(X_hist)
                W_hist = owu.loc[owu.index.get_level_values("time") < t, W_columns]
                W_bwu_hist = generate_bwu(W_hist)

                y_hist = owu.loc[owu.index.get_level_values("time") == t, X_columns]

                if not X_hist.empty and not y_hist.empty:
                    Z = doe.values
                    X_preproc = np.hstack([Z, X_bwu_hist.values, W_bwu_hist.values])
                    y = y_hist.loc[:, col].values

                    rmse_train, rmse_valid = cross_val_predict_hist_pls(
                        X_preproc, y, latent_vars, cv_folds
                    )
                    train_eval.append(rmse_train)
                    valid_eval.append(rmse_valid)

            all_train_eval[t][latent_vars] = np.mean(train_eval)
            all_valid_eval[t][latent_vars] = np.mean(valid_eval)

    return all_train_eval, all_valid_eval, latent_variables_options

### Setting

In [18]:
LATENT_VARIABLES = 4
X_columns = ['X:VCD', 'X:Glc', 'X:Lac', 'X:Titer']  # Specific columns to consider
W_columns = ['W:Feed']

### Data

In [19]:
X, X_columns = transform_owu(owu, t_steps=15, batch_first=False)
X_test, X_columns = transform_owu(owu_test, t_steps=15, batch_first=False)

### Train

In [20]:
# Fit the models
models = fit_multi_step_hist_pls_model(
    doe,
    owu,
    t_steps=15,
    latent_variables=LATENT_VARIABLES,
    X_columns=X_columns,
    W_columns=W_columns,
)

### Test

In [21]:
# Initial conditions for predictions (using the first timestep data)
X0 = owu.loc[owu.index.get_level_values('time') < 1, X_columns]

W = owu[['W:Feed']]

# Predict the next steps
X_pred = predict_multi_step_hist_pls_model(
    doe,
    X0,
	W,
    X_columns=X_columns,
    multi_step_models=models,
    t_steps=15,
)


X0_test = owu_test.loc[owu_test.index.get_level_values('time') < 1, X_columns]
W_test = owu_test[['W:Feed']]

# Predict the next steps
X_test_pred = predict_multi_step_hist_pls_model(
    doe_test,
    X0_test,
	W_test,
    X_columns=X_columns,
    multi_step_models=models,
    t_steps=15,
)

In [22]:
plot_multi_step_pls_model_eval(
    X,
    X_pred,
    X_test,
    X_test_pred,
    X_columns = X_columns,
)

plot_relative_rmse_by_variables(
    X,
    X_pred,
    X_test,
    X_test_pred,
    X_columns = X_columns,
)

In [23]:
plot_predicted_profile(X_test, X_test_pred, X_columns, select_runs=[0, 1, 2, 3, 4], height=500)

### K-Fold

In [24]:
latent_vars_options = list(range(1, 20))

all_train_eval, all_valid_eval, latent_vars_options = multi_step_hist_pls_cross_validation_for_each_day(
    doe,
    owu,
    X_columns=X_columns,
    W_columns=W_columns,
    latent_variables_options=latent_vars_options,
    cv_folds=5
)


100%|██████████| 19/19 [00:00<00:00, 22.79it/s]
100%|██████████| 19/19 [00:00<00:00, 22.73it/s]
100%|██████████| 19/19 [00:00<00:00, 23.10it/s]
100%|██████████| 19/19 [00:00<00:00, 23.13it/s]
100%|██████████| 19/19 [00:00<00:00, 22.89it/s]
100%|██████████| 19/19 [00:00<00:00, 22.71it/s]
100%|██████████| 19/19 [00:00<00:00, 22.69it/s]
100%|██████████| 19/19 [00:00<00:00, 22.56it/s]
100%|██████████| 19/19 [00:00<00:00, 22.28it/s]
100%|██████████| 19/19 [00:00<00:00, 22.32it/s]
100%|██████████| 19/19 [00:00<00:00, 22.35it/s]
100%|██████████| 19/19 [00:00<00:00, 22.18it/s]
100%|██████████| 19/19 [00:00<00:00, 21.39it/s]
100%|██████████| 19/19 [00:00<00:00, 21.84it/s]


In [25]:
plot_rmse_by_latent_vars(all_train_eval, all_valid_eval, latent_vars_options)

best_latent_vars = {}
for t, evals in all_valid_eval.items():
    best_latent_vars[t] = min(evals, key=evals.get)

print("Best latent variables for each time step:", best_latent_vars)

Best latent variables for each time step: {1: 5, 2: 4, 3: 6, 4: 5, 5: 3, 6: 3, 7: 2, 8: 2, 9: 7, 10: 7, 11: 7, 12: 7, 13: 6, 14: 4}


### Retrain

In [26]:
# Fit the models
models = fit_multi_step_hist_pls_model(
    doe,
    owu,
	t_steps=15,
    latent_variables=LATENT_VARIABLES,
    X_columns=X_columns,
	W_columns=W_columns,
	best_latent_vars=best_latent_vars
)

In [27]:
# Initial conditions for predictions (using the first timestep data)
X0 = owu.loc[owu.index.get_level_values('time') < 1, X_columns]

W = owu[['W:Feed']]

# Predict the next steps
X_pred = predict_multi_step_hist_pls_model(
    doe,
    X0,
	W,
    X_columns=X_columns,
    multi_step_models=models,
    t_steps=15,
)


X0_test = owu_test.loc[owu_test.index.get_level_values('time') < 1, X_columns]
W_test = owu_test[['W:Feed']]

# Predict the next steps
X_test_pred = predict_multi_step_hist_pls_model(
    doe_test,
    X0_test,
	W_test,
    X_columns=X_columns,
    multi_step_models=models,
    t_steps=15,
)

In [28]:
plot_multi_step_pls_model_eval(
    X,
    X_pred,
    X_test,
    X_test_pred,
    X_columns = X_columns,
)

plot_relative_rmse_by_variables(
    X,
    X_pred,
    X_test,
    X_test_pred,
    X_columns = X_columns,
)

In [29]:
plot_predicted_profile(X_test, X_test_pred, X_columns, select_runs=[0, 1, 2, 3, 4], height=500)