In [49]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.cross_decomposition import PLSRegression
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.metrics import root_mean_squared_error
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [27]:
def generate_bwu(owu):
    owu = owu.drop(["timesteps"],axis=1)
    # Input: multiindex OWU
    # Output: singleindex BWU
    for run_ix, run in owu.groupby("run"):
        if run_ix == owu.index.get_level_values('run')[0]:
            bwuindex = run.unstack(level=1)
        else:
            bwuindex = pd.concat([bwuindex, run.unstack(level=1)])
    bwu_columns = [str(bwuindex.columns.get_level_values(0)[i])+str(":")+str(bwuindex.columns.get_level_values(1)[i]) 
                   for i in range(len(bwuindex.columns.get_level_values(0)))]
    bwu = pd.DataFrame(bwuindex.to_numpy(), columns=bwu_columns)
    
    return bwu


def generate_y(bwu, return_aggr=False):
    # Input: singleindex BWU
    # Output: singleindex BWU having only target
    titer_column = [c for c in bwu.columns if c.startswith("X:Titer")]
    targets = pd.DataFrame(columns=["Y:Titer", "Y:Aggr"], index=bwu.index)

    # iterate through experiments
    for j in list(bwu.index):
        x_titer = bwu.loc[j, titer_column]
        x_prod = [0]
        x_aggr = [0]
        k_aggr = 10**-7
        for i in range(len(x_titer)):
            if i == 0:
                continue
            xt_titer = x_titer.iloc[i]
            dt_titer = x_titer.iloc[i] - x_titer.iloc[i - 1]
            x_prod.append(xt_titer)
            x_aggr.append(k_aggr * (xt_titer**2))

            dt_aggr = x_aggr[i] - x_aggr[i - 1]
            dt_prod = dt_titer - 2 * dt_aggr
            dt_aggr = k_aggr * (x_prod[i - 1] + dt_prod) ** 2

            x_aggr[i] = x_aggr[i - 1] + dt_aggr
            x_prod[i] = x_prod[i - 1] + dt_prod
        y_prod = x_prod[-1]
        y_aggr = x_aggr[-1]

        targets.loc[j, "Y:Titer"] = y_prod
        targets.loc[j, "Y:Aggr"] = y_aggr
    if return_aggr:
        target = targets["Y:Aggr"]
    else:
        target = targets["Y:Titer"]

    return pd.DataFrame(target)

In [34]:
def vip(x, y, model):
    import numpy as np

    # Score matrix T (latent variables), corresponding to T in the formula
    t = model.x_scores_

    # Weight matrix W, corresponding to W in the formula
    w = model.x_weights_

    # Loadings matrix Q, corresponding to c in the formula (sometimes Q is used for loadings in PLS models)
    q = model.y_loadings_

    # Number of samples (m) and number of variables (p), corresponding to the shape of X
    m, p = x.shape

    # Number of latent variables (h), corresponding to the shape of T
    _, h = t.shape

    # Initialize VIP scores array
    vips = np.zeros((p,))

    # Calculate SS(c_i t_i), the s in the formula, representing the sum of squares for the i-th latent variable
    # Here, t.T @ t is T^t * T, q.T @ q is c^t * c
    s = np.diag(t.T @ t @ q.T @ q).reshape(h, -1)

    # Calculate the total sum of SS(c_i t_i)
    total_s = np.sum(s)

    # Calculate the VIP score for each variable
    for i in range(p):
        # Calculate (w_ij / ||w_i||)^2 for each latent variable j
        # w[:,j] is the j-th column of weights, representing the weights for the j-th latent variable
        weight = np.array([(w[i, j] / np.linalg.norm(w[:, j])) ** 2 for j in range(h)])

        # Calculate the VIP score using the formula:
        # VIP_j = (k * ∑(SS(c_i t_i) * (w_ij / ||w_i||)^2) / ∑(SS(c_i t_i)))^(1/2)
        # Where k = p (number of variables)
        vips[i] = np.sqrt(p * np.sum(s.T @ weight) / total_s)

    return vips

In [55]:
def fit_pls_model(
    doe, tar, latent_variables=5, polynomial_degree=1, interactions_only=False
):
    # Define pipeline
    include_bias = False
    if polynomial_degree == 0:
        print("Constant model for pls is not allowed!")
        latent_variables = 1
        include_bias = True
    if polynomial_degree == 1:
        latent_variables = min(latent_variables, 5)
    poly_features = PolynomialFeatures(
        degree=polynomial_degree,
        interaction_only=interactions_only,
        include_bias=include_bias,
    )

    # Normlization data
    standard_scaler = StandardScaler(with_mean=True, with_std=True)

    pls_model = PLSRegression(n_components=latent_variables, scale=True)
    pipe = Pipeline(
        [("features", poly_features), ("scaler", standard_scaler), ("model", pls_model)]
    )

    # Fit PLS model
    X = doe
    y = tar
    pipe.fit(X, y)

    # Model features and variable importance
    X_columns = poly_features.get_feature_names_out()
    X_preproc = pd.DataFrame(
        pipe.steps[1][1].transform(pipe.steps[0][1].transform(X)), columns=X_columns
    )
    pls_vip = vip(X_preproc, y, pls_model)
    return X_columns, pls_vip, pipe


def plot_pls_model_coef(X_columns, pls_vip):
    # Plot Model Inference
    fig = px.bar(
        x=list(X_columns),
        y=pls_vip.reshape(-1),
        title="VIP scores of PLS model",
        labels={"x": "Variables", "y": "Estimated VIP", "color": "p-value"},
    )
    fig.add_hline(y=1)
    fig.add_hline(y=0.8, line=dict(color="gray"))
    fig.update_layout(width=1600)
    fig.show()


def plot_pls_model_eval(
    y,
    yhat,
    y_test,
    yhat_test,
    train_r2=None,
    test_r2=None,
    train_abs_rmse=None,
    test_abs_rmse=None,
    train_rel_rmse=None,
    test_rel_rmse=None,
):

    # Plot observed vs predicted
    fig = make_subplots(
        rows=1,
        cols=2,
        subplot_titles=(
            f"Train Set <br> R^2 = {train_r2} <br> Abs RMSE = {train_abs_rmse} <br> Rel RMSE = {train_rel_rmse}",
            f"Test Set <br> R^2 = {test_r2} <br> Abs RMSE = {test_abs_rmse} <br> Rel RMSE = {test_rel_rmse}",
        ),
    )
    # Train set plot
    fig.add_trace(
        go.Scatter(x=y.values.reshape(-1), y=yhat.reshape(-1), mode="markers"),
        row=1,
        col=1,
    )
    fig.add_shape(
        type="line",
        x0=min(yhat)[0],
        y0=min(yhat)[0],
        x1=max(yhat)[0],
        y1=max(yhat)[0],
        layer="below",
        line=dict(dash="dash"),
    )
    # Test set plot
    fig.add_trace(
        go.Scatter(
            x=y_test.values.reshape(-1), y=yhat_test.reshape(-1), mode="markers"
        ),
        row=1,
        col=2,
    )
    fig.add_shape(
        type="line",
        x0=min(yhat_test)[0],
        y0=min(yhat_test)[0],
        x1=max(yhat_test)[0],
        y1=max(yhat_test)[0],
        layer="below",
        line=dict(dash="dash"),
        row=1,
        col=2,
    )

    fig.update_layout(width=1600)
    fig.update_xaxes(title="Observed values", row=1, col=1)
    fig.update_xaxes(title="Observed values", row=1, col=2)
    fig.update_yaxes(title="Predicted values", row=1, col=1)
    fig.update_yaxes(title="Predicted values", row=1, col=2)
    fig.show()


def plot_pls_scores(X_columns, pipe, pc_x_axis=1, pc_y_axis=2):
    pls_model = pipe["model"]
    fig = make_subplots(
        rows=2,
        cols=2,
        specs=[[{"colspan": 2}, None], [{}, {}]],
        subplot_titles=(
            "Scores Plot ",
            "Loadings of Principal Component - " + str(pc_x_axis),
            "Loadings of Principal Component - " + str(pc_y_axis),
        ),
    )
    fig.add_trace(
        go.Scatter(
            x=pls_model.x_scores_[:, pc_x_axis],
            y=pls_model.x_scores_[:, pc_y_axis],
            mode="markers",
            name="Scores",
        ),
        row=1,
        col=1,
    )
    fig.add_bar(
        x=X_columns,
        y=pls_model.x_loadings_[:, pc_x_axis - 1],
        name="Loadings PC - " + str(pc_x_axis),
        row=2,
        col=[1, 2],
    )
    fig.add_bar(
        x=X_columns,
        y=pls_model.x_loadings_[:, pc_y_axis - 1],
        name="Loadings PC - " + str(pc_y_axis),
        row=2,
        col=2,
    )
    fig.update_layout(height=1000)
    fig.show()

# Dataset

In [36]:
def read_owu_v3(file, root_path = 'dataset/datahow_2022/interpolation/'):
	data = pd.read_csv(f'{root_path}/{file}.csv')
	owu_df = data.copy()
	owu_df.set_index(['run', 'time'], inplace=True)
	return owu_df

def read_doe(file, root_path= 'dataset/datahow_2022/interpolation/'):
	data = pd.read_csv(f'{root_path}/{file}.csv', usecols=["feed_start", "feed_end", "Glc_feed_rate", "Glc_0", "VCD_0"])
	doe_df = data.copy()
	return doe_df

In [37]:
owu = read_owu_v3('owu')
doe = read_doe('owu_doe')
bwu = generate_bwu(owu)
tar = generate_y(bwu, return_aggr=False)
owu_test = read_owu_v3('owu_test')
doe_test = read_doe('owu_test_doe')
bwu_test = generate_bwu(owu_test)
tar_test = generate_y(bwu_test, return_aggr=False)

# Data-Driven Models for CQAs

- Aim of predicting the final titer

<details>
<summary>
<font size="3" color="black">
<b>PLS Introduction ⏏︎Click to open</b>
</font>
</summary>

<img src="assets/pls_explain.png" alt="Variables Type" width="1000">



PLS 模型允许我们使用更多的变量/列而不会导致模型过度拟合。

在只有一个响应变量 $ y $ 和 $ k $ 个预测变量的情况下，具有 $ h $ 个潜变量的 PLS 回归模型表达如下：

$$ X = T W^t + E$$

$$ y = U c^t + f $$

### 模型解释

- **X**：原始预测变量矩阵。
- **T**：得分矩阵（潜变量矩阵）。
- **W**：权重矩阵。
- **E**：误差矩阵。
- **y**：响应变量。
- **U**：响应变量的得分矩阵。
- **c**：回归系数向量。
- **f**：响应变量的误差向量。

PLS 模型通过找到一组新的潜变量（得分矩阵 $ T $ 和 $ U $）来解释原始变量和响应变量之间的关系，从而减少数据的维度并避免多重共线性的问题。

### VIP (变量重要性投影) 分数

VIP 分数是用于衡量变量在模型中重要性的指标。对于第 $ j $ 个变量，VIP 分数计算公式如下：

$$ VIP_j = \left( k \sum_{i=1}^h \left(SS(c_i t_i) \left(\frac{w_{ij}}{||w_i||}\right)^2\right) / \sum_{k=1}^h (c_i t_i) \right)^{1/2} $$

- **VIP_j**：第 $ j $ 个变量的 VIP 分数。
- **k**：总的预测变量数。
- **h**：潜变量的数量。
- **SS(c_i t_i)**：第 $ i $ 个潜变量的平方和。
- **w_{ij}**：第 $ j $ 个变量在第 $ i $ 个潜变量中的权重。
- **||w_i||**：第 $ i $ 个潜变量权重的范数。

### VIP 分数的应用

- VIP 分数的平方平均值等于 1，因此“一大于一规则”通常用作变量选择的标准。即，VIP 分数大于 1 的变量被认为对模型重要，可以优先保留。

### 总结

- PLS 模型通过引入潜变量减少维度，并避免多重共线性的问题，使得我们可以使用更多的变量而不会导致模型过度拟合。
- VIP 分数则帮助我们评估每个变量在模型中的重要性，提供了一个有效的变量选择标准。

</details>

## BB-PLS1
- Black Box - Partial Least Square Model (PLS1)
- Training: $[Z, X(t = 0)] \rightarrow PLS1 \rightarrow Y_{Final}$

* Input matrix: "doe". This corresponds to the values of the manipulated process parameters for each experiment.
* Output target: "tar". This corresponds to the final value of titer at the end of each experiment. (or aggregates)
* Select the number of latent variables for the model (the maximum number of latent variables is 5, equal to the number of variables in the input matrix).

### Data

In [38]:
dataset = doe.copy() # inputs
dataset["Target"] = tar # outputs
dataset.head()

Unnamed: 0,feed_start,feed_end,Glc_feed_rate,Glc_0,VCD_0,Target
0,2.5,10.0,12.5,45.0,0.55,1565.310032
1,1.94898,10.244898,18.622449,40.714286,0.27449,854.449233
2,3.295918,9.346939,14.642857,17.857143,0.127551,1206.096655
3,3.479592,9.020408,15.561224,47.857143,0.843878,1587.937536
4,1.459184,9.591837,7.295918,33.571429,0.421429,811.55299


### Train

In [44]:
""" Number of latent Variables """
LATENT_VARIABLES = 5
""" Ploynomial degree of features """
POLYNOMIAL_DEGREE_PLS = 1
""" Add only interaction between features"""
INTERACTIONS_ONLY = False

# Training
X_columns, pls_vip, pipe = fit_pls_model(
    doe,
    tar,
    latent_variables=LATENT_VARIABLES,
    polynomial_degree=POLYNOMIAL_DEGREE_PLS,
    interactions_only=INTERACTIONS_ONLY,
)

plot_pls_model_coef(X_columns, pls_vip)

### Test

In [51]:
# Make predictions
X = doe
y = tar
yhat = pipe.predict(X)
X_test = doe_test
y_test = tar_test
yhat_test = pipe.predict(X_test)


# Calculate error metrics
train_r2 = round(pipe.score(X, y), 3)
train_abs_rmse = round(root_mean_squared_error(y, yhat), 3)
train_rel_rmse = round(
	root_mean_squared_error(y, yhat) / np.std(np.array(y)), 3
)
test_r2 = round(pipe.score(X_test, y_test), 3)
test_abs_rmse = round(root_mean_squared_error(y_test, yhat_test), 3)
test_rel_rmse = round(
	root_mean_squared_error(y_test, yhat_test) / np.std(np.array(y_test)),
	3,
)

# Plot
plot_pls_model_eval(
    y, yhat, y_test, yhat_test,
    train_r2=train_r2,
    test_r2=test_r2,
    train_abs_rmse=train_abs_rmse,
    test_abs_rmse=test_abs_rmse,
    train_rel_rmse=train_rel_rmse,
    test_rel_rmse=test_rel_rmse,
)

### PCA

In [56]:
plot_pls_scores(X_columns, pipe, pc_x_axis=1, pc_y_axis=2)

## BWU-PLS1
- Batch Wise Unfolded - Partial Least Square Model (PLS1), also called Historical-PLS
- One model per timepoint per process variable is developed, denoted as $PLS1_{i, t}$
	- But, using the process condition and the historical information available until a given time
- Training: $[Z, X(t < t_{model})] \rightarrow PLS1_{i, t} \rightarrow X_i(t=t_{model})$
- Testing: $[Z, X(t = 0), X^{predicted}(t < t_{model})] \rightarrow PLS1_{i, t} \rightarrow X_i(t=t_{model})$

# Data-Driven Models for Simulation

## BB-PLS1
- Black Box - Partial Least Square Model (PLS1)
- One model per timepoint per process variable is developed, denoted as $PLS1_{i, t}$
- Training: $[Z, X(t = 0)] \rightarrow PLS1_{i, t} \rightarrow X_i(t = t_{model})$

## BWU-PLS1
- Batch Wise Unfolded - Partial Least Square Model (PLS1), also called Historical-PLS
- One model per timepoint per process variable is developed, denoted as $PLS1_{i, t}$
	- But, using the process condition and the historical information available until a given time
- Training: $[Z, X(t < t_{model})] \rightarrow PLS1_{i, t} \rightarrow X_i(t=t_{model})$
- Testing: $[Z, X(t = 0), X^{predicted}(t < t_{model})] \rightarrow PLS1_{i, t} \rightarrow X_i(t=t_{model})$

## Instant-ANN
- ANN models per variable per time point
- Training: $[Z, X(t = t_{model} - 1)] \rightarrow ANN_{i, t} \rightarrow X_i(t = t_{model})$
- Testing: $[Z, \hat{X}(t = t_{model} - 1)] \rightarrow ANN_{i, t} \rightarrow X_i(t = t_{model})$

## OWU-ANN
- a single model is used for all time points
- Training: $[Z, X(t = t_{model} - 1)] \rightarrow ANN_i \rightarrow X_i(t = t_{model})$​
- Testing: $[Z, \hat{X}(t = t_{model} - 1)] \rightarrow ANN_i \rightarrow X_i(t = t_{model})$