### Import libraries

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from scipy.integrate import odeint

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, WhiteKernel
from sklearn.metrics import r2_score, root_mean_squared_error
from kernels import SubspaceKernel

import plotly.io as pio
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

pio.templates.default = "plotly_white"
pcolors = px.colors.qualitative.T10
pcolors25 = px.colors.qualitative.Alphabet

import warnings

warnings.filterwarnings("ignore")

In [2]:
# data_type = 'interpolation'
data_type = "interpolation"
root_path = f"dataset/datahow_2022/{data_type}/"


def read_owu_v4(file, root_path="dataset/datahow_2022/interpolation/"):
    data = pd.read_csv(f"{root_path}/{file}.csv")
    owu_df = data.copy()
    num_runs = len(pd.read_csv(f"{root_path}/{file}_doe.csv"))
    if "run" not in owu_df.columns:
        owu_df.index = pd.MultiIndex.from_product(
            [list(range(num_runs)), list(range(15))], names=["run", "time"]
        )
    else:
        owu_df.set_index(["run", "time"], inplace=True)
    return owu_df


def read_doe(file, root_path="dataset/datahow_2022/interpolation/"):
    data = pd.read_csv(
        f"{root_path}/{file}.csv",
        usecols=["feed_start", "feed_end", "Glc_feed_rate", "Glc_0", "VCD_0"],
    )
    doe_df = data.copy()
    return doe_df

In [3]:
def owu_to_tensor(owu_raw, t_steps=15, batch_first=False):
    """
    Convert a raw OWU dataframe into tensor format for further processing.

    Parameters:
    owu_raw (pd.DataFrame): Input dataframe with hierarchical index (run, time).
    t_steps (int): Number of time steps for each run.
    batch_first (bool): Whether the batch dimension should be the first dimension in the output tensors.

    Returns:
    tuple: A tuple containing:
        - X (np.ndarray): Tensor of shape (B, T, C_X) if batch_first is True, otherwise (T, C_X, B).
        - F (np.ndarray): Tensor of shape (B, T, C_F) if batch_first is True, otherwise (T, C_F, B).
        - X_columns (list): List of column names corresponding to X features.
        - F_columns (list): List of column names corresponding to F features.

    Raises:
    ValueError: If the number of time steps in any run does not match t_steps.
    """

    owu = owu_raw.copy()
    owu = owu.sort_index(level=["run", "time"])

    X_columns = [col for col in owu.columns if "X:" in col]
    F_columns = [col for col in owu.columns if "W:" in col]

    C_X = len(X_columns)
    C_F = len(F_columns)
    B = owu.index.get_level_values("run").nunique()
    T = t_steps

    if batch_first:
        X = np.zeros((B, T, C_X))
        F = np.zeros((B, T, C_F))

    else:
        X = np.zeros((T, C_X, B))
        F = np.zeros((T, C_F, B))

    for i, (run, group) in enumerate(owu.groupby(level="run")):
        X_group = group[X_columns].copy()
        F_group = group[F_columns].copy()

        if len(group) != T:
            raise ValueError(f"Run {run} does not have {T} time steps.")

        if batch_first:
            X[i, :, :] = X_group.values
            F[i, :, :] = F_group.values
        else:
            X[:, :, i] = X_group.values
            F[:, :, i] = F_group.values

    return X, F, X_columns, F_columns


def doe_to_tensor(doe_raw, batch_first=False):
    """
    Convert a raw DOE dataframe into tensor format for further processing.

    Parameters:
    doe_raw (pd.DataFrame): Input dataframe.
    batch_first (bool): Whether the batch dimension should be the first dimension in the output tensor.

    Returns:
    tuple: A tuple containing:
        - Z (np.ndarray): Tensor of shape (B, T, C_Z) if batch_first is True, otherwise (T, C_Z, B).
        - Z_columns (list): List of column names corresponding to Z features.
    """
    doe = doe_raw.copy()
    doe = doe.sort_index()
    Z_columns = [col for col in doe.columns]

    C_Z = len(Z_columns)
    T = 1
    B = doe.shape[0]

    if batch_first:
        Z = np.zeros((B, T, C_Z))
        Z[:, 0, :] = doe.values
    else:
        Z = np.zeros((T, C_Z, B))
        Z[0, :, :] = doe.values.T

    return Z, Z_columns

In [4]:
def r2(y, y_pred):
    """
    Calculate the R² (coefficient of determination) score for the given true and predicted values.

    Parameters:
    y (np.ndarray): True values array of shape (B, T).
    y_pred (np.ndarray): Predicted values array of shape (B, T).

    Returns:
    float: The R² score rounded to 3 decimal places.
    """
    return round(r2_score(y.flatten(), y_pred.flatten()), 3)


def absolute_rmse(y, y_pred):
    """
    Calculate the absolute Root Mean Squared Error (RMSE) for the given true and predicted values.

    Parameters:
    y (np.ndarray): True values array of shape (B, T).
    y_pred (np.ndarray): Predicted values array of shape (B, T).

    Returns:
    float: The absolute RMSE rounded to 3 decimal places.
    """
    return round(root_mean_squared_error(y, y_pred), 3)


def relative_rmse(y, y_pred):
    """
    Calculate the relative Root Mean Squared Error (RMSE) for the given true and predicted values.

    Parameters:
    y (np.ndarray): True values array of shape (B, T).
    y_pred (np.ndarray): Predicted values array of shape (B, T).

    Returns:
    float: The relative RMSE (normalized by the standard deviation of y) rounded to 3 decimal places.
    """
    return round(root_mean_squared_error(y, y_pred) / np.std(np.array(y)), 3)

### **Process Parameters**

Please insert the values of the process manipulated variables:

- Feed start (day): day at which Glc feed is started
- Feed end (day): ay at which Glc feed is stopped
- Feed rate: mass rate (g/L/day) at which Glc is feed (continuous feed over 24 hours)
- Initial Glc concentration (g/L): Glc at time t = 0
- Initial VCD (10^6 cell/mL): VCD at time t = 0

* X:VCD: (10^6 cell/mL)
* X:Glc: (g/L)
* X:Lac: (g/L)
* X:Titer: (mg/L)
* W:Feed: (g/L/day)

### Load dataset

In [5]:
owu = read_owu_v4("owu", root_path=root_path)
doe = read_doe("owu_doe", root_path=root_path)

owu_test = read_owu_v4("owu_test", root_path=root_path)
doe_test = read_doe("owu_test_doe", root_path=root_path)

In [6]:
TIME_STEP = 24

In [7]:
def tensor_preparation(
    owu,
    owu_test,
    doe,
    doe_test,
    t_steps=15,
    time_step=24,
    batch_first=True,
    init_volume=1000,
    feed_items=["X:Glc"],
):
    # Convert state variable data to tensors
    X_train, F_train, X_columns, F_columns = owu_to_tensor(
        owu, t_steps=t_steps, batch_first=batch_first
    )
    X_test, F_test, X_columns, F_columns = owu_to_tensor(
        owu_test, t_steps=t_steps, batch_first=batch_first
    )

    # Convert experimental condition variable data to tensors
    Z_train, Z_columns = doe_to_tensor(doe, batch_first=batch_first)
    Z_test, Z_columns = doe_to_tensor(doe_test, batch_first=batch_first)

    # Calculate volume
    V_train = (init_volume + F_train.sum(axis=-1, keepdims=True).cumsum(axis=1)) / 1000
    V_test = (init_volume + F_test.sum(axis=-1, keepdims=True).cumsum(axis=1)) / 1000

    # Print original shapes
    print(f"\n>>> Original Shape: ")
    print_shapes(
        X_matrixs=(X_train, X_test),
        F_matrixs=(F_train, F_test),
        Z_matrixs=(Z_train, Z_test),
        V_matrixs=(V_train, V_test),
        X_columns=X_columns,
        Z_columns=Z_columns,
        F_columns=F_columns,
    )

    # Create masks and apply them
    time_mask = np.ones(t_steps)

    feed_mask = np.zeros(len(X_columns)).astype(int)
    feed_mask[[X_columns.index(item) for item in feed_items]] = 1

    F_train = (feed_mask[None, None, :] * F_train) / time_step
    F_test = (feed_mask[None, None, :] * F_test) / time_step

    Z_train = time_mask[None, :, None] * Z_train
    Z_test = time_mask[None, :, None] * Z_test

    X_matrixs = (X_train, X_test)
    F_matrixs = (F_train, F_test)
    Z_matrixs = (Z_train, Z_test)
    V_matrixs = (V_train, V_test)
    columns = (X_columns, F_columns, Z_columns)

    # Print unfolding shapes
    print(f"\n>>> Unfolding Shape: ")
    print_shapes(
        X_matrixs=X_matrixs,
        F_matrixs=F_matrixs,
        Z_matrixs=Z_matrixs,
        V_matrixs=V_matrixs,
        X_columns=X_columns,
        Z_columns=Z_columns,
        F_columns=F_columns,
    )
    return X_matrixs, F_matrixs, Z_matrixs, V_matrixs, columns


def print_shapes(
    X_matrixs,
    F_matrixs,
    Z_matrixs,
    V_matrixs,
    X_columns=None,
    Z_columns=None,
    F_columns=None,
):
    X_train, X_test = X_matrixs
    F_train, F_test = F_matrixs
    Z_train, Z_test = Z_matrixs
    V_train, V_test = V_matrixs

    print("--- X Matrix ---")
    print(f"{'X Columns:':<20} {X_columns} on axis -1")
    print(f"{'X Shape:':<20} {X_train.shape}")
    print(f"{'X Test Shape:':<20} {X_test.shape}")

    print("--- F Matrix ---")
    print(f"{'F Columns:':<20} {F_columns} on axis -1")
    print(f"{'F Shape:':<20} {F_train.shape}")
    print(f"{'F Test Shape:':<20} {F_test.shape}")

    print("--- Z Matrix ---")
    print(f"{'Z Columns:':<20} {Z_columns} on axis -1")
    print(f"{'Z Shape:':<20} {Z_train.shape}")
    print(f"{'Z Test Shape:':<20} {Z_test.shape}")

    print("--- V Matrix ---")
    print(f"{'Volume Columns:':<20} After feed Volume")
    print(f"{'V Shape:':<20} {V_train.shape}")
    print(f"{'V Test Shape:':<20} {V_test.shape}")


# Extract Key Structure
X_matrixs, F_matrixs, Z_matrixs, V_matrixs, columns = tensor_preparation(
    owu,
    owu_test,
    doe,
    doe_test,
    t_steps=15,
    time_step=24,
    batch_first=True,
    init_volume=1000,
    feed_items=["X:Glc"],
)

# Get train data, test data
X_trainval, X_test = X_matrixs
F_trainval, F_test = F_matrixs
Z_trainval, Z_test = Z_matrixs
V_trainval, V_test = V_matrixs
X_columns, F_columns, Z_columns = columns


>>> Original Shape: 
--- X Matrix ---
X Columns:           ['X:VCD', 'X:Glc', 'X:Lac', 'X:Titer'] on axis -1
X Shape:             (50, 15, 4)
X Test Shape:        (50, 15, 4)
--- F Matrix ---
F Columns:           ['W:Feed'] on axis -1
F Shape:             (50, 15, 1)
F Test Shape:        (50, 15, 1)
--- Z Matrix ---
Z Columns:           ['feed_start', 'feed_end', 'Glc_feed_rate', 'Glc_0', 'VCD_0'] on axis -1
Z Shape:             (50, 1, 5)
Z Test Shape:        (50, 1, 5)
--- V Matrix ---
Volume Columns:      After feed Volume
V Shape:             (50, 15, 1)
V Test Shape:        (50, 15, 1)

>>> Unfolding Shape: 
--- X Matrix ---
X Columns:           ['X:VCD', 'X:Glc', 'X:Lac', 'X:Titer'] on axis -1
X Shape:             (50, 15, 4)
X Test Shape:        (50, 15, 4)
--- F Matrix ---
F Columns:           ['W:Feed'] on axis -1
F Shape:             (50, 15, 4)
F Test Shape:        (50, 15, 4)
--- Z Matrix ---
Z Columns:           ['feed_start', 'feed_end', 'Glc_feed_rate', 'Glc_0', 'VCD_0'

### Target calculation

In [8]:
def central_differences(X, F, sign_mask=None, time_step=24):
    """
    Calculate central differences for the input data X with feed rates F and sign adjustments using sign_mask.

    Parameters:
    X (numpy.ndarray): Input data array of shape (batch_size, time_steps, variables)
    F (numpy.ndarray): Feed rates array of shape (batch_size, time_steps, variables)
    sign_mask (numpy.ndarray): Array to adjust signs for each variable, shape (variables,)
    time_step (int): Time step size for calculating derivatives

    Returns:
    numpy.ndarray: Derivatives matrix of the same shape as X
    """

    # Initial derivatives matrix
    Y = np.zeros_like(X)

    # Initial time step derivatives
    Y[:, 0, :] = (sign_mask[None, :] * (X[:, 1, :] - X[:, 0, :])) / time_step + F[:, 0, :]

    # Central time step derivatives
    Y[:, 1:-1, :] = (sign_mask[None, None, :] * ((X[:, 2:, :] - X[:, :-2, :]) / 2)) / time_step + F[:, 1:-1, :]

    # Final time step derivatives
    Y[:, -1, :] = (sign_mask[None, :] * (X[:, -1, :] - X[:, -2, :])) / time_step + F[:, -2, :]
    return Y

In [9]:
sign_mask = np.array([1, -1, 1, 1])  # [C, ]

# B, T, C
Y_trainval = central_differences(
    X_trainval, F_trainval, sign_mask=sign_mask, time_step=TIME_STEP
)

# B, T, C
Y_test = central_differences(X_test, F_test, sign_mask=sign_mask, time_step=TIME_STEP)

### Inspect target

In [10]:
def plot_state_data_color_by_rate(X, Y, t_steps=15, sign_mask=None, X_columns=None):
    """
    Plot state data colored by growth/consumption rates.

    Parameters:
    X (np.ndarray): State data array of shape (B, T, C)
    Y (np.ndarray): Rate data array of shape (B, T, C)
    t_steps (int): Number of time steps.
    sign_mask (list): List indicating the type of behavior (growth or consumption) for each feature/variable.
    X_columns (list): List colume name of features/variables in owu dataframe
    Returns:
    None
    """

    t = np.arange(t_steps)
    NUM_RUNS = X.shape[0]
    behavior = ["growth" if m == 1 else "consumption" for m in sign_mask]

    # pcolors = px.colors.qualitative.Plotly
    fig = make_subplots(
        rows=2,
        cols=min(4, len(X_columns)),
        subplot_titles=[f"{var} over time - {var} rate" for var in X_columns]
        + [
            f"{X_columns[0]} vs {var} - {var} {behavior[i]} rate"
            for i, var in enumerate(X_columns)
        ],
        horizontal_spacing=0.1,
        vertical_spacing=0.15,
    )

    for i, var in enumerate(X_columns):
        fig.add_trace(
            go.Scatter(
                x=np.repeat(t, NUM_RUNS),
                y=X[:, :, i].T.flatten(),
                mode="markers",
                marker=dict(
                    color=Y[:, :, i].T.flatten(),
                    colorscale=px.colors.sequential.Viridis,
                    showscale=True,
                    colorbar=dict(len=1.0, x=0.45 if i % 2 == 0 else 1.0),
                ),
            ),
            row=1,
            col=i + 1,
        )
        fig.update_xaxes(title_text="Time", row=1, col=i + 1)
        fig.update_yaxes(title_text=var, row=1, col=i + 1)

    for i, var in enumerate(X_columns):
        fig.add_trace(
            go.Scatter(
                x=X[:, :, 0].T.flatten(),
                y=X[:, :, i].T.flatten(),
                mode="markers",
                marker=dict(
                    color=Y[:, :, i].T.flatten(),
                    colorscale=px.colors.sequential.Viridis,
                ),
            ),
            row=2,
            col=i + 1,
        )
        fig.update_xaxes(title_text=X_columns[0], row=2, col=i + 1)
        fig.update_yaxes(title_text=var, row=2, col=i + 1)

    fig.update_layout(
        title_text="State plots of process colored by growth/consumption rates",
        showlegend=False,
        height=1000,
    )

    fig.show()

In [11]:
plot_state_data_color_by_rate(
    X_trainval, Y_trainval, t_steps=15, sign_mask=sign_mask, X_columns=X_columns
)

In [12]:
plot_state_data_color_by_rate(
    X_test, Y_test, t_steps=15, sign_mask=sign_mask, X_columns=X_columns
)

### Train and test set

In [13]:
def flatten_dataset(X, Z, F, Y, X_columns=None, Z_columns=None, verbose=False):
    """
    Flatten the input datasets (X, Z, F, Y) for further processing.

    Parameters:
    X (np.ndarray): Input state data array of shape (B, T, C)
    Z (np.ndarray): Additional input data array of shape (B, T, Z)
    F (np.ndarray): Feed rates array of shape (B, T, C)
    Y (np.ndarray): Target data array of shape (B, T, C)
    X_columns (list): List of column names for X features.
    Z_columns (list): List of column names for Z features.

    Returns:
    tuple: A tuple containing:
        - X_flat (np.ndarray): Flattened state data array of shape (B*T, C).
        - F_flat (np.ndarray): Flattened feed rates array of shape (B*T, C).
        - Y_flat (np.ndarray): Flattened target data array of shape (B*T, C).

    Prints:
    - Shape and column information of the flattened arrays.

    Raises:
    ValueError: If X, Z, F, and Y do not have the same batch size and time steps.
    """
    # Assert that X, Z, F, and Y have the same batch size (B) and time steps (T)
    assert (
        X.shape[:2] == Z.shape[:2] == F.shape[:2] == Y.shape[:2]
    ), "X, Z, F, and Y must have the same batch size and time steps"

    B, T, C = X.shape

    # X_flat = np.concatenate([X.reshape(B*T, C), Z[:,:,:-2].reshape(B*T, -1)], axis=1)

    X_flat = X.reshape(B * T, C)
    Y_flat = Y.reshape(B * T, C)
    F_flat = F.reshape(B * T, C)

    if verbose:
        print(f"Flatten Shape: ")
        print(f"X + Z Columns : {X_columns}+{Z_columns}")
        print(f"X Columns: {X_columns}")
        print(f"X Shape: {X_flat.shape}")
        print(f"F Shape: {F_flat.shape}")
        print(f"X Columns: {X_columns}")
        print(f"Derivative X Shape: {Y_flat.shape}")
    return X_flat, F_flat, Y_flat

In [15]:
X_trainval_flat, F_trainval_flat, Y_trainval_flat = flatten_dataset(
    X_trainval,
    Z_trainval,
    F_trainval,
    Y_trainval,
    X_columns=X_columns,
    Z_columns=Z_columns,
)
X_test_flat, F_test_flat, Y_test_flat = flatten_dataset(
    X_test,
	Z_test,
	F_test,
	Y_test,
	X_columns=X_columns, 
	Z_columns=Z_columns
)

### Train one model

In [17]:
def fit_gp_model(X, y):
    """
    Fit a Gaussian Process (GP) model to the given data.

    Parameters:
    X (np.ndarray): Input features array of shape (n_samples, n_features).
    y (np.ndarray): Target values array of shape (n_samples,).

    Returns:
    GaussianProcessRegressor: The fitted GP model.
    """

    n_features = X.shape[-1]

    # Apply RBF kernel to the normal features
    raw_feature_kernel = RBF(
        length_scale=[1e-1] * n_features, length_scale_bounds=(1e-2, 1e2)
    )

    # The SubspaceKernel ensures that the RBF kernel is only applied to the first 10 features
    feature_kernel = SubspaceKernel(
        raw_feature_kernel, ids_to_apply=np.arange(0, n_features)
    )

    # Noise kernal
    noise_kernel = WhiteKernel(noise_level=1e-2, noise_level_bounds=(1e-10, 1e1))

    # Combine the kernel and allow for Noise
    full_kernel = 1**2 * feature_kernel + noise_kernel
    gp_model = GaussianProcessRegressor(kernel=full_kernel, n_restarts_optimizer=3)

    # Fit GP model
    gp_model.fit(X, y)
    return gp_model

In [18]:
models = {}
for i, var_name in tqdm(
    enumerate(X_columns), desc="Training: ", total=len(X_columns), leave=True, ncols=80
):

    model = fit_gp_model(X_trainval_flat, Y_trainval_flat[:, i])
    models[i] = model

Training: 100%|███████████████████████████████████| 4/4 [00:21<00:00,  5.48s/it]


### Predict by one model

In [19]:
def ode_fcn(t, y, models, feeds=None, sign_mask=None):
    """
    Calculate the derivatives of the state variables at time t for the ODE solver.

    Parameters:
    t (float): Current time point.
    y (np.ndarray): Current state of the system, a list of state variables.
    models (dict): Dictionary of models where keys are indices (corresponding to the indices of state variables in X) \
                    and values are the respective predictive models for each state variable.
    feeds (np.ndarray): Array of feed rates with shape (T, C), where T is the number of time steps and C is the number of features/variables.
    sign_mask (list or np.ndarray): Array indicating the type of behavior for each variable, with -1 for consumption and 1 for growth.

    Returns:
    np.ndarray: Derivatives of the state variables.
    """

    # Current time index
    time_idx = int(t // 24)

    # Ensure time index does not exceed feed array bounds
    time_idx = min(time_idx, feeds.shape[0] - 1)

    # Get current feed rate
    current_feeds = feeds[time_idx]

    # Initialize derivatives array
    dX_dt = np.zeros(len(models))

    # Calculate the derivative for each state variable using the respective model and feed rate
    for i, model in models.items():
        dX_dt[i] = sign_mask[i] * model.predict(y.reshape(1, -1))[0] + current_feeds[i]

    return dX_dt

In [20]:
def run_hybrid(X, Z, F, models, t_end=14, time_step=24, sign_mask=None):
    """
    Run a hybrid simulation using the provided models and input data.

    Parameters:
    X (np.ndarray): State data array of shape (B, T, C) 
    Z (np.ndarray): Initial condition array of shape (B, T, C) 
    F (np.ndarray): Feed rates array of shape (B, T, C) 
    models (dict): Dictionary of models where keys are indices (corresponding to the indices of state variables in X) \
        and values are the respective predictive models for each state variable.
    t_end (int): End time of the simulation in days.
    time_step (int): Time step size for the simulation in hours.
    sign_mask (list or np.ndarray): Array indicating the type of behavior for each variable, with -1 for consumption and 1 for growth.

    Returns:
    np.ndarray: Predicted state data array of shape (B, T, C).
    """

    X_pred = np.zeros_like(X)

    for i in tqdm(
        range(len(X)), desc="Simulating: ", total=len(X), leave=True, ncols=80
    ):
        # Initial Values
        init_values = np.array([Z[i, 0, -1], Z[i, 0, -2], 0, 0])

        # Get all the time-steps on which we want to predict
        t_eval = np.arange(0, t_end * time_step + time_step, time_step)  # T,

        # Get feed rate data
        feeds = F[i, :, :]

        X_pred[i, :, :] = odeint(
            func=ode_fcn,
            y0=init_values,
            t=t_eval,
            args=(models, feeds, sign_mask),
            tfirst=True,
        )  # T, C

    return X_pred

In [21]:
X_trainval_pred = run_hybrid(
    X=X_trainval,
    Z=Z_trainval,
    F=F_trainval,
    models=models,
    t_end=14,
    time_step=24,
    sign_mask=sign_mask,
)
X_test_pred = run_hybrid(
    X=X_test,
    Z=Z_test,
    F=F_test,
    models=models,
    t_end=14,
    time_step=24,
    sign_mask=sign_mask,
)

Simulating: 100%|███████████████████████████████| 50/50 [00:19<00:00,  2.53it/s]
Simulating: 100%|███████████████████████████████| 50/50 [00:19<00:00,  2.50it/s]


### Model performance

In [22]:
def evaluate_model_performance(
    X_train, X_train_pred, X_test, X_test_pred, X_columns=None
):
    """
    Evaluate and print the performance of the model on training and testing data.

    Parameters:
    X_columns (list): List of column names for X features.
    X_train (np.ndarray): Actual training data array of shape (B, T, C).
    X_train_pred (np.ndarray): Predicted training data array of shape (B, T, C).
    X_test (np.ndarray): Actual testing data array of shape (B, T, C).
    X_test_pred (np.ndarray): Predicted testing data array of shape (B, T, C).

    Returns:
    None
    """
    print("Training")
    train_rmses = []
    train_r2s = []
    for i, var in enumerate(X_columns):
        # Calculate error metrics
        train_rmse = relative_rmse(X_train[:, :, i], X_train_pred[:, :, i])
        train_r2 = r2(X_train[:, :, i], X_train_pred[:, :, i])
        print(f"{var} Train RMSE: {train_rmse}, R2: {train_r2}")
        train_rmses.append(train_rmse)
        train_r2s.append(train_r2)

    print("\nTesting")
    test_rmses = []
    test_r2s = []
    for i, var in enumerate(X_columns):
        # Calculate error metrics
        test_rmse = relative_rmse(X_test[:, :, i], X_test_pred[:, :, i])
        test_r2 = r2(X_test[:, :, i], X_test_pred[:, :, i])
        print(f"{var} Test RMSE: {test_rmse}, R2: {test_r2}")
        test_rmses.append(test_rmse)
        test_r2s.append(test_r2)

    return train_rmses, test_rmses, train_r2s, test_r2s

In [23]:
evaluate_model_performance(
    X_trainval, X_trainval_pred, X_test, X_test_pred, X_columns=X_columns
)

Training
X:VCD Train RMSE: 0.185, R2: 0.96
X:Glc Train RMSE: 0.262, R2: 0.922
X:Lac Train RMSE: 0.11, R2: 0.983
X:Titer Train RMSE: 0.157, R2: 0.963

Testing
X:VCD Test RMSE: 0.117, R2: 0.985
X:Glc Test RMSE: 0.306, R2: 0.889
X:Lac Test RMSE: 0.058, R2: 0.996
X:Titer Test RMSE: 0.133, R2: 0.969


([0.185, 0.262, 0.11, 0.157],
 [0.117, 0.306, 0.058, 0.133],
 [0.96, 0.922, 0.983, 0.963],
 [0.985, 0.889, 0.996, 0.969])

In [24]:
def plot_multi_step_gp_model_eval(
    X,
    X_pred,
    X_test,
    X_test_pred,
    X_columns=None,
):
    """
    Plot the evaluation of multi-step GP model predictions for training and testing data.

    Parameters:
    X (np.ndarray): Actual training data array of shape (B, T, C).
    X_pred (np.ndarray): Predicted training data array of shape (B, T, C).
    X_test (np.ndarray): Actual testing data array of shape (B, T, C).
    X_test_pred (np.ndarray): Predicted testing data array of shape (B, T, C).
    X_columns (list): List of column names for X features.

    Returns:
    None
    """
    for i, col in enumerate(X_columns):
        y = X[:, :, i].copy()
        y_pred = X_pred[:, :, i].copy()
        y_test = X_test[:, :, i].copy()
        y_test_pred = X_test_pred[:, :, i].copy()

        # Metrics for training set
        train_r2 = r2(y, y_pred)
        train_abs_rmse = absolute_rmse(y, y_pred)
        train_rel_rmse = relative_rmse(y, y_pred)

        # Metrics for testing set
        test_r2 = r2(y_test, y_test_pred)
        test_abs_rmse = absolute_rmse(y_test, y_test_pred)
        test_rel_rmse = relative_rmse(y_test, y_test_pred)

        # Plot observed vs predicted
        fig = make_subplots(
            rows=1,
            cols=2,
            subplot_titles=(
                f"Train Set - {col} <br> R^2 = {train_r2} <br> Abs RMSE = {train_abs_rmse} <br> Rel RMSE = {train_rel_rmse}",
                f"Test Set - {col} <br> R^2 = {test_r2} <br> Abs RMSE = {test_abs_rmse} <br> Rel RMSE = {test_rel_rmse}",
            ),
        )

        # Train set plot
        NUM_TRAIN = X.shape[0]
        for i in range(NUM_TRAIN):
            fig.add_trace(
                go.Scatter(
                    x=y[i].reshape(-1),
                    y=y_pred[i].reshape(-1),
                    mode="markers",
                    name=f"Run id in Train {i}",
                    legendgroup=f"train_{i}",
                ),
                row=1,
                col=1,
            )
        fig.add_shape(
            type="line",
            x0=y_pred.min(),
            y0=y_pred.min(),
            x1=y_pred.max(),
            y1=y_pred.max(),
            layer="above",
            line=dict(dash="dash"),
        )

        # Test set plot
        NUM_TEST = X_test.shape[0]
        for j in range(NUM_TEST):
            fig.add_trace(
                go.Scatter(
                    x=y_test[j].reshape(-1),
                    y=y_test_pred[j].reshape(-1),
                    mode="markers",
                    name=f"Run id in Test {j}",
                    legendgroup=f"test_{j}",
                ),
                row=1,
                col=2,
            )
        fig.add_shape(
            type="line",
            x0=y_test_pred.min(),
            y0=y_test_pred.min(),
            x1=y_test_pred.max(),
            y1=y_test_pred.max(),
            layer="above",
            line=dict(dash="dash"),
            row=1,
            col=2,
        )

        fig.update_layout(width=1600)
        fig.update_xaxes(title="Observed values", row=1, col=1)
        fig.update_xaxes(title="Observed values", row=1, col=2)
        fig.update_yaxes(title="Predicted values", row=1, col=1)
        fig.update_yaxes(title="Predicted values", row=1, col=2)
        fig.show()


def plot_relative_rmse_by_variables(
    X,
    X_pred,
    X_test,
    X_test_pred,
    X_train_pred_std=None,
    X_val_pred_std=None,
    X_columns=None,
):
    """
    Plot the evaluation relative rmse for training and testing data with optional standard deviation.

    Parameters:
    X (np.ndarray): Actual training data array of shape (B, T, C).
    X_pred (np.ndarray): Predicted training data array of shape (B, T, C).
    X_test (np.ndarray): Actual testing data array of shape (B, T, C).
    X_test_pred (np.ndarray): Predicted testing data array of shape (B, T, C).
    X_train_pred_std (np.ndarray): Predicted training data standard deviation array of shape (B, T, C).
    X_val_pred_std (np.ndarray): Predicted testing data standard deviation array of shape (B, T, C).
    X_columns (list): List of column names for X features.

    Returns:
    None
    """
    relative_rmse_train = []
    relative_rmse_test = []
    train_std = []
    test_std = []

    for i, col in enumerate(X_columns):
        y = X[:, :, i].copy()
        y_pred = X_pred[:, :, i].copy()
        y_test = X_test[:, :, i].copy()
        y_test_pred = X_test_pred[:, :, i].copy()

        # Metrics for training set
        train_rel_rmse = relative_rmse(y, y_pred)
        relative_rmse_train.append(train_rel_rmse)

        # Calculate standard deviation if provided and scale by std of y
        if X_train_pred_std is not None:
            scaled_train_std = np.mean(X_train_pred_std[:, :, i]) / np.std(y)
            train_std.append(scaled_train_std)

        # Metrics for testing set
        test_rel_rmse = relative_rmse(y_test, y_test_pred)
        relative_rmse_test.append(test_rel_rmse)

        # Calculate standard deviation if provided and scale by std of y_test
        if X_val_pred_std is not None:
            scaled_test_std = np.mean(X_val_pred_std[:, :, i]) / np.std(y_test)
            test_std.append(scaled_test_std)

    fig_rmse = go.Figure()
    fig_rmse.add_trace(
        go.Bar(
            x=X_columns,
            y=relative_rmse_train,
            name="Train Set",
            marker_color=pcolors[0],
            text=[f"{v:.2f}" for v in relative_rmse_train],
            textposition="outside",
            error_y=dict(type="data", array=train_std) if train_std else None,
        )
    )

    fig_rmse.add_trace(
        go.Bar(
            x=X_columns,
            y=relative_rmse_test,
            name="Test Set",
            marker_color=pcolors[1],
            text=[f"{v:.2f}" for v in relative_rmse_test],
            textposition="outside",
            error_y=dict(type="data", array=test_std) if test_std else None,
        )
    )

    fig_rmse.update_layout(
        barmode="group",
        title="Relative RMSE for Each Variable",
        xaxis_title="Feature",
        yaxis_title="Relative RMSE",
        legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="center", x=0.5),
    )

    fig_rmse.show()


def plot_predicted_profile_uncertainty(
    X, X_pred_mean, X_pred_std=None, X_columns=None, select_runs=[0], height=1000
):
    """
    Plot the predicted profiles for selected runs with uncertainty.

    Parameters:
    X (np.ndarray): Actual data array of shape (B, T, C).
    X_pred_mean (np.ndarray): Predicted mean data array of shape (B, T, C).
    X_pred_std (np.ndarray): Predicted standard deviation data array of shape (B, T, C).
    X_columns (list): List of column names for X features.
    select_runs (list): List of run indices to be plotted.
    height (int): Height of the plot.

    Returns:
    None
    """
    max_cols_per_row = 5
    num_columns = len(X_columns)
    num_rows = (num_columns + max_cols_per_row) // max_cols_per_row

    fig = make_subplots(
        rows=num_rows, cols=min(num_columns, max_cols_per_row), subplot_titles=X_columns
    )
    color_palette = px.colors.qualitative.Plotly
    for idx, j in enumerate(select_runs):
        color = color_palette[idx % len(color_palette)]
        for i, c in enumerate(X_columns):
            row = i // max_cols_per_row + 1
            col = i % max_cols_per_row + 1
            show_legend = i == 0

            # 绘制观测值
            fig.add_trace(
                go.Scatter(
                    x=list(range(X.shape[1])),
                    y=X[j, :, i],
                    name=f"Run {j} Observed",
                    marker=dict(color=color),
                    showlegend=show_legend,
                    legendgroup=f"group_{j}",
                ),
                row=row,
                col=col,
            )

            # 绘制预测均值
            fig.add_trace(
                go.Scatter(
                    x=list(range(X.shape[1])),
                    y=X_pred_mean[j, :, i],
                    name=f"Run {j} Predicted",
                    line=dict(dash="dash", color=color),
                    marker=dict(color=color),
                    showlegend=show_legend,
                    legendgroup=f"group_{j}",
                ),
                row=row,
                col=col,
            )

            if X_pred_std is not None:
                # 绘制预测不确定性（±2 标准差）
                fig.add_trace(
                    go.Scatter(
                        x=list(range(X.shape[1])),
                        y=X_pred_mean[j, :, i] + 2 * X_pred_std[j, :, i],
                        fill=None,
                        name=f"Run {j} Predicted Upper Bound",
                        mode="lines",
                        line=dict(color=color, width=0),
                        showlegend=False,
                        legendgroup=f"group_{j}",
                    ),
                    row=row,
                    col=col,
                )
                fig.add_trace(
                    go.Scatter(
                        x=list(range(X.shape[1])),
                        y=X_pred_mean[j, :, i] - 2 * X_pred_std[j, :, i],
                        fill="tonexty",
                        name=f"Run {j} Predicted Lower Bound",
                        mode="lines",
                        line=dict(color=color, width=0),
                        fillcolor="rgba(128, 128, 128, 0.3)",  # 使用合法的rgba颜色表示
                        showlegend=False,
                        legendgroup=f"group_{j}",
                    ),
                    row=row,
                    col=col,
                )

    fig.update_layout(
        showlegend=True,
        title_text="Process variable evolution for selected runs",
        height=height,
    )
    fig.show()

In [25]:
plot_multi_step_gp_model_eval(
    X_trainval,
    X_trainval_pred,
    X_test,
    X_test_pred,
    X_columns=X_columns,
)

plot_relative_rmse_by_variables(
    X_trainval,
    X_trainval_pred,
    X_test,
    X_test_pred,
    X_columns=X_columns,
)

In [26]:
plot_predicted_profile_uncertainty(
    X_test,
    X_test_pred,
    X_columns=X_columns,
    select_runs=[0, 1, 2, 3, 4, 19],
    height=500,
)

## Ensemble

In [67]:
def predict_with_ensemble(models_list, X, Z, F, sign_mask, t_end=14, time_step=24):
    predictions = np.zeros((len(models_list), X.shape[0], X.shape[1], X.shape[2]))

    for i, models in tqdm(
        enumerate(models_list),
        desc="Ensembles inference: ",
        total=len(models_list),
        leave=True,
        ncols=80,
    ):
        predictions[i] = run_hybrid(
            X, Z, F, models, t_end=t_end, time_step=time_step, sign_mask=sign_mask
        )
    mean_prediction = np.mean(predictions, axis=0)
    std_prediction = np.std(predictions, axis=0)
    return mean_prediction, std_prediction


def evaluate_ensemble_models(
    models_list, X_train, X_val, Z_train, Z_val, F_train, F_val, sign_mask
):
    print("Ensemble models predicting on train set")
    X_train_pred_mean, X_train_pred_std = predict_with_ensemble(
        models_list, X_train, Z_train, F_train, sign_mask
    )
    print("Ensemble models predicting on validation set")
    X_val_pred_mean, X_val_pred_std = predict_with_ensemble(
        models_list, X_val, Z_val, F_val, sign_mask
    )

    return X_train_pred_mean, X_train_pred_std, X_val_pred_mean, X_val_pred_std

### K-Fold Ensemble

In [34]:
def train_gp_models_cv(X, Y, Z, F, X_columns, Z_columns, n_splits=10):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    models_list = []

    for k, (train_index, val_index) in enumerate(kf.split(X)):
        X_train, X_val = X[train_index], X[val_index]
        Y_train, Y_val = Y[train_index], Y[val_index]
        Z_train, Z_val = Z[train_index], Z[val_index]
        F_train, F_val = F[train_index], F[val_index]

        X_flat, F_flat, Y_flat = flatten_dataset(
            X_train, Z_train, F_train, Y_train, X_columns=X_columns, Z_columns=Z_columns
        )

        models = {}
        for i, var_name in tqdm(
            enumerate(X_columns),
            desc=f"Training - {k}th fold: ",
            total=len(X_columns),
            leave=True,
            ncols=80,
        ):
            model = fit_gp_model(X_flat, Y_flat[:, i])
            models[i] = model
        models_list.append(models)

    return models_list

In [31]:
models_list = train_gp_models_cv(
    X_trainval,
    Y_trainval,
    Z_trainval,
    F_trainval,
    X_columns=X_columns,
    Z_columns=Z_columns,
    n_splits=10,
)

Training - 0th fold: 100%|████████████████████████| 4/4 [00:20<00:00,  5.22s/it]
Training - 1th fold: 100%|████████████████████████| 4/4 [00:18<00:00,  4.69s/it]
Training - 2th fold: 100%|████████████████████████| 4/4 [00:17<00:00,  4.35s/it]
Training - 3th fold: 100%|████████████████████████| 4/4 [00:17<00:00,  4.30s/it]
Training - 4th fold: 100%|████████████████████████| 4/4 [00:15<00:00,  3.79s/it]
Training - 5th fold: 100%|████████████████████████| 4/4 [00:19<00:00,  4.94s/it]
Training - 6th fold: 100%|████████████████████████| 4/4 [00:15<00:00,  3.85s/it]
Training - 7th fold: 100%|████████████████████████| 4/4 [00:17<00:00,  4.49s/it]
Training - 8th fold: 100%|████████████████████████| 4/4 [00:16<00:00,  4.15s/it]
Training - 9th fold: 100%|████████████████████████| 4/4 [00:17<00:00,  4.47s/it]


In [35]:
X_train, X_val, Y_train, Y_val, Z_train, Z_val, F_train, F_val = train_test_split(
    X_trainval, Y_trainval, Z_trainval, F_trainval, test_size=0.2, random_state=42
)

In [36]:
X_train_pred_mean, X_train_pred_std, X_val_pred_mean, X_val_pred_std = (
    evaluate_ensemble_models(
        models_list,
        X_train,
        X_val,
        Z_train,
        Z_val,
        F_train,
        F_val,
        sign_mask,
    )
)

Ensemble models predicting on train set


Simulating: 100%|███████████████████████████████| 40/40 [00:14<00:00,  2.67it/s]
Simulating: 100%|███████████████████████████████| 40/40 [00:15<00:00,  2.56it/s]
Simulating: 100%|███████████████████████████████| 40/40 [00:24<00:00,  1.65it/s]
Simulating: 100%|███████████████████████████████| 40/40 [00:14<00:00,  2.79it/s]
Simulating: 100%|███████████████████████████████| 40/40 [00:08<00:00,  4.61it/s]
Simulating: 100%|███████████████████████████████| 40/40 [00:13<00:00,  2.99it/s]
Simulating: 100%|███████████████████████████████| 40/40 [00:18<00:00,  2.18it/s]
Simulating: 100%|███████████████████████████████| 40/40 [00:14<00:00,  2.83it/s]
Simulating: 100%|███████████████████████████████| 40/40 [00:09<00:00,  4.35it/s]
Simulating: 100%|███████████████████████████████| 40/40 [00:13<00:00,  2.94it/s]
Ensembles inference: 100%|██████████████████████| 10/10 [02:26<00:00, 14.65s/it]


Ensemble models predicting on validation set


Simulating: 100%|███████████████████████████████| 10/10 [00:03<00:00,  2.68it/s]
Simulating: 100%|███████████████████████████████| 10/10 [00:03<00:00,  2.51it/s]
Simulating: 100%|███████████████████████████████| 10/10 [00:06<00:00,  1.48it/s]
Simulating: 100%|███████████████████████████████| 10/10 [00:03<00:00,  2.85it/s]
Simulating: 100%|███████████████████████████████| 10/10 [00:02<00:00,  4.50it/s]
Simulating: 100%|███████████████████████████████| 10/10 [00:03<00:00,  2.93it/s]
Simulating: 100%|███████████████████████████████| 10/10 [00:05<00:00,  1.87it/s]
Simulating: 100%|███████████████████████████████| 10/10 [00:03<00:00,  2.68it/s]
Simulating: 100%|███████████████████████████████| 10/10 [00:02<00:00,  4.27it/s]
Simulating: 100%|███████████████████████████████| 10/10 [00:03<00:00,  2.77it/s]
Ensembles inference: 100%|██████████████████████| 10/10 [00:38<00:00,  3.87s/it]


In [37]:
evaluate_model_performance(
    X_train,
    X_train_pred_mean,
    X_val,
    X_val_pred_mean,
    X_columns=X_columns,
)

Training
X:VCD Train RMSE: 0.204, R2: 0.953
X:Glc Train RMSE: 0.27, R2: 0.918
X:Lac Train RMSE: 0.131, R2: 0.975
X:Titer Train RMSE: 0.183, R2: 0.946

Testing
X:VCD Test RMSE: 0.173, R2: 0.966
X:Glc Test RMSE: 0.313, R2: 0.878
X:Lac Test RMSE: 0.114, R2: 0.984
X:Titer Test RMSE: 0.144, R2: 0.968


([0.204, 0.27, 0.131, 0.183],
 [0.173, 0.313, 0.114, 0.144],
 [0.953, 0.918, 0.975, 0.946],
 [0.966, 0.878, 0.984, 0.968])

In [40]:
plot_relative_rmse_by_variables(
    X_train,
    X_train_pred_mean,
    X_val,
    X_val_pred_mean,
    # X_train_pred_std=X_train_pred_std,
    # X_val_pred_std=X_val_pred_std,
    X_columns=X_columns,
)

In [41]:
X_test_pred_mean, X_test_pred_std = predict_with_ensemble(
    models_list, X_test, Z_test, F_test, sign_mask
)

Simulating: 100%|███████████████████████████████| 50/50 [00:18<00:00,  2.66it/s]
Simulating: 100%|███████████████████████████████| 50/50 [00:18<00:00,  2.64it/s]
Simulating: 100%|███████████████████████████████| 50/50 [00:29<00:00,  1.68it/s]
Simulating: 100%|███████████████████████████████| 50/50 [00:17<00:00,  2.79it/s]
Simulating: 100%|███████████████████████████████| 50/50 [00:10<00:00,  4.64it/s]
Simulating: 100%|███████████████████████████████| 50/50 [00:16<00:00,  3.00it/s]
Simulating: 100%|███████████████████████████████| 50/50 [00:28<00:00,  1.76it/s]
Simulating: 100%|███████████████████████████████| 50/50 [00:17<00:00,  2.82it/s]
Simulating: 100%|███████████████████████████████| 50/50 [00:10<00:00,  4.57it/s]
Simulating: 100%|███████████████████████████████| 50/50 [00:18<00:00,  2.77it/s]
Ensembles inference: 100%|██████████████████████| 10/10 [03:07<00:00, 18.80s/it]


In [77]:
plot_predicted_profile_uncertainty(
    X_test,
    X_test_pred_mean,
    X_test_pred_std,
    X_columns=X_columns,
    select_runs=[0, 1, 2, 3, 19],
    height=500,
)

### Bootstrap Ensemble on Concentrations

In [50]:
def bootstrap_resampling(
    X, Y, Z, F, n_boot, train_ratio=0.9, replace=False, random_state=None
):
    """
    Generate bootstrap sample pairs.

    Parameters:
    X, Y, Z, F : numpy.ndarray
        Input matrices with shape (B, T, C).
    n_boot : int
        Number of bootstrap sample pairs to generate.
    train_ratio : float, optional
        Ratio of the number of training samples to the total number of experiments. Default is 0.5.
    replace : bool, optional
        Whether to sample with replacement. Default is False.
    random_state : int or None, optional
        Random seed. Default is None.

    Returns:
    list of tuple
        A list of bootstrap sample pairs, where each element is a tuple (train_data, val_data).
    """
    if random_state is not None:
        np.random.seed(random_state)

    n_experiments = X.shape[0]
    n_train = int(train_ratio * n_experiments)
    n_val = n_experiments - n_train

    bootstrap_samples = []
    for _ in range(n_boot):
        train_indices = np.random.choice(n_experiments, n_train, replace=replace)
        val_indices = np.array(
            [exp for exp in range(n_experiments) if exp not in train_indices]
        )

        train_data = (
            X[train_indices],
            Y[train_indices],
            Z[train_indices],
            F[train_indices],
        )
        val_data = (X[val_indices], Y[val_indices], Z[val_indices], F[val_indices])

        bootstrap_samples.append((train_data, val_data))

    return bootstrap_samples


def train_gp_models_bootstrap(
    X,
    Y,
    Z,
    F,
    X_columns,
    Z_columns,
    n_boot=10,
    train_ratio=0.5,
    replace=True,
    random_state=42,
):
    """
    Train Gaussian Process (GP) models using bootstrap resampling.

    Parameters:
    X, Y, Z, F : numpy.ndarray
        Input matrices with shape (B, T, C).
    X_columns, Z_columns : list of str
        Column names for X and Z matrices.
    n_boot : int, optional
        Number of bootstrap sample pairs to generate. Default is 10.
    train_ratio : float, optional
        Ratio of the number of training samples to the total number of experiments. Default is 0.5.
    replace : bool, optional
        Whether to sample with replacement. Default is True.
    random_state : int or None, optional
        Random seed. Default is 42.

    Returns:
    list of dict
        A list of dictionaries, where each dictionary contains trained GP models for the bootstrap sample.
    """
    # Generate bootstrap samples
    bootstrap_datasets = bootstrap_resampling(
        X,
        Y,
        Z,
        F,
        n_boot=n_boot,
        train_ratio=train_ratio,
        replace=replace,
        random_state=random_state,
    )

    models_list = []

    for b, (train_data, val_data) in enumerate(bootstrap_datasets):
        X_train, Y_train, Z_train, F_train = train_data
        X_flat, F_flat, Y_flat = flatten_dataset(
            X_train, Z_train, F_train, Y_train, X_columns=X_columns, Z_columns=Z_columns
        )

        models = {}
        for i, var_name in tqdm(
            enumerate(X_columns),
            desc=f"Training - {b}th boot: ",
            total=len(X_columns),
            leave=True,
            ncols=80,
        ):
            model = fit_gp_model(X_flat, Y_flat[:, i])
            models[i] = model
        models_list.append(models)

    return models_list

In [51]:
models_list = train_gp_models_bootstrap(
    X_trainval,
    Y_trainval,
    Z_trainval,
    F_trainval,
    X_columns,
    Z_columns,
    n_boot=10,
	train_ratio=0.5,
    replace=False,
    random_state=42,
)

Training - 0th boot: 100%|████████████████████████| 4/4 [00:05<00:00,  1.34s/it]
Training - 1th boot: 100%|████████████████████████| 4/4 [00:04<00:00,  1.01s/it]
Training - 2th boot: 100%|████████████████████████| 4/4 [00:05<00:00,  1.45s/it]
Training - 3th boot: 100%|████████████████████████| 4/4 [00:05<00:00,  1.29s/it]
Training - 4th boot: 100%|████████████████████████| 4/4 [00:05<00:00,  1.39s/it]
Training - 5th boot: 100%|████████████████████████| 4/4 [00:04<00:00,  1.03s/it]
Training - 6th boot: 100%|████████████████████████| 4/4 [00:04<00:00,  1.17s/it]
Training - 7th boot: 100%|████████████████████████| 4/4 [00:05<00:00,  1.36s/it]
Training - 8th boot: 100%|████████████████████████| 4/4 [00:03<00:00,  1.00it/s]
Training - 9th boot: 100%|████████████████████████| 4/4 [00:05<00:00,  1.38s/it]


In [63]:
# 预测和评估模型
X_train, X_val, Y_train, Y_val, Z_train, Z_val, F_train_sub, F_val = (
    train_test_split(X_trainval, Y_trainval, Z_trainval, F_trainval, test_size=0.2, random_state=42)
)

X_train_pred_mean, X_train_pred_std, X_val_pred_mean, X_val_pred_std = (
    evaluate_ensemble_models(
        models_list,
        X_train,
        X_val,
        Z_train,
        Z_val,
        F_train,
        F_val,
        sign_mask,
    )
)

Ensemble models predicting on train set


Simulating: 100%|███████████████████████████████| 40/40 [00:06<00:00,  6.55it/s]
Simulating: 100%|███████████████████████████████| 40/40 [00:06<00:00,  6.30it/s]
Simulating: 100%|███████████████████████████████| 40/40 [00:06<00:00,  6.27it/s]
Simulating: 100%|███████████████████████████████| 40/40 [00:08<00:00,  4.86it/s]
Simulating: 100%|███████████████████████████████| 40/40 [00:13<00:00,  3.04it/s]
Simulating: 100%|███████████████████████████████| 40/40 [00:06<00:00,  5.88it/s]
Simulating: 100%|███████████████████████████████| 40/40 [00:07<00:00,  5.65it/s]
Simulating: 100%|███████████████████████████████| 40/40 [00:13<00:00,  3.06it/s]
Simulating: 100%|███████████████████████████████| 40/40 [00:07<00:00,  5.44it/s]
Simulating: 100%|███████████████████████████████| 40/40 [00:08<00:00,  4.46it/s]
Ensembles inference: 100%|██████████████████████| 10/10 [01:23<00:00,  8.35s/it]


Ensemble models predicting on validation set


Simulating: 100%|███████████████████████████████| 10/10 [00:01<00:00,  6.69it/s]
Simulating: 100%|███████████████████████████████| 10/10 [00:01<00:00,  6.15it/s]
Simulating: 100%|███████████████████████████████| 10/10 [00:01<00:00,  5.84it/s]
Simulating: 100%|███████████████████████████████| 10/10 [00:02<00:00,  4.66it/s]
Simulating: 100%|███████████████████████████████| 10/10 [00:03<00:00,  2.95it/s]
Simulating: 100%|███████████████████████████████| 10/10 [00:01<00:00,  5.78it/s]
Simulating: 100%|███████████████████████████████| 10/10 [00:01<00:00,  5.73it/s]
Simulating: 100%|███████████████████████████████| 10/10 [00:03<00:00,  3.09it/s]
Simulating: 100%|███████████████████████████████| 10/10 [00:01<00:00,  5.31it/s]
Simulating: 100%|███████████████████████████████| 10/10 [00:02<00:00,  4.47it/s]
Ensembles inference: 100%|██████████████████████| 10/10 [00:21<00:00,  2.12s/it]


In [53]:
evaluate_model_performance(
    X_train,
    X_train_pred_mean,
    X_val,
    X_val_pred_mean,
    X_columns=X_columns,
)

Training
X:VCD Train RMSE: 0.198, R2: 0.955
X:Glc Train RMSE: 0.278, R2: 0.911
X:Lac Train RMSE: 0.176, R2: 0.954
X:Titer Train RMSE: 0.229, R2: 0.915

Testing
X:VCD Test RMSE: 0.161, R2: 0.968
X:Glc Test RMSE: 0.302, R2: 0.894
X:Lac Test RMSE: 0.149, R2: 0.97
X:Titer Test RMSE: 0.217, R2: 0.923


([0.198, 0.278, 0.176, 0.229],
 [0.161, 0.302, 0.149, 0.217],
 [0.955, 0.911, 0.954, 0.915],
 [0.968, 0.894, 0.97, 0.923])

In [54]:
plot_relative_rmse_by_variables(
    X_train,
    X_train_pred_mean,
    X_val,
    X_val_pred_mean,
    # X_train_pred_std=X_train_pred_std,
    # X_val_pred_std=X_val_pred_std,
    X_columns=X_columns,
)

In [55]:
X_test_pred_mean, X_test_pred_std = predict_with_ensemble(
    models_list, X_test, Z_test, F_test, sign_mask
)

Simulating: 100%|███████████████████████████████| 50/50 [00:07<00:00,  6.67it/s]
Simulating: 100%|███████████████████████████████| 50/50 [00:07<00:00,  6.28it/s]
Simulating: 100%|███████████████████████████████| 50/50 [00:08<00:00,  6.04it/s]
Simulating: 100%|███████████████████████████████| 50/50 [00:10<00:00,  4.56it/s]
Simulating: 100%|███████████████████████████████| 50/50 [00:16<00:00,  3.08it/s]
Simulating: 100%|███████████████████████████████| 50/50 [00:08<00:00,  6.11it/s]
Simulating: 100%|███████████████████████████████| 50/50 [00:09<00:00,  5.55it/s]
Simulating: 100%|███████████████████████████████| 50/50 [00:16<00:00,  2.96it/s]
Simulating: 100%|███████████████████████████████| 50/50 [00:09<00:00,  5.05it/s]
Simulating: 100%|███████████████████████████████| 50/50 [00:11<00:00,  4.33it/s]
Ensembles inference: 100%|██████████████████████| 10/10 [01:46<00:00, 10.64s/it]


In [57]:
plot_predicted_profile_uncertainty(
    X_test,
    X_test_pred_mean,
    X_test_pred_std,
    X_columns=X_columns,
    select_runs=[0, 1, 2, 3, 19],
    height=500,
)

### Bootstrap Ensemble on Rates

In [72]:
def predict_with_ensemble_rates(models_list, X, Z, F, sign_mask, t_end=14, time_step=24):
    """使用集成模型进行预测"""
    def ode_fcn(t, y, models_list, feeds=None, sign_mask=None):
        """计算状态变量的导数"""
        time_idx = int(t // 24)
        time_idx = min(time_idx, feeds.shape[0] - 1)
        current_feeds = feeds[time_idx]
        
        dX_dt = np.zeros(len(models_list[0]))
        for i in range(len(dX_dt)):
            predictions = [model[i].predict(y.reshape(1, -1))[0] for model in models_list]
            dX_dt[i] = sign_mask[i] * np.mean(predictions) + current_feeds[i]
        
        return dX_dt

    X_pred = np.zeros_like(X)
    for i in tqdm(range(len(X)), desc="Simulating: ", total=len(X), leave=True, ncols=80):
        init_values = np.array([Z[i, 0, -1], Z[i, 0, -2], 0, 0])
        t_eval = np.arange(0, t_end * time_step + time_step, time_step)
        feeds = F[i, :, :]
        
        X_pred[i, :, :] = odeint(
            func=ode_fcn,
            y0=init_values,
            t=t_eval,
            args=(models_list, feeds, sign_mask),
            tfirst=True,
        )
    
    return X_pred

def evaluate_ensemble_rates_models(
    models_list, X_train, X_val, Z_train, Z_val, F_train, F_val, sign_mask
):
    print("Ensemble models predicting on train set")
    X_train_pred = predict_with_ensemble_rates(
        models_list, X_train, Z_train, F_train, sign_mask
    )
    print("Ensemble models predicting on validation set")
    X_val_pred = predict_with_ensemble_rates(
        models_list, X_val, Z_val, F_val, sign_mask
    )

    return X_train_pred, X_val_pred

In [73]:
X_train_pred, X_val_pred = (
    evaluate_ensemble_rates_models(
        models_list,
        X_train,
        X_val,
        Z_train,
        Z_val,
        F_train,
        F_val,
        sign_mask,
    )
)

Ensemble models predicting on train set


Simulating: 100%|███████████████████████████████| 40/40 [02:05<00:00,  3.15s/it]


Ensemble models predicting on validation set


Simulating: 100%|███████████████████████████████| 10/10 [00:32<00:00,  3.28s/it]


In [74]:
evaluate_model_performance(
    X_train,
    X_train_pred,
    X_val,
    X_val_pred,
    X_columns=X_columns,
)

Training
X:VCD Train RMSE: 0.298, R2: 0.896
X:Glc Train RMSE: 0.329, R2: 0.87
X:Lac Train RMSE: 0.147, R2: 0.969
X:Titer Train RMSE: 0.225, R2: 0.92

Testing
X:VCD Test RMSE: 0.167, R2: 0.966
X:Glc Test RMSE: 0.299, R2: 0.897
X:Lac Test RMSE: 0.119, R2: 0.982
X:Titer Test RMSE: 0.131, R2: 0.975


([0.298, 0.329, 0.147, 0.225],
 [0.167, 0.299, 0.119, 0.131],
 [0.896, 0.87, 0.969, 0.92],
 [0.966, 0.897, 0.982, 0.975])

In [75]:
X_test_pred = predict_with_ensemble_rates(
    models_list, X_test, Z_test, F_test, sign_mask
)

Simulating: 100%|███████████████████████████████| 50/50 [02:38<00:00,  3.18s/it]


In [76]:
plot_predicted_profile_uncertainty(
    X_test,
    X_test_pred,
    X_columns=X_columns,
    select_runs=[0, 1, 2, 3, 19],
    height=500,
)