In [1]:
import pandas as pd
import numpy as np
import scipy.io as sio
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score

In [2]:
file_data = 'data/data.xlsx'

In [3]:
# ==============================================================================
# 1. 优化后的 Feed Concentration 解析函数
# ==============================================================================
def get_feed_concentrations(file_path):
    df_raw = pd.read_excel(file_path, sheet_name='feed conc')
    
    target_mets = {'Ala', 'Arg', 'Asn', 'Asp', 'Cys', 'Glc', 'Gln', 'Glu', 'Pyr', 
                   'Gly', 'His', 'Ile', 'Lac', 'Leu', 'Lys', 'Met', 'Nh4', 'Phe', 
                   'Pro', 'Ser', 'Thr', 'Tyr', 'Val'}
    names = df_raw.columns.values
    values = df_raw.iloc[1].values # (跳过单位行)
    
    feed_concs = {}
    for n, v in zip(names, values):
        if isinstance(n, str) and n.strip() in target_mets:
            try:
                feed_concs[n.strip()] = float(v)
            except ValueError:
                pass 
                
    return feed_concs

In [None]:
# ==============================================================================
# 2. 数据处理主逻辑 (计算 Mr/IR)
# ==============================================================================
def process_bioprocess_data(data_file, feed_concs):
    # 读取数据
    df = pd.read_excel(data_file, sheet_name='data')
    
    # --- 2.1 数据清洗 ---
    # 删除第一行单位行 (例如包含 'h', 'mM')
    if str(df.iloc[0]['Time']).strip() == 'h':
        df = df.drop(0).reset_index(drop=True)
        
    # 强制转换为数值类型
    cols_to_numeric = df.columns.drop(['Experiment'])
    df[cols_to_numeric] = df[cols_to_numeric].apply(pd.to_numeric, errors='coerce')
        
    processed_dfs = []
    unique_exps = df['Experiment'].unique()
    
    print(f"开始处理 {len(unique_exps)} 个实验批次...")
    
    for exp_id in unique_exps:
        # 提取单批次数据并按时间排序
        group = df[df['Experiment'] == exp_id].copy().sort_values('Time').reset_index(drop=True)
        
        # --- 2.2 物理量提取与单位统一 ---
        # 原始数据确认：
        # V: 270 (mL) -> 需要转 L
        # feed volume: 7.04 (mL, 增量) -> 需要转 L
        # sample: 0 (mL) -> 需要转 L
        
        V_L = group['V'] / 1000.0  # mL -> L
        
        # 注意：feed volume 是 Incremental (每一步加入的量)，不需要 diff
        Feed_Vol_Incr_L = group['feed volume'] / 1000.0 
        Sample_Vol_L = group['sample'] / 1000.0 
        Time = group['Time']
        
        # 初始化结果 DataFrame
        res = group[['Time', 'Experiment']].copy()
        res['V_L'] = V_L
        
        # ==========================================
        # 2.3 处理 Biomass (Xv)
        # ==========================================
        # 单位转换: Mcell/mL -> gDW/L
        X_raw = group['X'] 
        dw_factors = np.where(Time < 100, 0.2161, 0.2875)
        Xv_conc = X_raw * dw_factors # gDW/L
        res['Xv'] = Xv_conc
        
        # 计算 Mr_Xv (单位: g)
        # Accum 初始 = 初始质量; Update = -Sample
        accum_X = np.zeros(len(group))
        mass_X0 = V_L.iloc[0] * Xv_conc.iloc[0]
        accum_X[0] = mass_X0 
        
        for t in range(1, len(group)):
            # Biomass Feed=0
            mass_out = Sample_Vol_L.iloc[t] * Xv_conc.iloc[t]
            accum_X[t] = accum_X[t-1] - mass_out
            
        res['Mr_Xv'] = (V_L * Xv_conc) - accum_X
        
        # ==========================================
        # 2.4 处理 mAb (Product)
        # ==========================================
        # 单位: mg/L -> Mass: mg
        # 逻辑: Feed=0, Sample Removes
        mAb_conc = group['mAb'] # mg/L
        res['Conc_mAb'] = mAb_conc
        
        accum_mAb = np.zeros(len(group))
        mass_mAb0 = V_L.iloc[0] * mAb_conc.iloc[0]
        accum_mAb[0] = mass_mAb0
        
        for t in range(1, len(group)):
            # mAb Feed=0
            mass_out = Sample_Vol_L.iloc[t] * mAb_conc.iloc[t]
            accum_mAb[t] = accum_mAb[t-1] - mass_out
            
        res['Mr_mAb'] = (V_L * mAb_conc) - accum_mAb
        
        # ==========================================
        # 2.5 处理 Metabolites (Ala, Glc, etc.)
        # ==========================================
        for met, feed_c in feed_concs.items():
            if met not in group.columns:
                continue
            
            conc = group[met] # mM
            
            # 初始 Accum
            mass_0 = V_L.iloc[0] * conc.iloc[0]
            accum_vec = np.zeros(len(group))
            accum_vec[0] = mass_0
            
            for t in range(1, len(group)):
                # 1. Feed 带来的增加 (mmol)
                # 使用原始增量值
                d_feed_vol = Feed_Vol_Incr_L.iloc[t]
                mass_in = d_feed_vol * feed_c 
                
                # 2. Sample 带走的减少 (mmol)
                mass_out = Sample_Vol_L.iloc[t] * conc.iloc[t]
                
                # 3. 更新 Accum
                accum_vec[t] = accum_vec[t-1] + mass_in - mass_out
                
            # Mr 计算
            total_mass = V_L * conc
            mr = total_mass - accum_vec
            
            # 保存结果
            res[f'Conc_{met}'] = conc
            res[f'Mr_{met}'] = mr
            
        processed_dfs.append(res)

    df_final = pd.concat(processed_dfs, ignore_index=True)
    return df_final

In [13]:
# 1. 解析 Feed
print("正在解析 Feed 配方...")
feed_concs = get_feed_concentrations(file_data)

# 2. 处理数据
print("正在计算 Accum 和 Mr...")
df_mr = process_bioprocess_data(file_data, feed_concs)

# 3. 保存
output_csv = 'processed_data_IR_final.csv'
df_mr.to_csv(output_csv, index=False)

print("-" * 30)
print(f"处理完成。")
print(f"输出文件: {output_csv}")
print(f"数据维度: {df_mr.shape}")
print(f"包含的列 (前10个): {list(df_mr.columns[:10])}")
print("-" * 30)

# 打印前几行检验 (检查单位数量级是否合理，Mr 应该是 mmol 级别)
print("数据预览 (Br1, Glucose):")
print(df_mr[df_mr['Experiment'] == 1][['Time', 'Conc_Glc', 'Mr_Glc']].head())

正在解析 Feed 配方...
正在计算 Accum 和 Mr...
开始处理 9 个实验批次...
------------------------------
处理完成。
输出文件: processed_data_IR_final.csv
数据维度: (189, 53)
包含的列 (前10个): ['Time', 'Experiment', 'V_L', 'Xv', 'Mr_Xv', 'Conc_mAb', 'Mr_mAb', 'Conc_Ala', 'Mr_Ala', 'Conc_Arg']
------------------------------
数据预览 (Br1, Glucose):
   Time   Conc_Glc    Mr_Glc
0     0  72.550000  0.000000
1    12  65.198513 -1.827211
2    24  56.939987 -4.015453
3    36  48.140276 -6.477662
4    48  39.084531 -9.140549


In [15]:
def validate_matrix(exp_id, df_mr, feature_names, org_mat_file):

    """
    对比计算矩阵与原始矩阵，打印详细的误差统计。
    """
    # Extract Exp 1
    df_exp = df_mr[df_mr['Experiment'] == exp_id].copy()

    # Total 25: Xv, mAb, + 23 mets
    mr_cols = [col for col in df_mr.columns if col.startswith('Mr_')]

    assert len(mr_cols) == 25

    # Create matrix (25 x 21)
    calculated = df_exp[mr_cols].values.T
    original = sio.loadmat(org_mat_file)['data']['m_r'][0][exp_id-1]

    n_vars = calculated.shape[0]
    
    print(f"{'Feature':<10} | {'Max Abs Err':<12} | {'Max Rel Err':<12} | {'R2 Score':<10} | {'Status'}")
    print("-" * 65)
    
    for i in range(n_vars):
        y_calc = calculated[i, :]
        y_true = original[i, :]
        
        # 1. 绝对误差
        abs_diff = np.abs(y_calc - y_true)
        max_abs_err = np.max(abs_diff)
        
        # 2. 相对误差 (防止除以0)
        with np.errstate(divide='ignore', invalid='ignore'):
            rel_diff = abs_diff / (np.abs(y_true) + 1e-6) # 加个小项防止除0
            max_rel_err = np.max(rel_diff)
            
        # 3. R2 Score (衡量趋势一致性)
        r2 = r2_score(y_true, y_calc)
        
        # 判定
        status = "✅" if r2 > 0.99 else "⚠️"
        if max_rel_err > 0.1 and max_abs_err > 0.1: status = "❌" # 误差超过10%且绝对值不忽略不计
        
        name = feature_names[i] if i < len(feature_names) else f"Var_{i}"
        print(f"{name:<10} | {max_abs_err:.2e}     | {max_rel_err:.2e}     | {r2:.4f}     | {status}")

# 定义变量名列表 (25个)
metabolites_order = ['Ala', 'Arg', 'Asn', 'Asp', 'Cys', 'Glc', 'Gln', 'Glu', 'Pyr', 'Gly', 'His', 'Ile', 'Lac', 'Leu', 'Lys', 'Met', 'Nh4', 'Phe', 'Pro', 'Ser', 'Thr', 'Tyr', 'Val']
feature_names = ['Xv', 'mAb'] + metabolites_order

# 运行验证
validate_matrix(exp_id=1, df_mr=df_mr, feature_names=feature_names, org_mat_file='data/data.mat')

Feature    | Max Abs Err  | Max Rel Err  | R2 Score   | Status
-----------------------------------------------------------------
Xv         | 7.66e-02     | 6.22e-02     | 0.9892     | ⚠️
mAb        | 1.14e-13     | 1.16e-10     | 1.0000     | ✅
Ala        | 8.88e-16     | 3.58e-16     | 1.0000     | ✅
Arg        | 3.55e-15     | 2.80e-15     | 1.0000     | ✅
Asn        | 4.44e-16     | 3.71e-15     | 1.0000     | ✅
Asp        | 2.22e-16     | 1.69e-15     | 1.0000     | ✅
Cys        | 4.44e-16     | 1.77e-15     | 1.0000     | ✅
Glc        | 3.55e-15     | 1.49e-16     | 1.0000     | ✅
Gln        | 3.55e-15     | 4.74e-14     | 1.0000     | ✅
Glu        | 2.22e-16     | 6.78e-15     | 1.0000     | ✅
Pyr        | 3.55e-15     | 2.34e-15     | 1.0000     | ✅
Gly        | 3.55e-15     | 2.00e-16     | 1.0000     | ✅
His        | 8.88e-16     | 1.20e-15     | 1.0000     | ✅
Ile        | 7.11e-15     | 1.24e-14     | 1.0000     | ✅
Lac        | 0.00e+00     | 0.00e+00     | 1.0000     | ✅


In [16]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA

# ==============================================================================
# 1. PCA 分析与 S 矩阵构建
# ==============================================================================
def build_reaction_correlation_matrix(csv_file, n_components=7):
    # 读取预处理后的数据
    df = pd.read_csv(csv_file)
    
    # 提取所有 M_r 列 (Reacted Amount)
    # 注意顺序：Xv, mAb, Ala, Arg ... (需与之前定义的 met_list 一致)
    met_list = ['Ala', 'Arg', 'Asn', 'Asp', 'Cys', 'Glc', 'Gln', 'Glu', 'Pyr', 
                'Gly', 'His', 'Ile', 'Lac', 'Leu', 'Lys', 'Met', 'Nh4', 'Phe', 
                'Pro', 'Ser', 'Thr', 'Tyr', 'Val']
    mr_cols = ['Mr_Xv', 'Mr_mAb'] + [f'Mr_{m}' for m in met_list]
    
    # 提取数据矩阵 (Samples x Features)
    # 这里的 Samples 是所有实验的所有时间点拼接
    X_mr = df[mr_cols].values
    
    # 归一化 (Normalization) - 论文方法：除以最大绝对值
    # Matlab: reacted_masses_pca = reacted_masses ./ max(abs(reacted_masses))
    max_vals = np.max(np.abs(X_mr), axis=0)
    max_vals[max_vals == 0] = 1.0 # 防止除零
    X_norm = X_mr / max_vals
    
    # 执行 PCA
    pca = PCA(n_components=n_components, svd_solver='full')
    pca.fit(X_norm)
    
    # 检查解释方差
    explained_var = np.sum(pca.explained_variance_ratio_)
    print(f"PCA ({n_components} components) 累积解释方差: {explained_var:.4%}")
    
    # 构建 S 矩阵 (Reaction Correlation Matrix)
    # S = PCA_Components.T * Scaling_Factors
    # 维度: (25 features, 7 components)
    # 每一列代表一个“宏观反应” (Macro-Reaction) 的化学计量数向量
    S_matrix = pca.components_.T * max_vals[:, np.newaxis]
    
    return S_matrix, pca, max_vals

# 运行
S, pca_model, scalers = build_reaction_correlation_matrix('processed_data_IR_final.csv')
print("S 矩阵形状:", S.shape)

PCA (7 components) 累积解释方差: 99.9337%
S 矩阵形状: (25, 7)


In [None]:
import torch
import torch.nn as nn
import numpy as np

class HybridModelBase(nn.Module):
    """
    混合模型基类，处理 S 矩阵和物理混合逻辑
    """
    def __init__(self, S_matrix, device='cpu'):
        super().__init__()
        self.device = device
        
        # 1. 注册 S 矩阵 (Fixed, Non-trainable)
        # S 形状: (25, 7) -> (n_species, n_latent)
        # 转置为 (7, 25) 以便进行线性变换: Score(1x7) @ S.T(7x25) = dMr(1x25)
        self.S_matrix = torch.tensor(S_matrix, dtype=torch.float32).to(device)
        self.register_buffer('S', self.S_matrix) 
        
    def hybrid_forward_step(self, scores):
        """
        混合层计算: 将 NN 输出的 Scores 转化为 Reacted Mass 的增量
        dMr = Scores @ S.T
        """
        # scores: (Batch, 7)
        # S: (25, 7)
        dMr = torch.matmul(scores, self.S.t()) # (Batch, 25)
        return dMr

class HybridLSTM(HybridModelBase):
    """
    论文中的最佳 LSTM 架构: In(27)-ReLU(16)-LSTM(7)-Rate(7)
    """
    def __init__(self, S_matrix, input_dim=27, hidden_dim=16, latent_dim=7, device='cpu'):
        super().__init__(S_matrix, device)
        
        # 神经网络主体
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU()
        )
        
        # LSTM 层
        # batch_first=True -> (Batch, Seq, Feature)
        self.lstm = nn.LSTM(input_size=hidden_dim, hidden_size=latent_dim, batch_first=True)
        
        # 输出映射层 (Rate Layer)
        # 论文中 LSTM 输出 7 个状态，直接作为 Rates，或者接一个 Linear
        # 为了灵活性，这里加一个 Linear(7->7)
        self.fc_out = nn.Linear(latent_dim, latent_dim)
        
    def forward(self, x_seq, hidden_state=None):
        """
        x_seq: (Batch, Seq_Len, 27) - 归一化后的输入序列
        """
        batch_size, seq_len, _ = x_seq.shape
        
        # 1. Feedforward Encoder
        # Flatten for Linear layer: (Batch*Seq, 27)
        x_flat = x_seq.reshape(-1, x_seq.size(2))
        features = self.encoder(x_flat)
        
        # Reshape back for LSTM: (Batch, Seq, 16)
        features = features.reshape(batch_size, seq_len, -1)
        
        # 2. LSTM
        lstm_out, new_hidden = self.lstm(features, hidden_state)
        
        # 3. Output Layer -> Scores
        scores = self.fc_out(lstm_out) # (Batch, Seq, 7)
        
        # 4. Hybrid Calculation (针对序列中的每一步)
        # dMr: (Batch, Seq, 25)
        dMr = torch.matmul(scores, self.S.t())
        
        return dMr, scores, new_hidden

class HybridFFNN(HybridModelBase):
    """
    论文中的典型 FFNN 架构: In(27)-Tanh(8)-Tanh(8)-Tanh(8)-Rate(7)
    """
    def __init__(self, S_matrix, input_dim=27, hidden_dim=8, latent_dim=7, device='cpu'):
        super().__init__(S_matrix, device)
        
        self.net = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.Tanh(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.Tanh(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.Tanh(),
            nn.Linear(hidden_dim, latent_dim) # Output layer (Scores)
        )
        
    def forward(self, x_seq):
        """
        FFNN 处理时间序列时，实际上是将每一步独立处理 (Time-distributed)
        """
        # x_seq: (Batch, Seq, 27)
        scores = self.net(x_seq) # (Batch, Seq, 7)
        dMr = torch.matmul(scores, self.S.t())
        return dMr, scores