In [15]:
import pandas as pd
import numpy as np

import plotly.express as px
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import pairwise_distances

# Data Structure

### Variable Type
<img src="assets/variables_type.png" alt="Variables Type" width="1000">

### Unfolding type
<img src="assets/unfolding_type.png" alt="Unfolding Type" width="1000">

# OWU Dataset
- Observation-wise unfolding

In [16]:
def read_owu_v1(file):
	root_path = 'dataset/datahow_2020/insilico_data'
	data = pd.read_excel(f'{root_path}/{file}.xlsx')
	col_names = ["run", "timesteps", "X:VCD", "X:Glc", "X:Gln", "X:NH4", "X:Lac", "X:Titer", "W:pH", "W:Temp", "F:Feed_Glc", "F:Feed_Gln",]
	owu_df = data.copy()
	owu_df.columns = col_names
	owu_df['time'] = (owu_df.timesteps / 24).astype(int)
	owu_df.set_index(['run', 'time'], inplace=True)
	return owu_df


def read_owu_v2(file):
	root_path = 'dataset/datahow_concise'
	data = pd.read_csv(f'{root_path}/{file}.csv')
	owu_df = data.copy()
	owu_df.set_index(['run', 'time'], inplace=True)
	return owu_df


def read_owu_v3(file):
	root_path = 'dataset/datahow_2022'
	data = pd.read_csv(f'{root_path}/{file}.csv')
	owu_df = data.copy()
	owu_df.set_index(['run', 'time'], inplace=True)
	return owu_df

In [17]:
owu_ood = read_owu_v3('owu_ood')
owu_ood_titer = read_owu_v3('owu_ood_titer')
owu_ood_high_titer = read_owu_v3('owu_ood_high_titer')
owu_ood_low_titer = read_owu_v3('owu_ood_low_titer')

# BWU Dataset
- Batch-wise unfolding

In [18]:
def generate_bwu(owu):
    owu = owu.drop(["timesteps"],axis=1)
    # Input: multiindex OWU
    # Output: singleindex BWU
    for run_ix, run in owu.groupby("run"):
        if run_ix == owu.index.get_level_values('run')[0]:
            bwuindex = run.unstack(level=1)
        else:
            bwuindex = pd.concat([bwuindex, run.unstack(level=1)])
    bwu_columns = [str(bwuindex.columns.get_level_values(0)[i])+str(":")+str(bwuindex.columns.get_level_values(1)[i]) 
                   for i in range(len(bwuindex.columns.get_level_values(0)))]
    bwu = pd.DataFrame(bwuindex.to_numpy(), columns=bwu_columns)
    
    return bwu

In [19]:
bwu = generate_bwu(owu_ood)
bwu.head()

Unnamed: 0,X:VCD:0,X:VCD:1,X:VCD:2,X:VCD:3,X:VCD:4,X:VCD:5,X:VCD:6,X:VCD:7,X:VCD:8,X:VCD:9,...,W:Feed:5,W:Feed:6,W:Feed:7,W:Feed:8,W:Feed:9,W:Feed:10,W:Feed:11,W:Feed:12,W:Feed:13,W:Feed:14
0,0.75,2.083893,5.067346,9.700998,13.808836,15.305127,14.346967,12.044545,9.268963,6.497916,...,15.0,15.0,15.0,15.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.678571,1.788242,4.197866,8.024606,11.64425,12.98902,11.780328,9.032983,5.956852,3.496495,...,16.632653,16.632653,16.632653,16.632653,16.632653,0.0,0.0,0.0,0.0,0.0
2,0.576531,1.696046,4.475798,9.544466,15.165314,18.470742,18.751854,16.999519,14.363996,11.604577,...,12.142857,12.142857,12.142857,12.142857,12.142857,0.0,0.0,0.0,0.0,0.0
3,0.984694,2.924958,7.444333,14.42048,20.570813,23.127551,22.33065,19.648806,16.312824,13.028844,...,15.204082,15.204082,15.204082,15.204082,15.204082,0.0,0.0,0.0,0.0,0.0
4,0.515306,1.328392,3.138892,6.312312,10.046362,12.466191,12.598864,10.871897,8.153435,5.329461,...,19.081633,19.081633,19.081633,19.081633,19.081633,0.0,0.0,0.0,0.0,0.0


In [20]:
def generate_y(bwu, return_aggr=False):
    # Input: singleindex BWU
    # Output: singleindex BWU having only target
    titer_column = [c for c in bwu.columns if c.startswith("X:Titer")]
    targets = pd.DataFrame(columns=["Y:Titer", "Y:Aggr"], index=bwu.index)

    # iterate through experiments
    for j in list(bwu.index):
        x_titer = bwu.loc[j, titer_column]
        x_prod = [0]
        x_aggr = [0]
        k_aggr = 10**-7
        for i in range(len(x_titer)):
            if i == 0:
                continue
            xt_titer = x_titer.iloc[i]
            dt_titer = x_titer.iloc[i] - x_titer.iloc[i - 1]
            x_prod.append(xt_titer)
            x_aggr.append(k_aggr * (xt_titer**2))

            dt_aggr = x_aggr[i] - x_aggr[i - 1]
            dt_prod = dt_titer - 2 * dt_aggr
            dt_aggr = k_aggr * (x_prod[i - 1] + dt_prod) ** 2

            x_aggr[i] = x_aggr[i - 1] + dt_aggr
            x_prod[i] = x_prod[i - 1] + dt_prod
        y_prod = x_prod[-1]
        y_aggr = x_aggr[-1]

        targets.loc[j, "Y:Titer"] = y_prod
        targets.loc[j, "Y:Aggr"] = y_aggr
    if return_aggr:
        target = targets["Y:Aggr"]
    else:
        target = targets["Y:Titer"]

    return pd.DataFrame(target)

# Target Dataset

In [21]:
tar = generate_y(bwu, return_aggr=False)

In [22]:
tar.head()

Unnamed: 0,Y:Titer
0,1034.889209
1,598.214512
2,1616.008651
3,2216.397102
4,677.286486


# Stratified Sampling for Intrepolation

In [23]:
def stratified_sampling_for_interpolation(
    data, data_type="bwu", n_clusters=5, test_size=0.2, random_state=42,
    verbose=False,
):
    """Split dataset based on clustering for interpolation purpose"""
    if data_type == "owu":
        bwu = generate_bwu(data)
    elif data_type in ["bwu", "doe"]:
        bwu = data

    # Normalize
    scaler = StandardScaler()
    bwu_normalized = scaler.fit_transform(bwu)

    # Run PCA
    pca = PCA()
    bwu_pca = pca.fit_transform(bwu_normalized)

    # Clustering
    kmeans = KMeans(n_clusters=n_clusters, random_state=random_state, n_init='auto')
    clusters = kmeans.fit_predict(bwu_pca)

    bwu["cluster"] = clusters

    if verbose:
        fig = px.scatter(
            x=bwu_pca[:, 0], 
            y=bwu_pca[:, 1], 
            color=bwu['cluster'].astype(str),
            title="Cluster Visualization in PCA Space",
            labels={"x": "Principal Component 1", "y": "Principal Component 2", "color": "Cluster"}
        )
        fig.update_layout(width=800, height=600)
        fig.show();

    # Stratify sampling
    train_idx = []
    test_idx = []
    for cluster in np.unique(clusters):
        cluster_data = bwu[bwu["cluster"] == cluster]
        train, test = train_test_split(
            cluster_data,
            test_size=test_size,
            random_state=random_state,
            stratify=cluster_data[["cluster"]],
        )
        train_idx.extend(train.index)
        test_idx.extend(test.index)

    if data_type in ["doe", "bwu"]:
        train_set = bwu.loc[train_idx].drop(columns=["cluster"])
        test_set = bwu.loc[test_idx].drop(columns=["cluster"])
    else:
        # for owu, two index, first is run, second is time
        train_run_idx = data.index.get_level_values("run").isin(train_idx)
        test_run_idx = data.index.get_level_values("run").isin(test_idx)

        train_set = data[train_run_idx]
        test_set = data[test_run_idx]

    return train_set, test_set

In [24]:
train_set, test_set = stratified_sampling_for_interpolation(
    owu_ood, data_type="owu", n_clusters=3, test_size=0.2, random_state=42,
	verbose=True
)
# train_set.to_csv('dataset/datahow_2020/insilico_data/interpolation/owu.csv')
# test_set.to_csv('dataset/datahow_2020/insilico_data/interpolation/owu_test.csv')

# Greedy Maximized Pairwise distance for Extrapolation

In [25]:
def select_max_distance_experiments(bwu, num_experiments=5):
    # Standardize the data
    scaler = StandardScaler()
    bwu_normalized = scaler.fit_transform(bwu)
    
    # Calculate pairwise distance matrix
    distance_matrix = pairwise_distances(bwu_normalized, metric='l2')
    
    # Initialize the selected experiments list
    selected_experiments = [bwu.index[0]]  # Start with the first experiment
    
    for _ in range(1, num_experiments):
        max_dist = 0
        next_exp = None
        
        for i in range(bwu.shape[0]):
            if i in selected_experiments:
                continue
            # Calculate the minimum distance from this experiment to the selected experiments
            min_dist_to_selected = min(distance_matrix[i, j] for j in selected_experiments)
            
            if min_dist_to_selected > max_dist:
                max_dist = min_dist_to_selected
                next_exp = i
                
        selected_experiments.append(next_exp)
    
    return bwu.iloc[selected_experiments]

In [31]:
# get titer groups
time_14_df = owu_ood.xs(14, level='time')
high_titer_index = time_14_df[time_14_df['X:Titer'] > 1000].index.tolist()
low_titer_index = time_14_df[time_14_df['X:Titer'] < 1000].index.tolist()

# split owu by titer groups
owu_ood_high_titer = owu_ood[owu_ood.index.get_level_values('run').isin(high_titer_index)]
owu_ood_low_titer = owu_ood[owu_ood.index.get_level_values('run').isin(low_titer_index)]
owu_ood_high_titer.to_csv('dataset/datahow_2022/owu_ood_high_titer.csv')
owu_ood_low_titer.to_csv('dataset/datahow_2022/owu_ood_low_titer.csv')

# split doe by titer groups
owu_ood_doe = pd.read_csv('dataset/datahow_2022/owu_ood_doe.csv', usecols=["feed_start", "feed_end", "Glc_feed_rate", "Glc_0", "VCD_0"])
owu_ood_doe_high_titer = owu_ood_doe[owu_ood_doe.index.isin(high_titer_index)]
owu_ood_doe_low_titer = owu_ood_doe[owu_ood_doe.index.isin(low_titer_index)]
owu_ood_doe_high_titer.to_csv('dataset/datahow_2022/owu_ood_doe_high_titer.csv')
owu_ood_doe_low_titer.to_csv('dataset/datahow_2022/owu_ood_doe_low_titer.csv')

# read owu groups
owu_ood_high_titer = pd.read_csv('dataset/datahow_2022/owu_ood_high_titer.csv')
owu_ood_low_titer = pd.read_csv('dataset/datahow_2022/owu_ood_low_titer.csv')
owu_ood_high_titer.set_index(['run', 'time'], inplace=True)
owu_ood_low_titer.set_index(['run', 'time'], inplace=True)

# read doe groups
owu_ood_doe_high_titer = pd.read_csv('dataset/datahow_2022/owu_ood_doe_high_titer.csv', index_col=0)
owu_ood_doe_low_titer = pd.read_csv('dataset/datahow_2022/owu_ood_doe_low_titer.csv', index_col=0)

In [33]:
bwu_ood_high_titer = generate_bwu(owu_ood_high_titer)
selected_index = select_max_distance_experiments(bwu_ood_high_titer, num_experiments=5).index
print(selected_index)

owu_ood_high_titer_selected = owu_ood_high_titer[owu_ood_high_titer.index.get_level_values('run').isin(selected_index)]
owu_ood_high_titer_remains = owu_ood_high_titer[~owu_ood_high_titer.index.get_level_values('run').isin(selected_index)]

owu_ood_doe_high_titer_selected = owu_ood_doe_high_titer[owu_ood_doe_high_titer.index.isin(selected_index)]
owu_ood_doe_high_titer_remains = owu_ood_doe_high_titer[~owu_ood_doe_high_titer.index.isin(selected_index)]

owu = pd.concat([owu_ood_low_titer, owu_ood_high_titer_selected])
owu_doe = pd.concat([owu_ood_doe_low_titer, owu_ood_doe_high_titer_selected])

owu_test = owu_ood_high_titer_remains.copy()
owu_test_doe = owu_ood_doe_high_titer_remains.copy()


Index([0, 2, 11, 20, 22], dtype='int64')


In [35]:
owu.to_csv('dataset/datahow_2022/extrapolation/owu.csv', index=False)
owu_test.to_csv('dataset/datahow_2022/extrapolation/owu_test.csv', index=False)

owu_doe.to_csv('dataset/datahow_2022/extrapolation/owu_doe.csv', index=False)
owu_test_doe.to_csv('dataset/datahow_2022/extrapolation/owu_test_doe.csv', index=False)