### Import Library

In [27]:
import pandas as pd
import numpy as np

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from scipy.stats import chi2
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.covariance import EmpiricalCovariance

### PCA Introduction
* abnormalities in runs/variables
* interpretation scores/loadings
* differences between OWU/BWU

<img src="assets/pca_explain.png" alt="PCA Explain" width="1000">

# PCA of OWU matrix

In [2]:
def read_owu(file):
	root_path = 'dataset/datahow_2020/insilico_data'
	data = pd.read_excel(f'{root_path}/{file}.xlsx')
	col_names = ["run", "timesteps", "X:VCD", "X:Glc", "X:Gln", "X:NH4", "X:Lac", "X:Titer", "W:pH", "W:Temp", "F:Feed_Glc", "F:Feed_Gln",]
	owu_df = data.copy()
	owu_df.columns = col_names
	owu_df['time'] = (owu_df.timesteps / 24).astype(int)
	owu_df.set_index(['run', 'time'], inplace=True)
	return owu_df

In [3]:
owu = read_owu('rawdata')

### Visualize the OWU matrix

In [4]:
owu.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,timesteps,X:VCD,X:Glc,X:Gln,X:NH4,X:Lac,X:Titer,W:pH,W:Temp,F:Feed_Glc,F:Feed_Gln
run,time,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,0,0,0.4,60.0,3.0,0.1,0.1,0.0,6.9,37.5,0.0,0.0
1,1,24,1.015551,82.789425,2.071044,0.945923,5.29222,8.891501,6.9,37.5,30.0,1.0
1,2,48,2.063143,73.895885,1.504454,3.228866,15.598221,23.47959,6.9,37.5,30.0,1.0
1,3,72,2.73515,106.519232,0.374085,3.251472,33.851222,73.414915,6.9,37.5,30.0,1.0
1,4,96,4.106711,102.865067,0.174484,4.377329,43.038763,150.955189,6.9,37.5,30.0,1.0
1,5,120,4.723016,88.211028,0.0,5.223006,36.49669,189.689869,6.9,37.5,30.0,1.0
1,6,144,4.932028,168.364784,0.0,5.769891,74.991427,346.178775,7.0,37.5,30.0,1.0
1,7,168,4.827316,133.85714,0.039018,4.991701,105.321299,348.716061,7.0,37.5,30.0,1.0
1,8,192,5.74548,199.244957,0.136537,5.620637,125.286756,322.79823,7.0,37.5,0.0,0.0
1,9,216,4.781973,234.815768,0.0,5.993455,121.762157,430.402555,7.0,37.5,0.0,0.0


### Plot correlation matrix

In [5]:
def plot_owu_correlation(owu):
    owu = owu.drop(["timesteps"],axis=1)
    fig = px.imshow(owu.corr().round(3), text_auto=True)
    fig.update_layout(title="OWU Correlation Matrix among X variables", width=1000)
    fig.show();

In [6]:
plot_owu_correlation(owu)

### Unnormalized PCA

PCA is run on the OWU matrix, but without any variable normalization.

In [7]:
def plot_explained_variance_owu_raw(owu):
    owu = owu.drop(["timesteps"], axis=1)
    owupca = PCA()
    owupca.fit(owu)
    owupca_x = list(range(1, owupca.n_components_ + 1))

    fig = px.line(
        x=owupca_x,
        y=np.cumsum(owupca.explained_variance_ratio_),
        color=px.Constant("Cumulative explained variance"),
        labels=dict(
            x="Principal component index", y="Explained Variance Ratio", color="Legend"
        ),
    )
    fig.add_bar(
        x=owupca_x,
        y=owupca.explained_variance_ratio_,
        name="Individual explained variance",
    )
    fig.update_layout(width=1000)
    fig.show();

In [8]:
plot_explained_variance_owu_raw(owu)

### Normalized PCA

- PCA is run on the OWU matrix, but this time the variables are first normalized with respect to their mean and standard deviation. 
- This ensures that each variable has equal contribution to the variance explained by the principal components


In [9]:
def plot_explained_variance_owu(owu):
    owu = owu.drop(["timesteps"], axis=1)
    scaler = StandardScaler()
    owu_normalized = scaler.fit_transform(owu)
    # Run PCA on scaled data
    owupca = PCA()
    owupca.fit(owu_normalized)
    owupca_x = list(range(1, owupca.n_components_ + 1))

    fig = px.line(
        x=owupca_x,
        y=np.cumsum(owupca.explained_variance_ratio_),
        color=px.Constant("Cumulative explained variance"),
        labels=dict(
            x="Principal component index", y="Explained Variance Ratio", color="Legend"
        ),
    )
    fig.add_bar(
        x=owupca_x,
        y=owupca.explained_variance_ratio_,
        name="Individual explained variance",
    )
    fig.update_layout(width=1000,
                      title='OWU PCA Explained Variance')
    fig.show()

In [10]:
plot_explained_variance_owu(owu)

### Plot scores and loadings

In [11]:
def plot_scores_loadings_owu(
    owu, pc_x_axis=1, pc_y_axis=2, highlight_run=0, highlight_type="Run_id",
    width=1200, height=900,
):
    owu = owu.drop(["timesteps"], axis=1)
    scaler = StandardScaler()
    owu_normalized = scaler.fit_transform(owu)
    # Run PCA on scaled data
    owupca = PCA()
    owupca.fit(owu_normalized)
    owupca_x = list(range(1, owupca.n_components_ + 1))

    # Run PCA analysis
    owu_components = owupca.fit_transform(owu_normalized)
    owu_loadings = owupca.components_.T * np.sqrt(owupca.explained_variance_)
    owu_features = list(owu.columns)
    owu_explained = owupca.explained_variance_ratio_

    # Create 2 rows and 1 cols subplots
    fig = make_subplots(
        rows=2,
        cols=1,
        subplot_titles=["OWU PCA scores plot", "OWU PCA loadings plot"],
        vertical_spacing=0.08,
    )

    # Score plots
    fig.add_trace(
        go.Scatter(
            x=[0, 0], y=[0, 0], mode="markers", marker_color="black", name="origin"
        ),
        row=1,
        col=1,
    )
    text_data = [
        f"Run {owu.index.get_level_values(0)[i]}, Time {owu.index.get_level_values(1)[i]}"
        for i in range(len(owu))
    ]

    # Score plots for highlight type
    if highlight_type == "Run_id":
        color_idx = owu.index.get_level_values("run")
    elif highlight_type == "Time_id":
        color_idx = owu.index.get_level_values("time")
    elif highlight_type == "Titer_14":
        color_idx = np.repeat(np.array(owu["X:Titer"][:, 14]), 15)
    # elif highlight_type in doe.columns:
    #     color_idx = np.repeat(np.array(doe[highlight_type]), 15)
    else:
        color_idx = None
    fig.add_trace(
        go.Scatter(
            x=owu_components[:, pc_x_axis - 1],
            y=owu_components[:, pc_y_axis - 1],
            mode="markers",
            marker=dict(size=10, color=color_idx, showscale=True),
            marker_size=10,
            text=text_data,
            textposition="top center",
        ),
        row=1,
        col=1,
    )

    # Score plots for highlight run
    highlight_run_ix = owu.index.get_level_values("run") == highlight_run
    fig.add_trace(
        go.Scatter(
            x=owu_components[highlight_run_ix, pc_x_axis - 1],
            y=owu_components[highlight_run_ix, pc_y_axis - 1],
            mode="lines+markers",
            marker_size=10,
            marker_color="black",
            line_color="black",
            text=[
                f"Run {highlight_run}, Time {owu.index.get_level_values(1)[i]}"
                for i in range(len(owu))
            ],
            textposition="top center",
        ),
        row=1,
        col=1,
    )

    # Loading plots
    fig.add_trace(
        go.Scatter(
            x=[0, 0], y=[0, 0], mode="markers", marker_color="black", name="origin"
        ),
        row=2,
        col=1,
    )
    for i, feature in enumerate(owu_features):
        fig.add_shape(
            type="line",
            x0=0,
            y0=0,
            x1=owu_loadings[i, pc_x_axis - 1],
            y1=owu_loadings[i, pc_y_axis - 1],
            row=2,
            col=1,
        )
        fig.add_annotation(
            x=owu_loadings[i, pc_x_axis - 1],
            y=owu_loadings[i, pc_y_axis - 1],
            ax=0,
            ay=0,
            xanchor="center",
            yanchor="bottom",
            text=feature,
            row=2,
            col=1,
        )

    fig.update_layout(width=width, height=height, showlegend=False)
    fig.update_xaxes(
        title=f"Principal Component - {pc_x_axis} ({owu_explained[pc_x_axis-1].round(2)}%)",
        row=2,
        col=1,
    )
    fig.update_yaxes(
        title=f"Principal Component - {pc_y_axis} ({owu_explained[pc_y_axis-1].round(2)}%)",
        row=1,
        col=1,
    )
    fig.update_yaxes(
        title=f"Principal Component - {pc_y_axis} ({owu_explained[pc_y_axis-1].round(2)}%)",
        row=2,
        col=1,
    )
    fig.show()

    fig = make_subplots(
        rows=1,
        cols=2,
        subplot_titles=(
            "OWU Loadings of Principal Component - " + str(pc_x_axis),
            "OWU Loadings of Principal Component - " + str(pc_y_axis),
        ),
    )
    fig.add_bar(x=owu_features, y=owu_loadings[:, pc_x_axis - 1], row=1, col=1)
    fig.add_bar(x=owu_features, y=owu_loadings[:, pc_y_axis - 1], row=1, col=2)
    fig.update_layout(width=width, height=height/2, showlegend=False)
    fig.show()

In [12]:
plot_scores_loadings_owu(owu, pc_x_axis=1, pc_y_axis=2, highlight_run=77, highlight_type='Time_id')

### Mahalanobis distance of observations 
- equal to the euclidean distance in PCA space

In [13]:
def plot_mahalanobis_distance_owu(owu, highlight_run=0, width=1600):
    owu = owu.drop(["timesteps"], axis=1)
    scaler = StandardScaler()
    owu_normalized = scaler.fit_transform(owu)

    # Run PCA on scaled data
    owupca = PCA()
    owupca.fit(owu_normalized)

    # Run PCA analysis
    owu_components = owupca.fit_transform(owu_normalized)

    # Calculate mahalanobis distance
    owu_mahalanobis = (
        EmpiricalCovariance().fit(owu_components).mahalanobis(owu_components)
    )
    T2threshold = chi2.ppf(0.95, owu_components.shape[1] - 1)

    # Plot mahalanobis distance
    fig = px.bar(
        x=list(range(len(owu_mahalanobis))),
        y=owu_mahalanobis,
        labels=dict(x="OWU Observation Id.", y="Mahalanobis distance", color="Run_id"),
        title="OWU Mahalanobis distance with 95% Confidence Interval",
        color=owu.index.get_level_values(0),
        color_continuous_scale=px.colors.cyclical.HSV,
    )
    fig.add_hline(y=T2threshold)

    highlight_run_ix = owu.index.get_level_values("run") == highlight_run

    fig.add_trace(
        go.Bar(
            x=list(np.array(range(len(owu_mahalanobis)))[highlight_run_ix]),
            y=owu_mahalanobis[highlight_run_ix],
            marker_color="black",
        )
    )
    fig.update_layout(width=width, showlegend=False, coloraxis_showscale=True)
    fig.update_layout(barmode="overlay")
    fig.show()

In [14]:
plot_mahalanobis_distance_owu(owu, highlight_run=10, width=1400)

# PCA of BWU matrix

In [15]:
def generate_bwu(owu):
    owu = owu.drop(["timesteps"],axis=1)
    # Input: multiindex OWU
    # Output: singleindex BWU
    for run_ix,run in owu.groupby("run"):
        if run_ix == 1: 
            bwuindex = run.unstack(level=1)
        else:
            bwuindex = pd.concat([bwuindex, run.unstack(level=1)])
    bwu_columns = [str(bwuindex.columns.get_level_values(0)[i])+str(":")+str(bwuindex.columns.get_level_values(1)[i]) 
                   for i in range(len(bwuindex.columns.get_level_values(0)))]
    bwu = pd.DataFrame(bwuindex.to_numpy(), columns=bwu_columns)
    
    return bwu

In [16]:
bwu = generate_bwu(owu)

### Visualize the BWU matrix

In [17]:
bwu

Unnamed: 0,X:VCD:0,X:VCD:1,X:VCD:2,X:VCD:3,X:VCD:4,X:VCD:5,X:VCD:6,X:VCD:7,X:VCD:8,X:VCD:9,...,F:Feed_Gln:5,F:Feed_Gln:6,F:Feed_Gln:7,F:Feed_Gln:8,F:Feed_Gln:9,F:Feed_Gln:10,F:Feed_Gln:11,F:Feed_Gln:12,F:Feed_Gln:13,F:Feed_Gln:14
0,0.4,1.015551,2.063143,2.735150,4.106711,4.723016,4.932028,4.827316,5.745480,4.781973,...,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.4,0.953393,1.786357,1.991316,3.897274,4.040470,3.695521,3.945214,4.182197,3.642935,...,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.4,1.914412,3.033244,2.429069,4.003605,5.821906,6.183043,5.095282,4.591090,3.885602,...,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.2,0.485637,1.470946,2.234323,4.404896,4.952949,5.638315,4.904156,5.515848,4.060728,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0
4,0.2,0.701393,1.722991,1.884669,2.493436,3.742112,4.650736,4.441868,4.666359,5.576590,...,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.2,0.324911,0.703203,1.446786,1.949236,2.364267,2.868030,3.071294,3.677680,2.746184,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0
96,0.2,0.239439,0.489605,0.654519,1.303005,1.662527,1.872537,2.686625,3.299594,2.893775,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0
97,0.2,0.533646,1.187276,1.933980,2.238092,2.292502,2.798951,3.300981,2.828262,2.819626,...,6.0,6.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
98,0.3,1.009951,2.550745,3.316546,3.733694,4.687383,6.424874,4.562861,4.601619,4.104420,...,3.5,3.5,3.5,3.5,3.5,3.5,0.0,0.0,0.0,0.0


In [18]:
def plot_bwu_correlation(bwu, width=1000):
    fig = px.imshow(bwu.corr().round(3), text_auto=True)
    fig.update_layout(title="BWU Correlation Matrix among X variables", width=width)
    fig.show();

### Plot correlation matrix

In [19]:
plot_bwu_correlation(bwu, width=900)

### Normalized PCA


In [20]:
def plot_explained_variance_bwu(bwu, pca_n_components=15, width=1000):
    scaler = StandardScaler()
    bwu_normalized = scaler.fit_transform(bwu)
    
    # Run PCA on scaled data
    bwupca = PCA(n_components=min(min(bwu.shape), pca_n_components))
    bwupca.fit(bwu_normalized)
    bwupca_x = list(range(1, bwupca.n_components_ + 1))

    fig = px.line(
        x=bwupca_x,
        y=np.cumsum(bwupca.explained_variance_ratio_),
        color=px.Constant("Cumulative explained variance"),
        labels=dict(
            x="Principal component index", y="Explained Variance Ratio", color="Legend"
        ),
    )
    fig.add_bar(
        x=bwupca_x,
        y=bwupca.explained_variance_ratio_,
        name="Individual explained variance",
    )
    fig.update_layout(width=width,
                      title='BWU PCA Explained Variance')
    fig.show()


In [21]:
plot_explained_variance_bwu(bwu, pca_n_components=15, width=1000)

### Plot scores and loadings

In [22]:
def plot_scores_loadings_bwu(
    bwu, pc_x_axis=1, pc_y_axis=2, highlight_type="Run_id", pca_n_components=15, 
    width=1200, height=900,
):
    scaler = StandardScaler()
    bwu_normalized = scaler.fit_transform(bwu)
    # Run PCA on scaled data
    bwupca = PCA(n_components=min(min(bwu.shape), pca_n_components))
    bwupca.fit(bwu_normalized)
    bwupca_x = list(range(1, bwupca.n_components_ + 1))

    # Run PCA analysis
    bwu_components = bwupca.fit_transform(bwu_normalized)
    bwu_loadings = bwupca.components_.T * np.sqrt(bwupca.explained_variance_)
    bwu_features = list(bwu.columns)
    bwu_explained = bwupca.explained_variance_ratio_

    # Create 2 rows and 1 cols subplots
    fig = make_subplots(
        rows=2,
        cols=1,
        subplot_titles=["BWU PCA scores plot", "BWU PCA loadings plot"],
        vertical_spacing=0.08,
    )

    # Score plots
    fig.add_trace(
        go.Scatter(
            x=[0, 0], y=[0, 0], mode="markers", marker_color="black", name="origin"
        ),
        row=1,
        col=1,
    )
    text_data = [
        f"Run {bwu.index.get_level_values(0)[i]}"
        for i in range(len(bwu))
    ]

    # Score plots for highlight type
    if highlight_type == "Run_id":
        color_idx=np.array(bwu.index.get_level_values(0))
    elif highlight_type == "Titer_14":
        color_idx = np.array(bwu["X:Titer:14"])
    # elif highlight_type in doe.columns:
    #     color_idx = np.array(doe[PLOT_COLOR])
    else:
        color_idx = None
    fig.add_trace(
        go.Scatter(
            x=bwu_components[:, pc_x_axis - 1],
            y=bwu_components[:, pc_y_axis - 1],
            mode="markers",
            marker=dict(size=10, color=color_idx, showscale=True),
            marker_size=10,
            text=text_data,
            textposition="top center",
        ),
        row=1,
        col=1,
    )

    # Loading plots
    fig.add_trace(
        go.Scatter(
            x=[0, 0], y=[0, 0], mode="markers", marker_color="black", name="origin"
        ),
        row=2,
        col=1,
    )
    for i, feature in enumerate(bwu_features):
        fig.add_shape(
            type="line",
            x0=0,
            y0=0,
            x1=bwu_loadings[i, pc_x_axis - 1],
            y1=bwu_loadings[i, pc_y_axis - 1],
            row=2,
            col=1,
        )
        fig.add_annotation(
            x=bwu_loadings[i, pc_x_axis - 1],
            y=bwu_loadings[i, pc_y_axis - 1],
            ax=0,
            ay=0,
            xanchor="center",
            yanchor="bottom",
            text=feature,
            row=2,
            col=1,
        )

    fig.update_layout(width=width, height=height, showlegend=False)
    fig.update_xaxes(
        title=f"Principal Component - {pc_x_axis} ({bwu_explained[pc_x_axis-1].round(2)}%)",
        row=2,
        col=1,
    )
    fig.update_yaxes(
        title=f"Principal Component - {pc_y_axis} ({bwu_explained[pc_y_axis-1].round(2)}%)",
        row=1,
        col=1,
    )
    fig.update_yaxes(
        title=f"Principal Component - {pc_y_axis} ({bwu_explained[pc_y_axis-1].round(2)}%)",
        row=2,
        col=1,
    )
    fig.show()

    fig = make_subplots(
        rows=1,
        cols=2,
        subplot_titles=(
            "BWU Loadings of Principal Component - " + str(pc_x_axis),
            "BWU Loadings of Principal Component - " + str(pc_y_axis),
        ),
    )
    fig.add_bar(x=bwu_features, y=bwu_loadings[:, pc_x_axis - 1], row=1, col=1)
    fig.add_bar(x=bwu_features, y=bwu_loadings[:, pc_y_axis - 1], row=1, col=2)
    fig.update_layout(width=width, height=height/2, showlegend=False)
    fig.show()

In [23]:
plot_scores_loadings_bwu(bwu, pc_x_axis=1, pc_y_axis=2, pca_n_components=15,  highlight_type='Run_id')

### Mahalanobis distance of observations 
- equal to the euclidean distance in PCA space

In [24]:
def plot_mahalanobis_distance_bwu(bwu, pca_n_components=15, highlight_run=0, width=1600):
    scaler = StandardScaler()
    bwu_normalized = scaler.fit_transform(bwu)

    # Run PCA on scaled data
    bwu_pca = PCA(n_components=min(min(bwu.shape), pca_n_components))
    bwu_pca.fit(bwu_normalized)

    # Run PCA analysis
    bwu_components = bwu_pca.fit_transform(bwu_normalized)

    # Calculate Mahalanobis distance
    bwu_mahalanobis = EmpiricalCovariance().fit(bwu_components).mahalanobis(bwu_components)
    T2threshold = chi2.ppf(0.95, bwu_components.shape[1] - 1)

    # Plot Mahalanobis distance
    fig = px.bar(
        x=list(range(len(bwu_mahalanobis))),
        y=bwu_mahalanobis,
        labels=dict(x="BWU Observation Id.", y="Mahalanobis distance", color="Run_id"),
        title="BWU Mahalanobis distance with 95% Confidence Interval",
        color=bwu.index,
        color_continuous_scale=px.colors.cyclical.HSV,
    )
    fig.add_hline(y=T2threshold, line_dash="dash", annotation_text="95% Threshold", annotation_position="top right")

    # Highlight specific run
    highlight_run_ix = bwu.index == highlight_run

    fig.add_trace(
        go.Bar(
            x=list(np.array(range(len(bwu_mahalanobis)))[highlight_run_ix]),
            y=bwu_mahalanobis[highlight_run_ix],
            marker_color="black",
        )
    )
    fig.update_layout(width=width, showlegend=False, coloraxis_showscale=True)
    fig.update_layout(barmode="overlay")
    fig.show()

In [25]:
plot_mahalanobis_distance_bwu(bwu, highlight_run=99, width=1600)

# Stratified Sampling

In [41]:
def stratified_sampling_for_interpolation(
    data, data_type="bwu", n_clusters=5, test_size=0.2, random_state=42,
    verbose=False,
):
    """Split dataset based on clustering for interpolation purpose"""
    if data_type == "owu":
        bwu = generate_bwu(data)
    elif data_type in ["bwu", "doe"]:
        bwu = data

    # Normalize
    scaler = StandardScaler()
    bwu_normalized = scaler.fit_transform(bwu)

    # Run PCA
    pca = PCA()
    bwu_pca = pca.fit_transform(bwu_normalized)

    # Clustering
    kmeans = KMeans(n_clusters=n_clusters, random_state=random_state)
    clusters = kmeans.fit_predict(bwu_pca)

    bwu["cluster"] = clusters

    if verbose:
        fig = px.scatter(
            x=bwu_pca[:, 0], 
            y=bwu_pca[:, 1], 
            color=bwu['cluster'].astype(str),
            title="Cluster Visualization in PCA Space",
            labels={"x": "Principal Component 1", "y": "Principal Component 2", "color": "Cluster"}
        )
        fig.update_layout(width=800, height=600)
        fig.show();

    # Stratify sampling
    train_idx = []
    test_idx = []
    for cluster in np.unique(clusters):
        cluster_data = bwu[bwu["cluster"] == cluster]
        train, test = train_test_split(
            cluster_data,
            test_size=test_size,
            random_state=random_state,
            stratify=cluster_data[["cluster"]],
        )
        train_idx.extend(train.index)
        test_idx.extend(test.index)

    if data_type in ["doe", "bwu"]:
        train_set = bwu.loc[train_idx].drop(columns=["cluster"])
        test_set = bwu.loc[test_idx].drop(columns=["cluster"])
    else:
        # for owu, two index, first is run, second is time
        train_run_idx = data.index.get_level_values("run").isin(train_idx)
        test_run_idx = data.index.get_level_values("run").isin(test_idx)

        train_set = data[train_run_idx]
        test_set = data[test_run_idx]

    return train_set, test_set

In [42]:
train_set, test_set = stratified_sampling_for_interpolation(
    owu, data_type="owu", n_clusters=5, test_size=0.2, random_state=42,
	verbose=True
)

In [43]:
test_set

Unnamed: 0_level_0,Unnamed: 1_level_0,timesteps,X:VCD,X:Glc,X:Gln,X:NH4,X:Lac,X:Titer,W:pH,W:Temp,F:Feed_Glc,F:Feed_Gln
run,time,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
20,0,0,0.200000,20.000000,3.000000,0.100000,0.100000,0.000000,6.9,36.0,0.0,0.0
20,1,24,0.465052,12.344838,1.930211,0.995202,1.557998,8.444000,6.9,36.0,0.0,0.0
20,2,48,0.823930,14.878785,1.247687,1.008069,6.053220,21.431336,6.9,36.0,0.0,0.0
20,3,72,1.615609,9.630062,0.400731,1.198852,14.785587,30.070268,6.9,36.0,5.0,6.0
20,4,96,2.375978,7.224935,4.691588,3.049815,18.807655,86.763520,6.9,36.0,5.0,6.0
...,...,...,...,...,...,...,...,...,...,...,...,...
92,10,240,2.331238,186.882758,20.355289,23.613723,99.552011,700.302829,7.1,36.0,30.0,6.0
92,11,264,1.951967,227.126447,21.106760,23.439616,109.905780,700.650801,7.1,36.0,30.0,6.0
92,12,288,1.676322,196.634521,18.388996,30.926542,82.337328,778.432280,7.1,36.0,30.0,6.0
92,13,312,1.335594,302.457306,31.741710,40.371787,110.847810,710.228495,7.1,36.0,0.0,0.0


In [45]:
train_set.to_csv('dataset/datahow_2020/insilico_data/interpolation/owu.csv')
test_set.to_csv('dataset/datahow_2020/insilico_data/interpolation/owu_test.csv')