### Import Library

In [2]:
import pandas as pd
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.io as pio
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.covariance import EmpiricalCovariance
from scipy.stats import chi2

pio.templates.default = "plotly_white"
pcolors = px.colors.qualitative.T10
pcolors25 = px.colors.qualitative.Alphabet

### PCA Introduction
* abnormalities in runs/variables
* interpretation scores/loadings
* differences between OWU/BWU

<img src="assets/pca_explain.png" alt="PCA Explain" width="1000">

# PCA of OWU matrix

In [3]:
def read_owu_v1(file):
	root_path = 'dataset/datahow_2020/insilico_data'
	data = pd.read_excel(f'{root_path}/{file}.xlsx')
	col_names = ["run", "timesteps", "X:VCD", "X:Glc", "X:Gln", "X:NH4", "X:Lac", "X:Titer", "W:pH", "W:Temp", "F:Feed_Glc", "F:Feed_Gln",]
	owu_df = data.copy()
	owu_df.columns = col_names
	owu_df['time'] = (owu_df.timesteps / 24).astype(int)
	owu_df.set_index(['run', 'time'], inplace=True)
	return owu_df


def read_owu_v2(file):
	root_path = 'dataset/datahow_concise'
	data = pd.read_csv(f'{root_path}/{file}.csv')
	owu_df = data.copy()
	owu_df.set_index(['run', 'time'], inplace=True)
	return owu_df


def read_owu_v3(file, root_path = 'dataset/datahow_2022'):
	data = pd.read_csv(f'{root_path}/{file}.csv')
	owu_df = data.copy()
	owu_df.set_index(['run', 'time'], inplace=True)
	return owu_df

In [4]:
owu = read_owu_v3('owu', root_path='dataset/datahow_2022/interpolation/')
owu_test = read_owu_v3('owu_test', root_path='dataset/datahow_2022/interpolation/')

### Visualize the OWU matrix

In [5]:
owu.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,timesteps,X:VCD,X:Glc,X:Lac,X:Titer,W:Feed
run,time,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0,0.0,0.55,45.0,0.0,0.0,0.0
0,1,1.0,1.725244,44.008188,1.489389,0.054632,0.0
0,2,2.0,4.779558,41.089619,5.872389,1.311539,12.5
0,3,3.0,10.278397,40.271478,16.487303,15.870869,12.5
0,4,4.0,15.886799,40.083603,35.542541,85.824621,12.5
0,5,5.0,18.568089,35.783073,60.77645,245.221131,12.5
0,6,6.0,18.099141,30.505201,87.483607,468.616199,12.5
0,7,7.0,15.811653,26.66724,112.033694,706.666289,12.5
0,8,8.0,12.920362,25.389475,132.740373,924.761704,12.5
0,9,9.0,10.123559,26.873536,149.296217,1108.215155,12.5


### Plot correlation matrix

In [6]:
def plot_owu_correlation(owu):
    owu = owu.drop(["timesteps"],axis=1)
    fig = px.imshow(owu.corr().round(3), text_auto=True)
    fig.update_layout(title="OWU Correlation Matrix among X variables", width=1000)
    fig.show();

In [7]:
plot_owu_correlation(owu)

### Unnormalized PCA

PCA is run on the OWU matrix, but without any variable normalization.

In [8]:
def plot_explained_variance_owu_raw(owu):
    owu = owu.drop(["timesteps"], axis=1)
    owupca = PCA()
    owupca.fit(owu)
    owupca_x = list(range(1, owupca.n_components_ + 1))

    fig = px.line(
        x=owupca_x,
        y=np.cumsum(owupca.explained_variance_ratio_),
        color=px.Constant("Cumulative explained variance"),
        labels=dict(
            x="Principal component index", y="Explained Variance Ratio", color="Legend"
        ),
    )
    fig.add_bar(
        x=owupca_x,
        y=owupca.explained_variance_ratio_,
        name="Individual explained variance",
    )
    fig.update_layout(width=1000)
    fig.show();

In [9]:
plot_explained_variance_owu_raw(owu)

### Normalized PCA

- PCA is run on the OWU matrix, but this time the variables are first normalized with respect to their mean and standard deviation. 
- This ensures that each variable has equal contribution to the variance explained by the principal components


In [10]:
def plot_explained_variance_owu(owu):
    owu = owu.drop(["timesteps"], axis=1)
    scaler = StandardScaler()
    owu_normalized = scaler.fit_transform(owu)
    # Run PCA on scaled data
    owupca = PCA()
    owupca.fit(owu_normalized)
    owupca_x = list(range(1, owupca.n_components_ + 1))

    fig = px.line(
        x=owupca_x,
        y=np.cumsum(owupca.explained_variance_ratio_),
        color=px.Constant("Cumulative explained variance"),
        labels=dict(
            x="Principal component index", y="Explained Variance Ratio", color="Legend"
        ),
    )
    fig.add_bar(
        x=owupca_x,
        y=owupca.explained_variance_ratio_,
        name="Individual explained variance",
    )
    fig.update_layout(width=1000,
                      title='OWU PCA Explained Variance')
    fig.show()

In [11]:
plot_explained_variance_owu(owu)

### Plot scores and loadings

In [12]:
def plot_scores_loadings_owu(
    owu,
    pc_x_axis=1,
    pc_y_axis=2,
    highlight_run=0,
    highlight_type="Run_id",
    width=1200,
    height=900,
):
    owu = owu.drop(["timesteps"], axis=1)
    scaler = StandardScaler()
    owu_normalized = scaler.fit_transform(owu)
    # Run PCA on scaled data
    owupca = PCA()
    owupca.fit(owu_normalized)
    owupca_x = list(range(1, owupca.n_components_ + 1))

    # Run PCA analysis
    owu_components = owupca.fit_transform(owu_normalized)
    owu_loadings = owupca.components_.T * np.sqrt(owupca.explained_variance_)
    owu_features = list(owu.columns)
    owu_explained = owupca.explained_variance_ratio_

    # Create 2 rows and 1 cols subplots
    fig = make_subplots(
        rows=2,
        cols=1,
        subplot_titles=["OWU PCA scores plot", "OWU PCA loadings plot"],
        vertical_spacing=0.08,
    )

    # Score plots
    fig.add_trace(
        go.Scatter(
            x=[0, 0], y=[0, 0], mode="markers", marker_color="black", name="origin"
        ),
        row=1,
        col=1,
    )
    text_data = [
        f"Run {owu.index.get_level_values(0)[i]}, Time {owu.index.get_level_values(1)[i]}"
        for i in range(len(owu))
    ]

    # Score plots for highlight type
    if highlight_type == "Run_id":
        color_idx = owu.index.get_level_values("run")
    elif highlight_type == "Time_id":
        color_idx = owu.index.get_level_values("time")
    elif highlight_type == "Titer_14":
        color_idx = np.repeat(np.array(owu["X:Titer"][:, 14]), 15)
    # elif highlight_type in doe.columns:
    #     color_idx = np.repeat(np.array(doe[highlight_type]), 15)
    else:
        color_idx = None
    fig.add_trace(
        go.Scatter(
            x=owu_components[:, pc_x_axis - 1],
            y=owu_components[:, pc_y_axis - 1],
            mode="markers",
            marker=dict(size=10, color=color_idx, showscale=True),
            marker_size=10,
            text=text_data,
            textposition="top center",
        ),
        row=1,
        col=1,
    )

    # Score plots for highlight run
    highlight_run_ix = owu.index.get_level_values("run") == highlight_run
    fig.add_trace(
        go.Scatter(
            x=owu_components[highlight_run_ix, pc_x_axis - 1],
            y=owu_components[highlight_run_ix, pc_y_axis - 1],
            mode="lines+markers",
            marker_size=10,
            marker_color="black",
            line_color="black",
            text=[
                f"Run {highlight_run}, Time {owu.index.get_level_values(1)[i]}"
                for i in range(len(owu))
            ],
            textposition="top center",
        ),
        row=1,
        col=1,
    )

    # Loading plots
    fig.add_trace(
        go.Scatter(
            x=[0, 0], y=[0, 0], mode="markers", marker_color="black", name="origin"
        ),
        row=2,
        col=1,
    )
    for i, feature in enumerate(owu_features):
        fig.add_shape(
            type="line",
            x0=0,
            y0=0,
            x1=owu_loadings[i, pc_x_axis - 1],
            y1=owu_loadings[i, pc_y_axis - 1],
            row=2,
            col=1,
        )
        fig.add_annotation(
            x=owu_loadings[i, pc_x_axis - 1],
            y=owu_loadings[i, pc_y_axis - 1],
            ax=0,
            ay=0,
            xanchor="center",
            yanchor="bottom",
            text=feature,
            row=2,
            col=1,
        )

    fig.update_layout(width=width, height=height, showlegend=False)
    fig.update_xaxes(
        title=f"Principal Component - {pc_x_axis} ({owu_explained[pc_x_axis-1].round(2)}%)",
        row=2,
        col=1,
    )
    fig.update_yaxes(
        title=f"Principal Component - {pc_y_axis} ({owu_explained[pc_y_axis-1].round(2)}%)",
        row=1,
        col=1,
    )
    fig.update_yaxes(
        title=f"Principal Component - {pc_y_axis} ({owu_explained[pc_y_axis-1].round(2)}%)",
        row=2,
        col=1,
    )
    fig.show()

    fig = make_subplots(
        rows=1,
        cols=2,
        subplot_titles=(
            "OWU Loadings of Principal Component - " + str(pc_x_axis),
            "OWU Loadings of Principal Component - " + str(pc_y_axis),
        ),
    )
    fig.add_bar(x=owu_features, y=owu_loadings[:, pc_x_axis - 1], row=1, col=1)
    fig.add_bar(x=owu_features, y=owu_loadings[:, pc_y_axis - 1], row=1, col=2)
    fig.update_layout(width=width, height=height / 2, showlegend=False)
    fig.show()

In [13]:
plot_scores_loadings_owu(owu, pc_x_axis=1, pc_y_axis=2, highlight_run=0, highlight_type='Time_id')

### Mahalanobis distance of observations 
- equal to the euclidean distance in PCA space

In [14]:
def plot_mahalanobis_distance_owu(owu, highlight_run=0, width=1600):
    owu = owu.drop(["timesteps"], axis=1)
    scaler = StandardScaler()
    owu_normalized = scaler.fit_transform(owu)

    # Run PCA on scaled data
    owupca = PCA()
    owupca.fit(owu_normalized)

    # Run PCA analysis
    owu_components = owupca.fit_transform(owu_normalized)

    # Calculate mahalanobis distance
    owu_mahalanobis = (
        EmpiricalCovariance().fit(owu_components).mahalanobis(owu_components)
    )
    T2threshold = chi2.ppf(0.95, owu_components.shape[1] - 1)

    # Plot mahalanobis distance
    fig = px.bar(
        x=list(range(len(owu_mahalanobis))),
        y=owu_mahalanobis,
        labels=dict(x="OWU Observation Id.", y="Mahalanobis distance", color="Run_id"),
        title="OWU Mahalanobis distance with 95% Confidence Interval",
        color=owu.index.get_level_values(0),
        color_continuous_scale=px.colors.cyclical.HSV,
    )
    fig.add_hline(y=T2threshold)

    highlight_run_ix = owu.index.get_level_values("run") == highlight_run

    fig.add_trace(
        go.Bar(
            x=list(np.array(range(len(owu_mahalanobis)))[highlight_run_ix]),
            y=owu_mahalanobis[highlight_run_ix],
            marker_color="black",
        )
    )
    fig.update_layout(width=width, showlegend=False, coloraxis_showscale=True)
    fig.update_layout(barmode="overlay")
    fig.show()

In [15]:
plot_mahalanobis_distance_owu(owu, highlight_run=10, width=1400)

# PCA of BWU matrix

In [16]:
def generate_bwu(owu):
    owu = owu.drop(["timesteps"],axis=1)
    # Input: multiindex OWU
    # Output: singleindex BWU
    for run_ix,run in owu.groupby("run"):
        if run_ix == 0: 
            bwuindex = run.unstack(level=1)
        else:
            bwuindex = pd.concat([bwuindex, run.unstack(level=1)])
    bwu_columns = [str(bwuindex.columns.get_level_values(0)[i])+str(":")+str(bwuindex.columns.get_level_values(1)[i]) 
                   for i in range(len(bwuindex.columns.get_level_values(0)))]
    bwu = pd.DataFrame(bwuindex.to_numpy(), columns=bwu_columns)
    
    return bwu

In [17]:
bwu = generate_bwu(owu)

### Visualize the BWU matrix

In [18]:
bwu.head()

Unnamed: 0,X:VCD:0,X:VCD:1,X:VCD:2,X:VCD:3,X:VCD:4,X:VCD:5,X:VCD:6,X:VCD:7,X:VCD:8,X:VCD:9,...,W:Feed:5,W:Feed:6,W:Feed:7,W:Feed:8,W:Feed:9,W:Feed:10,W:Feed:11,W:Feed:12,W:Feed:13,W:Feed:14
0,0.55,1.725244,4.779558,10.278397,15.886799,18.568089,18.099141,15.811653,12.920362,10.123559,...,12.5,12.5,12.5,12.5,12.5,0.0,0.0,0.0,0.0,0.0
1,0.27449,0.871876,2.589262,6.529319,12.160305,15.860577,15.657798,12.758719,8.910175,5.459967,...,18.622449,18.622449,18.622449,18.622449,18.622449,0.0,0.0,0.0,0.0,0.0
2,0.127551,0.394527,1.182078,3.245514,7.422644,12.957409,16.94004,17.855081,16.416292,13.883034,...,14.642857,14.642857,14.642857,14.642857,0.0,0.0,0.0,0.0,0.0,0.0
3,0.843878,2.609587,6.769304,12.917048,17.673164,18.962614,17.510225,14.7839,11.821098,9.128513,...,15.561224,15.561224,15.561224,15.561224,0.0,0.0,0.0,0.0,0.0,0.0
4,0.421429,1.320115,3.754135,8.637166,14.486637,18.036687,18.229667,16.021779,11.657944,7.802797,...,7.295918,7.295918,7.295918,7.295918,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
def plot_bwu_correlation(bwu, width=1000):
    fig = px.imshow(bwu.corr().round(3), text_auto=True)
    fig.update_layout(title="BWU Correlation Matrix among X variables", width=width)
    fig.show();

### Plot correlation matrix

In [20]:
plot_bwu_correlation(bwu, width=900)

### Normalized PCA


In [21]:
def plot_explained_variance_bwu(bwu, pca_n_components=15, width=1000):
    scaler = StandardScaler()
    bwu_normalized = scaler.fit_transform(bwu)
    
    # Run PCA on scaled data
    bwupca = PCA(n_components=min(min(bwu.shape), pca_n_components))
    bwupca.fit(bwu_normalized)
    bwupca_x = list(range(1, bwupca.n_components_ + 1))

    fig = px.line(
        x=bwupca_x,
        y=np.cumsum(bwupca.explained_variance_ratio_),
        color=px.Constant("Cumulative explained variance"),
        labels=dict(
            x="Principal component index", y="Explained Variance Ratio", color="Legend"
        ),
    )
    fig.add_bar(
        x=bwupca_x,
        y=bwupca.explained_variance_ratio_,
        name="Individual explained variance",
    )
    fig.update_layout(width=width,
                      title='BWU PCA Explained Variance')
    fig.show()


In [22]:
plot_explained_variance_bwu(bwu, pca_n_components=15, width=1000)

### Plot scores and loadings

In [23]:
def plot_scores_loadings_bwu(
    bwu, pc_x_axis=1, pc_y_axis=2, highlight_type="Run_id", pca_n_components=15, 
    width=1200, height=900,
):
    scaler = StandardScaler()
    bwu_normalized = scaler.fit_transform(bwu)
    # Run PCA on scaled data
    bwupca = PCA(n_components=min(min(bwu.shape), pca_n_components))
    bwupca.fit(bwu_normalized)
    bwupca_x = list(range(1, bwupca.n_components_ + 1))

    # Run PCA analysis
    bwu_components = bwupca.fit_transform(bwu_normalized)
    bwu_loadings = bwupca.components_.T * np.sqrt(bwupca.explained_variance_)
    bwu_features = list(bwu.columns)
    bwu_explained = bwupca.explained_variance_ratio_

    # Create 2 rows and 1 cols subplots
    fig = make_subplots(
        rows=2,
        cols=1,
        subplot_titles=["BWU PCA scores plot", "BWU PCA loadings plot"],
        vertical_spacing=0.08,
    )

    # Score plots
    fig.add_trace(
        go.Scatter(
            x=[0, 0], y=[0, 0], mode="markers", marker_color="black", name="origin"
        ),
        row=1,
        col=1,
    )
    text_data = [
        f"Run {bwu.index.get_level_values(0)[i]}"
        for i in range(len(bwu))
    ]

    # Score plots for highlight type
    if highlight_type == "Run_id":
        color_idx=np.array(bwu.index.get_level_values(0))
    elif highlight_type == "Titer_14":
        color_idx = np.array(bwu["X:Titer:14"])
    # elif highlight_type in doe.columns:
    #     color_idx = np.array(doe[PLOT_COLOR])
    else:
        color_idx = None
    fig.add_trace(
        go.Scatter(
            x=bwu_components[:, pc_x_axis - 1],
            y=bwu_components[:, pc_y_axis - 1],
            mode="markers",
            marker=dict(size=10, color=color_idx, showscale=True),
            marker_size=10,
            text=text_data,
            textposition="top center",
        ),
        row=1,
        col=1,
    )

    # Loading plots
    fig.add_trace(
        go.Scatter(
            x=[0, 0], y=[0, 0], mode="markers", marker_color="black", name="origin"
        ),
        row=2,
        col=1,
    )
    for i, feature in enumerate(bwu_features):
        fig.add_shape(
            type="line",
            x0=0,
            y0=0,
            x1=bwu_loadings[i, pc_x_axis - 1],
            y1=bwu_loadings[i, pc_y_axis - 1],
            row=2,
            col=1,
        )
        fig.add_annotation(
            x=bwu_loadings[i, pc_x_axis - 1],
            y=bwu_loadings[i, pc_y_axis - 1],
            ax=0,
            ay=0,
            xanchor="center",
            yanchor="bottom",
            text=feature,
            row=2,
            col=1,
        )

    fig.update_layout(width=width, height=height, showlegend=False)
    fig.update_xaxes(
        title=f"Principal Component - {pc_x_axis} ({bwu_explained[pc_x_axis-1].round(2)}%)",
        row=2,
        col=1,
    )
    fig.update_yaxes(
        title=f"Principal Component - {pc_y_axis} ({bwu_explained[pc_y_axis-1].round(2)}%)",
        row=1,
        col=1,
    )
    fig.update_yaxes(
        title=f"Principal Component - {pc_y_axis} ({bwu_explained[pc_y_axis-1].round(2)}%)",
        row=2,
        col=1,
    )
    fig.show()

    fig = make_subplots(
        rows=1,
        cols=2,
        subplot_titles=(
            "BWU Loadings of Principal Component - " + str(pc_x_axis),
            "BWU Loadings of Principal Component - " + str(pc_y_axis),
        ),
    )
    fig.add_bar(x=bwu_features, y=bwu_loadings[:, pc_x_axis - 1], row=1, col=1)
    fig.add_bar(x=bwu_features, y=bwu_loadings[:, pc_y_axis - 1], row=1, col=2)
    fig.update_layout(width=width, height=height/2, showlegend=False)
    fig.show()

In [24]:
plot_scores_loadings_bwu(bwu, pc_x_axis=1, pc_y_axis=2, pca_n_components=15,  highlight_type='Run_id')

### Mahalanobis distance of observations 
- equal to the euclidean distance in PCA space

In [25]:
def plot_mahalanobis_distance_bwu(bwu, pca_n_components=15, highlight_run=0, width=1600):
    scaler = StandardScaler()
    bwu_normalized = scaler.fit_transform(bwu)

    # Run PCA on scaled data
    bwu_pca = PCA(n_components=min(min(bwu.shape), pca_n_components))
    bwu_pca.fit(bwu_normalized)

    # Run PCA analysis
    bwu_components = bwu_pca.fit_transform(bwu_normalized)

    # Calculate Mahalanobis distance
    bwu_mahalanobis = EmpiricalCovariance().fit(bwu_components).mahalanobis(bwu_components)
    T2threshold = chi2.ppf(0.95, bwu_components.shape[1] - 1)

    # Plot Mahalanobis distance
    fig = px.bar(
        x=list(range(len(bwu_mahalanobis))),
        y=bwu_mahalanobis,
        labels=dict(x="BWU Observation Id.", y="Mahalanobis distance", color="Run_id"),
        title="BWU Mahalanobis distance with 95% Confidence Interval",
        color=bwu.index,
        color_continuous_scale=px.colors.cyclical.HSV,
    )
    fig.add_hline(y=T2threshold, line_dash="dash", annotation_text="95% Threshold", annotation_position="top right")

    # Highlight specific run
    highlight_run_ix = bwu.index == highlight_run

    fig.add_trace(
        go.Bar(
            x=list(np.array(range(len(bwu_mahalanobis)))[highlight_run_ix]),
            y=bwu_mahalanobis[highlight_run_ix],
            marker_color="black",
        )
    )
    fig.update_layout(width=width, showlegend=False, coloraxis_showscale=True)
    fig.update_layout(barmode="overlay")
    fig.show()

In [26]:
plot_mahalanobis_distance_bwu(bwu, highlight_run=99, width=1600)