# Training autoencoders on cytokine data
To run this notebook, you need:
 - Tensorflow Python package (tested with version 2.0)
 - Preprocessed cytokine time series data in the ``data/processed/`` folder. The list of necessary files can be found a few cells below (testing and training on separate experiments). 


## Motivation
The goal is to see if we can recover the latent space from an unsupervised approach, and to show that the supervised approach with cytokine integrals rather than concentrations is probably slightly better. To achieve this, we use a simple two-layer autoencoder with a bottleneck. We use the Keras module in Tensorflow. 

## Import packages, define useful classes

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
import psutil, pickle, json
import os, sys
main_dir_path = os.path.abspath('../')
sys.path.insert(0, main_dir_path)

# Processing: min-max scaling of the data
from sklearn.preprocessing import PowerTransformer, StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.metrics import r2_score
from sklearn.linear_model import RidgeClassifier
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.neural_network import MLPClassifier

# Neural networks: keras for autoencoder
from tensorflow import keras as ks
import tensorflow as tf
from utils.autoencoder_models import Autoencoder, Autoencoder_regul, save_autoencoder, load_autoencoder
import random as python_random

# Custom scripts
import utils.custom_pandas as cpd
from utils.mi_time_window import compute_mi_timecourse
from utils.discrete_continuous_info import discrete_continuous_info_fast
from ltspcyt.scripts.neural_network import import_WT_output

In [None]:
%matplotlib inline

In [None]:
# Plot parameters for Science
plt.rcParams["figure.figsize"] = (2.5, 2.25)
plt.rcParams["axes.labelsize"] = 8.
plt.rcParams["legend.fontsize"] = 8.
plt.rcParams["axes.labelpad"] = 0.5
plt.rcParams["xtick.labelsize"] = 7.
plt.rcParams["ytick.labelsize"] = 7.
plt.rcParams["legend.title_fontsize"] = 8.
plt.rcParams["axes.titlesize"] = 8.
plt.rcParams["font.size"] = 8.

# For larger display of small graphs in the notebook
plt.rcParams['figure.dpi'] = 120

In [None]:
# Colors for the different kinds of latent spaces
different_ls_colors = {
    "Ag. classifier": mpl.colors.to_rgba("maroon"),  # already set by main text figures: choose between ["goldenrod", "maroon"]
    "Autoencoder": sns.color_palette("hls", 3)[1], 
    "PCA": sns.color_palette("hls", 3)[2]
}
sns.palplot(different_ls_colors.values())

In [None]:
# CPU count for multiprocessing with the right number of jobs
cpu_count = psutil.cpu_count(logical=False)

## Autoencoder class
Using the Keras model API

## Import data, select training and test sets

In [None]:
# Import the data and check the normalization of each feature. 
df = import_WT_output(folder=os.path.join(main_dir_path, "data", "processed"))
df = df.loc[df.index.isin(["1uM", "100nM", "10nM", "1nM"], level="Concentration")]
df = df.loc[:, df.columns.isin(["IFNg", "IL-17A", "IL-2", "IL-6", "TNFa"], level="Cytokine")]

# Keep only datasets of interest
main_ot1_datasets = ['Activation_2', 'Activation_TCellNumber_1', 'Activation_Timeseries_1',
       'CD25MutantTimeSeries_OT1_Timeseries_2', 'HighMI_1-1', 'HighMI_1-2',
       'HighMI_1-3', 'HighMI_1-4', 'NewPeptideComparison_OT1_Timeseries_20',
       'PeptideComparison_OT1_Timeseries_18',
       'PeptideComparison_OT1_Timeseries_19',
       'PeptideComparison_OT1_Timeseries_20',
       'PeptideComparison_OT1_Timeseries_21',
       'PeptideComparison_OT1_Timeseries_22',
       'PeptideComparison_OT1_Timeseries_23',
       'PeptideTumorComparison_OT1_Timeseries_1', 'TCellNumber_2',
       'TCellNumber_OT1_Timeseries_7'
]
df = df.loc[df.index.isin(main_ot1_datasets, level="Data")]
df = df.sort_index()

In [None]:
# Normalize train data and scale test data with same boundaries
train_sets = [
    'PeptideComparison_OT1_Timeseries_18',
    'PeptideComparison_OT1_Timeseries_19',
    'PeptideComparison_OT1_Timeseries_20',
    'PeptideTumorComparison_OT1_Timeseries_1',
    'TCellNumber_OT1_Timeseries_7', 
    'Activation_Timeseries_1'
]
training_peps = ["N4", "Q4", "T4", "V4", "G4", "E1"]
df_train = df.loc[df.index.isin(train_sets, level="Data")]
# Select only integrals
df_train = df_train.xs("integral", level="Feature", axis=1)
# Select only the training peptides
df_train = df_train.loc[df_train.index.isin(training_peps, level="Peptide")]

test_sets = [
    'Activation_2', 
    'Activation_TCellNumber_1', 
    'CD25MutantTimeSeries_OT1_Timeseries_2',
    'PeptideComparison_OT1_Timeseries_21',
    'PeptideComparison_OT1_Timeseries_22',
    'PeptideComparison_OT1_Timeseries_23',
    'HighMI_1-1', 'HighMI_1-2',
    'HighMI_1-3', 'HighMI_1-4'
]

df_test = df.loc[df.index.isin(test_sets, level="Data")]
df_test = df_test.xs("integral", level="Feature", axis=1)
# Select only the training peptides
df_test = df_test.loc[df_test.index.isin(training_peps, level="Peptide")]

In [None]:
# Normalize
df_min, df_max = df_train.min(axis=0), df_train.max(axis=0)
df_train_norm = (df_train - df_min) / (df_max - df_min)
df_test_norm = (df_test - df_min) / (df_max - df_min)

## Train an autoencoder
Optimal results can apparently be obtained by using only the time inverval 20 - 70 hours. 
But let me first try with the full time courses. 

Apply regularization to the first layer, to try to force the autoencoder out of the linear, PCA regime. 

In [None]:
def plot_training_history(hist, df_train, df_test):
    # Calculate R2
    r2_history = 1.0 - np.asarray(hist.history['loss'])*df_train.size / ((df_train - df_train.mean(axis=0))**2).sum().sum()
    val_r2_history = 1.0 - np.asarray(hist.history['val_loss'])*df_train.size / ((df_test - df_test.mean(axis=0))**2).sum().sum()

    # Remember that the MSE is with respect to quantities scaled 0-1, so 0.01 is 1% error. 
    fig, axes = plt.subplots(1, 2)
    fig.set_size_inches(3.25, 2.25)
    axes = axes.flatten()
    train_line_args = dict(label="Training", color="xkcd:navy", lw=2.5, ls="-")
    test_line_args = dict(label="Validation", color="xkcd:aqua", ls="--", lw=2.5)
    epochs = np.arange(1, len(r2_history)+1)
    axes[0].plot(epochs, hist.history['loss'], **train_line_args)
    axes[0].plot(epochs, hist.history['val_loss'], **test_line_args)
    axes[1].plot(epochs, r2_history,  **train_line_args)
    axes[1].plot(epochs, val_r2_history, **test_line_args)
    
    # Annotate max R2
    max_r2 = max(r2_history[-1], val_r2_history[-1])
    min_r2 = min(r2_history[-1], val_r2_history[-1])
    axes[1].annotate("{:.2f}".format(max_r2), xy=(epochs[-1], min_r2*0.95), ha="right", va="top", size=7)
    
    axes[0].set_ylabel('Loss (MSE)')
    axes[0].set_xlabel('No. epoch')
    axes[0].legend(loc="upper right", fontsize=7)
    axes[1].legend(loc="lower right", fontsize=7)
    axes[1].set(ylabel=r"$R^2$ score", xlabel="No. epoch")
    axes[0].set_yscale("log")
    # Add minor ticks. minor_thresholds=(subset, all): subset is number of axis range decades below
    # which only a subset of ticks is shown, all the threshold below which all minor ticks are labeled, 
    # and if decades > subset, no minor ticks are shown. 
    axes[0].yaxis.set_minor_formatter(mpl.ticker.LogFormatterSciNotation(minor_thresholds=(2, 1)))
    axes[0].tick_params(which="minor", axis="y", labelsize=6)
    axes[1].axhline(1.0, ls=":", color="grey")
    fig.tight_layout()
    return fig, axes, r2_history, val_r2_history

In [None]:
# Seeding Python, numpy, tensorflow all necessary for reproducibility. 
%env PYTHONHASHSEED 0
np.random.seed(3245134)
python_random.seed(345345)

# Tensorflow seed 2413598 gives similar latent space orientation
tf_seed_regul = 2143598
tf.random.set_seed(tf_seed_regul)

# Creating the regularized autoencoder
autoenc_regul = Autoencoder_regul(5, 2)
# Nope: with regularization the latent space is a lot worse, everything on top of each other. 
autoenc_regul.compile(optimizer='adam', loss=ks.losses.MeanSquaredError())  #  gives poor results. 
history_regul = autoenc_regul.fit(df_train_norm.values.astype(np.float32), df_train_norm.values.astype(np.float32),                 
            epochs=30,
            shuffle=True,
            validation_data=(df_test_norm.values, df_test_norm.values))

In [None]:
fig, axes, _, _ = plot_training_history(history_regul, df_train_norm, df_test_norm)
axes[1].set_ylim(0, 1.1)
fig.set_size_inches(3.25, 2.)
# fig.savefig("figures/autoencoder_training_history_integrals_regul.pdf", transparent=True)
plt.show()
plt.close()

In [None]:
vals_train_encoded_reg = autoenc_regul.encoder(df_train_norm.values.astype(np.float32)).numpy()
df_train_encoded_reg = pd.DataFrame(vals_train_encoded_reg, index=df_train_norm.index, 
                                columns=pd.Index(["LS1", "LS2"], name="Latent variable"), dtype=np.float32)


vals_train_recon_reg = autoenc_regul.decoder(df_train_encoded_reg.values.astype(np.float32)).numpy()
df_train_recon_reg = pd.DataFrame(vals_train_recon_reg, index=df_train_norm.index, columns=df_train_norm.columns)

vals_test_encoded_reg = autoenc_regul.encoder(df_test_norm.values.astype(np.float32)).numpy()
df_test_encoded_reg = pd.DataFrame(vals_test_encoded_reg, index=df_test_norm.index, 
                                columns=pd.Index(["LS1", "LS2"], name="Latent variable"), dtype=np.float32)


vals_test_recon_reg = autoenc_regul.decoder(df_test_encoded_reg.values.astype(np.float32)).numpy()
df_test_recon_reg = pd.DataFrame(vals_test_recon_reg, index=df_test_norm.index, columns=df_test_norm.columns)

In [None]:
# Plot the latent space for one dataset
g = sns.relplot(data=df_train_encoded_reg.sort_index(level="Time").reset_index(), 
            x="LS1", y="LS2", hue="Peptide", size="Concentration",
            kind="line", sort=False, style='TCellNumber', col="Data")
# g.fig.savefig("figures/regularized_autoencoder_latent_space_traindata.pdf", transparent=True)

### Visualize reconstructions
To see how well this autoencoder fares compared to the decoders we have trained for the paper. As it turns out, this autoencoder has basically the same reconstruction artifacts that we would get with simple reconstruction by linear regression. It does not beat 

In [None]:
# Compare data and reconstruction
# Plot the latent space for one dataset
df_train_both = pd.concat({"Real":df_train_norm, "Recon":df_train_recon_reg}, names=["Source"], axis=0)
sns.relplot(data=df_train_both.stack().xs("100k", level="TCellNumber").sort_index(level="Time").to_frame().reset_index(), 
            x="Time", y=0, hue="Peptide", size="Concentration",
            kind="line", style='Source', row="Data", col="Cytokine")

## New antigens
Where do they go with each autoencoder above? Do they line up properly?

In [None]:
new_peps = ["A2", "Y3"]
df_allpeps_norm = (df.xs("integral", level="Feature", axis=1).copy() - df_min) / (df_max - df_min)
df_allpeps_norm["Peptide set"] = "New"
# There are a couple peptide we don't use in general
df_allpeps_norm = df_allpeps_norm.drop(["Q7", "A8"], level="Peptide", axis=0)
df_allpeps_norm.loc[df_allpeps_norm.index.isin(training_peps, level="Peptide"), "Peptide set"] = "Train"
df_allpeps_norm = df_allpeps_norm.set_index(["Peptide set"], append=True)

In [None]:
df_allpeps_norm.shape

In [None]:
# Also with the regularized autoencoder
vals_allpeps_encoded_reg = autoenc_regul.encoder(df_allpeps_norm.values.astype(np.float32)).numpy()
df_allpeps_encoded_reg = pd.DataFrame(vals_allpeps_encoded_reg, index=df_allpeps_norm.index, 
                                columns=pd.Index(["LS1", "LS2"], name="Latent variable"), dtype=np.float32)

vals_allpeps_recon_reg = autoenc_regul.decoder(df_allpeps_encoded_reg.values.astype(np.float32)).numpy()
df_allpeps_recon_reg = pd.DataFrame(vals_allpeps_recon_reg, index=df_allpeps_norm.index, columns=df_allpeps_norm.columns)

In [None]:
def plot_latent_space(df_ls, allpeps, x="LS1", y="LS2", 
                      dset_choice="PeptideComparison_OT1_Timeseries_20", tcn_choice="100k"):
    default_colors = sns.color_palette()
    palette = {allpeps[i]:default_colors[i] for i in range(len(allpeps))}
    allpeps_order = ["N4", "A2", "Y3", "Q4", "T4", "V4", "G4", "E1"]
    size_order = ["1uM", "100nM", "10nM", "1nM"]
    
    # Choose one training dataset, the one with most trajectories ideally
    df_plot = df_ls.xs(dset_choice, level="Data", drop_level=False)
    # Add E1, G4 from all datasets
    df_plot = df_plot.append(df_ls.loc[df_ls.index.isin(["G4", "E1"], level="Peptide")])
    df_plot = df_plot.droplevel("Data")
    chosen_lw = 1.
    g = sns.relplot(data=(df_plot.sort_index(level="Time").xs(tcn_choice, level="TCellNumber").reset_index()), 
                x=x, y=y, sort=False, kind="line", height=1.75, legend=True,
                hue="Peptide", hue_order=allpeps_order, palette=palette,
                size="Concentration", size_order=size_order,
                style='Peptide set', style_order=["Train", "New"])
    
    return g

In [None]:
# Plot data with special line style for new antigens
# Prepare colors
chosen_dset = "PeptideComparison_OT1_Timeseries_20"
g = plot_latent_space(df_allpeps_encoded_reg, training_peps+new_peps, 
                      dset_choice=chosen_dset, tcn_choice="100k")
g.fig.axes[0].set_xticks([])
g.fig.axes[0].set_yticks([])
g.fig.axes[0].set_xlabel(r"LS${}_1$ (a.u.)", labelpad=3.)
g.fig.axes[0].set_ylabel(r"LS${}_2$ (a.u.)", labelpad=3.)
g.fig.axes[0].set_aspect("equal")
g.legend.set_visible(False)
handles, labels = g.fig.axes[0].get_legend_handles_labels()
fig.set_size_inches(1.75*2, 1.75)
g.fig.axes[0].legend(handles, labels, fontsize=6, ncol=2, loc="upper left", bbox_to_anchor=(1, 1), 
                    frameon=False, columnspacing=1.)
g.fig.tight_layout()
#g.fig.savefig("figures/autoencoder_regularized_latent_space_{}.pdf".format(dset_choice), 
#              transparent=True, bbox_inches="tight")
plt.show()
plt.close()

# PCA latent space
This is another way to get a latent space. Does it look like the autoencoder space?

In [None]:
# Also add PCA
pcmodel = PCA(n_components=2)
pcmodel_full = PCA(n_components=5)
pcmodel.fit(df_train_norm)
pcmodel_full.fit(df_train_norm)
df_train_pca = pd.DataFrame(pcmodel.transform(df_train_norm), index=df_train_norm.index, 
                              columns=pd.Index(["PC1", "PC2"], name="Principal component"))
df_test_pca = pd.DataFrame(pcmodel.transform(df_test_norm), index=df_test_norm.index, 
                              columns=pd.Index(["PC1", "PC2"], name="Principal component"))
df_allpeps_pca = pd.DataFrame(pcmodel.transform(df_allpeps_norm), index=df_allpeps_norm.index, 
                              columns=pd.Index(["PC1", "PC2"], name="Principal component"))
print(pcmodel.explained_variance_ratio_)
print(pcmodel_full.explained_variance_ratio_)

In [None]:
# Plot data with special line style for new antigens
# Prepare colors
g = plot_latent_space(df_allpeps_pca, training_peps+new_peps, x="PC1", y="PC2",
                      dset_choice="PeptideComparison_OT1_Timeseries_20", tcn_choice="100k")
g.fig.axes[0].set_xticks([])
g.fig.axes[0].set_yticks([])
g.fig.axes[0].set_xlabel("PC 1 (a.u.)", labelpad=3.)
g.fig.axes[0].set_ylabel("PC 2 (a.u.)", labelpad=3.)
#g.fig.savefig("figures/pca_latent_space_{}.pdf".format(chosen_dset), 
#              transparent=True, bbox_inches="tight")
plt.show()
plt.close()

# Mutual information
How does the autoencoder's latent space compare to our latent space and to PCA? 

In [None]:
# Dictionary concatenating the autoencoder and the classifier's latent spaces
def create_dict_ls(dfs_dict, dset_choice, tcn_choice):
    dict_ls = {}
    for k, d in dfs_dict.items():
        dat = d.loc[d.index.isin(dset_choice, level="Data")]
        if "E1" not in dat.index.get_level_values("Peptide").unique():
            dat = dat.append(d.xs("E1", level="Peptide", drop_level=False)).sort_index()
        if tcn_choice not in ["all", None]:
            dat = dat.xs(tcn_choice, level="TCellNumber")
        dict_ls[k] = dat
    return dict_ls

In [None]:
# Projection matrix
projmat = np.load(os.path.join(main_dir_path, "data", "trained-networks", 
                              "mlp_input_weights-thomasRecommendedTraining.npy"))

df_allpeps_proj = df_allpeps_norm.dot(projmat)
df_allpeps_proj.columns = pd.Index(["LS1", "LS2"], name='Variable')

In [None]:
# Compute MI over time windows
dict_all_dfs_ls = {
    #"Autoencoder": df_allpeps_encoded, 
    "Autoencoder": df_allpeps_encoded_reg, 
    "Ag. classifier": df_allpeps_proj, 
    "PCA": df_allpeps_pca
}
# Combine all training datasets to have better statistics. 
# Use only 100k T cells otherwise too much overlap. 
dsets_chosen = train_sets
tcn_chosen = "100k"
dict_latent_spaces = create_dict_ls(dict_all_dfs_ls, dsets_chosen, tcn_chosen)


df_mi_time_encoded, max_mi_encoded = compute_mi_timecourse(
    dict_latent_spaces, q="Peptide", overlap=False, window=3, knn=3*3, speed="fast")

In [None]:
# Plotting
#models_order = ["5 cytokine conc.", "Ag. classifier", "Autoencoder", "PCA"]
models_order = ["Ag. classifier", "Autoencoder", "PCA"]
g = sns.relplot(data=df_mi_time_encoded.reset_index().melt(id_vars=["Time"], value_name="MI", var_name="LS model"), 
            x="Time", y="MI", hue="LS model", kind="line", height=3., palette=different_ls_colors, style="LS model", lw=3., 
               hue_order=models_order, style_order=models_order)
# Improve seaborn plot
g.fig.axes[0].set_ylabel("MI (bits)")
g.fig.axes[0].set_xlabel("Time (h)")
g.fig.set_size_inches(2.25, 1.75)
#g.fig.subplots_adjust(right=3/3.25)
g.legend.loc = 1  # if required you can set the loc
g.legend.set_bbox_to_anchor([0.95, 0.45])  # coordinates of lower left of bounding box
g.fig.tight_layout()
g.fig.tight_layout()
for legobj in g.legend.legendHandles:
    legobj.set_linewidth(2.0)
g.fig.tight_layout()
#g.fig.savefig("figures/mi_vs_time_different_latent_spaces_{}.pdf".format("train"), 
#              transparent=True, bbox_inches="tight")
plt.show()
plt.close()

## Export results
This export was used to plot the network diagrams with proper weights between nodes. This plotting code is not included in the package, so there is no use to exporting the results below. 

# Train classifiers on top of the different latent spaces
and compare to the original classifier. 

In [None]:
def init_peps_cytos_concs():
    # Keep the train_peptides order as in the original code. E1, weakest, is first. 
    train_peptides = ["N4", "Q4", "T4", "V4", "G4", "E1"][::-1]
    keep_cytokines = ["IFNg", "IL-17A", "IL-2", "IL-6", "TNFa"]
    keep_conc = ["1uM", "100nM", "10nM", "1nM"]
    keep_cytokines.sort()
    return train_peptides, keep_cytokines, keep_conc


In [None]:
train_peptides, keep_cytokines, keep_conc = init_peps_cytos_concs()
peptide_dict = {k:v for v, k in enumerate(train_peptides) 
                         if k in df_allpeps_norm.index.get_level_values("Peptide").unique()}
#Extract times and set classes
targets_enc = df_train_encoded_reg.index.get_level_values("Peptide").map(peptide_dict).values.astype(int)
targets_pca = df_train_pca.index.get_level_values("Peptide").map(peptide_dict).values.astype(int)

pca_classif = RidgeClassifier(alpha=1e-1, fit_intercept=True, max_iter=5000, 
                              solver="auto", random_state=43251)
auto_classif = RidgeClassifier(alpha=1e-1, fit_intercept=True, max_iter=5000, 
                               solver="auto", random_state=249874198)

pca_nn = MLPClassifier(activation="tanh",hidden_layer_sizes=(),max_iter=5000,
                    solver="adam",random_state=1345209,learning_rate="adaptive",alpha=0.01)
auto_nn = MLPClassifier(activation="tanh",hidden_layer_sizes=(),max_iter=5000,
                    solver="adam",random_state=9247148,learning_rate="adaptive",alpha=0.01)

# Fit
#pca_classif.fit(df_train_pca, targets_pca)
#auto_classif.fit(df_train_encoded, targets_enc)
# Fit with cross-validation to have a reliable score estimate
crossval_args = dict(cv=5, return_train_score=True, return_estimator=True)
auto_scores = cross_validate(auto_classif, df_train_encoded_reg.values, targets_enc, **crossval_args)
pca_scores = cross_validate(pca_classif, df_train_pca.values, targets_pca, **crossval_args)

print("Starting training of neural networks...")

auto_scores_nn = cross_validate(auto_nn, df_train_encoded_reg.values, targets_enc, **crossval_args)
print("Finished network for autoencoder; starting network for PCA...")
pca_scores_nn = cross_validate(pca_nn, df_train_pca.values, targets_pca, **crossval_args)

# Rename test_score to cv_score 
auto_scores["cv_score"] = auto_scores["test_score"]
pca_scores["cv_score"] = pca_scores["test_score"]
auto_scores_nn["cv_score"] = auto_scores_nn["test_score"]
pca_scores_nn["cv_score"] = pca_scores_nn["test_score"]

print(auto_scores["train_score"].mean(), "pm", auto_scores["train_score"].std())
print(pca_scores["train_score"].mean(), "pm", pca_scores["train_score"].std())
print(auto_scores_nn["train_score"].mean(), "pm", auto_scores_nn["train_score"].std())
print(pca_scores_nn["train_score"].mean(), "pm", pca_scores_nn["train_score"].std())

In [None]:
# Validation with test data
targets_test_enc = df_test_encoded_reg.index.get_level_values("Peptide").map(peptide_dict).values.astype(int)
targets_test_pca = df_test_pca.index.get_level_values("Peptide").map(peptide_dict).values.astype(int)

auto_scores["test_score"] = np.asarray([cl.score(df_test_encoded_reg.values, targets_test_enc) 
                                        for cl in auto_scores["estimator"]])
pca_scores["test_score"] = np.asarray([cl.score(df_test_pca.values, targets_test_pca) 
                                        for cl in pca_scores["estimator"]])
auto_scores_nn["test_score"] = np.asarray([cl.score(df_test_encoded_reg.values, targets_test_enc) 
                                        for cl in auto_scores_nn["estimator"]])
pca_scores_nn["test_score"] = np.asarray([cl.score(df_test_pca.values, targets_test_pca) 
                                        for cl in pca_scores_nn["estimator"]])

In [None]:
# Import projection matrix of our latent space classifier
# Generate its latent space on the train and test data here
# and train new output layer classifiers on it. 
# The score may be slightly different from the original classifier
# because we remove the offsets and tanh applied on the latent space by it. 
projmat = np.load(os.path.join(main_dir_path, "data", "trained-networks", 
                              "mlp_input_weights-thomasRecommendedTraining.npy"))
df_proj_train = df_train_norm.dot(projmat)
df_proj_test = df_test_norm.dot(projmat)

targets_mlp = df_proj_train.index.get_level_values("Peptide").map(peptide_dict).values.astype(int)

classif_mlp = MLPClassifier(activation="tanh",hidden_layer_sizes=(),max_iter=5000,
                    solver="adam",random_state=92448,learning_rate="adaptive",alpha=0.01)
print("Starting cross-validation of classifier trained on the original classifier's latent space")
classif_scores = cross_validate(classif_mlp, df_proj_train.values, targets_mlp, **crossval_args)
print(classif_scores["train_score"].mean(), "pm", classif_scores["train_score"].std())

In [None]:
targets_test_mlp = df_proj_test.index.get_level_values("Peptide").map(peptide_dict).values.astype(int)
classif_scores["cv_score"] = classif_scores["test_score"]
classif_scores["test_score"] = np.asarray([cl.score(df_proj_test.values, targets_test_mlp) 
                                        for cl in classif_scores["estimator"]])

In [None]:
# nomograph of train and test score for each kind of model
fig, ax = plt.subplots()
x = np.arange(3)
i = 0
for mod, sc_dict in zip(["Ag. classifier", "PCA", "Autoencoder"], [classif_scores, pca_scores_nn, auto_scores_nn]):
    clr = different_ls_colors[mod]
    y = [sc_dict["train_score"].mean(), sc_dict["cv_score"].mean(), sc_dict["test_score"].mean()]
    yerr = [sc_dict["train_score"].std(), sc_dict["cv_score"].std(), sc_dict["test_score"].std()]
    ax.errorbar(x, np.asarray(y)*100, yerr=np.asarray(yerr)*100, ls="-", marker="o", ms=8, label=mod, 
               color=clr, mfc=clr, mec=clr)
    i += 1
ax.legend(title="Latent space", fontsize=7, frameon=False, loc="lower left")
ax.set_xticks(x)
ax.set_xticklabels(["Train", "Cross-validate", "Test"])
ax.set_ylabel("Accuracy for Ag. prediction (%)")
fig.tight_layout()
# fig.savefig("figures/train_cross_test_scores_different_latent_spaces.pdf", bbox_inches="tight")
plt.show()
plt.close()

In [None]:
# Both MI and accuracy on one plot
fig, axes = plt.subplots(1, 2)
fig.set_size_inches(5.25, 1.75)

lw_graph = 3.
lw_leg = 2.
handlelength=3.

# MI plot
models_order = ["Ag. classifier", "Autoencoder", "PCA"]
lstyles = {"Ag. classifier":"-", "Autoencoder":"--", "PCA":":"}
ax = axes[0]
times = sorted(df_mi_time_encoded2.index.get_level_values("Time").unique())
for m in models_order:
    y = df_mi_time_encoded2[m].values
    ax.plot(times, y, color=different_ls_colors[m], ls=lstyles[m], lw=lw_graph, label=m)
ax.set_ylabel("MI (bits)")
ax.set_xlabel("Time (h)")
leg = ax.legend(title="Latent Space", handlelength=handlelength, fontsize=8, frameon=False)
for legobj in leg.legendHandles:
    legobj.set_linewidth(lw_leg)

# Score plot
ax = axes[1]
x = np.arange(3)
i = 0
for mod, sc_dict in zip(["Ag. classifier", "PCA", "Autoencoder"], [classif_scores, pca_scores_nn, auto_scores_nn]):
    clr = different_ls_colors[mod]
    y = [sc_dict["train_score"].mean(), sc_dict["cv_score"].mean(), sc_dict["test_score"].mean()]
    yerr = [sc_dict["train_score"].std(), sc_dict["cv_score"].std(), sc_dict["test_score"].std()]
    ax.errorbar(x, np.asarray(y)*100, yerr=np.asarray(yerr)*100, marker="o", ms=8, label=mod, 
               color=clr, mfc=clr, mec=clr, lw=lw_graph, ls=lstyles[mod])
    i += 1
ax.set_xticks(x)
ax.set_xticklabels(["Train", "Cross-validate", "Test"])
ax.set_ylabel("Accuracy for quality (%)")
fig.tight_layout(w_pad=6.)
#fig.savefig("figures/mi_accuracies_different_latent_spaces_{}.pdf".format("train"), 
#              transparent=True, bbox_inches="tight")
plt.show()
plt.close()

In [None]:
# Plot the three latent spaces on the same figure, let the legend run over three rows
def three_ls_plots(df_ag, df_au, df_pc, allpeps):
    default_colors = sns.color_palette()
    palette = {allpeps[i]:default_colors[i] for i in range(len(allpeps))}
    allpeps_order = ["N4", "A2", "Y3", "Q4", "T4", "V4", "G4", "E1"]
    size_order = ["1uM", "100nM", "10nM", "1nM"]
    dset_choice = ["PeptideComparison_OT1_Timeseries_20"]
    df_plot_combined = pd.concat({"Ag. classifier":df_ag, 
                    "Autoencoder":df_au, 
                    "PCA":df_pc.rename({"PC1":"LS1", "PC2":"LS2"}, level="Principal component", axis=1)}, 
                    names=["Latent space"], axis=0)
    df_plot = df_plot_combined.loc[(slice(None), dset_choice),].xs("100k", level="TCellNumber")
    df_plot = df_plot.append(df_plot_combined.loc[df_plot_combined.index.isin(["G4", "E1"], level="Peptide")])
    df_plot.droplevel("Data")
    chosen_lw = 1.0
    
    g = sns.relplot(data=df_plot.sort_index(level="Time").reset_index(), 
                x="LS1", y="LS2", sort=False, kind="line", height=1.75, legend=True,
                hue="Peptide", hue_order=allpeps_order, palette=palette,
                size="Concentration", size_order=size_order, row="Latent space",
                style='Peptide set', style_order=["Train", "New"],
                facet_kws={'sharey': False, 'sharex': False}, 
                row_order=["PCA", "Autoencoder", "Ag. classifier"])
    for ax in g.axes.flatten():
        ax.set_xticks([])
        ax.set_yticks([])
        ax.set_xticklabels([])
        ax.set_yticklabels([])
        ax.set_title("")
    g.fig.axes[0].set_ylabel(r"PC$_2$")
    g.fig.axes[0].set_xlabel(r"PC$_1$", labelpad=2.5)
    g.fig.axes[1].set_xlabel(r"LS$_1$", labelpad=2.5)
    g.fig.axes[1].set_ylabel(r"LS$_2$")
    g.fig.axes[2].set_xlabel(r"LS$_1$", labelpad=2.5)
    g.fig.axes[2].set_ylabel(r"LS$_2$")
    
    g.fig.set_size_inches(5.25/2, 1.75*3)
    g.fig.tight_layout(h_pad=4.)
    g.fig.subplots_adjust(right=0.6)
    
    return g

In [None]:
df_proj_allpeps = df_allpeps_norm.dot(mlp.coefs_[0])
df_proj_allpeps.columns = pd.Index(["LS1", "LS2"], name="Variable")
g = three_ls_plots(df_proj_allpeps, df_allpeps_encoded_reg, df_allpeps_pca, training_peps+new_peps)
# g.fig.savefig("figures/latent_spaces_three_models.pdf", transparent=True)
plt.show()
plt.close()

#### Credits
Author of this script: frbourassa

Keras-derived Autoencoder classes inspired by the official documentation and written by frbourassa. 