# Supplementary panels about cytokine reconstruction

To run this notebook, you need:
- processed cytokine time series (default: NewPeptideComparison_20.hdf) in `data/processed/`;
- weights of trained neural network in `data/trained-networks/`. 
- to have run `reconstruct_cytokines_fromLSdata.ipynb` with non-linear and linear reconstruction methods, and saved the results in `results/reconstruction/`; 
- to have run `generate_synthetic_data.ipynb` and have saved its results in `results/reconstruction`. 

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import os, sys
main_dir_path = os.path.abspath('../')
sys.path.insert(0, main_dir_path)

import utils.plotting_recon as pltrecon
import utils.custom_pandas as custom_pd
import utils.plotting_3d as plt3d
from utils.recon_scaling import scale_back

In [None]:
%matplotlib inline
plt.rcParams['savefig.transparent'] = True

# 3D plots to show the 2D manifold
And how it changes with different T cell numbers, experiments and cell types. 

In [None]:
chosen_cytokines = ["IFNg", "IL-2", "TNFa"]
proj_mat = np.load(os.path.join(main_dir_path, "data", "trained-networks", 
                            "mlp_input_weights-thomasRecommendedTraining.npy")).T
dense_data = pd.read_hdf(os.path.join(main_dir_path, "data", "processed", 
                            "PeptideComparison_4.hdf"))
dense_data = dense_data.loc[("100k")]

In [None]:
feat = "integral"
fig, ax = plt3d.cytokines_one_latent_plane(dense_data, 
            proj_mat, chosen_cytokines, feat=feat)

ax.view_init(elev=30., azim=200)
fig.set_size_inches(2.4, 2.4)
fig.tight_layout()
# fig.savefig(os.path.join(main_dir_path, "figures", "supp", "3d_plot_{}_PeptideComparison20.pdf".format(feat)), 
#        bbox_inches='tight')
plt.show()
plt.close()

In [None]:
feat = "concentration"
fig, ax = plt3d.cytokines_one_latent_plane(dense_data, 
            proj_mat, chosen_cytokines, feat=feat)

ax.view_init(elev=45., azim=230)
fig.set_size_inches(2.4, 2.4)
fig.tight_layout()
# fig.savefig(os.path.join(main_dir_path, "figures", "supp", "3d_plot_{}_PeptideComparison20.pdf".format(feat)), 
#        bbox_inches='tight')
plt.show()
plt.close()

## 3D plot for different T cell numbers and different experiments – removed from paper
With different T cell numbers or experiments, the 2D manifold changes slightly. This makes cytokine reconstruction of all T cell numbers with the same coefficients impossible, and why reconstruction is never perfect on reconstruction optimization and test data (which must come from different experiments/replicates to avoid overfitting). 

In [None]:
feat = "integral"
dense_data2 = pd.read_hdf(os.path.join(main_dir_path, "data", "processed", "TCellNumber_1.hdf"))
dense_data2 = pd.concat([dense_data2], names=["Data"], keys=["PeptideComparison 20"])
fig, ax = plt3d.cytokines_dataset_tcellstate_planes(dense_data2, proj_mat, chosen_cytokines, 
                            hue_level="TCellNumber", feat="integral", 
                            init_view={"azim":0, "elev":20})

fig.set_size_inches(2.4, 2.4)
fig.tight_layout()
# fig.savefig(os.path.join(main_dir_path, "figures", "supp", "3d_plot_{}_TCellNumber_OT1_Timeseries_7.pdf".format(feat)), 
#        bbox_inches='tight')
plt.show()
plt.close()

In [None]:
# Various experiments at 100K T cells
foldr = os.path.join(main_dir_path, "data", "processed")
dense_data2 = pd.concat(
    [pd.read_hdf(os.path.join(foldr, "PeptideComparison_4.hdf")), 
     pd.read_hdf(os.path.join(foldr, "PeptideComparison_3.hdf")), 
     (pd.read_hdf(os.path.join(foldr, "PeptideComparison_8.hdf"))
          .xs("100k", level="TCellNumber", drop_level=False))], 
    names=["Data"], keys=["PeptideComparison_4", "PeptideComparison_3", "PeptideComparison_8"])

fig, ax = plt3d.cytokines_dataset_tcellstate_planes(dense_data2, proj_mat, chosen_cytokines, 
                            hue_level="TCellNumber", feat="integral", 
                            init_view={"azim":12, "elev":28})

fig.set_size_inches(2.4, 2.4)
fig.tight_layout()
# fig.savefig(os.path.join(main_dir_path, "figures", "supp", "3d_plot_{}_3dsets_comparison.pdf".format(feat)), 
#        bbox_inches='tight')
plt.show()
plt.close()

## Going back to $\log_{10}$ scale for cytokine concentrations
This is important so the y axes do not look arbitrary on the cytokine reconstruction plots. 

The transformation we need to undo is the following:
```
if typ == "integral":
        df_wt[typ] = (df_wt[typ] - df_min)/(df_max - df_min)
    else:   # for conc and deriv, the constant rescaling term disappears. 
        df_wt[typ] = df_wt[typ]/(df_max - df_min)
```

Reconstruct with the scaled data, because we learned quadratic coefficients for scaled data, and our latent space is scaled, but after reconstruction, scale back. 

In [None]:
minmaxfile = os.path.join(main_dir_path, "data", "trained-networks", "min_max-thomasRecommendedTraining.hdf")
df_min = pd.read_hdf(minmaxfile, key="df_min")
df_max = pd.read_hdf(minmaxfile, key="df_max")
df_min, df_max = df_min.xs("integral", level="Feature"), df_max.xs("integral", level="Feature")

# Linear reconstruction
Not using model curves yet; this is just (smoothed) data projected to latent space, and reconstructed back. 

## Load reconstruction and latent space (proj) data

In [None]:
fnames1 = os.path.join(main_dir_path, "results", "reconstruction", "df_{}_linear_HighMI_1.hdf")
dflin_proj_train = pd.read_hdf(fnames1.format("proj"), key="train")
dflin_proj_test = pd.read_hdf(fnames1.format("proj"), key="test")

dflin_recon_train = scale_back(pd.read_hdf(fnames1.format("recon"), key="train"), 
                               df_min, df_max)
dflin_recon_test = scale_back(pd.read_hdf(fnames1.format("recon"), key="test"), 
                              df_min, df_max)

dflin_wt_train = scale_back(pd.read_hdf(fnames1.format("wt"), key="train"), 
                           df_min, df_max)
dflin_wt_test = scale_back(pd.read_hdf(fnames1.format("wt"), key="test"), 
                           df_min, df_max)

In [None]:
print(dflin_wt_train.index.get_level_values("Concentration").unique())

In [None]:
figdict = pltrecon.plot_recon_true(dflin_wt_test, dflin_recon_test, feature="concentration", toplevel="Data",
    sharey=True, do_legend=True, pept=["N4", "A2", "Y3", "Q4", "T4", "V4"])

dset = "HighMI_1-2"
leg = figdict[dset].legends[0]
#figdict[dset].savefig(os.path.join(main_dir_path, "figures", "supp", 
#        "supp_figure_linear_recon_concentrations_{}.pdf".format(dset)), 
#        transparent=True, bbox_extra_artists=(leg,), bbox_inches='tight')

# Accurate reconstruction
$$ c_i = Q_{i1}n_1 + Q_{i2}n_2 + Q_{i3} n_1^2 + Q_{i4} n_2^2 + Q_{i5} n_1 n_2 + Q_{i6} N_1 + Q_{i7} N_2 $$

In [None]:
fnames2 = os.path.join(main_dir_path, "results", "reconstruction", 
                       "df_{}_nonlinear_HighMI_1.hdf")
dfacc_proj_train = pd.read_hdf(fnames2.format("proj"), key="train")
dfacc_proj_test = pd.read_hdf(fnames2.format("proj"), key="test")

dfacc_recon_train = scale_back(pd.read_hdf(fnames2.format("recon"), key="train"), 
                               df_min, df_max)
dfacc_recon_test = scale_back(pd.read_hdf(fnames2.format("recon"), key="test"), 
                              df_min, df_max)

dfacc_wt_train = scale_back(pd.read_hdf(fnames2.format("wt"), key="train"), 
                            df_min, df_max)
dfacc_wt_test = scale_back(pd.read_hdf(fnames2.format("wt"), key="test"), 
                           df_min, df_max)

In [None]:
figdict_acc = pltrecon.plot_recon_true(dfacc_wt_test, dfacc_recon_test, 
                    feature="concentration", toplevel="Data", 
                    sharey=True, do_legend=True, pept=["N4", "A2", "Y3", "Q4", "T4", "V4"])

dset = "HighMI_1-2"
leg = figdict_acc[dset].legends[0]
#figdict_acc[dset].savefig(os.path.join(main_dir_path, "figures", "supp", 
#        "supp_figure_include-integrals_recon_concentrations_{}.pdf".format(dset)), 
#         transparent=True, bbox_extra_artists=(leg,), bbox_inches='tight')

# Reconstruction residuals
Probably just a small plot of the average residuals across all peptides? Or a distribution of residuals, like min, max, average, and std or median?  

In [None]:
dflin_residuals_test = (dflin_wt_test.loc[:, "concentration"] - dflin_recon_test)
dflin_residuals_train = (dflin_wt_train.loc[:, "concentration"] - dflin_recon_train)
dfacc_residuals_test = (dfacc_wt_test.loc[:, "concentration"] - dfacc_recon_test)
dfacc_residuals_train = (dfacc_wt_train.loc[:, "concentration"] - dfacc_recon_train)

In [None]:
# Averaged over the reconstructed data sets. 
fig, axes, axleg = pltrecon.plot_residuals_summary([dflin_residuals_test["concentration"], 
                        dfacc_residuals_test["concentration"]], 
                        ["Linear", "Accurate"], sharey_all=True, legend_loc="side")
# fig.savefig("panels_recon/residuals_reconstruction_summary_HighMI_1.pdf", transparent=True, bbox_inches="tight")

### Model+reconstruction residuals on cytokine data
See the Google Colab notebook for main figure 3A, where the p-values are computed. The residuals plot will be a pretext to explain in a caption the p-values meaning and how they were computed. 

## Hausdorff dimension of the 2D manifold
See separate Python script. 

# Synthetic data from sampled parameters

### Note: to run this part, need to run `generate_synthetic_data.ipynb` and save its results first
 
Specifically, objects that need to be available for import:
- df_params_synth_sigmoid_freealpha_selectdata.hdf
- df_recon_synth_sigmoid_freealpha_selectdata.hdf
- quadratic_tanh_pipeline_selectdata.pkl
- scalerkde_dict_sigmoid_freealpha_selectdata.pkl
- ser_v2v1_synth_selectdata.hdf
- tanh_norm_factors_integrals_selectdata.pkl
- v2v1_kde_sigmoid_freealpha_selectdata.pkl

In [None]:
# Import the necessary scripts.
from ltspcyt.scripts.reconstruction import ScalerKernelDensity, QuadraticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KernelDensity
import pickle

In [None]:
# Import the relevant objects. 
option = "selectdata"  # "HighMI_1"
folder = os.path.join(main_dir_path, "results", "reconstruction")
with open(os.path.join(folder, "scalerkde_dict_sigmoid_freealpha_{}.pkl".format(option)), "rb") as hd:
    scaler_kdes_dict = pickle.load(hd)

# We might want to plot the regression coefficients as a heatmap?
with open(os.path.join(folder, "quadratic_tanh_pipeline_{}.pkl".format(option)), "rb") as hd:
    pipe = pickle.load(hd)

df_params_synth = pd.read_hdf(os.path.join(folder, "df_params_synth_sigmoid_freealpha_{}.hdf".format(option)))
df_recon_synth = scale_back( 
    pd.read_hdf(os.path.join(folder, "df_recon_synth_sigmoid_freealpha_{}.hdf".format(option))), 
    df_min, df_max)
df_latent_synth = pd.read_hdf(os.path.join(folder, "df_latent_synth_sigmoid_freealpha_{}.hdf".format(option)))

### Kernel densities

In [None]:
times = list(map(float, df_latent_synth.index.get_level_values("Time").unique()))
times = np.sort(np.asarray(times))
peps_to_plot = ["N4", "A2", "Y3", "Q4", "T4", "V4", "E1"]
peps_color_order = ["N4", "Q4", "T4", "V4", "G4", "E1", "A2", "Y3", "A8", "Q7"]
peps_colors = {peps_color_order[i]:sns.color_palette()[i] for i in range(len(peps_color_order))}
peps_palette = {a:peps_colors[a] for a in peps_to_plot}
replic_choice = "2"

In [None]:
fig, [axes, axleg, leg] = pltrecon.pairplot_scalerkdes(
                            {p:scaler_kdes_dict[p] for p in peps_to_plot}, peps_to_plot, df_params_synth.columns,
                            hues=peps_palette, do_leg=True, res=31, plot_type="fill", fontsize=8)
# Fix some xticks
for i in range(len(axes)):
    axes[i][0].set_xticks([0, 2, 4])
for i in range(1, len(axes)):
    axes[i][1].set_xticks([0, 2, 4])
for i in range(2, len(axes)):
    axes[i][2].set_xticks([-np.pi/2, 0, np.pi/3])
axes[-1][2].set_xticklabels([r"$-\pi/2$", r"$0$", r"$\pi/3$"])

fig.set_size_inches(4., 4.)
fig.tight_layout(h_pad=0.1, w_pad=0.1)
fig.savefig(os.path.join(main_dir_path, "figures", "supp", "kdes_for_data_generation_{}.pdf".format(option)), 
                transparent=True, bbox_inches="tight")
plt.show()
plt.close()

### Latent space trajectories thus generated
Maybe also plot tanh of integrals, just to see what that looks like? 
A column of 2 or 4 plots.  

In [None]:
fig, axes = plt.subplots(4, 1, sharex=True)

feats = [("concentration", "Node 1"), ("concentration", "Node 2"), 
        ("tanh integral", "Node 1"), ("tanh integral", "Node 2")]
# For each peptide, plot the 4 quantities
for pep in peps_to_plot:
    labels = [pep] + [None]*3
    for i in range(4):
        y = df_latent_synth.loc[(pep, replic_choice, times), feats[i]].values
        axes[i].plot(times, y, color=peps_palette[pep], lw=2., label=labels[i], zorder=len(peps_palette)-peps_to_plot.index(pep))

# Label the y axes
for i, ylbl in enumerate(["n_1(t)", "n_2(t)", r"\tanh [N_1(t)\,/\,\bar{N}_1]", r"\tanh [N_2(t)\,/\,\bar{N}_2]"]):
    axes[i].set_ylabel(r"${}$".format(ylbl), size=8)
    axes[i].tick_params(axis="both", labelsize=7, width=1., length=3.)

# Add a legend: not necessary, there is already one for the KDEs
#fig.legend(*axes[0].get_legend_handles_labels(), loc="upper left", bbox_to_anchor=(0.95, 0.98), fontsize=7)
fig.set_size_inches(2., 4.)
fig.tight_layout()

fig.savefig(os.path.join(main_dir_path, "figures", "supp", 
        "latentspace_data_generation_replicate{}_{}.pdf".format(replic_choice, option)), 
         bbox_inches="tight", transparent=True)
plt.show()
plt.close()

### Cytokine trajectories thus generated
Each cytokine in a column, all peptides in the same row of plots. So it's a single row of five plots. 

In [None]:
print(df_recon_synth)

In [None]:
# For each peptide, plot the 4 quantities
fig, axes = plt.subplots(1, 5, sharex=True, sharey=True)

conc_choice = ["1uM", "100nM", "10nM", "1nM"][int(replic_choice)]
cytokines = ["IFNg", "IL-2", "IL-17A", "IL-6", "TNFa"]
cytokines_nice = [r"IFN-$\gamma$", "IL-2", "IL-17A", "IL-6", "TNF"]
for pep in peps_to_plot:
    labels = [pep] + [None]*(len(cytokines)-1)
    for i in range(len(cytokines)):
        y = df_recon_synth.loc[(pep, conc_choice, times), ("concentration", cytokines[i])].values
        axes[i].plot(times, y, color=peps_palette[pep], lw=2., label=labels[i], zorder=len(peps_palette)-peps_to_plot.index(pep))

# Label the y axes
for i, ylbl in enumerate(cytokines_nice):
    axes[i].set_title(ylbl, size=8)
    axes[i].tick_params(axis="both", labelsize=7, width=1., length=3.)

axes[0].set_ylabel(r"$\log_{10}$(cyto)", size=8)
for i in range(1, len(cytokines)):
    axes[i].set_ylabel("")
    
for i in range(len(cytokines)):
    axes[i].set_xlabel("Time [h]", size=8)

# Add a legend: not necessary, there is already one for the KDEs
#fig.legend(*axes[0].get_legend_handles_labels(), loc="upper left", bbox_to_anchor=(0.95, 0.98), fontsize=7)
fig.set_size_inches(6., 1.5)
fig.tight_layout()

fig.savefig(os.path.join(main_dir_path, "figures", "supp", 
        "generated_cytokines_replicate{}_{}.pdf".format(replic_choice, option)), 
          bbox_inches="tight", transparent=True)
plt.show()
plt.close()