# Supplementary panels showing how latent space models fit data

To run this notebook, you need:
- To have run `fit_latentspace_models.ipynb` for each model and saved the results in `results/fits/`

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
import json

import os, sys
main_dir_path = os.path.abspath('../')
sys.path.insert(0, main_dir_path)

from utils.plotting_fits import (add_hue_size_style_legend, timecourse_smallplots, 
    latentspace_smallplot, barplots_levels, paramspace_smallplots, create_cmap_seed, 
    create_midseeded_clist, add_legend_subtitles_huemaps, create_labeling)
import utils.custom_pandas as custom_pd

In [None]:
# rcParams for all plots
plt.rcParams['savefig.transparent'] = True

param_formatting_dict = {
    "theta": r"\theta", 
    "vt": r"v_{t2}", 
    "t0": r"t_0",
    "tau0": r"\tau_0",
    "v0": r"v_0", 
    "a0": r"a_0", 
    "v1": r"v_{t1}", 
    "alpha": r"\alpha", 
    "beta": r"\beta", 
    "gamma": r"\gamma",
}

## Removing unavailable time points
By default, the df_compare dataframes (with spline and model time series at every hour) extend time series to 72 hours. If an experiment did not extend to 72 hours, missing time points are set equal to the last available time point. We need to remove those artifical time points, which do not match the data, to have a truthful estimate of the residuals and the average residuals when we group per certain levels (e.g. per time). 

I have compiled elsewhere, with a small script, a the list of the last available time point for each experiment. It is saved in the JSON file ``fit_results/last_time_point_per_experiment.json``. 

In [None]:
with open(os.path.join(main_dir_path, "results", "fits", "last_time_point_per_experiment.json"), "r") as h:
    last_times_dict = json.load(h)

In [None]:
# Call this function on each df_compare that I import
def remove_artificial_times(df, max_t_dict):
    full_df = {}
    for d in df.index.get_level_values("Data").unique():
        try:
            tmax = max_t_dict[d]
        except KeyError:
            print("Could not find last experimental time for {}".format(d))
            tmax = 72
        df_d = df.xs(d, level="Data", axis=0)
        df_d = df_d.loc[df_d.index.get_level_values("Time").astype(float) <= tmax]
        full_df[d] = df_d
    return pd.concat(full_df, names=["Data"], axis=0)

# Constant velocity model
Show $N_i(t)$, $N_1$ vs $N_2$, $n_i(t)$. 
Also show parameter space.

In [None]:
plots_height = 2.1
plots_width = 3.

In [None]:
df_compare_velo = pd.read_hdf(os.path.join(main_dir_path, "results", "fits", 
                            "df_compare_Constant_velocity_reg10_selectdata.hdf"))
df_compare_velo = remove_artificial_times(df_compare_velo, last_times_dict)
dset_velo = "Activation_Timeseries_1"
df_compare_velo = custom_pd.xs_slice(df_compare_velo, name="Peptide", lvl_slice=["N4", "Q4", "T4", "V4"], axis=0)

In [None]:
print(df_compare_velo.index.get_level_values("Data").unique())

In [None]:
# Integrals
fig, [ax1, ax2, axleg, leg] = timecourse_smallplots(
    custom_pd.xs_slice(df_compare_velo.xs(dset_velo, level="Data", axis=0)
                    .xs("100k", level="TCellNumber", axis=0)
                    .xs("integral", level="Feature", axis=0),
                 name="Concentration", lvl_slice=["1uM", "10nM"], axis=0),
    feat_name="LS", maxwidth=1.5, do_leg=False, 
    fontsize=6, handlelength=2.)
fig.set_size_inches(plots_width*3/4, plots_height)
fig.tight_layout(h_pad=0.5)
# fig.savefig(os.path.join(main_dir_path, "figures", "supp", "constant_velocity_integrals_latent_timecourses.pdf"))

In [None]:
# N1 vs N2
fig, [ax1, axleg, leg] = latentspace_smallplot(
    df_compare_velo.xs(dset_velo, level="Data", axis=0)
                        .xs("100k", level="TCellNumber", axis=0)
                        .xs("integral", level="Feature", axis=0),
    feat_name="LS", maxwidth=1.5, do_leg=True, 
    fontsize=6, handlelength=2.)
leg.get_frame().set_linewidth(0.0)
fig.set_size_inches(plots_width, plots_height)
fig.tight_layout()
# fig.savefig(os.path.join(main_dir_path, "figures", "supp", "constant_velocity_integrals_latent_trajectories.pdf"))
plt.show()
plt.close()

In [None]:
# Parameters
df_params_velo = pd.read_hdf(os.path.join(main_dir_path, "results", "fits", "df_params_Constant_velocity_reg10_selectdata.hdf"))

# Remove concentrations below 1 nM
df_params_velo = custom_pd.xs_slice(df_params_velo, name="Concentration", 
                                   lvl_slice=["1uM", "300nM", "100nM", "30nM", "10nM", 
                                             "3nM", "1nM"], axis=0)

# Correct column names so they are formatted to math
param_name_map = {a:a for a in df_params_velo.columns}
param_name_map.update(param_formatting_dict)
df_params_velo.columns = df_params_velo.columns.map(param_name_map)

In [None]:
fig, [axes, axleg, leg] = paramspace_smallplots(
    df_params_velo.loc[df_params_velo.index.isin(["80k", "100k"], level="TCellNumber")], 
    hue_level_name="Peptide", style_level_name=None, size_level_name="Concentration", 
    do_leg=True, fontsize=6, handlelength=1, maxsize=5., ncol=2)
leg.get_frame().set_linewidth(0.0)
fig.set_size_inches(plots_width*1.75, plots_height*2)
fig.tight_layout(w_pad=0.5, h_pad=0.5)
#fig.savefig(os.path.join(main_dir_path, "figures", "supp", "param_plots_constant_velocity_100k.pdf"), 
#    transparent=True, bbox_inches="tight", bbox_extra_artists=(leg,))
plt.show()
plt.close()

# Force model with matching, fixed alpha
We want to show the fits of $n_1(t)$, $n_2(t)$ in more detail. 

In [None]:
df_compare_fixalpha = pd.read_hdf(os.path.join(main_dir_path, "results", "fits", 
                                    "df_compare_Sigmoid_reg04_selectdata.hdf"))
df_compare_fixalpha = remove_artificial_times(df_compare_fixalpha, last_times_dict)
dset_fixalpha = "Activation_Timeseries_1"

In [None]:
print(df_compare_fixalpha.index.get_level_values("Data").unique())

In [None]:
# Integrals
fig, [ax1, ax2, axleg, leg] = timecourse_smallplots(
    custom_pd.xs_slice(df_compare_fixalpha.xs(dset_fixalpha, level="Data", axis=0)
                .xs("100k", level="TCellNumber", axis=0)
                .xs("integral", level="Feature", axis=0), 
            name="Concentration", lvl_slice=["1uM", "10nM"], axis=0), 
    feat_name="LS", maxwidth=1.5, do_leg=False, 
    fontsize=6, handlelength=2.)
fig.set_size_inches(plots_width*3/4, plots_height)
fig.tight_layout(h_pad=0.5)
# fig.savefig(os.path.join(main_dir_path, "figures", "supp", "sigmoid_fixalpha_integrals_latent_timecourses.pdf"))

In [None]:
# N1 vs N2
fig, [ax1, axleg, leg] = latentspace_smallplot(
    custom_pd.xs_slice(df_compare_fixalpha.xs(dset_fixalpha, level="Data", axis=0)
                        .xs("100k", level="TCellNumber", axis=0)
                        .xs("integral", level="Feature", axis=0),
        name="Peptide", lvl_slice=["N4", "Q4", "T4", "V4"], axis=0), 
    feat_name="LS", maxwidth=1.5, do_leg=True, 
    fontsize=6, handlelength=2.)
leg.get_frame().set_linewidth(0.0)
fig.set_size_inches(plots_width, plots_height)
fig.tight_layout()
# fig.savefig(os.path.join(main_dir_path, "figures", "supp", "sigmoid_fixalpha_integrals_latent_trajectories.pdf"))

In [None]:
# Parameters
df_params_fixalpha = pd.read_hdf(os.path.join(main_dir_path, "results", "fits", "df_params_Sigmoid_reg04_selectdata.hdf"))

# Remove concentrations below 1nM
df_params_fixalpha = custom_pd.xs_slice(df_params_fixalpha, name="Concentration", 
                                   lvl_slice=["1uM", "300nM", "100nM", "30nM", "10nM", 
                                             "3nM", "1nM"], axis=0)

# Correct column names so they are formatted to math
param_name_map = {a:a for a in df_params_fixalpha.columns}
param_name_map.update(param_formatting_dict)
df_params_fixalpha.columns = df_params_fixalpha.columns.map(param_name_map)

In [None]:
fig, [axes, axleg, leg] = paramspace_smallplots(
    df_params_fixalpha.loc[df_params_fixalpha.index.isin(["80k", "100k"], level="TCellNumber")][["a_0", r"\tau_0", r"\theta", r"v_{t1}"]],  
    hue_level_name="Peptide", style_level_name=None, size_level_name="Concentration", 
    do_leg=True, maxsize=5., fontsize=6, handlelength=1, ncol=2)
leg.get_frame().set_linewidth(0.0)
fig.set_size_inches(plots_width*1.75, plots_height*2)
fig.tight_layout(w_pad=0.5, h_pad=0.5)
#fig.savefig(os.path.join(main_dir_path, "figures", "supp", "param_plots_sigmoid_fixalpha_100k_4paramsshown.pdf"),
#           transparent=True, bbox_inches="tight", bbox_extra_artists=(leg,))
plt.show()
plt.close()

# Force model with matching, free alpha
What we want to show is mostly the parameter space colored per T cell number, because the fits are essentially identical, but the parameter space takes care of T cell number separately from other peptide-related attributes. 

In [None]:
df_compare_freealpha = pd.read_hdf(os.path.join(main_dir_path, "results", "fits", 
                                    "df_compare_Sigmoid_freealpha_reg04_selectdata.hdf"))
df_compare_freealpha = remove_artificial_times(df_compare_freealpha, last_times_dict)
dset_freealpha = "Activation_Timeseries_1"

In [None]:
print(df_compare_freealpha.index.get_level_values("Data").unique())

In [None]:
# Integrals
fig, [ax1, ax2, axleg, leg] = timecourse_smallplots(
    custom_pd.xs_slice(df_compare_freealpha.xs(dset_freealpha, level="Data", axis=0)
                .xs("100k", level="TCellNumber", axis=0)
                .xs("integral", level="Feature", axis=0), 
            name="Concentration", lvl_slice=["1uM", "10nM"], axis=0),
    feat_name="LS", maxwidth=1.5, do_leg=False, 
    fontsize=6, handlelength=2.)
fig.set_size_inches(plots_width*3/4, plots_height)
fig.tight_layout(h_pad=0.5)
#fig.savefig(os.path.join(main_dir_path, "figures", "supp", "sigmoid_freealpha_integrals_latent_timecourses.pdf"))

In [None]:
# N1 vs N2
fig, [ax1, axleg, leg] = latentspace_smallplot(
    custom_pd.xs_slice(df_compare_freealpha.xs(dset_freealpha, level="Data", axis=0)
                        .xs("100k", level="TCellNumber", axis=0)
                        .xs("integral", level="Feature", axis=0),
        name="Peptide", lvl_slice=["N4", "Q4", "T4", "V4"], axis=0), 
    feat_name="LS", maxwidth=1.5, do_leg=True, 
    fontsize=6, handlelength=2.)
leg.get_frame().set_linewidth(0.0)
fig.set_size_inches(plots_width, plots_height)
fig.tight_layout()
#fig.savefig(os.path.join(main_dir_path, "figures", "supp", 
#    "sigmoid_freealpha_integrals_latent_trajectories.pdf"))

In [None]:
# Parameters
df_params_freealpha = pd.read_hdf(os.path.join(main_dir_path, "results", "fits", 
                                    "df_params_Sigmoid_freealpha_reg04_selectdata.hdf"))

# Remove concentrations below 1nM
df_params_freealpha = custom_pd.xs_slice(df_params_freealpha, name="Concentration", 
                                   lvl_slice=["1uM", "300nM", "100nM", "30nM", "10nM", 
                                             "3nM", "1nM"], axis=0)

# Correct column names so they are formatted to math
param_name_map = {a:a for a in df_params_freealpha.columns}
param_name_map.update(param_formatting_dict)
df_params_freealpha.columns = df_params_freealpha.columns.map(param_name_map)

In [None]:
print(df_params_freealpha.index.get_level_values("Data").unique())

In [None]:
fig, [axes, axleg, leg] = paramspace_smallplots(
    df_params_freealpha.loc[df_params_freealpha.index.isin(["80k", "100k"], level="TCellNumber")][["a_0", r"\tau_0", r"\theta", r"v_{t1}"]], 
    hue_level_name="Peptide", style_level_name=None, size_level_name="Concentration", 
    do_leg=True, maxsize=5., fontsize=6, handlelength=1, ncol=2)
leg.get_frame().set_linewidth(0.0)
fig.set_size_inches(plots_width*1.75, plots_height*2)
fig.tight_layout(w_pad=0.5, h_pad=0.5)
#fig.savefig(os.path.join(main_dir_path, "figures", "supp", "param_plots_sigmoid_freealpha_100k_4paramsshown.pdf"), 
#            transparent=True, bbox_inches="tight", bbox_extra_artists=(leg,))
plt.show()
plt.close()

# Comparison of models for concentration
Choose the same data set for all compared models, but a different dataset from above just to show that the models do not fit a single experiment

In [None]:
dset_conc = "TCellNumber_OT1_Timeseries_7"

In [None]:
# Constant velocity model
fig, [ax1, ax2, axleg, leg] = timecourse_smallplots(
    custom_pd.xs_slice(df_compare_velo.xs(dset_conc, level="Data", axis=0)
                        .xs("100k", level="TCellNumber", axis=0)
                        .xs("concentration", level="Feature", axis=0),
        name="Concentration", lvl_slice=["1uM", "10nM"], axis=0), 
    feat_name="ls", maxwidth=1.5, do_leg=False, 
    fontsize=6, handlelength=2.)
# leg.get_frame().set_linewidth(0.0)
fig.set_size_inches(plots_width*3/4, plots_height)
fig.tight_layout(h_pad=0.5)
#fig.savefig(os.path.join(main_dir_path, "figures", "supp", "constant_velocity_concentrations_latent_timecourses.pdf"))

In [None]:
# Fixed alpha model
fig, [ax1, ax2, axleg, leg] = timecourse_smallplots(
    custom_pd.xs_slice(df_compare_fixalpha.xs(dset_conc, level="Data", axis=0)
                .xs("100k", level="TCellNumber", axis=0)
                .xs("concentration", level="Feature", axis=0),
        name="Concentration", lvl_slice=["1uM", "10nM"], axis=0),
    feat_name="ls", maxwidth=1.5, do_leg=False, 
    fontsize=6, handlelength=2.)
# leg.get_frame().set_linewidth(0.0)
fig.set_size_inches(plots_width*3/4, plots_height)
fig.tight_layout(h_pad=0.5)
#fig.savefig(os.path.join(main_dir_path, "figures", "supp", 
#            "sigmoid_fixalpha_concentrations_latent_timecourses.pdf"))

In [None]:
# Free alpha model
fig, [ax1, ax2, axleg, leg] = timecourse_smallplots(
    custom_pd.xs_slice(df_compare_freealpha.xs(dset_conc, level="Data", axis=0)
                .xs("100k", level="TCellNumber", axis=0)
                .xs("concentration", level="Feature", axis=0), 
        name="Concentration", lvl_slice=["1uM", "10nM"], axis=0), 
    feat_name="ls", maxwidth=1.5, do_leg=True, 
    fontsize=6, handlelength=2.)
leg.get_frame().set_linewidth(0.0)
fig.set_size_inches(plots_width, plots_height)
fig.tight_layout(h_pad=0.5)
#fig.savefig(os.path.join(main_dir_path, "figures", "supp", 
#            "sigmoid_freealpha_concentrations_latent_timecourses.pdf"))

## Fit residuals on concentrations and integrals
Per peptide, averaged over all conditions, on each node.  

In [None]:
def compute_residuals(df, gby_extras=()):
    pr = "Processing type"
    df_res = ((df.xs("Splines", level=pr) - df.xs("Fit", level=pr))**2
             ).groupby(["Feature", *gby_extras]).mean()
    return df_res.unstack("Feature")

In [None]:
# Compute residuals for integrals and concentrations
groupby_extras=("Time",)
df_res_velo = compute_residuals(df_compare_velo, gby_extras=groupby_extras)
df_res_fixalpha = compute_residuals(df_compare_fixalpha, gby_extras=groupby_extras)
df_res_freealpha = compute_residuals(df_compare_freealpha, gby_extras=groupby_extras)
print(df_res_velo)

In [None]:
df_res_all = pd.concat({"Constant \nvelocity":df_res_velo, 
                        "Matching, \nfixed " + r"$\alpha$": df_res_fixalpha, 
                        "Matching, \nfree " + r"$\alpha$": df_res_freealpha}, 
                       axis=0, names=["Model"])

In [None]:
# Plots for integrals
def plot_residuals_feature(df_all, feat="integral", colmap="cubehelix", do_leg=True, **kwargs):
    """ kwargs are passed to axes.legend(). """
    fig = plt.figure()
    if do_leg:
        gs = fig.add_gridspec(nrows=2, ncols=4)
    else:
        gs = fig.add_gridspec(nrows=2, ncols=3)
    axes = [fig.add_subplot(gs[0, :3])]
    axes.append(fig.add_subplot(gs[1, :3], sharex=axes[0]))
    times = df_all.index.get_level_values("Time").unique().map(float)
    if isinstance(colmap, str):
        colors = [sns.set_hls_values(a, s=1) for a in sns.color_palette(colmap, 4)]
        # colors = sns.color_palette(colmap, 4)
    elif isinstance(colmap, list):
        colors = colmap
    else:
        raise TypeError("{} not a supported type for colmap".format(type(colmap)))
    styles = ["-", ":", "--", "-."]
    for i, mod in enumerate(df_all.index.get_level_values("Model").unique()):
        axes[0].plot(times, df_all.loc[mod, ("Node 1", feat)], label=mod, color=colors[i],
                    ls=styles[i], lw=2.)
        axes[1].plot(times, df_all.loc[mod, ("Node 2", feat)], label=mod, color=colors[i], 
                    ls=styles[i], lw=2.)

    # Adjust size, labels, etc. 
    axes[1].set_xlabel("Time [h]", size=8)
    lbl = "ls" if feat=="concentration" else "LS"
    for i in range(2):
        axes[i].tick_params(axis="both", length=2., width=0.5, labelsize=6.)
        axes[i].set_yscale("log")
        axes[i].set_ylabel(r"Residuals$^2$ ${}_{}$".format(lbl, i+1), size=8)
    # Add a legend
    if do_leg:
        kwargs2 = dict(bbox_to_anchor=(0, 0), loc="lower left")
        kwargs2.update(kwargs)
        legax = fig.add_subplot(gs[:, -1])
        leg = legax.legend(*axes[1].get_legend_handles_labels(), **kwargs2)
        legax.set_axis_off()
    else:
        legax = None
        
    return fig, axes, legax

In [None]:
cmap = [sns.set_hls_values(a, s=1) for a in sns.color_palette("cubehelix", 4)][-2::-1]
fig, axes, legax = plot_residuals_feature(df_res_all, feat="integral", colmap=cmap, do_leg=False)
fig.set_size_inches(plots_width*3/4, plots_height)
fig.tight_layout(h_pad=0.5)
#fig.savefig(os.path.join(main_dir_path, "figures", "supp", "residuals_integrals.pdf"), transparent=True)
plt.show()
plt.close()

In [None]:
fig, axes, legax = plot_residuals_feature(df_res_all, feat="concentration", colmap=cmap, 
                                          do_leg=True, fontsize=6, handlelength=2., labelspacing=1., 
                                         bbox_to_anchor=(0, -0.12))
fig.set_size_inches(plots_width, plots_height)
fig.tight_layout(h_pad=0.5)
legax.get_legend().get_frame().set_linewidth(0.0)
#fig.savefig(os.path.join(main_dir_path, "figures", "supp", "residuals_concentrations.pdf"), transparent=True, 
#            bbox_extra_artists=(legax.get_legend(),), bbox_inches="tight")
plt.show()
plt.close()

# Effect of T cell number on fits of different models – removed from paper
We did not discuss this at length, but fitting the $\alpha$ parameter in the constant force model with matching improves fit residuals for T cell numbers different from 100k initial cells. Also, fitting $\alpha$ makes the $a_0$ vs $\tau_0$ correlation collapse onto a single diagonal, while that correlation has different slopes for different T cell numbers when $\alpha$ is fixed. 

In [None]:
groupby_extras = ("TCellNumber",)
tcnums = ["100k", "30k", "10k", "3k"]
dsets = ['Activation_TCellNumber_1']

df_res_all_tcn = pd.concat({
    "Matching, \nfixed " + r"$\alpha$": 
        compute_residuals(df_compare_fixalpha.loc[(dsets, tcnums), :], groupby_extras), 
    "Matching, \nfree " + r"$\alpha$": 
        compute_residuals(df_compare_freealpha.loc[(dsets, tcnums), :], groupby_extras)
    }, names=["Model"], axis=0)
df_res_all_tcn = df_res_all_tcn.groupby(["TCellNumber", "Model"]).mean()
df_res_all_tcn.columns = df_res_all_tcn.columns.set_names(["Node", "Feature"])
df_res_all_tcn = df_res_all_tcn.stack("Node").unstack("Node")

In [None]:
all_models = df_res_all.index.get_level_values("Model").unique()
colors_models = [sns.set_hls_values(a, s=1) for a in sns.color_palette("cubehelix", len(all_models))]
colormap_models = {all_models[i]:colors_models[i] for i in range(len(all_models))}

fig, axes, legax = barplots_levels(
    df_res_all_tcn[[("concentration", "Node 1"), ("concentration", "Node 2")]]*1e5, 
    hue_lvl="Model", x_lvl="TCellNumber", groupwidth=0.7, hue_map=colormap_models, hue_reverse=False)

# Rectify the y axis labels and title
axes[0].set_ylabel(r"$10^5 \, \times$ Res${}^2$", size=8)
axes[0].set_xlabel("T cell number", size=8)
for i in range(1, len(df_res_all_tcn.index.get_level_values("Model").unique())):
    axes[i].set_ylabel("")
    axes[i].set_xlabel("T cell number", size=8)
axes[0].set_title("Node 1", fontsize=8)
axes[1].set_title("Node 2", fontsize=8)

fig.set_size_inches(4.25, 1.8)  # Smaller nice size: 3.5, 1.6
fig.tight_layout(h_pad=0.5, w_pad=0.5)
# fig.savefig(os.path.join(main_dir_path, "figures", "supp", "residuals_tcellnumber_alpha.pdf"), 
#        transparent=True, bbox_inches="tight")
plt.show()
plt.close()

## Parameter pairplots colored per T cell number
Take 1 (maybe 2 very similar) datasets where we can see how fitting alpha makes the a0 vs tau0 curves collapse on top of each other. 

- Show a_0 vs tau_0 for fixed and free alpha; 
    - Add linear r quadratic regression fits for each TCN? 
- Show a KDE plot of the $\alpha$ parameter
- Try to respect the color map used for each model, rather than the color map viridis for T cell number? 

In [None]:
# Prepare colors: fixed alpha
# colors_fixed_alpha = create_midseeded_clist(colormap_models.get("Matching, \nfixed " + r"$\alpha$"), len(tcnums))
colors_fixed_alpha = create_cmap_seed(colormap_models.get("Matching, \nfixed " + r"$\alpha$"), 
                                      n_colors = len(tcnums), light=False)
#colors_fixed_alpha = sns.light_palette(colormap_models.get("Matching, \nfixed " + r"$\alpha$"), n_colors=len(tcnums)+1)[1:]

In [None]:
# Prepare colors: free alpha
#colors_free_alpha = create_midseeded_clist(colormap_models.get("Matching, \nfree " + r"$\alpha$"), len(tcnums), max_l=0.92, min_l=0.5)
colors_free_alpha = create_cmap_seed(colormap_models.get("Matching, \nfree " + r"$\alpha$"), 
                                      n_colors = len(tcnums), light=False)
#colors_free_alpha = sns.light_palette(colormap_models.get("Matching, \nfree " + r"$\alpha$"), n_colors=len(tcnums)+1)[1:]

In [None]:
df_params_freealpha = df_params_freealpha.astype(np.float64)
df_params_fixalpha = df_params_fixalpha.astype(np.float64)
dsets = ['Activation_TCellNumber_1', "TCellNumber_OT1_Timeseries_7", "Activation_TCellNumber_2", 
        "TCellNumber_1", "TCellNumber_2"]
dsets = dsets[1:2]

In [None]:
def pairwise_params_plot_linear_fit(df, dsets, params, tcns, colors, ax):
    markers = ["o", "s", "o", "X", "P", "1", "2", "3", "4", "8", "*", "D"]
    parx, pary = params[0], params[1]
    for i, tcn in enumerate(tcns):
        x, y = df.loc[(dsets, tcn), parx], df.loc[(dsets, tcn), pary]
        j = 0
        for pep in df.loc[dsets].index.get_level_values("Peptide").unique():
            ax.plot(x.xs(pep, level="Peptide").values, y.xs(pep, level="Peptide").values, 
                    color=colors[i], ls="none", marker=markers[j], ms=4)
            j += 1

        # Quadratic fit to emphasize the collapse or lack thereof. Force through zero
        coefs = np.polynomial.polynomial.polyfit(x.values, y.values, deg=[1, 2], rcond=None, full=False)
        xrange = np.arange(x.min(), x.max()+0.025, 0.05)
        ax.plot(xrange, np.polynomial.polynomial.polyval(xrange, coefs), ls="-", lw=1., 
                color=colors[i], label=tcn)
    return ax, coefs

In [None]:
# First and second plot: tau_0 vs a_0 for fixed alpha
fig, axes = plt.subplots(2, 2, sharex="col")
fig.set_size_inches(4.25, 3.25)

ax, coefs = pairwise_params_plot_linear_fit(df_params_fixalpha, dsets, 
                        ["a_0", r"\tau_0"], tcnums, colors_fixed_alpha, axes[0, 0])

# Second plot: same thing, for fixed alpha
ax, coefs = pairwise_params_plot_linear_fit(df_params_freealpha, dsets, 
                        ["a_0", r"\tau_0"], tcnums, colors_free_alpha, axes[1, 0])

# Third plot: KDE of alpha
sns.kdeplot(data=df_params_freealpha.loc[dsets].reset_index(), x=r"\alpha", hue="TCellNumber", palette=colors_free_alpha, 
            ax=axes[1, 1], legend=False, fill=True)

# Label plots properly
for i in range(2):
    axes[i, 0].set_ylabel(r"$\tau_0$ [-]", size=9)
    axes[i, 0].set_xlabel(r"$a_0$ [-]", size=9)
    axes[i, 0].tick_params(axis="both", width=0.5, length=2.5, labelsize=7)
axes[1, 1].set_xlabel(r"$\alpha$ [-]", size=9)
axes[1, 1].set_ylabel("Density [-]", size=9)
axes[1, 1].tick_params(axis="both", width=0.5, length=2.5, labelsize=7)


# Legend
models = list(df_res_all_tcn.index.get_level_values("Model").unique())
models.sort(key=lambda x: x.count("free"))
for i in range(len(models)):
    models[i].replace("\n", "")
legd = add_legend_subtitles_huemaps(models, hue_maps=[
            {tcnums[i]:colors_fixed_alpha[i] for i in range(len(tcnums))}, 
            {tcnums[i]:colors_free_alpha[i] for i in range(len(tcnums))}], 
        ax=axes[0, 1], hue_levels_order=tcnums[::-1],
        fontsize=8, ncol=2, borderaxespad=-2.5, loc="upper left", 
        bbox_to_anchor=(0.05, 0.65), frameon=False, columnspacing=-4.)

axes[0, 1].set_axis_off()
fig.tight_layout()
# fig.savefig(os.path.join(main_dir_path, "figures", "supp", "a0-tau0_slope_alphaKDE_{}.pdf".format(dsets[0])), transparent=True)
plt.show()
plt.close()

# Revision figure: more examples of LS1 and LS2 fits
Just show multiple $LS_1$, $LS_2$ plots, fits vs splines. For the force model with matching. 

Use the ensemble of datasets I called  "selectdata" : theese are 14 datasets with time series for naive CD8+ OT-1 T cells with no known major experimental problem. 

Of course, there is variability, we know that some look less like the most reliable ones, but the models fit anyways. Reconstruction would be harder, but even in that case, I tried fitting a decoder (linear regression with quadratic terms tanh integrals included) on those 15 datasets, and the R^2 was > 0.90, which is pretty good considering that variability and some minor flaws exhibited by those decoders when we use them to reconstruct cytokines from model trajectories. 

Also a plot of total reconstruction error over time; use relative error and average over all time series in a dataset to be able to compare. Make a bar graph with one bar per experiment.  
Could compare constant velocity model and force model: two bars per dataset. 

In [None]:
# Check datasets available, should be the same for both models. 
print(df_compare_freealpha.index.get_level_values("Data").unique())
print(df_compare_velo.index.get_level_values("Data").unique())

In [None]:
# To properly rank peptides  according to antigenicity
peptide_ranks = {"N4":13, "Q4":12, "T4":11, "V4":10, "G4":9, "E1":8,
              "A2":7, "Y3":6, "A8":5, "Q7":4}

def choose_traj(df, randomgen, size=1, per_lvl="Peptide", do_not_sample=()):
    """ Group df rows per_lvl and select <size> samples of the available indices.  
    do_not_sample specifies which levels should not be considered as separate samples, 
    e.g. time if we want to select a certain number of time series. 
    """
    df2 = df
    # Stack levels to ignode
    for lvl in do_not_sample:
        df2 = df2.unstack(lvl)
    # Select size conditions per group
    try:
        df_sampled = df2.groupby(per_lvl).sample(n=size, replace=False, random_state=randomgen)
    except ValueError:  # Too few sample available for some peptide
        df_sampled = {}
        for ind, gp in df2.groupby(per_lvl):
            df_sampled[ind] = gp.loc[ind].sample(n=min(size, gp.shape[0]), replace=False, random_state=randomgen)
        df_sampled = pd.concat(df_sampled, names=[per_lvl], axis=0)
    # Stack back ignored levels. 
    for lvl in do_not_sample:
        df_sampled = df_sampled.stack(lvl)
    return df_sampled


# Function used to change TCellNumber when 100k not available
def get_closest_tcn(target, choices):
    # First, convert each choice to integer
    choices_int = []
    map_units = {"k":1000, "M":int(1e6), "":1}
    # Function to map to each string
    def str_to_int_number(s):
        n = ""
        units = ""
        for c in s:
            if c.isnumeric() or c == ".":
                n += c
            elif c.isalpha():
                units += c
        base = float(n)
        factor = map_units.get(units, 1)
        return base*factor
    # Then, find index of closest integer number of T cells
    choices_int = list(map(str_to_int_number, choices))
    choices_int = np.asarray(choices_int)
    target = str_to_int_number(target)
    where = np.argmin(np.abs(choices_int - target))
    # Return string at the index of choice closest to target
    return choices[where]

In [None]:
# Make one big plot with 12 datasets (showing one HighMI_1 repeat)
# So this involves 3*4*2 plots, times 2 models. 
# Let's do this: randomly select 6 of 12 datasets; if HighMI_1 selected pick one replicate
# Then make four columns with two groups of two, one per dataset, and six rows, three groups of two, on
# Rows alternate between LS1 and LS2 this way. 
# Randomly sample two conditions per peptide for each dataset (else graph is too crowded)

# Add spacer column in the middle
# No need for spacer rows because the title on top of each plot will solve it. 
# But need one thin row at the top for titles of groups of 2 columns.
def example_fits_plots(feat, dfv, dff, seed=249824358):
    nrows = 3  # Number of rows of datasets
    ncols = 2  # Number of columns of datasets
    n_dsets = nrows * ncols    # Number of datasets to show
    n_subrows = 2  # Number of variables
    n_subcols = 2  # Number of models
    fig = plt.figure()
    # Add an extra column for legends...
    gs = fig.add_gridspec(nrows=nrows*(n_subrows+1), ncols=ncols*n_subcols+1+1, 
                          height_ratios=[0.15, 1.15, 1.15]*nrows, 
                          width_ratios=[1.2, 1.2, 0.1, 1.2, 1.2, 0.3])
    fig.set_size_inches(ncols*n_subcols*1.25+0.3, nrows*n_subrows*1.15)

    # Legend axis
    axleg = fig.add_subplot(gs[:, -1])
    axleg.set_axis_off()

    # 3D array of plots, indexed data, node, model
    axes = np.zeros([n_dsets, n_subrows, n_subcols], dtype=object)
    for i in range(n_dsets):
        irow, icol = i // ncols, i % ncols
        for j in range(n_subrows):
            for k in range(n_subcols):
                sharex = None if (j == 0 and i < ncols) else axes[icol, 0, k]
                sharey = None if (k == 0 and i == 0) else axes[0, j, 0]
                #ax = axes0[irow*(n_subrows+1)+j+1, icol*(n_subcols+1) +  + k]
                ax = fig.add_subplot(gs[irow*(n_subrows+1)+j+1, icol*(n_subcols+1) + k], 
                                     sharex=sharex, sharey=sharey)
                axes[i, j, k] = ax


    # Select datasets randomly
    rgen = np.random.default_rng(seed=seed)
    dsets_available = df_compare_velo.index.get_level_values("Data").unique().to_list()
    # Remove the dataset that we already plotted in the supplementary figure above
    dsets_available.remove(dset_freealpha)
    for i in range(1, 5):
        dsets_available.remove("HighMI_1-{}".format(i))
    dsets_available.append("HighMI_1")
    dataset_selection = rgen.choice(dsets_available, size=n_dsets, replace=False)
    # If HighMI_1 is there, randomly select one of the replicates
    for i in range(len(dataset_selection)):
        if dataset_selection[i] == "HighMI_1":
            dataset_selection[i] = "HighMI_1-{}".format(rgen.choice(np.arange(1, 5), size=1)[0])
    print(dataset_selection)


    # Set axis off for other subplots and replace with titles where appropriate
    title_axes = np.zeros([nrows, ncols], dtype=object)
    for i in range(nrows):
        for k in range(ncols):
            dset_idx = i*ncols + k
            title_axes[i, k] = fig.add_subplot(gs[i*(n_subrows+1), k*(n_subcols+k):k*(n_subcols+k)+n_subcols])
            title_axes[i, k].set_axis_off()
            dsetlbl = dataset_selection[dset_idx]
            if len(dsetlbl) > 20:
                dsetlbl = dsetlbl[:18] + "..." + dsetlbl[-2:]
            title_axes[i, k].annotate(dsetlbl,  xy=(0.5, 1.0), 
                xycoords="axes fraction", va="top", ha="center", fontsize=9)


    # Finally, we are ready to plot the model vs data time series
    for k in range(n_dsets):
        axesk = axes[k]
        # Select the dataset, peptides and TCellNumber to plot
        tcn = "100k"
        df_velo_plot = (df_compare_velo.xs(dataset_selection[k], level="Data")
                        .xs(feat, level="Feature", axis=0))
        # In case 100k is not available, take closest, e.g. 80k in TCellNumber_1
        try:
            df_velo_plot = df_velo_plot.xs(tcn, level="TCellNumber", axis=0)
        except KeyError:
            tcn = get_closest_tcn(tcn, df_velo_plot.index.get_level_values("TCellNumber").unique().to_list())
            df_velo_plot = df_velo_plot.xs(tcn, level="TCellNumber", axis=0)
        df_velo_plot = df_velo_plot.loc[df_velo_plot.index.isin(["N4", "Q4", "T4", "V4"], level="Peptide")]

        df_force_plot = (df_compare_freealpha.xs(dataset_selection[k], level="Data")
                         .xs(feat, level="Feature", axis=0))
        df_force_plot = df_force_plot.loc[df_force_plot.index.isin(["N4", "Q4", "T4", "V4"], level="Peptide")]
        df_force_plot = df_force_plot.xs(tcn, level="TCellNumber", axis=0)

        # Select only two time series per peptide
        next_seed = rgen.integers(int(2**32) - 1, size=1)
        df_velo_plot = choose_traj(df_velo_plot, next_seed, size=2, 
                            per_lvl="Peptide", do_not_sample=["Time", "Processing type"])
        df_force_plot = choose_traj(df_force_plot, next_seed, size=2, 
                            per_lvl="Peptide", do_not_sample=["Time", "Processing type"])

        legend_info = create_labeling(df_velo_plot, maxwidth=1.25)
        hue_level_name, hues = legend_info[0]
        size_level_name, sizes = legend_info[1]
        style_level_name, styles = legend_info[2]

        # Plot each LS variable for each model
        for j in range(n_subcols):
            df_feature = df_velo_plot if j == 0 else df_force_plot
            model = "Velocity model" if j == 0 else "Force model"
            df_feature = df_feature.unstack("Time")
            index_names = list(df_feature.index.names)
            hue_pos = index_names.index(hue_level_name)
            if size_level_name != "":
                size_pos = index_names.index(size_level_name)
            else:
                size_pos = None
            if style_level_name != "":
                style_pos = index_names.index(style_level_name)
            else:
                style_pos = None
            axesk[0, j].set_title(model, fontsize=8)
            # Plot each line
            for key in df_feature.index:
                times = df_feature.loc[key].index.get_level_values("Time").unique()
                times = sorted(times, key=float)
                times_f = np.asarray(times)
                n1_vals = df_feature.loc[key, "Node 1"].loc[times]
                n2_vals = df_feature.loc[key, "Node 2"].loc[times]
                # Load the color, linestyle, size corresponding to this key
                hue = hues.get(key[hue_pos])
                siz = sizes.get(key[size_pos]) if size_pos is not None else 2.
                sty = styles.get(key[style_pos]) if style_pos is not None else "-"
                axesk[0, j].plot(times_f, n1_vals, color=hue, lw=siz, ls=sty)
                axesk[1, j].plot(times_f, n2_vals, color=hue, lw=siz, ls=sty)

            # Label axes if necessary
            ax1.tick_params(which="both", length=1.5, width=0.5, labelsize=6.)

            # More formatting, set off tick labels if shared axes, etc.
            # y axes labels and tick labels
            if j == 0 and (k % ncols) == 0:
                lbl = "LS" if feat == "integral" else "ls"
                axesk[0, j].set_ylabel(r"${}_1(t)$ (a.u.)".format(lbl), size=8)
                plt.setp(axesk[0, j].get_yticklabels(), size=7)
                axesk[1, j].set_ylabel(r"${}_2(t)$ (a.u.)".format(lbl), size=8)
                plt.setp(axesk[1, j].get_yticklabels(), size=7)
            else:
                # Use setp, which loops over labels to set them all to invisible
                plt.setp(axesk[0, j].get_yticklabels(), visible=False)
                plt.setp(axesk[1, j].get_yticklabels(), visible=False)

            # x axes labels and tick labels: hide all except last row
            if (k // n_subrows) == nrows - 1:
                axesk[1, j].set_xlabel("Time (h)", size=8)
                axesk[1, j].set_xticks([0, 24, 48, 72])
                axesk[1, j].set_xticklabels([0, 24, 48, 72])
                plt.setp(axesk[1, j].get_xticklabels(), size=7)
                axesk[0, j].set_xlabel("")
                plt.setp(axesk[0, j].get_xticklabels(), visible=False)
            else:
                axesk[0, j].set_xlabel("")
                axesk[1, j].set_xlabel("")
                plt.setp(axesk[1, j].get_xticklabels(), visible=False)
                plt.setp(axesk[0, j].get_xticklabels(), visible=False)
            if k == 0:
                leg = add_hue_size_style_legend(
                    axleg, hues, sizes, styles, size_level_name,
                    style_level_name, hue_sort_key=peptide_ranks.get, 
                    frameon=False, fontsize=7)
    fig.tight_layout(w_pad=0.2, h_pad=0.2)
    return fig, axes, axleg

In [None]:
fig, axes, axleg = example_fits_plots("integral", df_compare_velo, df_compare_freealpha)
#fig.savefig(os.path.join(main_dir_path, "figures", "fits", "revision_fig_more_model_fits_integrals.pdf"), 
#    transparent=True, bbox_inches="tight")
plt.show()
plt.close()

In [None]:
fig, axes, axleg = example_fits_plots("concentration", df_compare_velo, df_compare_freealpha, seed=1009191)
#fig.savefig(os.path.join(main_dir_path, "figures", "fits", "revision_fig_more_model_fits_concentrations.pdf"), 
#    transparent=True, bbox_inches="tight")
plt.show()
plt.close()

### Summary of residuals

Residuals are normalized by the range of the variable ($LS_1$, $LS_2$, $ls_1$, $ls_2$) in each dataset.  This is really the only non-arbitrary measurement of the latent space variables' magnitude. 
Because it is also arbitrary whether nodes are centered at zero or off-origin; the latter would reduce the relative error artificially. This is especially true since we are in log-scale; the zero just gives an indication of the scale, it is not an absolute. So, residuals of uniform magnitude in log scale, no matter the value of the log, are actually residuals proportional to the signal already. 

We consider signed residuals rather than absolute (RMS or squared) residuals because:
  - We want to show that the model is not biased towards fitting consistently lower or higher values than the data (splines). So, the mean should be close to zero
  - We want to show the typical deviation above and below the spline data; they should also be similar. 

We normalize the residuals of the LS variable by its range (max - min across conditions) in each dataset, because:
  - The distribution of the LS variable itself above or below zero is arbitrary (rotations of the latent space are irrelevant to antigen encoding and the amount of information in the trajectories). 
  - The usual relative error (error divided by the value of the LS variable) blows up when the LS variable cross zero; the model is not supposed to fit better near zero values of a LS variable than elsewhere. A "small" error on the LS variable values when it is crossing zero is acceptable; intuitively, "small" refers to the full range that this variable will span over time. 
  - The location of the zero is arbitrary, just like, e.g., the choice of the origin of a coordinate system to describe real ballistic motions; it does not represent a null observable quantity. 
  - The LS variables are derived from log-transformed physical quantities (concentrations), so residuals that would be uniform across values of the LS variables are actually proportional to the magnitude of the variable in linear scale, corresponding to uniform relative error. 

In [None]:
def compute_residuals_signed(df):
    pr = "Processing type"
    return df.xs("Splines", level=pr) - df.xs("Fit", level=pr)

In [None]:
# All squared residuals per time series
df_res_velo = compute_residuals_signed(df_compare_velo)
df_res_matching = compute_residuals_signed(df_compare_freealpha)

# Normalization factor of each feature's residuals, per dataset: the range of each LS variable. 
range_lsLS_per_dset = (df_compare_velo.xs("Splines", level="Processing type").groupby(["Data", "Feature"]).max()
                       - df_compare_velo.xs("Splines", level="Processing type").groupby(["Data", "Feature"]).min())
range_lsLS_per_dset = range_lsLS_per_dset.unstack("Feature")
print(range_lsLS_per_dset.mean(axis=0))

# Statistics of residuals normalized per dataset. Will create box plots and scatter plots
df_res_velo_norm = df_res_velo.unstack("Feature") / range_lsLS_per_dset * 100  # in %
df_res_matching_norm = df_res_matching.unstack("Feature") / range_lsLS_per_dset * 100  # in %
print(df_res_velo_norm.groupby("Data").mean())

In [None]:
# Nice box plot plus histogram of those residuals. One boxplot per LS variable; this gives four in total
# Put legend to the right (same legend for all four)
# Make one row for LS, one row for ls.

# First, prepare a nicely formatted DataFrame to slice
plotDf = pd.concat({"Constant velocity":df_res_velo_norm, 
                    "Force w/ matching":df_res_matching_norm}, names=["Model"])
def feature_renamer(tup):
    node, feat = tup 
    base = r"$LS_{}$" if feat == "integral" else r"$ls_{}$"
    return base.format("{" + str(node[-1]) + "}")
plotDf = plotDf.drop(["PeptideComparison_OT1_Timeseries_20"], level="Data")  # Same as NewPeptide... 
plotDf = plotDf.sort_index(axis=1, level=0, ascending=False)
plotDf = plotDf.stack("Feature").unstack("Feature").sort_index(axis=1, level="Feature", ascending=False)
plotDf.columns = pd.Index(list(map(feature_renamer, plotDf.columns.to_list())), name="Variable")
plotDf = plotDf.stack("Variable")
plotDf.name = "Residuals"
datasets_to_int_map = {a:i for i, a in enumerate(plotDf.index.get_level_values("Data").unique())}
#int_to_datasets_map = {i:a for a, i in datasets_to_int_map.items()}  # To write names in the labels
plotDf = plotDf.rename(index=datasets_to_int_map, level="Data")
# Make sure of the order of index levels
plotDf = plotDf.reorder_levels(
    ["Model", "Data", 'TCellNumber', 'Peptide', 'Concentration', 'Time', 'Variable'])

# Prepare figure
fig = plt.figure()
fig.set_size_inches(2.5*2 + 0.5, 2.25*2)
gs = fig.add_gridspec(nrows=2, ncols=3, width_ratios=[1.0, 1.0, 0.3])
axes = np.zeros([2, 2], dtype=object)

# First and last, same colors as in supp. fig of residuals
colors_models = [sns.set_hls_values(a, s=1) for a in sns.color_palette("cubehelix", 4)][-2::-1]
colors_models = colors_models[::2][::-1]
models_order = ["Force w/ matching", "Constant velocity"]

# Loop over features
# C-order : LS1, LS2, ls1, ls2
nice_ls_names = [r"$LS_{1}$", r"$LS_{2}$", r"$ls_{1}$", r"$ls_{2}$"]
assert len(nice_ls_names) == len(plotDf.index.get_level_values("Variable").unique())
for i in range(len(nice_ls_names)):
    # Create new axis
    sharey = axes[i // 2, 0] if i % 2 == 1 else None
    sharex = axes[0, i % 2] if i // 2 == 1 else None
    ax = fig.add_subplot(gs[i // 2, i % 2], sharey=sharey, sharex=sharex)
    axes.flat[i] = ax
    # Use sns.boxplot on this axis
    g = sns.boxplot(data=plotDf.xs(nice_ls_names[i], level="Variable", axis=0).reset_index(), 
                x="Data", y="Residuals", hue="Model", palette=colors_models, ax=ax, 
                whis=(5, 95), hue_order=models_order,
                linewidth=0.75, showfliers=False, width=0.7,
                flierprops=dict(markersize=1., markeredgewidth=0.1))
    ax.axhline(0.0, ls=":", lw=1.0, zorder=0, color="grey")
    g.legend_.remove()
    
    # Title: variable
    ax.set_title(nice_ls_names[i], fontsize=10, y=0.98)
    
    # Hide or show tick and axes labels when appropriate
    # y labels and ticks
    if i % 2 == 0:
        ax.set_ylabel(ax.get_ylabel() + " / range (%)", size=10)
        plt.setp(ax.get_yticklabels(), fontsize=8)
    else:
        ax.set_ylabel("")
        plt.setp(ax.get_yticklabels(), visible=False)
    # x labels and ticks
    ax.set_xticklabels([str(i+1) if i % 2 == 0 else "" 
                        for i in range(len(ax.get_xticklabels()))])
    if i >= len(nice_ls_names) - 2:
        ax.set_xlabel("Dataset", size=10)
        plt.setp(ax.get_xticklabels(), fontsize=8)
    else:
        ax.set_xlabel("")
        plt.setp(ax.get_xticklabels(), visible=False)

# Manual legend: two Rectangles, one per color      
legax = fig.add_subplot(gs[:, -1])
legax.set_axis_off()
models_order2 = []
for lbl in models_order:
    models_order2.append(" ".join(lbl.split(" ")[:-1]) + "\n" + lbl.split(" ")[-1]) 
legax.legend(handles=[mpl.patches.Rectangle((0, 0), 1, 1, color=colors_models[0], label=models_order[0]), 
                      mpl.patches.Rectangle((0, 0), 1, 1, color=colors_models[1], label=models_order[1])], 
             labels=models_order2, frameon=False, loc="upper left", bbox_to_anchor=(-0.4, 0.9), 
             fontsize=8, title="Model", title_fontsize=10, borderaxespad=0.3, borderpad=0.3)

# Set titles: LS1, LS2, ls1, or ls2
fig.tight_layout(w_pad=0.5, h_pad=0.5)
#fig.savefig(os.path.join(main_dir_path, "figures", "fits", "revision_fig_model_fits_residuals_boxplots.pdf"), 
#    transparent=True, bbox_inches="tight")
plt.show()
plt.close()