# Supplementary figure panels related to mutual information
Other MI-related panels are created by `main_plotting_scripts/peptide_channel_diagrams.py` and `theoretical_antigen_classes_from_capacity_HighMI_13.ipynb` directly. 



In [None]:
import numpy as np
import scipy as sp
from scipy.interpolate import UnivariateSpline, PchipInterpolator
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import json

import os, sys
main_dir_path = os.path.abspath('../')
sys.path.insert(0, main_dir_path)

#from utils.distrib_interpolation import (eval_interpolated_means_covs, interpolate_params_vs_logec50, 
#                                         stats_per_levels, compute_cholesky_dataframe)
import utils.custom_pandas as custom_pd
from utils.extra_pairplots import dual_pairplot
from utils.plotting_mi import plot_params_vs_logec50
from utils.statistics import build_symmetric

In [None]:
%matplotlib inline

In [None]:
#plt.rcParams["figure.figsize"] = (2.25, 1.75)
plt.rcParams["axes.labelsize"] = 8.
plt.rcParams["legend.fontsize"] = 8.
plt.rcParams["axes.labelpad"] = 0.5
plt.rcParams["xtick.labelsize"] = 7.
plt.rcParams["ytick.labelsize"] = 7.
plt.rcParams["legend.title_fontsize"] = 8.
plt.rcParams["axes.titlesize"] = 8.
plt.rcParams["font.size"] = 8.
plt.rcParams["figure.dpi"] = 160

# Figure comparing fitted multivariate normal distributions to data

### Note: this figure is also produced directly in `compute_channel_capacity_HighMI_13.ipynb`


In [None]:
foldr = os.path.join(main_dir_path, "results", "fits")
df_params = pd.read_hdf(os.path.join(foldr, "df_params_Sigmoid_freealpha_HighMI_13.hdf"))
foldr = os.path.join(main_dir_path, "results", "highmi13")
suffix = "highmi13_Sigmoid_freealpha.hdf"
df_params_means = pd.read_hdf(os.path.join(foldr, "df_params_means_{}".format(suffix)))

df_params_means_estim_vari = pd.read_hdf(os.path.join(foldr, "df_params_means_estim_vari_{}".format(suffix)))
df_params_covs = pd.read_hdf(os.path.join(foldr, "df_params_covs_{}".format(suffix)))
df_params_covs_estim_vari = pd.read_hdf(os.path.join(foldr, "df_params_covs_estim_vari_{}".format(suffix)))
ser_npts = pd.read_hdf(os.path.join(foldr, "ser_npts_{}".format(suffix)))

# Also, the Cholesky decomposition
df_params_chol = pd.read_hdf(os.path.join(foldr, "df_params_chol_{}".format(suffix)))
df_params_chol_estim_vari = pd.read_hdf(os.path.join(foldr, "df_params_chol_estim_vari_{}".format(suffix)))

In [None]:
df_ec50s = pd.read_json(os.path.join(main_dir_path, "data", "misc", "potencies_df_2021.json"))
df_ec50s.columns.name = "Reference"; df_ec50s.index.name = "Peptide"
ser_ec50s_avglog = np.log10(df_ec50s).mean(axis=1)
print(ser_ec50s_avglog)

In [None]:
nsamples = 40
seed = 1357642
params_to_keep = ["a0", "t0", "theta"]
levels_group = ["Peptide"]
rnd_gen = np.random.default_rng(seed=seed)
if len(levels_group) == 1:
    new_index = pd.MultiIndex.from_product([df_params_means.index] + [range(nsamples)], 
                                      names=[df_params_means.index.name, "Sample"])
else:
    new_index = pd.MultiIndex.from_product([*zip(*df_params_means.index)] + [range(nsamples)], 
                                      names=[df_params_means.index.names] + ["Sample"])
df_params_synth = pd.DataFrame(index=new_index, columns=params_to_keep, dtype=np.float64)
df_params_synth.columns.name = "Parameter"

# Sample from the fitted gaussians
for key in df_params_means.index:
    cov_mat = build_symmetric(df_params_covs.loc[key].values)
    mean_vec = df_params_means.loc[key].values
    df_params_synth.loc[key] = rnd_gen.multivariate_normal(mean_vec, cov_mat, nsamples)

In [None]:
params_remove = list(set(df_params.index.names).difference(levels_group))
params_remove.remove("TCellNumber")
df_params_both = df_params.xs("30k", level="TCellNumber", axis=0).droplevel(params_remove).sort_index()

df_params_both = df_params_both.loc[:, params_to_keep[0]:params_to_keep[-1]]
print(df_params_both.groupby(levels_group).count().values)

idx = np.concatenate([np.arange(n) for n in df_params_both.groupby(levels_group).count().sort_index().values[:, 0]])
df_params_both["Sample"] = idx
df_params_both = df_params_both.set_index("Sample", append=True)
df_params_both = pd.concat([df_params_both, df_params_synth], axis=1, keys=["Data", "Synth"], names=["Source", "Parameter"])
df_params_both = df_params_both.stack("Source")
print(df_params_both)

In [None]:
# Improve parameter labels
rename_dict = {"theta":r"$\theta$", "a0": r"$a_0$", "t0": r"$t_0$"}
params_to_keep2 = [rename_dict[a] for a in params_to_keep]
peptides = ["N4", "Q4", "A2", "Y3", "T4", "V4", "G4", "E1"]
pep_color_order = ["N4", "Q4", "T4", "V4", "G4", "E1", "A2", "Y3", "A8", "Q7"]
pep_palette = {pep_color_order[i]:sns.color_palette()[i] for i in range(len(pep_color_order))}
palette_order = [pep_palette.get(a) for a in peptides]

In [None]:
## NOTE: this figure is also produced directly in 
df_params_plot = custom_pd.xs_slice(df_params_both.rename(rename_dict, axis=1, level="Parameter"), 
                    name="Peptide", lvl_slice=peptides, axis=0).reset_index()

# Pairplot with reflection across the diagonal to compare synthetic vs data distributions
fig, axes, leg = dual_pairplot(data=df_params_plot, vari=params_to_keep2, 
    dual_lvl="Source", dual_labels=["Data", "Synthetic"], 
    dual_hues = [(0.5, 0.5, 0.5), plt.cm.viridis([206])[0]], palette=palette_order,
    hue="Peptide", hue_order=peptides, alpha=0.8, s=9, edgecolors=None)

# Clean up layout
fig.set_size_inches(4.75, 4.75)
fig.tight_layout(h_pad=0.5, w_pad=0.65)

# Uncomment to save supplementary figure
#fig.savefig(os.path.join(main_dir_path, "figures", "supp", 
#    "pairplot_synthreal_dual_Sigmoid_freealpha_HighMI_13.pdf"), transparent=True, 
#    bbox_extra_artists=(leg,), bbox_inches='tight', format="pdf")
plt.show()
plt.close()

# Figure showing interpolation of multivariate normal distribution parameters

In [None]:
with open(os.path.join(main_dir_path, "results", "highmi13", "ser_splines_means_highmi13.pkl"), "rb") as hd:
    ser_splines_means = pd.Series(pickle.load(hd))
with open(os.path.join(main_dir_path, "results", "highmi13", "ser_splines_chol_highmi13.pkl"), "rb") as hd:
    ser_splines_chol = pd.Series(pickle.load(hd))
ser_splines_means.index.name = "Parameter"
ser_splines_chol.index.name = "Parameter"

In [None]:
# Plot the interpolation of the means versus the data
df_params_plot = df_params_means.rename(rename_dict, axis=1, level="Mean element")
df_params_vari_plot = df_params_means_estim_vari.rename(rename_dict, axis=1, level="Var[Mean estimator]")
ser_splines_plot = ser_splines_means.rename(rename_dict, level="Parameter")

# Create a full subplots grid so all subplots will line up nicely in the end
fig, axes = plt.subplots(3, 3, sharey=False)
fig.set_size_inches(4.75, 4.5)

fig, axes_m = plot_params_vs_logec50(df_params_plot, df_params_vari_plot, ser_ec50s_avglog, 
                ser_interp=ser_splines_plot, cols_plot=None, x_name="Peptide", col_wrap=3, figax=[fig, axes[0]])
for ax in axes_m:  # 3 is col_wrap
    ax.set_xlabel(r"$\log_{10}{\mathrm{EC}_{50}}$ [-]")
    
#fig.tight_layout(w_pad=0.5)
#fig.savefig("panels_mi/mean_vs_logec50_Sigmoid_freealpha_HighMI_13.pdf", transparent=True, bbox_inches="tight")
#plt.show()
#plt.close()

# Second part of the plot: Cholesky elements
rename_dict2 = {"a0*a0":r"Chol[$a_0, a_0$]", "t0*a0":r"Chol[$t_0, a_0$]", "theta*a0":r"Chol[$\theta, a_0$]", 
               "t0*t0":r"Chol[$t_0, t_0$]", "theta*t0":r"Chol[$\theta, t_0$]", 
                "theta*theta":r"Chol[$\theta, \theta$]"}
df_params_plot = df_params_chol.rename(rename_dict2, axis=1, level="Cholesky element")
df_params_vari_plot = df_params_chol_estim_vari.rename(rename_dict2, axis=1, level="Var[Chol estimator]")
ser_splines_plot = ser_splines_chol.rename(rename_dict2, level="Parameter")

fig, axes_c = plot_params_vs_logec50(df_params_plot, df_params_vari_plot, ser_ec50s_avglog, 
                ser_interp=ser_splines_plot, cols_plot=None, x_name="Peptide", col_wrap=3, figax=[fig, axes[1:]])
for ax in axes_c:  # 3 is col_wrap
    ax.set_xlabel(r"$\log_{10}{\mathrm{EC}_{50}}$ [-]")

fig.tight_layout(w_pad=0.5)

# Uncomment to save figure
#fig.savefig(os.path.join(main_dir_path, "figures", "supp", 
#            "means_cholesky_vs_logec50_Sigmoid_freealpha_HighMI_13.pdf"), 
#            transparent=True, bbox_inches="tight")
plt.show()
plt.close()

# Optimal distribution from CCE algorithm

#### Need to run `more_main_scripts/estimate_channel_capacity_cce.ipynb` and save the results before creating this panel. 

In [None]:
## CHANGE THE DATE to when you executed that code and saved the result. 
with open(os.path.join(main_dir_path, "results", "highmi13", 
            "cce_run_log_8ins_rtol4e-02_TCN30k_25-jul-2021.json"), "r") as hd:
    cce_run = json.load(hd)

In [None]:
fig, ax = plt.subplots()
width = np.amin(np.abs(np.diff(cce_run["input_values"])))/2
ax.bar(cce_run["input_values"], cce_run["optim_input_distrib"], width=width, 
       color="xkcd:grey blue", edgecolor="k", linewidth=0.8)

# Labeling axes
ax.set_xticks(cce_run["input_values"])
ax.set_xticklabels(cce_run["input_peptides"])
ax.set_xlabel(r"$\log_{10}{(\mathrm{EC}_{50})}$ [-]", size=8)
ax.set_ylabel("P(EC$_{50}$)", size=8)
ax.tick_params(which="both", labelsize=6)
ax.annotate(r"$C = ({:.1f} \pm {:.1f})$ bits".format(cce_run["capacity_bits"], 0.04*cce_run["capacity_bits"]), 
            xy=(0.12, 0.85), xycoords="axes fraction", size=7)

# Invert EC50 axis to have N4 on the right
ax.invert_xaxis()

fig.set_size_inches(2.25, 2.)
fig.tight_layout()

#fig.savefig(os.path.join(main_dir_path, "figures", "supp", "cce_optimal_logec50_distrib_HighMI_13.pdf"), 
#            transparent=True, bbox_inches="tight")
plt.show()
plt.close()

# Channel capacity optimal distribution plot – Moved to main text
Keeping the code for separate panels here. 

In [None]:
with open(os.path.join(main_dir_path, "results", "highmi13", "capacity_bootstrap_results_HighMI_13.json"), "r") as hd:
    run_log = json.load(hd)

In [None]:
run_log.keys()

In [None]:
sampled_logec50 = run_log["input_values"]
optim_input_distrib = run_log["optimal_distribution"]
capacity_bits = run_log["average_capacity_bits"]
reltol = run_log["relative_tolerance"]

In [None]:
# Make a histogram (bar plot) of the optimal input distribution
fig, ax = plt.subplots()
ax.bar(np.around(sampled_logec50, 2), optim_input_distrib, width=np.diff(sampled_logec50)[0], 
      color="w", edgecolor="k", linewidth=0.8)
ax.set_xlabel(r"$\log_{10}{(\mathrm{EC}_{50})}$ [-]", size=8)
ax.set_ylabel("Probability [-]", size=8)
ax.tick_params(which="both", labelsize=6)
ax.annotate(r"$C = ({:.2f} \pm {:.2f})$ bits".format(capacity_bits, run_log["variance_capacity_bits"]**0.5), 
            xy=(0.2, 0.7), xycoords="axes fraction", size=7, backgroundcolor=(1, 1, 1, 0.5))
ax.invert_xaxis()
# Annotate peptides
maxprob = np.amax(optim_input_distrib)
for pep in ser_ec50s_avglog.index:
    if pep not in df_params_means.index:
        continue
    ha = "right" if pep == "Y3" else "center"
    ax.annotate(pep, xy=(ser_ec50s_avglog[pep], maxprob), fontsize=6, ha=ha, va="top", color="grey")
    ax.axvline(ser_ec50s_avglog[pep], ls=":", lw=0.8, color="grey", ymax=0.85)

fig.set_size_inches(2.25, 1.75)
fig.tight_layout()
n_inputs = len(sampled_logec50)

#fig.savefig("panels_mi/optimal_logec50_distrib_{}inputs_HighMI_13.pdf".format(n_inputs), 
#            transparent=True, bbox_inches="tight")
plt.show()
plt.close()

# Ideal peptide EC50 selection, CMF plot – Moved to main text
Keeping the code for separate panels here. 

In [None]:
# Cumulate starting at E1, so reverse the ec50 axis. 
pmf = run_log["optimal_distribution"][::-1]
nsep = int(round(2**run_log["average_capacity_bits"]))
indices = np.zeros(nsep, dtype=int)
indices[-1] = len(pmf) - 1

cumul_prob = np.cumsum(pmf)
inner_prob = np.sum(pmf[1:-1])
binwidth = inner_prob / (nsep - 1)
binseps = np.linspace(pmf[0]+binwidth, 1.0 - binwidth - pmf[-1], nsep - 2)
indices[1:-1] = np.searchsorted(cumul_prob, binseps)

In [None]:
# Make a histogram (bar plot) of the optimal input distribution
fig, ax = plt.subplots()
bars = ax.bar(np.around(sampled_logec50, 2)[::-1], cumul_prob, width=np.diff(sampled_logec50)[0], 
      color="w", edgecolor="k", linewidth=0.8)

# highlight the selected ideal EC50s and annotated their EC50
for i in indices:
    if i == 0:
        ec50_i = sampled_logec50[-1] + sampled_logec50[0]
        ha="center"
    elif i == indices[-1]:
        ec50_i = 0.0
        ha="center"
    else:
        ec50_i = sampled_logec50[::-1][i]
        ha="right"
    bars[i].set_facecolor("xkcd:light blue")
    ax.annotate("{:.1f}".format(ec50_i), xy=(ec50_i, bars[i].get_height()+0.03),
                xycoords="data", fontsize=6, ha=ha, va="bottom")

# Horizontal lines at the bin separators
hlines_props = dict(ls="--", lw=1., color="grey")
ax.axhline(1.025-binwidth, **hlines_props)  #xmin=0.95, 
ax.axhline(-0.025+binwidth, **hlines_props)  #xmin=ax.transLimits.transform((sampled_logec50[0], 1))[0],
#ax.plot(sampled_logec50[0], 1.03-binwidth, marker="o", ls="none", mfc="grey", mec="grey", ms=3)
#ax.plot(sampled_logec50[-1], -0.025+binwidth, marker="o", ls="none", mfc="grey", mec="grey", ms=3)
for i in range(len(binseps)):
    ec50_i = sampled_logec50[indices[i+1]]
    ax.axhline(binseps[::-1][i], **hlines_props)  #xmin=ax.transLimits.transform((ec50_i, 1))[0], 
    #ax.plot(ec50_i, binseps[::-1][i], marker="o", ls="none", mfc="grey", mec="grey", ms=3)

# Arrows to show how we are evenly spaced in probability
arrowprops = dict(arrowstyle="<->", shrinkA=0.5, shrinkB=0.5, color="grey")
ec50_i = sampled_logec50[-1]
ax.annotate("", xy=(ec50_i, binseps[-2]), xytext=(ec50_i, binseps[-1]), arrowprops=arrowprops)
ax.annotate("", xy=(ec50_i, binseps[-3]), xytext=(ec50_i, binseps[-2]), arrowprops=arrowprops)
ax.annotate("", xy=(ec50_i, binseps[-1]), xytext=(ec50_i, binseps[-1]+binwidth), arrowprops=arrowprops)
ax.annotate("", xy=(ec50_i, binseps[-4]), xytext=(ec50_i, binseps[-3]), arrowprops=arrowprops)


# Tick formatter to have two decimals and align with previous plot
def major_formatter(x, pos):
    return "{:.2f}".format(x)

ax.yaxis.set_major_formatter(major_formatter)
ax.set_xlabel(r"$\log_{10}{(\mathrm{EC}_{50})}$ [-]", size=8)
ax.set_ylabel("Cumulative prob. [-]", size=8)
ax.tick_params(which="both", labelsize=6)
ax.invert_xaxis()

fig.set_size_inches(2.25, 1.75)
fig.tight_layout()
n_inputs = len(sampled_logec50)

#fig.savefig("panels_mi/ideal_peptides_determination_cmf_HighMI_13.pdf", 
#            transparent=True, bbox_inches="tight")
plt.show()
plt.close()