## Supplemental information inline plot on MSKCC original EC50s
Supplemental information inline plot to illustrate that all weakest CMV antigens were missing full dose response curves and thus EC50s in the fit parameters provided with their paper. This explains why we had to do our own EC50 fits and processing.  

In [None]:
import os
import json,math
import matplotlib.pyplot as plt
import matplotlib as mpl
import matplotlib.colors as clr
import numpy as np
import pandas as pd
import seaborn as sns
sns.set_context('talk')
sns.set_style('ticks')
import warnings
warnings.filterwarnings('ignore')
idx = pd.IndexSlice
pj = os.path.join

In [None]:
do_save_plots = False
root_dir = ".."
data_dir = pj(root_dir, "data", "dose_response")
fig_dir = "panels_misc"

In [None]:
# Renaming convention for TCRs
tcr_rename = {
    "1": "C1",
    "2": "C2",
    "3": "C3",
    "4": "G1",
    "5": "G3",
    "6": "G2",
    "7": "N1"
}

In [None]:
def clean_nosub_duplicates(df):
    for tcr in df.index.get_level_values("TCR").unique():
        params_tcr = df.xs(tcr, level="TCR")
        antigen = params_tcr.index.get_level_values("Antigen").unique()[0]
        # Find all false substitutions
        wt_duplicates = {}
        for pep in params_tcr.index.get_level_values("Peptide").unique():
            if pep[0] == pep[2]:
                wt_duplicates[pep] = params_tcr.loc[(antigen, pep)]
        # Check they were all identical
        #print(wt_duplicates)
        # Replace them all by one WT row
        df = df.drop([(antigen, tcr, pep) for pep in wt_duplicates.keys()])
        df.loc[(antigen, tcr, "WT")] = list(wt_duplicates.values())[0]
    return df

In [None]:
#@title Load MSKCC data
# Import MSKCC dose response data, curve fitting parameters, and ec50 data
results_dir = data_dir
mskcc_data = pd.read_hdf(pj(data_dir, "MSKCC_rawDf.hdf")).sort_index()
mskcc_params = pd.read_hdf(pj(data_dir, "MSKCC_fitCurvesEC50.hdf"))
mskcc_params = mskcc_params.rename({"Neopeptide":"Neoantigen"}, level="Antigen")
mskcc_ec50s = pd.read_hdf(pj(data_dir, "MSKCC_originalEC50df.hdf"))

for df in [mskcc_data, mskcc_params, mskcc_ec50s]:
    print(df.index.get_level_values("Antigen").unique())

# In the parameters dataframe, drop all false substitutions, e.g. A7A:
# these are all copies of the WT, duplicated for heatmap plotting convenience
mskcc_params = clean_nosub_duplicates(mskcc_params)

# Rename CD137 to 4-1BB
resp_name = "Response (4-1BB+ %)"
mskcc_data = mskcc_data.rename({"Response (CD137+ %)":resp_name}, axis=1)

# Change K_a from ug/ml to mol/l
print(mskcc_data)
print(mskcc_ec50s)
mskcc_params["K_a"] = mskcc_ec50s["EC50 (M)"]
print(mskcc_params)

In [None]:
df_response_inf = mskcc_data.copy()
peps_without_ec50 = (mskcc_ec50s["EC50 (M)"] == np.inf)
df_response_inf["INF"] = peps_without_ec50
df_response_inf = df_response_inf.set_index("INF", append=True).set_index("Dose (ug/mL)", append=True)
# Keep only the largest dose
df_response_inf = df_response_inf.xs(100.0, level="Dose (ug/mL)")[resp_name]

In [None]:
# Plot cumulative distribution of max. response for peptides with and without EC50
# Find the limit between the two kinds
df_inf_cmv = df_response_inf.xs("CMV", level="Antigen")
inf_lim = 0.5*(df_inf_cmv.xs(True, level="INF").max() + df_inf_cmv.xs(False, level="INF").min())
# Rename stuff for nicer plots
df_plot = df_inf_cmv.copy()
df_plot = (df_plot.rename(tcr_rename, level="TCR")
            .rename({True:"Incomplete", False:"Complete"}, level="INF"))
df_plot.index = df_plot.index.set_names(["TCR", "Peptide", "Dose response"])
g = sns.displot(data=df_plot.reset_index(),
                hue="Dose response", x=resp_name, kind="ecdf", col="TCR")
for ax in g.axes.flat:
    ax.axvline(inf_lim, color="k", ls="--")
g.axes.flat[0].set_ylabel("Proportion of peptides")
if do_save_plots:
    g.figure.savefig(pj(fig_dir, "ecdf_cmv_peptides.pdf"), transparent=True,
              bbox_inches="tight", bbox_extra_artists=(g.legend,))
plt.show()
plt.close()

# Full plots below would show other TCRs only have responsive peptides.
# sns.displot(data=temp[temp['Dose (ug/mL)'] == 100],hue='INF',x=resp_name,kind='ecdf',col='TCR')
#sns.displot(data=temp[temp['Dose (ug/mL)'] == 100],x=resp_name,kind='ecdf',col='TCR',color='k')
#sns.displot(data=temp[temp['Dose (ug/mL)'] == 100],x=resp_name,kind='ecdf',hue='TCR')