# Noise distribution of cytokine time series


In [None]:
import numpy as np
import pandas as pd
import scipy as sp
import scipy.stats
import psutil, pickle
import os, sys
main_dir_path = os.path.abspath('../')
sys.path.insert(0, main_dir_path)

# plotting
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl

# Custom scripts
import utils.custom_pandas as cpd
from utils.process_raw_data_choices import process_file_choices, select_naive_data
from utils.multiprocess_training import process_train_dsets, process_test_dsets, init_peps_cytos_concs

In [None]:
%matplotlib inline

In [None]:
# Plot parameters for Science
plt.rcParams["figure.figsize"] = (2.5, 2.)
plt.rcParams["axes.labelsize"] = 8.
plt.rcParams["legend.fontsize"] = 8.
plt.rcParams["axes.labelpad"] = 0.5
plt.rcParams["xtick.labelsize"] = 7.
plt.rcParams["ytick.labelsize"] = 7.
plt.rcParams["legend.title_fontsize"] = 8.
plt.rcParams["axes.titlesize"] = 8.
plt.rcParams["font.size"] = 8.

# For larger display of small graphs in the notebook
plt.rcParams['figure.dpi'] = 120

In [None]:
cyt_palette = sns.cubehelix_palette(5, start=.5, rot=-.75)  # blue-green colors
cyt_palette.append(mpl.colors.to_rgba("xkcd:pink"))
cyt_palette.append(mpl.colors.to_rgba("xkcd:purple"))
cyt_order = ["IL-2", "IFNg", "IL-17A", "IL-6", "TNFa"][::-1]  # Based on MI order, from highest to lowest (highest MI is darkest)
cyt_order.append("IL-4")
cyt_order.append("IL-10")
sns.palplot(cyt_palette)
cyt_palette = {cyt_order[i]:cyt_palette[i] for i in range(len(cyt_order))}
nice_cyto_labels = {"IL-2":"IL-2", "IFNg":r"IFN-$\gamma$", "IL-17A":"IL-17A", "IL-6":"IL-6", "TNFa":"TNF", 
                   "IL-4":"IL-4", "IL-10":"IL-10"}

In [None]:
train_data_files = [
    'cytokineConcentrationPickleFile-20190412-PeptideComparison_OT1_Timeseries_18-final.pkl',
    'cytokineConcentrationPickleFile-20190608-PeptideComparison_OT1_Timeseries_19-final.pkl',
    'cytokineConcentrationPickleFile-20190718-NewPeptideComparison_OT1_Timeseries_20-final.pkl',
    'cytokineConcentrationPickleFile-20190725-PeptideTumorComparison_OT1_Timeseries_1-final.pkl',
    'cytokineConcentrationPickleFile-20190802-TCellNumber_OT1_Timeseries_7-final.pkl', 
    'cytokineConcentrationPickleFile-20190812-Activation_Timeseries_1-final.pkl'
]

test_data_files = [
    'cytokineConcentrationPickleFile-20200220-Activation_TCellNumber_1-final.pkl', 
    'cytokineConcentrationPickleFile-20190404-CD25MutantTimeSeries_OT1_Timeseries_2-final.pkl',
    'cytokineConcentrationPickleFile-20191029-PeptideComparison_OT1_Timeseries_22-final.pkl',
    'cytokineConcentrationPickleFile-20191106-PeptideComparison_OT1_Timeseries_23-final.pkl',
    'cytokineConcentrationPickleFile-20200624-HighMI_1-1-final.pkl',
    'cytokineConcentrationPickleFile-20200624-HighMI_1-2-final.pkl',
    'cytokineConcentrationPickleFile-20200624-HighMI_1-3-final.pkl',
    'cytokineConcentrationPickleFile-20200624-HighMI_1-4-final.pkl'
]

# Import training and test data with different processings
Log scale and linear scale, smoothing or no smoothing (to compare splines to raw data). Never compute time integrals: we want to look at concentrations here. 

In [None]:
process_kwargs = {"take_log": True, "rescale_max": False, "max_time": 72, 
                  "do_integrate": False, "do_smooth": False, "subtract_min": True}
dummy_minmax = {"min":0, "max":1}

# Raw, log scale
df_train_raw = process_test_dsets(train_data_files, process_kwargs, dummy_minmax,  
                            folder=os.path.join(main_dir_path, "data", "final"), 
                            extra_cytos=["IL-4", "IL-10"], tslice=slice(0, 72)).sort_index()

# Splines, log scale
process_kwargs["do_smooth"] = True
df_train_splines = process_test_dsets(train_data_files, process_kwargs, dummy_minmax,  
                            folder=os.path.join(main_dir_path, "data", "final"), 
                            extra_cytos=["IL-4", "IL-10"], tslice=slice(0, 72)).sort_index()

# Raw, linear scale
process_kwargs["do_smooth"] = False
process_kwargs["take_log"] = False
# process_kwargs["subtract_min"] = False
df_train_raw_linear = process_test_dsets(train_data_files, process_kwargs, dummy_minmax,  
                            folder=os.path.join(main_dir_path, "data", "final"), 
                            extra_cytos=["IL-4", "IL-10"], tslice=slice(0, 72)).sort_index()

# Splines, linear scale
process_kwargs["do_smooth"] = True
df_train_splines_linear = process_test_dsets(train_data_files, process_kwargs, dummy_minmax,  
                            folder=os.path.join(main_dir_path, "data", "final"), 
                            extra_cytos=["IL-4", "IL-10"], tslice=slice(0, 72)).sort_index()

### Experimental time points at half-hours
The processing code is written to evaluate splines at every hour, while a few experimental time points are at half-hours. Instead of re-writing this code to evaluate splines at the exact times we want, I will use linear interpolation between available spline time points .

In [None]:
data_times = df_train_raw.index.get_level_values("Time").unique().to_list()
splines_times = df_train_splines.index.get_level_values("Time").unique().to_list()
all_times = set(data_times)
all_times.update(set(splines_times))
all_times = sorted(list(all_times))

# Interpolate. Reindexing introduces NaNs at missing experimental times, then interpolate replaces them
# Then keep only the data times now that interpolation is over
df_train_splines2 = (df_train_splines.unstack("Time").stack("Cytokine").stack("Feature")
                    .reindex(all_times, axis=1)
                    .interpolate(axis=1, method="linear")
                    .reindex(data_times, axis=1)
                    .unstack("Feature").unstack("Cytokine").stack("Time").sort_index())
df_train_splines_linear2 = (df_train_splines_linear.unstack("Time").stack("Cytokine").stack("Feature")
                    .reindex(all_times, axis=1)
                    .interpolate(axis=1, method="linear")
                    .reindex(data_times, axis=1)
                    .unstack("Feature").unstack("Cytokine").stack("Time").sort_index())

In [None]:
# Remove large final artifacts: no cyto spans more than 4.5 decades
df_train_splines2 = df_train_splines2.clip(0, 4.5)  
df_train_splines2

# Compute residuals and signal-to-noise ratio
Now that we have data and splines evaluated at the same time points, we can compute residuals. These residuals are then useful to compute the signal-to-noise ratio:

$$ \mathrm{SNR} = \frac{\langle c \rangle}{\sigma_c} $$

A high signal-to-noise means that the noise is small, the signal is strong. A low SNR means the data is very noisy. We can use this metric to explain why we discarded IL-4 and IL-10.

We compute SNR with two different statistical ensembles. 
 1. We compute the "overall" SNR, across all time points and conditions in the training datasets. This gives a rough estimate of which cytokines contain a consistent signal in multiple experiments ,and which are mostly background noise. 
 2. We find the quantiles of the distribution of measured values of each cytokine (across time and conditions), and we compute the SNR inside each interval between quantiles. In other words, we compute the SNR for the cytokine data in the first quantile, the second, and so on. Of course, the SNR increases with the quantile interval considered (because the numerator, the signal, increases). The point is to show that IL-4 and IL-10, even in the top tier, have very low SNR. A low overall SNR, by subdividing the data at quantiles, could have hidden a few elevated, non-noisy trajectories for those cytokines. With quantiles, we isolate the highest values of each cytokine and show that it is still mostly noise for IL-4 and IL-10
 
We elected to use five quantiles, i.e. quintiles, instead of quartiles, to have finer subdivisions without thinning too much the amount of data available in each interval. 

In [None]:
# Compute residuals for log-transformed and linear data
# Dropna to remove time points not available in a given dataset
# When we reindexed with data times, all data sets in the df_splines received all possible exp. time points, 
# not just the time points in that dataset. 
df_train_res = (df_train_raw - df_train_splines2).dropna(axis=0)
df_train_res_linear = (df_train_raw_linear - df_train_splines_linear2).dropna(axis=0)

print(df_train_raw_linear.max())
print(df_train_raw_linear.min())

## Overall signal-to-noise ratio
Without sub-dividing the data points into quantile intervals. 

In [None]:
# Compute SNR over all data points for each cytokine
overall_snrs_log = df_train_splines2.mean(axis=0) / np.sqrt((df_train_res**2).mean(axis=0))
overall_snrs_lin = df_train_splines_linear2.mean(axis=0) / np.sqrt((df_train_res_linear**2).mean(axis=0))

overall_snrs_log = overall_snrs_log.sort_values()
overall_snrs_lin = overall_snrs_lin.reindex(overall_snrs_log.index)

In [None]:
# Joint plot for log and linear scales
fig, axes = plt.subplots(1, 2, sharey=True)
fig.set_size_inches(2.5*2, 2.)
xbars = np.arange(len(df_train_splines2.columns))
colors = [cyt_palette[k] for k in overall_snrs_log.index.get_level_values("Cytokine")]
for j in range(len(colors)):
    axes[0].bar(xbars[j], width=0.5, height=overall_snrs_log.iloc[j], 
                facecolor=colors[j], edgecolor="k")
    axes[1].bar(xbars[j], width=0.5, height=overall_snrs_lin.iloc[j], 
                facecolor=colors[j], edgecolor="k")
    axes[0].annotate("{:.1f}".format(overall_snrs_log.iloc[j]), xy=(xbars[j], overall_snrs_log.iloc[j]+0.5), 
                                    fontsize=7, ha="center")
    axes[1].annotate("{:.1f}".format(overall_snrs_lin.iloc[j]), xy=(xbars[j], overall_snrs_lin.iloc[j]+0.5), 
                                    fontsize=7, ha="center")

ylims = axes[0].get_ylim()
for i in range(2):
    axes[i].set_xticks(xbars)
    #axes[i].set_yscale("log")
    #axes[i].set_ylim(1, ylims[1]*1.35)
    axes[i].set_ylim(ylims[0], ylims[1]+1.5)
ticklabels = overall_snrs_log.index.get_level_values("Cytokine").map(nice_cyto_labels)
axes[0].set_xticklabels(ticklabels, rotation=-30)
ticklabels = overall_snrs_lin.index.get_level_values("Cytokine").map(nice_cyto_labels)
axes[1].set_xticklabels(ticklabels, rotation=-30)
axes[0].set_ylabel("Signal-to-noise ratio")
axes[0].set_title("Log-transformed data")
axes[1].set_title("Linear-scale data")
#axes[0].axhline(2, ls="--", lw=1., color="k", zorder=0)

fig.tight_layout(w_pad=6.)
#fig.savefig(os.path.join(main_dir_path, "figures", "supp", 
#    "il4_il10_overall_signal-to-noise-ratio_log-linear.pdf"), transparent=True)

plt.show()
plt.close()

## Residuals per range of cytokine quantiles
We find quintiles of the distribution of each cytokine, in linear and log scale, and we split the data points at those quintile values. By definition, there will be 1/5th of the values in each inter-quintile interval. 
Then, we compute the signal-to-noise ratio in each subinterval, for each cytokine. 

We only do this for log-transformed cytokine data, because the plot above shows how poor the signal is for linear-scale cytokine concentrations. 

In [None]:
# This function distributes residuals in quantile intervals and computes SNR in each. 
# We are mostly interested in the latter part (SNR). 
# Argument is a Series (samples for one cytokine, for instance)
def residuals_distrib_per_range(ser_res, ser_spl, nbins=5, kind="equal"):
    """ kind is either "equal" or "quantile". 
    Each value in the returned dists gives the histogram weights, the histogram bin separators, 
    the signal-to-noise ratio, and the mean value of the cytokine in that quantile interval. 
    The keys are the quantiles bounding each interval. """
    if kind == "equal":
        binseps = np.linspace(ser_spl.min(), ser_spl.max(), nbins+1)
    elif kind == "quantile":
        binseps = ser_spl.quantile(np.linspace(0.0, 1.0, nbins+1)).values
        for i in range(len(binseps)-1):
            if binseps[i+1] == binseps[i]:
                binseps[i+1] += 0.00001*binseps[-1]  # Just something small to create a non-zero interval
    else:
        raise ValueError("'kind' should be either 'equal' or 'quantile'")
    
    dists = {}
    
    for i in range(nbins):
        rg = (binseps[i], binseps[i+1])
        wh = np.logical_and((ser_spl < rg[1]), (ser_spl >= rg[0]))
        samples = ser_res[wh]
        spline_samples = ser_spl[wh]
        histo, seps = np.histogram(samples)
        mean_within = np.mean(spline_samples)
        snr = mean_within / np.sqrt(np.mean(samples**2))
        dists[rg] = (histo, seps, snr, mean_within)
    return dists

In [None]:
# Compute SNR within each quantile interval, for each cytokine, in log transformed values
n_subdivs = 5  # quintiles
quantile_snrs_grid = np.zeros([7, n_subdivs])
quantile_means_grid = np.zeros([7, n_subdivs])

for col in df_train_res.columns:
    i = cyt_order.index(col[1])
    cy_dists = residuals_distrib_per_range(df_train_res[col], df_train_splines2[col], 
                                           nbins=n_subdivs, kind="quantile")
    ranges = cy_dists.keys()
    ranges = sorted(ranges, key=lambda x: x[0])
    for j in range(n_subdivs):
        heights, seps, snr, mn_within = cy_dists[ranges[j]]
        quantile_means_grid[i, j] = mn_within
        quantile_snrs_grid[i, j] = snr  # Store SNR

In [None]:
# Plot SNR as a function of quantile
# Maybe: plot as a function of mean cytokine value within quantile?
fig, ax = plt.subplots()
quintile_midpoints = np.arange(10, 100, 20)
quintile_boundaries = np.arange(0, 101, 20).astype(int)
# Sort cytokines according to snr in the last quantile, which will be right next to the legend
argsort_cytos_per_snr = np.argsort(quantile_snrs_grid[:, -1])[::-1]
cyt_order_snr = np.asarray(cyt_order)[argsort_cytos_per_snr]
quantile_snrs_grid2 = quantile_snrs_grid[argsort_cytos_per_snr, :]
markers = ["o", "s", "^", "*", "d", "x", "+"]
for i, cyt in enumerate(cyt_order_snr):
    colr = cyt_palette[cyt]
    if cyt == "IL-4":
        ls = "--"
    elif cyt == "IL-10":
        ls = "-."
    else:
        ls = "-"
    ax.plot(quintile_midpoints, quantile_snrs_grid2[i], color=cyt_palette[cyt], 
            ls=ls, lw=2., label=nice_cyto_labels[cyt], marker=markers[i], ms=5)
#ax.axhline(1.0, ls="--", lw=1., color="k", zorder=1)
#ax.set_yscale("log")
ax.set(xlabel="Quintile (#)", ylabel="Signal-to-noise ratio") #yscale="log"
ax.set_xticks(quintile_boundaries)
ax.set_xticklabels([""] + list(map(str, np.arange(1, len(quintile_boundaries)-1))) + [""])
leg = ax.legend(loc="upper left", bbox_to_anchor=(1, 1), frameon=False)
for obj in leg.get_lines():
    obj.set_linewidth(1.5)
fig.tight_layout()
fig.subplots_adjust(right=0.7)
#fig.savefig(os.path.join(main_dir_path, "figures", "supp", "il4_il10_signal-snr_per_quantile.pdf"), 
#    transparent=True, bbox_extra_artists=(leg,), bbox_inches="tight")
plt.show()
plt.close()

In [None]:
# Ratio of cytokine SNRs in the top 20 % of values (i.e. for measurements above last quintile)
# We find that IL-4 and IL-10 have a SNR at least four times lower than any other cytokine. 
# Actually, it's 4 times for IL-4 and 6 times for IL-10. 
print("IL-4:", quantile_snrs_grid2[:, -1] / quantile_snrs_grid2[-2, -1])
print("IL-10", quantile_snrs_grid2[:, -1] / quantile_snrs_grid2[-1, -1])

# Distribution of cytokines themselves
The point of this figure is to show that cytokines distribute more evenly in logarithmic scale (i.e. across decades) than in linear scale. 

We only show the five cytokines that we did not discard. We sort cytokines in decreasing order of signal-to-noise ratio.

In [None]:
fig, axes = plt.subplots(2, 5,  sharey=False)
fig.set_size_inches(5*1.25, 2*1.5)
df_plot_log = df_train_raw
df_plot_lin = df_train_raw_linear

cytos_ordered_logsnr = overall_snrs_log.index.get_level_values("Cytokine").to_list()
cytos_ordered_logsnr.remove("IL-10")
cytos_ordered_logsnr.remove("IL-4")
for i, cy in enumerate(cytos_ordered_logsnr[::-1]):
    col = ("concentration", cy)
    clr = cyt_palette[cy]
    axes[0, i].hist(df_plot_lin[col], bins=20, color=clr, edgecolor="k", linewidth=0.5)
    axes[1, i].hist(df_plot_log[col], bins=20, color=clr, edgecolor="k", linewidth=0.5)
    if i == 0:
        axes[0, 0].set_ylabel("Counts (#)")
        axes[1, 0].set_ylabel("Counts (#)")
    cy2 = nice_cyto_labels[cy]
    axes[0, i].set_xlabel("[" + cy2 + "] (nM)")
    axes[1, i].set_xlabel(r"log$_{10}$(" + cy2 + "/" + cy2 + r"$_{min}$)")

    if i == (5-1):
        axes[0, i].annotate("Linear-scale", xy=(1, 0.5), xytext=(1.2, 0.5), 
                       xycoords="axes fraction", rotation=90, va="center", ha="center")
        axes[1, i].annotate("Log-transformed", xy=(1, 0.5), xytext=(1.2, 0.5), 
                       xycoords="axes fraction", rotation=90, va="center", ha="center")

fig.tight_layout(w_pad=0.5, h_pad=2.)
# fig.savefig(os.path.join(main_dir_path, "figures", "supp", 
#    "histogram_cytokine_data_linear_log_scales.pdf"), transparent=True)
plt.show()
plt.close()