# Mutual information per time point
Here, we apply the mutual information estimator defined in (Kraskov et al., 2004) and (Ross, 2014), to compute mutual information between peptide quality $Q$ and cytokines $\mathbf{X}$, as a function of time. MI between peptide quality and cytokines is computed at each time point by aggregating time points over a sliding time window of 3 hours for better statistics. We use various quantities for the (vector) random variable $\mathbf{X}$: each individual cytokine, the vector of 5 cytokines (IFN-$\gamma$, IL-2, IL-17A, IL-6, TNF), each latent space variable (LS$_1$ or LS$_2$), the two latent space variables combined in a vector (LS$_1$, LS$_2$). 

Then, we compare to the mutual information between antigen quality and parameters of the constant velocity model, fitted on latent space time courses as a way to summarize the entire time kinetics of cytokines with a single vector of three real numbers ($v_0, t_0, \theta$). 

We use a dataset (HighMI_1) which contains 4 replicates of the cytokine time series for each peptide at each concentration. This is the dataset shown in main figure 1. 

In [None]:
import numpy as np
import scipy as sp
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import os

# Our own Python implementation of the MI algorithm
from utils.discrete_continuous_info import discrete_continuous_info_fast, discrete_continuous_info_ref
import utils.custom_pandas as cpd

In [None]:
%matplotlib inline

In [None]:
plt.rcParams["figure.figsize"] = (2.25, 1.75)
plt.rcParams["axes.labelsize"] = 8.
plt.rcParams["legend.fontsize"] = 8.
plt.rcParams["axes.labelpad"] = 0.5
plt.rcParams["xtick.labelsize"] = 7.
plt.rcParams["ytick.labelsize"] = 7.
plt.rcParams["legend.title_fontsize"] = 8.
plt.rcParams["axes.titlesize"] = 8.
plt.rcParams["font.size"] = 8.

# Import data and project to latent space
The HighMI_1 replicates were split in four separate files by our processing pipeline for compatibility with most former experiments that had only one replicate per condition. Here, we recombine those files to reform the whole dataset. 

In [None]:
# Cytokine data
df_dict = {}
for fi in os.listdir(os.path.join("data", "processed")):
    if fi.startswith("HighMI_1-") and fi.endswith(".hdf"):
        df_dict[fi[:-4]] = pd.read_hdf(os.path.join("data", "processed", fi))

df_wt = pd.concat(df_dict, names=["Data"])
df_wt = df_wt.xs("100k", level="TCellNumber", drop_level=False)

In [None]:
peptides = ["N4", "Q4", "T4", "V4", "G4", "E1", "A2", "Y3", "A8"]
concentrations = ["1uM","100nM","10nM","1nM"]

In [None]:
df_min, df_max = pd.read_pickle(os.path.join("data", "trained-networks", "min_max-thomasRecommendedTraining.pkl"))

In [None]:
cytokines = ["IFNg", "IL-17A", "IL-2", "IL-4", "IL-6", "IL-10", "TNFa"]
times = np.arange(1, 73)
print(cytokines)

In [None]:
# Project to latent space and scale
df = df_wt.unstack("Time").loc[:, ("integral", cytokines, times)].stack("Time")
df_conc = df_wt.unstack("Time").loc[:, ("concentration", cytokines, times)].stack("Time")
df = df.droplevel("Feature", axis=1)
df_conc = df_conc.droplevel("Feature", axis=1)

# Normalize
df_min = df.min()
df_max = df.max()
df = (df - df_min)/(df_max - df_min)

## Add E1 from other datasets
This peptide was not included in the HighMI_1 dataset because it consistently gives zero cytokine response, only measurement noise. Therefore, we import it from a few other datasets, since this null peptide category is important to get a proper estimate of mutual information. 

In [None]:
# Import a few datasets containing E1
df_dict = {}
for fi in os.listdir(os.path.join("data", "processed")):
    if fi.startswith("Activation_2") and fi.endswith(".hdf"):
        df_dict[fi[:-4]] = (pd.read_hdf(os.path.join("data", "processed", fi))
            .xs("E1", level="Peptide", drop_level=False).xs("Naive", level="ActivationType", drop_level=True))
    elif fi.startswith("CD25MutantTimeSeries") and fi.endswith(".hdf"):
        df_dict[fi[:-4]] = (pd.read_hdf(os.path.join("data", "processed", fi))   # keep only OT-1 cell data
            .xs("E1", level="Peptide", drop_level=False).xs("WT", level="Genotype", drop_level=True))
    elif fi.startswith("PeptideComparison_OT1_Timeseries_18") and fi.endswith(".hdf"):
        df_dict[fi[:-4]] = (pd.read_hdf(os.path.join("data", "processed", fi))
            .xs("E1", level="Peptide", drop_level=False))

df_data_e1 = pd.concat(df_dict, names=["Data"])
df_data_e1 = df_data_e1.loc[:, (slice(None), cytokines)]

In [None]:
df_conc_e1 = df_data_e1.xs("concentration", level="Feature", axis=1)
df_integ_e1 = df_data_e1.xs("integral", level="Feature", axis=1)
df_integ_e1 = (df_integ_e1 - df_min)/(df_max - df_min)

In [None]:
df_conc = df_conc.append(df_conc_e1).sort_index()

# Functions to compute MI over a sliding time window

In [None]:
def compute_mi_slice(df, q, t, window, knn=3, speed="fast"):
    """ Compute the mutual information between the variable X in the columns of df
    and the variable Q, a level in the index, in a time window centered at time t. 
    Time should be in the index too. Computes the MI between each feature and 
    the labels in q separately (sklearn's implementation does not handle 
    joint probability distributions). 
    
    Args:
        df (pd.DataFrame): should have "Time" in its column levels 
            and q in its index. 
        q (str): the name of the index level to use as labels specifying
            the values of the discrete RV. 
        t (float): the time on which the time window is centered. 
        window (float): duration of the time interval over which to 
            keep the samples. If a single time point is desired, use 0. 
        speed (str): either "fast" or "slow" ("slow" is ref. code)
    
    Returns:
        mi (float): mutual information
    """
    # Define the time window and slicer. Assuming the Time index is sorted. 
    tpts = np.array(df.index.get_level_values("Time").unique())
    tlo = tpts[np.searchsorted(tpts, t - window/2, side="left")]
    try:  # searchsorted can return the index next to the last; in that case, take the last
        thi = tpts[np.searchsorted(tpts, t + window/2, side="right")]
    except IndexError:
        thi = tpts[-1]
    time_slice = slice(tlo, thi)
    
    # Slice the df with our custom function
    df_t = cpd.xs_slice(df, "Time", time_slice, axis=0)
    if isinstance(df_t, pd.Series):
        df_t = df_t.to_frame()

    # Extract the labels about which the data should inform
    try:
        mapping = {a:i for i, a in enumerate(df_t.index.get_level_values(q).unique())}
        target = np.asarray(df_t.index.get_level_values(q).map(mapping))
    except ValueError:
        print("{} not in {}".format(q, df_t.index))
    
    # Compute the MI!
    if speed == "fast":
        mi = discrete_continuous_info_fast(target, df_t.values, k=knn, base=2)
    elif speed == "slow":
        mi = discrete_continuous_info_ref(target, df_t.values, k=knn, base=2)
    else:
        raise ValueError("Speed '{}' not available".format(speed))
        
    return mi
    

In [None]:
# Compute the maximum possible MI
# Function taken from the code for phi-evo simulations with cytokines
# Can input the Peptide level values to this, it counts the number of 
# occurences of each label
def entropy(xvals):
    """ Builds the distribution and compute the entropy
    of the sample xvals of some random variable X
    Args:
        xvals (list of int): list of sampled values of the X variable
    Returns:
        info (float): the entropy of the distribution from the sample, in bits
    """
    possible_values = list(set(xvals))
    mapping = {possible_values[i]:i for i in range(len(possible_values))}
    probs = np.zeros(len(possible_values))
    for x in xvals:
        probs[mapping[x]] += 1
    probs = probs / np.sum(probs)
    # We are sure that no prob value is zero because we only considered possible values
    info = np.sum(-probs * np.log(probs)) / np.log(2)  # in bits
    return info

In [None]:
def find_nearest(my_array, target, condition):
    """ Nice function by Akavall on StackOverflow:
    https://stackoverflow.com/questions/17118350/how-to-find-nearest-value-that-is-greater-in-numpy-array
    Page consulted Feb 10, 2019.
    """
    diff = my_array - target
    if condition == "above":
        # We need to mask the negative differences and zero
        # since we are looking for values above
        mask = (diff <= 0)
    elif condition == "below":
        # We need to mask positive differences and zero
        mask = (diff >= 0)
    if np.all(mask):
        return None # returns None if target is greater than any value
    masked_diff = np.ma.masked_array(diff, mask)
    return masked_diff.argmin()

def compute_mi_timecourse(dfs, q, overlap, window, knn=3, names=None, speed="fast"):
    """ Compute the MI over time for each feature in each df. 
    
    Args:
        dfs (dict of pd.DataFrames): each df has one feature per column, and q in its index. 
        q (str): name of the index level to use as the discrete RV
        overlap (bool): whether to allow the use of overlapping time windows
            when computing the MI at succesive time points. 
        window (float): the time duration over which to take the samples to compute the
            MI at one time point. Set to 0 if the distribution is to be based on single time points. 
        knn (int): number of nearest-neighbors to use. 
        speed (str): "fast" or "slow" (where "slow" is the reference code)
        
    Returns:
        
    """      
    # Set an order in which the dfs will be processed
    names = list(dfs.keys())
    
    # Determine the upper bound on the MI
    maximum_mi = entropy(dfs[names[0]].index.get_level_values("Peptide"))

    # Determine the times to consider for each df
    time_values = [dfs[a].index.get_level_values("Time").unique() for a in names]
    # Define the central times to consider
    if overlap or window == 0:  # we allow all times
        times = time_values
    else:  # we want different times at each evaluation
        times = []
        for t_array in time_values:
            tstart = t_array[find_nearest(t_array, t_array[0] + window/2, "above")]
            times.append(np.arange(tstart, t_array[-1], window))
    
    # Compute the MI at each selected time point, for each df
    mi_courses = []  # Contains the MI time course of each df variable
    for i in range(len(names)):
        # Will be an array with one time per row, one feature per column
        current_course = []
        df = dfs[names[i]]
        for t in times[i]:
            current_course.append(compute_mi_slice(df, q, t=t, window=window, knn=knn, speed=speed))
            if speed == "slow":
                print("Time {} h done".format(t))
        mi_courses.append(np.array(current_course))
    
    df_mi_courses = pd.concat([pd.Series(mi_courses[i], index=times[i]) for i in range(len(times))], 
                             keys=names, names=["Variable", "Time"])
    print(df_mi_courses.index.names)
    df_mi_courses = df_mi_courses.unstack("Variable")
    
    return df_mi_courses, maximum_mi

## Compute MI for individual cytokines and the vector of five cytokines

In [None]:
all_variables_dfs = {
    "all cytokines": df_conc
}
all_variables_dfs.update({c:df_conc[c] for c in cytokines})

In [None]:
all_variables_dfs.keys()

In [None]:
# Number of NN: 3 neighbors times length of time window.  
df_mi_time, max_mi = compute_mi_timecourse(all_variables_dfs, q="Peptide", overlap=False, 
                      window=3, knn=3*3, speed="fast")

In [None]:
sns.relplot(data=df_mi_time.stack("Variable").reset_index(), x="Time", y=0, hue="Variable", kind="line", height=3)

## Save all results for further plotting

In [None]:
# Append the theoretical maximal MI (entropy of Q) to the dataframe, for reference. 
# This is simply log_2(number of peptides). 
df_mi_time["MaxMI"] = np.ones(df_mi_time.shape[0])*np.nan
df_mi_time["MaxMI"].iloc[-1] = max_mi
df_mi_time

In [None]:
# Uncomment to save; data used for main figure 1E
# df_mi_time.to_hdf("output/miStatistics-HighMI_1-all-cytokines.hdf", key="df")

# MI estimation for latent space variables
LS$_1$ and LS$_2$ taken together preserve all information available in the five cytokines. 

In [None]:
mlp_coefs = np.load(os.path.join("data", "trained-networks", "mlp_coefs-thomasRecommendedTraining.npy"))
df_min, df_max = pd.read_pickle(os.path.join("data", "trained-networks", "min_max-thomasRecommendedTraining.pkl"))
df2 = df.append(df_integ_e1)
df2 = cpd.xs_slice(df2, "Cytokine", df_min.index.get_level_values("Cytokine").unique().tolist(), axis=1)
# Rename Data to Replicate, add Data level
lvl_names = df2.index.names
df2.index = df2.index.set_names(["Replicate"]+lvl_names[1:])
df2 = df2.rename({a:str(i) for i, a in enumerate(df2.index.get_level_values("Replicate").unique())})
df2 = pd.concat({"HighMI_1": df2}, names=["Data"])
df2 = df2.sort_index()
df_proj = pd.DataFrame(np.dot(df2, mlp_coefs), index=df2.index, columns=["Node 1", "Node 2"])

In [None]:
all_variables_dfs_latent = {
    "LS1": df_proj["Node 1"],
    "LS2": df_proj["Node 2"],
    "2 LS": df_proj
}
df_mi_latent, max_mi_latent = compute_mi_timecourse(all_variables_dfs_latent, q="Peptide",
                       overlap=False, window=3, knn=3*3, speed="fast")
print(df_mi_latent.max(axis=0))

In [None]:
sns.relplot(data=df_mi_latent.stack("Variable").reset_index(), x="Time", y=0, hue="Variable", kind="line", height=2.)

# MI estimation for $v_0$, $t_0$, $\theta$
Fit the constant velocity parameters on each time series, then compute MI between that description of cytokine time kinetics and antigen quality. 

In [None]:
from ltspcyt.scripts.sigmoid_ballistic import return_param_and_fitted_latentspace_dfs
fit_vars = {"Constant velocity":["v0", "t0", "theta", "vt"], 
           "Sigmoid_freealpha":["a0", "t0", "theta", "v1", "alpha", "beta"]}

In [None]:
e1_key = (slice(None), slice(None), slice(None), "E1")
df2.loc[e1_key, :] = df2.loc[e1_key, :] + 0.01*np.random.normal(size=df2.loc[e1_key].size).reshape(df2.loc[e1_key].shape)

In [None]:
# Fitting
choice_model = "Constant velocity"
regul_rate = 1.0

# Here, we need to reject negative v2v1 slopes, this improves the constant velocity fit for mouse1-replicate4
ret = return_param_and_fitted_latentspace_dfs(df_proj, choice_model, reg_rate=regul_rate, reject_neg_slope=True)
df_params, df_compare, df_hess, df_v2v1 = ret

nparameters = len(fit_vars[choice_model])
peptides = [a for a in peptides if a in df_params.index.get_level_values("Peptide").unique()]

In [None]:
df_params

In [None]:
var_choice = fit_vars[choice_model][:3]
pep_palette_order = ["N4", "Q4", "T4", "V4", "G4", "E1", "A2", "Y3"]
palette = sns.color_palette(n_colors=len(pep_palette_order))
pep_palette = {pep:palette[i] for i, pep in enumerate(pep_palette_order)}
hue_order = [a for a in pep_palette_order if a in df_params.index.get_level_values("Peptide").unique()]
sns.pairplot(data=df_params.iloc[:, :4].reset_index(), hue="Peptide", hue_order=hue_order, 
             palette=[pep_palette.get(a) for a in hue_order], 
             vars=var_choice)

In [None]:
# Remove clear outliers
df_params = df_params.loc[df_params.index.isin(["V4"], level="Peptide")*df_params["theta"] < np.pi/2]
df_params = df_params.loc[df_params.index.isin(["E1"], level="Peptide")*df_params["theta"] < np.pi/2]

In [None]:
var_choice = fit_vars[choice_model][:3]
vals = df_params[var_choice].values
pep_map = {peptides[i]:i for i in range(len(peptides))}
target = df_params.index.get_level_values("Peptide").map(pep_map)
# Number of knn: equals to number used before (3) per time point, for fair comparison
mi_v0t0theta = discrete_continuous_info_fast(target, vals, k=3, base=2)

In [None]:
# Result
print(mi_v0t0theta)  # bits