# Channel capacity for parameter space with the Kraskov MI estimator
Using a package called [cce](https://github.com/pawel-czyz/channel-capacity-estimator), which was developed for the paper [Grabowski et al., 2019](https://dx.doi.org/10.1098/rsif.2018.0792)
> Grabowski, Czyz, Kochanczyk, and Lipniacki. "Limits to the rate of information transmission through the MAPK pathway" J. R. Soc., Interface, 2019. 

It combines the bin-less MI estimator of
> Kraskov, Stögbauer, and Grassberger. "Estimating mutual information", *Phys. Rev. E*, 2004. 

with stochastic gradient descent as implemented in Tensorflow. It takes sample points (possibly multidimensional) with different labels and maximizes the MI between the points and the labels. It is much faster than our Blahut-Arimoto plus Monte Carlo integration, but that's because it only uses the 36 sample points per peptide that we have -- our B-A algorithm is based on multivariate normal distributions fitted and interpolated to have access to more closely spaced EC$_{50}$; the integrals involved take a lot longer to compute. 

**To run this notebook, you need to install the cce package on your computing, following the instructions on its Github page**: https://github.com/pawel-czyz/channel-capacity-estimator

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy as sp
import seaborn as sns

from datetime import date
import json
from time import perf_counter
import os

# YOU NEED TO INSTALL CCE: https://github.com/pawel-czyz/channel-capacity-estimator
import cce

## Import the data and prepare it for cce

In [None]:
folder =  "highmi13"  # "100ktcn", "alltcn", "highmi"
tcn = "30k"
suffix = "_TCNall" if folder == "alltcn" else "_TCN"+tcn

In [None]:
# If you fit model parameters (any model) on another dataset, you can import the DataFrame 
# with the fitted parameters saved by the fit_ballistic_model.ipynb notebook
df_params = pd.read_hdf(os.path.join("results", "fits", "df_params_Sigmoid_freealpha_HighMI_13.hdf"))
df_params.drop_duplicates(inplace=True)

In [None]:
good_cols = [col for col in df_params.columns if not col.startswith("var")]
data_cce = list(zip(df_params.index.get_level_values("Peptide"), df_params.loc[:, good_cols].values))

In [None]:
print("\n".join(map(str, data_cce[::100])))

In [None]:
potencies = pd.read_json(os.path.join("data", "potencies_df_2021.json"))
potencies

In [None]:
peptide_order = df_params.index.get_level_values("Peptide").unique()
ec50s_logs = np.around(np.log10(potencies).mean(axis=1).loc[peptide_order].values, 2)

## Run the cce algorithm

In [None]:
start_t = perf_counter()
knn = 6  # This seems an appropriate number of nearest-neighbors for 36 replicates per peptide. 
res = cce.WeightedKraskovEstimator(data_cce).calculate_maximized_mi(k=knn)
run_duration = perf_counter() - start_t
print(res)

In [None]:
# Save more info on the run in a JSON file
today = date.today().strftime("%d-%b-%Y").lower()
input_values = ec50s_logs
optim_input_distrib = [float(res[1][a]) for a in peptide_order]
capacity_bits = float(res[0])
n_inputs =  len(optim_input_distrib)
reltol = 0.04

run_info = {
    "date": today, 
    "capacity_bits": capacity_bits, 
    "input_values": list(input_values.astype(float)), 
    "input_peptides": list(peptide_order), 
    "optim_input_distrib": optim_input_distrib, 
    "run_duration (s)": run_duration, 
    "relative tolerance": reltol, 
    "absolute_error": reltol*capacity_bits, 
    "n_inputs": n_inputs, 
    "k_nearest_neighbors": knn
}

filename = os.path.join("results", folder, 
    "cce_run_log_{}ins_rtol{:.0e}{}_{}.json".format(n_inputs, reltol, suffix, today))

# Uncomment to save the result of the CCE calculation
#with open(filename, "w") as hand:
#    json.dump(run_info, hand)

## Analyze the results

In [None]:
# Make a histogram (bar plot) of the optimal input distribution
fig, ax = plt.subplots()
# Sort the input EC50 values and the probabilities; compute bar widths
sortxpos = np.argsort(ec50s_logs)
distrib = np.asarray(optim_input_distrib)[sortxpos]
xpos = np.asarray(ec50s_logs)[sortxpos]

separators = np.concatenate([[0.], (xpos[1:] + xpos[:-1])/2, [1.]])
# Same width on left and right sides of the first and last bars
separators[0] = max(0., xpos[0] - (separators[1] - xpos[0]))   
separators[-1] = xpos[-1] + (xpos[-1] - separators[-2])
widths = separators[1:] - separators[:-1]

ax.bar(separators[:-1], height=distrib, width=widths, linewidth=1., edgecolor="k", align="edge")
ax.set_xlabel(r"$\log_{10}{(\mathrm{EC}_{50})}$ [-]", size=14)
ax.set_ylabel("Probability [-]", size=14)

# Custom x ticks
ax.tick_params(which="both", labelsize=12)
ax.set_xticks(ec50s_logs)
ax.set_xticklabels([peptide_order[i]+"\n"+str(round(ec50s_logs[i], 1)) 
                    for i in range(len(peptide_order))])
ax.annotate(r"C = {:.4f} bits $\pm$ {:.2f} %".format(res[0], reltol*100), 
            xy=(0.2, 0.9), xycoords="axes fraction", size=12)

#fig.savefig(os.path.join("figures", folder, 
#    "cce_optimal_logec50_distrib_{}inputs_rtol{:.0e}{}.pdf".format(n_inputs, reltol, suffix)))
plt.show()
plt.close()