# Training classifiers with IL-4 and IL-10 included
We train on the time integrals of the logarithms of the normalized cytokine concentrations, as we do by default. 
We now include cytokines IL-4 and IL-10. The goal is to show that:
 1. The structure of the latent space is essentially unchanged
 2. The classifier's accuracy is not significantly higher even on training data
 3. These cytokines are highly variable from experiment to experiment

Either the variability makes the prediction very poor on some datasets, or the classifier learns to ignore them (by giving the low weights). In any case, there's a way to explain they are not important. 

It remains to be seen if those hypotheses are correct. The code below checks the answer. 

In [None]:
import numpy as np
import pandas as pd
import scipy as sp
import scipy.stats
import psutil, pickle
import itertools
import os, sys
main_dir_path = os.path.abspath('../')
if main_dir_path not in sys.path:
    sys.path.insert(0, main_dir_path)

# plotting
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl


# Processing: min-max scaling of the data
from sklearn.preprocessing import PowerTransformer, StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_validate

# Custom scripts
import utils.custom_pandas as cpd
from utils.mi_time_window import compute_mi_timecourse
from utils.discrete_continuous_info import discrete_continuous_info_fast
from utils.process_raw_data_choices import process_file_choices, select_naive_data
from utils.multiprocess_training import slice_level

In [None]:
# CPU count for multiprocessing with the right number of jobs
cpu_count = psutil.cpu_count(logical=False)

In [None]:
%matplotlib inline

# Plot parameters for Science
plt.rcParams["figure.figsize"] = (2.5, 2.)
plt.rcParams["axes.labelsize"] = 8.
plt.rcParams["legend.fontsize"] = 8.
plt.rcParams["axes.labelpad"] = 0.5
plt.rcParams["xtick.labelsize"] = 7.
plt.rcParams["ytick.labelsize"] = 7.
plt.rcParams["legend.title_fontsize"] = 8.
plt.rcParams["axes.titlesize"] = 8.
plt.rcParams["font.size"] = 8.

# For larger display of small graphs in the notebook
plt.rcParams['figure.dpi'] = 120

In [None]:
# Import and process cytokine data
train_sets = [
    'PeptideComparison_OT1_Timeseries_18',
    'PeptideComparison_OT1_Timeseries_19',
    'NewPeptideComparison_OT1_Timeseries_20',
    'PeptideTumorComparison_OT1_Timeseries_1',
    'TCellNumber_OT1_Timeseries_7', 
    'Activation_Timeseries_1'
]
df_train = {}
for f in train_sets:
    df_train[f] = select_naive_data(pd.read_hdf(os.path.join(main_dir_path, "data", "processed", f+".hdf")))
df_train = pd.concat(df_train, names=["Data"])

# Select only integrals
df_train = df_train.xs("integral", level="Feature", axis=1)
# Select only the training peptides
training_peps = ["N4", "Q4", "T4", "V4", "G4", "E1"]
df_train = df_train.loc[df_train.index.isin(training_peps, level="Peptide")]

test_sets = [
    #'Activation_2',  # Does not line up with usual data
    'Activation_TCellNumber_1', 
    'CD25MutantTimeSeries_OT1_Timeseries_2',
    #'PeptideComparison_OT1_Timeseries_21',  # Does not line up
    'PeptideComparison_OT1_Timeseries_22',
    'PeptideComparison_OT1_Timeseries_23',
    'HighMI_1-1', 'HighMI_1-2',
    'HighMI_1-3', 'HighMI_1-4'
]
df_test = {}
for f in test_sets:
    df_test[f] = select_naive_data(pd.read_hdf(os.path.join(main_dir_path, "data", "processed", f+".hdf")))
df_test = pd.concat(df_test, names=["Data"])

df_test = df_test.xs("integral", level="Feature", axis=1)
# Select only the training peptides
df_test = df_test.loc[df_test.index.isin(training_peps, level="Peptide")]

allpeps_decreasing_qual = ["N4", "A2", "Y3", "Q4", "T4", "Q7", "A8", "V4", "G4", "E1"]

In [None]:
# Global variables common to the two functions below
# Put them in one initialization function called by each other function
# to get a consistent set of peptides, concentrations and cytokines. 
def init_peps_cytos_concs():
    # Keep the train_peptides order as in the original code. E1, weakest, is first.
    train_peptides = ["N4", "Q4", "T4", "V4", "G4", "E1"][::-1]
    keep_cytokines = ["IFNg", "IL-17A", "IL-2", "IL-6", "TNFa", "IL-4", "IL-10"]
    keep_conc = ["1uM", "100nM", "10nM", "1nM"]
    keep_cytokines.sort()
    return train_peptides, keep_cytokines, keep_conc

def train_classifier(data, hidden_sizes=(2,), seed=None, activ="tanh"):
    train_peptides, keep_cytokines, keep_conc = init_peps_cytos_concs()
    peptide_dict = {k:v for v, k in enumerate(train_peptides)
                         if k in data.index.get_level_values("Peptide").unique()}

    #Extract times and set classes
    y = data.index.get_level_values("Peptide").map(peptide_dict)

    mlp = MLPClassifier(activation=activ, hidden_layer_sizes=hidden_sizes, max_iter=5000,
            solver="adam", random_state=seed, learning_rate="adaptive", alpha=0.01).fit(data, y)

    score = mlp.score(data, y)
    return mlp, score


def test_classifier(mlp, data):
    train_peptides, keep_cytokines, keep_conc = init_peps_cytos_concs()
    peptide_dict = {k:v for v, k in enumerate(train_peptides)
                         if k in data.index.get_level_values("Peptide").unique()}

    #Extract times and set classes
    y = data.index.get_level_values("Peptide").map(peptide_dict)

    score = mlp.score(data, y)
    return score


def test_classifier_bootstrap(mlp, data, n_rep=100):
    train_peptides, keep_cytokines, keep_conc = init_peps_cytos_concs()
    peptide_dict = {k:v for v, k in enumerate(train_peptides)
                         if k in data.index.get_level_values("Peptide").unique()}

    # Make many bootstrap replicates by selecting 80 % of the test data
    y = data.index.get_level_values("Peptide").map(peptide_dict).to_series()
    scores = []
    for i in range(n_rep):
        chosen = np.random.choice(np.arange(len(y)), replace=True, size=len(y))
        x_boot = data.iloc[chosen]
        y_boot = y.iloc[chosen]
        scores.append(mlp.score(x_boot, y_boot))
    return np.asarray(scores)

def crossvalidate_classifier(data, hidden_sizes=(2,), seed=None, activ="tanh"):
    train_peptides, keep_cytokines, keep_conc = init_peps_cytos_concs()
    peptide_dict = {k:v for v, k in enumerate(train_peptides)
                         if k in data.index.get_level_values("Peptide").unique()}

    #Extract times and set classes
    y = data.index.get_level_values("Peptide").map(peptide_dict)

    mlp = MLPClassifier(activation=activ, hidden_layer_sizes=hidden_sizes, max_iter=5000,
            solver="adam", random_state=seed, learning_rate="adaptive", alpha=0.01)

    scores = cross_validate(mlp, data.values, y, cv=5, return_train_score=True, return_estimator=False)
    return scores


def process_train_dsets(df, tslice=slice(1, 71)):
    train_peptides, keep_cytokines, keep_concs = init_peps_cytos_concs()
    # Keep relevant cytokines only
    df = df.loc[:, df.columns.isin(keep_cytokines, level="Cytokine")]
    # Keep training peptides only
    df = df.loc[df.index.isin(train_peptides[::-1], level="Peptide")]
    # Keep 100k T cells only, which was done manually in the original network's training
    df = df.xs("100k", level="TCellNumber", drop_level=False)
    # Keep typical concentrations
    df = df.loc[df.index.isin(keep_concs, level="Concentration")]
    df = slice_level(df, tslice, target_lvl="Time")
    # Normalize
    dfmin, dfmax = df.min(axis=0), df.max(axis=0)
    df = (df - dfmin) / (dfmax - dfmin)
    dfminmax = pd.concat({"min": dfmin, "max":dfmax}, names=["Extremum"], axis=1)
    # Return the normalization factors too, to be able to reverse the scaling afterwards
    return df, dfminmax

def process_test_dsets(df, dfminmax, tslice=slice(1, 71)):
    train_peptides, keep_cytokines, keep_concs = init_peps_cytos_concs()
    # Keep relevant cytokines only
    df = df.loc[:, df.columns.isin(keep_cytokines, level="Cytokine")]
    # Keep training peptides only
    df = df.loc[df.index.isin(train_peptides[::-1], level="Peptide")]
    # Keep 100k T cells only, which was done manually in the original network's training
    df = df.xs("100k", level="TCellNumber", drop_level=False)
    # Keep typical concentrations
    df = df.loc[df.index.isin(keep_concs, level="Concentration")]
    df = slice_level(df, tslice, target_lvl="Time")
    # Normalize with the training min and max
    dfmin, dfmax = dfminmax["min"], dfminmax["max"]
    df = (df - dfmin) / (dfmax - dfmin)
    return df

In [None]:
# Training
train_time_slice = slice(1, 71)
df_train_norm, df_minmax = process_train_dsets(df_train, tslice=train_time_slice)

# Now train and cross-validate a classifier
sd = 998243
print("Starting training...")
classif, train_score = train_classifier(df_train_norm, hidden_sizes=(2,), seed=sd, activ="tanh")
print("Finished training once; training score: {}".format(100*train_score))
print("Starting cross-validation...")
cv_scores = crossvalidate_classifier(df_train_norm, hidden_sizes=(2,), seed=None, activ="tanh")
print("Cross-validation scores:", cv_scores["test_score"].mean(), "pm", cv_scores["test_score"].std())
print("Train scores during crossval:", cv_scores["train_score"].mean(), "pm", cv_scores["train_score"].std())

In [None]:
# Testing
df_test_norm = process_test_dsets(df_test, df_minmax)
test_scores = test_classifier_bootstrap(classif, df_test_norm)
print("Test score:", test_scores.mean(), "pm", test_scores.std())


As we see, the test score is equal to the usual test score when not including IL-4 and IL-10. 
Refer the reader to the heatmap cube showing  scores for log-smooth-integral-processed data and having $\sim$65 % for the test score, although it was slightly lower for train ($\sim$71 %). 
This clearly means the classifier overfits on IL-4/IL-10 noise! 
Even the cross-validation score above means there is overfitting. 

# Compare to not including IL-4 and IL-10
Re-run this training here to make a plot summarizing the two types of training. 

In [None]:
# Training
train_time_slice = slice(1, 71)
df_train_5 = df_train.loc[:, ["IFNg", "IL-17A", "IL-2", "IL-6", "TNFa"]]
df_train_norm_5, df_minmax_5 = process_train_dsets(df_train_5, tslice=train_time_slice)

# Now train and cross-validate a classifier
sd = 90
print("Starting training...")
classif_5, train_score_5 = train_classifier(df_train_norm_5, hidden_sizes=(2,), seed=sd, activ="tanh")
print("Finished training once; training score: {}".format(100*train_score_5))
print("Starting cross-validation...")
cv_scores_5 = crossvalidate_classifier(df_train_norm_5, hidden_sizes=(2,), seed=None, activ="tanh")
print("Cross-validation scores:", cv_scores_5["test_score"].mean(), "pm", cv_scores_5["test_score"].std())
print("Train scores during crossval:", cv_scores_5["train_score"].mean(), "pm", cv_scores_5["train_score"].std())

In [None]:
# Testing
df_test_5 = df_test.loc[:, ["IFNg", "IL-17A", "IL-2", "IL-6", "TNFa"]]
df_test_norm_5 = process_test_dsets(df_test_5, df_minmax_5)
test_scores_5 = test_classifier_bootstrap(classif_5, df_test_norm_5)
print("Test score:", test_scores_5.mean(), "pm", test_scores_5.std())

## Plot for scores comparison

In [None]:
# Nomograph with train, cross-validation, and test scores
cv_scores["all_train_score"] = np.concatenate([cv_scores["train_score"], np.asarray([train_score])])
all_scores_7 = np.asarray([cv_scores["all_train_score"].mean(), cv_scores["test_score"].mean(), test_scores.mean()])
errors_scores_7 = np.asarray([cv_scores["all_train_score"].std(), cv_scores["test_score"].std(), test_scores.std()])

cv_scores_5["all_train_score"] = np.concatenate([cv_scores_5["train_score"], np.asarray([train_score_5])])
all_scores_5 = np.asarray([cv_scores_5["all_train_score"].mean(), cv_scores_5["test_score"].mean(), test_scores_5.mean()])
errors_scores_5 = np.asarray([cv_scores_5["all_train_score"].std(), cv_scores_5["test_score"].std(), test_scores_5.std()])

In [None]:
fig, ax = plt.subplots()
color1, color2 = "xkcd:coral", "xkcd:royal blue"
xticks = np.arange(len(all_scores_7))
ax.errorbar(xticks, all_scores_7*100, yerr=errors_scores_7*100, marker="o", ms=6, 
            color=color1, label="With IL-4 & IL-10", mfc=color1, mec=color1)
ax.errorbar([xticks[0], xticks[1]+0.02, xticks[2]+0.02], all_scores_5*100, yerr=errors_scores_5*100, marker="o", ms=6,
            color=color2, mfc=color2, mec=color2, label="No IL-4 & IL-10")
ax.set_xticks(xticks)
ax.set_xticklabels(["Train", "Cross-validation", "Test"])
ax.set_ylabel("Score (%)")
ax.legend(frameon=True, edgecolor=(1, 1, 1, 0), framealpha=0.9)

fig.tight_layout()
# fig.savefig(os.path.join(main_dir_path, "figures", "latentspaces", 
# "l4_il10_train_crossval_test_scores.pdf"), transparent=True)
plt.show()
plt.close()

## Plot the training results

In [None]:
def project_to_latent(df_cy, classif, apply_offsets=False, apply_tanh=False):
    projmat = classif.coefs_[0]  # 5x2 so dot on the right of data
    # Apply tanh function and offsets, optionally (False by default)
    offsets = classif.intercepts_[0]
    
    df_ls = df_cy.dot(projmat)
    df_ls.columns = pd.Index(["LS1", "LS2"], name="Latent variable")
    if apply_offsets:
        df_ls += offsets.reshape(1, -1)
    if apply_tanh:
        df_ls = np.tanh(df_ls)
    return df_ls

In [None]:
# With one flip and a rotation we can always get the desired orientation. 
# Flip if the initial angle for N4 is smaller (counterclockwise) than for other peptides
# Then rotate by 0, 90, 180 or 270 until N4 trajectories have positive LS1 and LS2 initial slope. 
# Return the transformed LS and the transformations. 
def determine_flip_rotate_latentspace(df_ls):
    average_lines_th = df_ls.groupby(["Peptide"]).mean()
    angles = np.arctan2(average_lines_th["LS2"], average_lines_th["LS1"]) % (2*np.pi)
    delta_angles = angles["N4"] - angles["Q4"]
    if delta_angles < -np.pi:
        delta_angles += 2*np.pi
    if delta_angles > np.pi:
        delta_angles -= 2*np.pi
    flip1 = 1 if delta_angles > 0 else -1
    # The flip will change angles; determine number of rotations
    # needed AFTER flipping node 1. 
    angles2 = np.arctan2(average_lines_th["LS2"], average_lines_th["LS1"]*flip1) % (2*np.pi)
    # Put the slope of N4 back in upper right quadrant (0-pi/2)
    number_rots_minus90 = int(angles2["N4"] // (np.pi/2))

    return (flip1, number_rots_minus90)

def apply_flip_rotate_latentspace(df_ls, flip1, n_rotations90):
    rotmat = np.asarray([[1, 0], 
                         [0, 1]])
    # Right-hand side dot product with following gives clockwise rotation by 90 degrees
    rot90 = np.asarray([[0, -1], 
                        [1,  0]])
    for n in range(n_rotations90):
        rotmat = rotmat.dot(rot90)
    # Apply transforms
    df_ls2 = df_ls.copy()
    df_ls2["LS1"] *= flip1
    df_ls2 = df_ls2.dot(rotmat)
    df_ls2.columns = df_ls.columns
    return df_ls2

In [None]:
# Compute training and test latent spaces
df_ls_train = project_to_latent(df_train_norm, classif, apply_offsets=False, apply_tanh=False)
# Determine flips and apply them to training data
ls_flips = determine_flip_rotate_latentspace(df_ls_train)
df_ls_train = apply_flip_rotate_latentspace(df_ls_train, *ls_flips)


# Compute the latent projection of test data as well
df_ls_test = project_to_latent(df_test_norm, classif, apply_offsets=False, apply_tanh=False)
# Apply the necessary flip and rotation
df_ls_test = apply_flip_rotate_latentspace(df_ls_test, *ls_flips)

In [None]:
# Same for the classifier using only 5 cytokines
# Compute training and test latent spaces
df_ls_train_5 = project_to_latent(df_train_norm_5, classif_5, apply_offsets=False, apply_tanh=False)
# Determine flips and apply them to training data
ls_flips_5 = determine_flip_rotate_latentspace(df_ls_train_5)
df_ls_train_5 = apply_flip_rotate_latentspace(df_ls_train_5, *ls_flips_5)


# Compute the latent projection of test data as well
df_ls_test_5 = project_to_latent(df_test_norm_5, classif_5, apply_offsets=False, apply_tanh=False)
# Apply the necessary flip and rotation
df_ls_test_5 = apply_flip_rotate_latentspace(df_ls_test_5, *ls_flips_5)

In [None]:
# First check latent spaces without any flipping, offsets or tanh
fig, axes = plt.subplots(1, 2)
axes = axes.flatten()
fig.set_size_inches(2.25*2, 2.25)

peps_in_df = [p for p in allpeps_decreasing_qual 
              if p in df_ls_train.index.get_level_values("Peptide").unique()]
g = sns.lineplot(data=df_ls_train.reset_index(), x="LS1", y="LS2", 
           hue="Peptide", hue_order=peps_in_df, ax=axes[0], sort=False, sizes=[1.5, 1., 0.75, 0.5],
           size="Concentration", size_order=["1uM", "100nM", "10nM", "1nM"],
           style="Data", legend=False)
g = sns.lineplot(data=df_ls_train_5.reset_index(), x="LS1", y="LS2", 
           hue="Peptide", hue_order=peps_in_df, ax=axes[1], sort=False, sizes=[1.5, 1., 0.75, 0.5],
           size="Concentration", size_order=["1uM", "100nM", "10nM", "1nM"],
           style="Data", legend=False)
for i in range(2):
    axes[i].set(xlabel=r"LS$_1$ (a.u.)", ylabel=r"LS$_2$ (a.u.)", 
           xticks=[], xticklabels=[], yticks=[], yticklabels=[])
    axes[i].spines["top"].set_visible(False)
    axes[i].spines["right"].set_visible(False)
axes[0].set_title("With IL-4 & IL-10")
axes[1].set_title("Without IL-4 & IL-10")

fig.tight_layout()
# fig.savefig(os.path.join(main_dir_path, "figures", "latentspaces", 
#    "il4_il10_latent_spaces_trained.pdf"), transparent=True)
plt.show()
plt.close()

# Signal-to-noise ratio
We compute the signal-to-noise ratio of each cytokine in another Jupyter notebook, ``cytokine_noise_distribution.ipynb``, where we also study the distribution of cytokines themselves and of the noise (variability around fitted splines) in linear and log scale. From those distributions, it is very easy to compute the SNR. 