# Training classifiers with different input processings
The goal is to check whether log transform, smoothing, and time integration are important for more accurate predictions. So here we systematically check the $2^3 = 8$ possible combinations of log, smoothing (and interpolation), and integration in processing cytokine data before classification. 

We stick to the same training data as with the original latent space, for better comparison. We even use the same seed in the case where all processings are on, to recover the same neural network. 

We always normalize the data with the min and max of the training set (min-max scaling). The min and max of course change depending on whether we take the log or not, and whether we time-integrate or not. 

In [None]:
import numpy as np
import pandas as pd
import scipy as sp
import scipy.stats
import psutil, pickle, json
import itertools
import multiprocessing
import os, sys
main_dir_path = os.path.abspath('../')
if main_dir_path not in sys.path:
    sys.path.insert(0, main_dir_path)

# plotting
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
from mpl_toolkits.mplot3d import Axes3D


from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_validate

# Custom scripts
import utils.custom_pandas as cpd
from metrics.mi_time_window import compute_mi_timecourse
from metrics.discrete_continuous_info import discrete_continuous_info_fast
from utils.process_raw_data_choices import process_file_choices, select_naive_data
from utils.multiprocess_training import (slice_level, process_train_dsets, train_classifier,  process_test_dsets, 
                                crossvalidate_classifier, test_classifier, init_peps_cytos_concs, process_train)

In [None]:
# CPU count for multiprocessing with the right number of jobs
cpu_count = psutil.cpu_count(logical=False)
multiprocessing.set_start_method("spawn")

In [None]:
# Label sizes for Science format (figure width 2.25 inches or 4.75 inches)
# Squeezing three subplots in a row: 4.75/3 = 1.583333
sns.reset_orig()
plt.rcParams["figure.figsize"] = (1.55, 1.65)
plt.rcParams["font.size"] = 8
plt.rcParams["axes.labelsize"] = 7
plt.rcParams["legend.fontsize"] = 7
plt.rcParams["xtick.labelsize"] = 6
plt.rcParams["ytick.labelsize"] = 6
plt.rcParams["xtick.major.pad"] = 2.  # distance to major tick label in points
plt.rcParams["xtick.minor.pad"] = 2.
plt.rcParams["axes.labelpad"] = 1.
plt.rcParams["axes.linewidth"] = 0.8
#plt.rcParams["axes.spines.top"] = False
#plt.rcParams["axes.spines.right"] = False

plt.rcParams['figure.dpi'] = 150 # default for me was 75
fs = 10

with open(os.path.join(main_dir_path, "data", "misc", "major_ticks_props.json"), "r") as hd:
    props_majorticks = json.load(hd)
with open(os.path.join(main_dir_path, "data", "misc", "minor_ticks_props.json"), "r") as hd:
    props_minorticks = json.load(hd)

# Define functions for the different processing combinations
The basic code in those functions copied from our processing pipeline, with options added here for cases where one or many processing steps are turned off. 

The code customized for this application is in utils/process_raw_data.py. The main function to process a dataframe is ``process_file_choices``. 

Some processing combinations are tricky:
- Time integral without smoothing
- Smoothing without log transform (because trajectories may look funny and be hard to de-noise)

We pay special attention to them in ``process_file_choices``. 

In [None]:
allpeps_decreasing_qual = ["N4", "A2", "Y3", "Q4", "T4", "Q7", "A8", "V4", "G4", "E1"]

In [None]:
# Try processing for integrals without smoothing (most complicated case)
process_kwargs = {"take_log": True, "rescale_max": False, "max_time": 96, 
                  "do_integrate": True, "do_smooth": False}
dset_to_test = 'cytokineConcentrationPickleFile-20190718-PeptideComparison_4-final.hdf'

df_processed = process_file_choices(os.path.join(main_dir_path, "data", "final"), dset_to_test, **process_kwargs)

sns.relplot(data=df_processed["integral"].reset_index(), x="Time", y="IFNg", 
            hue="Peptide", style="TCellNumber", size="Concentration", kind="line")

In [None]:
# Check the smoothing+splines when in linear scale. Try integrals and concentrations manually here. 
process_kwargs = {"take_log": False, "rescale_max": False, "max_time": 96, 
                  "do_integrate": False, "do_smooth": False}
df_linear_conc = process_file_choices(os.path.join(main_dir_path, "data", "final"), dset_to_test, **process_kwargs)

process_kwargs["do_smooth"] = True
df_linear_conc_smooth = process_file_choices(os.path.join(main_dir_path, "data", "final"), dset_to_test, **process_kwargs)

## Train classifiers on each of the eight possible processing combinations
We import the raw training datasets and clear any data that isn't naive OT-1 CD8+ T cells, for each of the eight processing combinations. Then, we train a MLP classifier with one hidden layer on each. 

In [None]:
train_data_files = [
    'cytokineConcentrationPickleFile-20190412-PeptideComparison_2-final.hdf',
    'cytokineConcentrationPickleFile-20190608-PeptideComparison_3-final.hdf',
    'cytokineConcentrationPickleFile-20190718-PeptideComparison_4-final.hdf',
    'cytokineConcentrationPickleFile-20190725-PeptideComparison_5-final.hdf',
    'cytokineConcentrationPickleFile-20190802-TCellNumber_1-final.hdf', 
    'cytokineConcentrationPickleFile-20190812-Activation_1-final.hdf'
]
train_data_names = [a[41:-10] for a in train_data_files]

In [None]:
# Cover all combination of log-no log, smooth-no smooth, integral-no integral
all_classifiers = {}
all_train_dfs = {}
all_minmax_dfs = {}
all_train_scores_long = {}
keys_lis = list(itertools.product([False, True], repeat=3))

# Multiprocessing
pool = multiprocessing.pool.Pool(processes=cpu_count)
all_return_objs = []
for lis in keys_lis:
    re = pool.apply_async(process_train, 
            args=(lis, train_data_files, os.path.join(main_dir_path, "data", "final")))
    all_return_objs.append(re)

all_returns = [p.get() for p in all_return_objs]

pool.close()
pool.join()

for i in range(len(tuple(keys_lis))):
    all_classifiers[tuple(keys_lis[i])] = all_returns[i][0]
    all_train_scores_long[tuple(keys_lis[i])] = all_returns[i][1]
    all_train_dfs[tuple(keys_lis[i])] = all_returns[i][2]
    all_minmax_dfs[tuple(keys_lis[i])] = all_returns[i][3]

In [None]:
for k, tscore in all_train_scores_long.items():
    print(k)
    print(tscore["train_score"].mean(), "pm", tscore["train_score"].std())
    print(tscore["test_score"].mean(), "pm", tscore["test_score"].std())
    print(tscore["whole_score"])

In [None]:
all_train_scores = {k:all_train_scores_long[k]["whole_score"] for k in all_train_scores_long.keys()}

In [None]:
# Validation scores: process training data and validate
train_time_slice = slice(1, 71)
test_data_files = [
    'cytokineConcentrationPickleFile-20200220-TCellNumber_3-final.hdf', 
    'cytokineConcentrationPickleFile-20190404-PeptideComparison_1-final.hdf',
    'cytokineConcentrationPickleFile-20191029-PeptideComparison_8-final.hdf',
    'cytokineConcentrationPickleFile-20191106-PeptideComparison_9-final.hdf', 
    'cytokineConcentrationPickleFile-20200624-HighMI_1-1-final.hdf',
    'cytokineConcentrationPickleFile-20200624-HighMI_1-2-final.hdf',
    'cytokineConcentrationPickleFile-20200624-HighMI_1-3-final.hdf',
    'cytokineConcentrationPickleFile-20200624-HighMI_1-4-final.hdf'
]
test_data_names = [a[41:-10] for a in test_data_files]

# Use the training normalization coefficients!
all_test_dfs = {}
all_test_scores = {}
for lis in itertools.product([False, True], repeat=3):  # l: log, s: smooth, i: integral
    # Process test data according to lis
    process_kwargs["take_log"] = lis[0]
    process_kwargs["do_smooth"] = lis[2]
    process_kwargs["do_integrate"] = lis[1]
    # Get the min-max scaling
    df_minmax = all_minmax_dfs[tuple(lis)]
    df_test = process_test_dsets(test_data_files, process_kwargs, df_minmax, 
                folder=os.path.join(main_dir_path, "data", "final"), tslice=train_time_slice)
    print(lis)
    print("Number of points:", df_test.shape)  # For interpolated data, should be (5680, 5)
    
    # Use only time points > 12 hours
    # df_test = df_test.loc[df_test.index.get_level_values("Time") >= 12]
    
    # Get the appropriate MLP and test it
    mlp = all_classifiers[tuple(lis)]
    score = test_classifier(mlp, df_test)

    print("Test score: {}".format(100*score))
    print()
    
    # Store the classifier, data, and normalization factors
    all_test_scores[tuple(lis)] = score
    all_test_dfs[tuple(lis)] = df_test

## Plot the training results

In [None]:
# Plot the scores with the eight combinations
# Stack of two heatmaps, based on this solution:
# https://stackoverflow.com/questions/57502763/how-to-plot-horizontal-stack-of-heatmaps-or-a-stack-of-grid
def heatmap_stack(scores_dict, cbar_label="Training score", cmap_choice="plasma", score_range=None, 
                 figax=None, add_cbar=True, lblsize=7):
    # Create a cube array of scores
    n = int(round(len(scores_dict.keys())**(1/3), 0))
    cube_scores = np.zeros([n, n, n])
    for lis in scores_dict.keys():
        l, i, s = map(int, lis)
        cube_scores[l, i, s] = scores_dict[lis]
    print(cube_scores)
    
    if figax is None:
        fig = plt.figure()
        ax = fig.add_subplot(projection="3d")
    else:
        fig, ax = figax
    x_smooth, z_int = np.meshgrid([0, 1], [0, 1])
    x_edges, z_edges = np.meshgrid(np.linspace(-0.5, 1.5, 3), np.linspace(-0.5, 1.5, 3))
    y_log = np.asarray([0, 1])  # Index for stacks: log (first in lis)

    cmap = mpl.cm.get_cmap(cmap_choice)
    if score_range is None:
        score_range = (np.amin(cube_scores), np.amax(cube_scores))
    norm = mpl.colors.Normalize(vmin=score_range[0]*100, vmax=score_range[1]*100)

    ## TODO: make custom cmap with 8 values for the eight scores? Easier to read? 
    for y in y_log:
        colors = cmap(norm(cube_scores[y]*100))  # Transpose to go from ij to xy convention
        ax.plot_surface(x_edges, np.zeros(shape=x_edges.shape)+y, z_edges, facecolors=colors, linewidth=0)

    # Annotate the actual scores in each square
    for lis in scores_dict.keys():
        score = scores_dict[lis]
        xyz = (int(lis[2]), int(lis[0]), int(lis[1]))
        ax.text(*xyz, "{:.1f} %".format(score*100), ha="center", va="center", color="w", zorder=100, fontsize=lblsize)

    # Connect each square on one heatmap to the analog square on the other?
    # Give a zorder in between the two surfaces. Doesn't work now, so just ignore. 
    #for si in itertools.product([0, 1], repeat=2):
    #    ax.plot(xs=[si[0]]*2, ys=[0, 1], zs=[si[1]]*2, ls="-", color=(0., 0., 0., 0.7), lw=3, zorder=100)


    # Colorbar
    # fig.subplots_adjust(right=0.6)
    if add_cbar:
        fig.colorbar(mpl.cm.ScalarMappable(norm, cmap=cmap_choice), ax=[ax], location='left', 
                anchor=(1.0, 0.25), shrink=0.7, aspect=10, label=cbar_label + " (%)")

    # Labeling axes and ticks
    lblnoyes = ["No"," Yes"]
    ax.set_xlabel("Smoothing", labelpad=-10)
    ax.set_xticks(x_smooth[0])
    ax.set_xticklabels(lblnoyes)

    ax.set_ylabel("Log transform", labelpad=-10)
    ax.set_yticks(y_log)
    ax.set_yticklabels(lblnoyes)

    ax.set_zlabel("Integral", labelpad=-10)
    ax.set_zticks(z_int[:, 0])
    ax.set_zticklabels(lblnoyes)
    ax.tick_params(axis="both", which="major", pad=-4)
    
    ax.view_init(azim=130, elev=30)
    return fig, ax, cube_scores

In [None]:
# A function for colorbars
from mpl_toolkits.axes_grid1 import make_axes_locatable

def add_colorbar(mappable, ax, **kwargs):
    """ Taken from https://joseph-long.com/writing/colorbars/ """
    fig = ax.figure
    divider = make_axes_locatable(ax)
    cax = divider.append_axes("left", size="3%", pad=0.05)
    return fig.colorbar(mappable, cax=cax, **kwargs)

In [None]:
#Combined plot
score_max = max(*all_train_scores.values(), *all_test_scores.values())
score_min = min(*all_train_scores.values(), *all_test_scores.values())

fig = plt.figure()
fig.set_size_inches(1.5*4, 2.75)
gs = fig.add_gridspec(1, 9)  # Give first ninth of the plot to color bar
ax_cbar = fig.add_subplot(gs[0:1])
ax_cbar.set_axis_off()
ax_train = fig.add_subplot(gs[1:5], projection="3d")
ax_test = fig.add_subplot(gs[5:9], projection="3d")
fig.subplots_adjust(wspace=2.5)

ax_train.set_title("Training", fontsize=8)
ax_test.set_title("Testing", fontsize=8)

cmap_chosen = "plasma"
fig, ax_train, cube_train_scores = heatmap_stack(all_train_scores, cbar_label="Score", cmap_choice=cmap_chosen,
                        score_range=(score_min, score_max), figax=[fig, ax_train], add_cbar=False, lblsize=7)
fig, ax_test, cube_test_scores = heatmap_stack(all_test_scores, cbar_label="Score", cmap_choice=cmap_chosen,
                        score_range=(score_min, score_max), figax=[fig, ax_test], add_cbar=False, lblsize=7)

# Add colorbar at the end, in a separate axes. Otherwise one plot is smaller
norm = mpl.colors.Normalize(vmin=score_min*100, vmax=score_max*100)
cbar = fig.colorbar(mpl.cm.ScalarMappable(norm, cmap=cmap_chosen), ax=ax_cbar, fraction=1, aspect=10, 
                   label="Score (%)", shrink=0.6)
cbar.set_label="Score (%)"

#fig.tight_layout()

#fig.savefig(os.path.join(main_dir_path, "figures", "latentspaces", 
#   "train_test_heatcube_log-integral-smooth.pdf"), transparent=True)
plt.show()
plt.close()

## Latent spaces depending on processing
Look at the latent spaces reached with each of the processing combinations. 
Organize plots on four rows of two plots. 
Make one column for logs, one for no-logs. 

Make sure to flip the latent spaces for each space until LS1 is increasing and LS2 is highest for N4. 

Define a couple of functions to order the eight combinations of processing in a logical way on a 2D grid. I use degree order with log being the most important, then integral, then smoothing (L-I-S)

In [None]:
from functools import cmp_to_key  # Convert a comparison function to a key class 
# (the key is applied to each element before sorting to redefine how they are compared)

# Define a degree order relation between processing choices. 
# Cover one plane of constant sum at a time, then within that plane, 
# use the ordering log > smoothing > integral
# Basically: swipe over triangles in the (+, +, +) octant
# When the sum is equal, the first element is checked, then then second, etc. 
# by calling comparison recursively on sub-tuples
# So we are assuming that the tuple is ordered in decreasing order of importance
def degree_order(x, y):
    # If the sum is different, the largest one wins
    sx = sum(x)
    sy = sum(y)
    if sx != sy:
        return sx - sy
    elif x[0] != y[0]:
        return x[0] - y[0]
    # Recursion on remaining elements
    elif len(x) > 1:
        return degree_order(x[1:], y[1:])
    elif len(x) == 1:  # They are equal because the first elements were not different
        return 0
    else:  # The code should never reach this point
        raise ValueError("Recursive comparison failed")

key_degree_order = cmp_to_key(degree_order)

In [None]:
# Test this on our eight keys
sorted(list(all_test_scores.keys()), key=key_degree_order)

In [None]:
def project_to_latent(df_cy, classif, apply_offsets=False, apply_tanh=False):
    projmat = classif.coefs_[0]  # 5x2 so dot on the right of data
    # Apply tanh function and offsets, optionally (False by default)
    offsets = classif.intercepts_[0]
    
    df_ls = df_cy.dot(projmat)
    df_ls.columns = pd.Index(["LS1", "LS2"], name="Latent variable")
    if apply_offsets:
        df_ls += offsets.reshape(1, -1)
    if apply_tanh:
        df_ls = np.tanh(df_ls)
    return df_ls

In [None]:
# Function to determine whether the latent space of a classifier should be flipped
# It doesn't perfectly work; see better version below. 
def find_flip_latentspace(df_ls):
    dict_qualities = {allpeps_decreasing_qual[i]:i for i in range(len(allpeps_decreasing_qual))}
    # Make sure N1 is increasing overall, with a linear regression over averages?
    avg_ls1_over_time = df_ls.xs("N4", level="Peptide").loc[:, "LS1"].groupby("Time").mean().sort_index()
    res = sp.stats.linregress(avg_ls1_over_time.index.get_level_values("Time").astype(float), 
                              avg_ls1_over_time.values)
    flip_ls1 = 1 if res.slope >= 0 else -1
    
    # Make sure max LS2 for strongest peptide is higher than max LS2 for lowest
    max_ls2 = df_ls.loc[:, "LS2"].groupby("Peptide").max()
    peps_sorted = sorted(max_ls2.index.get_level_values("Peptide"), 
                         key=lambda x: dict_qualities.get(x, len(dict_qualities)+1))
    flip_ls2 = 1 if max_ls2[peps_sorted[0]] >= max_ls2[peps_sorted[1]] else -1
    return np.asarray([[flip_ls1, flip_ls2]])  # Will broadcast to cover every row
    

In [None]:
# With one flip and a rotation we can always get the desired orientation. 
# Flip if the initial angle for N4 is smaller (counterclockwise) than for other peptides
# Then rotate by 0, 90, 180 or 270 until N4 trajectories have positive LS1 and LS2 initial slope. 
# Return the transformed LS and the transformations. 
def determine_flip_rotate_latentspace(df_ls):
    average_lines_th = df_ls.groupby(["Peptide"]).mean()
    angles = np.arctan2(average_lines_th["LS2"], average_lines_th["LS1"]) % (2*np.pi)
    delta_angles = angles["N4"] - angles["Q4"]
    if delta_angles < -np.pi:
        delta_angles += 2*np.pi
    if delta_angles > np.pi:
        delta_angles -= 2*np.pi
    flip1 = 1 if delta_angles > 0 else -1
    # The flip will change angles; determine number of rotations
    # needed AFTER flipping node 1. 
    angles2 = np.arctan2(average_lines_th["LS2"], average_lines_th["LS1"]*flip1) % (2*np.pi)
    # Put the slope of N4 back in upper right quadrant (0-pi/2)
    number_rots_minus90 = int(angles2["N4"] // (np.pi/2))

    return (flip1, number_rots_minus90)

def apply_flip_rotate_latentspace(df_ls, flip1, n_rotations90):
    rotmat = np.asarray([[1, 0], 
                         [0, 1]])
    # Right-hand side dot product with following gives clockwise rotation by 90 degress
    rot90 = np.asarray([[0, -1], 
                        [1,  0]])
    for n in range(n_rotations90):
        rotmat = rotmat.dot(rot90)
    # Apply transforms
    df_ls2 = df_ls.copy()
    df_ls2["LS1"] *= flip1
    df_ls2 = df_ls2.dot(rotmat)
    df_ls2.columns = df_ls.columns
    return df_ls2

In [None]:
# Compute training and test latent spaces
all_classif_flips = {}
all_train_latentspaces = {}
all_test_latentspaces = {}
for k in all_classifiers.keys():
    # Compute original latent space
    clf = all_classifiers[k]
    df_ls_train = project_to_latent(all_train_dfs[k], clf, apply_offsets=False, apply_tanh=False)
    # Determine if flips are necessary -- Old version
    #all_classif_flips[k] = find_flip_latentspace(df_ls_train)
    # Apply the flips
    #df_ls_train *= all_classif_flips[k]
    
    # Determine flips and apply them to training data
    all_classif_flips[k] = determine_flip_rotate_latentspace(df_ls_train)
    df_ls_train = apply_flip_rotate_latentspace(df_ls_train, *all_classif_flips[k])
    all_train_latentspaces[k] = df_ls_train
    
    # Compute the latent projection of test data as well
    df_ls_test = project_to_latent(all_test_dfs[k], clf, apply_offsets=False, apply_tanh=False)
    #df_ls_test *= all_classif_flips[k]
    # Apply the necessary flip and rotation
    df_ls_test = apply_flip_rotate_latentspace(df_ls_test, *all_classif_flips[k])
    all_test_latentspaces[k] = df_ls_test

In [None]:
# First check latent spaces without any flipping, offsets or tanh
fig, axes = plt.subplots(2, 4)
fig.set_size_inches(1.5*4, 1.8*2)
i0, i1 = 0, 0  # Where we are at in each column, depending on the sort order we choose
for k in sorted(list(all_test_scores.keys()), key=key_degree_order):
    j = 1 if k[0] else 0  # First column for no log
    i = i1 if k[0] else i0  # Next row to fill in the correct column
    ax = axes[j, i]
    peps_in_df = [p for p in allpeps_decreasing_qual 
                  if p in all_train_latentspaces[k].index.get_level_values("Peptide").unique()]
    g = sns.lineplot(data=all_train_latentspaces[k].reset_index(), x="LS1", y="LS2", 
               hue="Peptide", hue_order=peps_in_df, ax=ax, sort=False, sizes=[1.5, 1., 0.75, 0.5],
               size="Concentration", size_order=["1uM", "100nM", "10nM", "1nM"],
               style="Data", legend=False)
    ax.set(xlabel=r"LS$_1$ (a.u.)", ylabel=r"LS$_2$ (a.u.)", 
           xticks=[], xticklabels=[], yticks=[], yticklabels=[])
    ax.spines["top"].set_visible(False)
    ax.spines["right"].set_visible(False)

    plot_title = "Log\n" if k[0] else "Linear\n" 
    plot_title += "Integral\n" if k[1] else "No integral\n"
    plot_title += "Smooth" if k[2] else "No smooth"
    fontweight = "bold" if k[1] and k[2] and k[0] else None
    ax.set_title(plot_title, y=0.9, fontweight=fontweight, fontsize=8)
    # Update i0 or i1
    i0 += (1-j)  # 1 if j=0, else 0
    i1 += j
fig.tight_layout()
#fig.savefig(os.path.join(main_dir_path, "figures", "latentspaces", 
#   "latent_spaces_trained_log-integral-smooth.pdf"), transparent=True, dpi=300)
plt.show()
plt.close()

#### Credits
Script and imported custom functions written by frbourassa