In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scanpy as sc
import scipy.stats as ss
import seaborn as sns
sc.settings.set_figure_params(dpi=100)
print(sc.__version__)

In [None]:
import blosum as bl
# perform encoding by direct, BCP, BLOSUM
vocab = ['A','C','D','E','F','G','H','I','K','L','M','N','P','Q','R','S','T','V','W','Y']
# direct encoding
map_direct = {x:[1 * (x == y) for y in vocab] for x in vocab}
# bcp encoding
aa_hydrophobicity = {
    'A': 1.8,  # Alanine
    'R': -4.5,  # Arginine
    'N': -3.5,  # Asparagine
    'D': -3.5,  # Aspartic Acid
    'C': 2.5,  # Cysteine
    'E': -3.5,  # Glutamic Acid
    'Q': -3.5,  # Glutamine
    'G': -0.4,  # Glycine
    'H': -3.2,  # Histidine
    'I': 4.5,  # Isoleucine
    'L': 3.8,  # Leucine
    'K': -3.9,  # Lysine
    'M': 1.9,  # Methionine
    'F': 2.8,  # Phenylalanine
    'P': -1.6,  # Proline
    'S': -0.8,  # Serine
    'T': -0.7,  # Threonine
    'W': -0.9,  # Tryptophan
    'Y': -1.3,  # Tyrosine
    'V': 4.2,  # Valine
}
# https://www.imgt.org/IMGTeducation/Aide-memoire/_UK/aminoacids/IMGTclasses.html
aa_volume = {
    'A': 88.6,   # Alanine
    'R': 173.4,  # Arginine
    'N': 114.1,  # Asparagine
    'D': 111.1,  # Aspartic Acid
    'C': 108.5,  # Cysteine
    'E': 138.4,  # Glutamic Acid
    'Q': 143.8,  # Glutamine
    'G': 60.1,   # Glycine
    'H': 153.2,  # Histidine
    'I': 166.7,  # Isoleucine
    'L': 166.7,  # Leucine
    'K': 168.6,  # Lysine
    'M': 162.9,  # Methionine
    'F': 189.9,  # Phenylalanine
    'P': 112.7,  # Proline
    'S': 89.0,   # Serine
    'T': 116.1,  # Threonine
    'W': 227.8,  # Tryptophan
    'Y': 193.6,  # Tyrosine
    'V': 140.0,  # Valine
}
# 1 = donor and acceptor, 0.5 = only donor or acceptor
aa_hbond = {
    'A': 0,    # Alanine
    'R': 0.5,  # Arginine
    'N': 1,    # Asparagine
    'D': 0.5,  # Aspartic Acid
    'C': 0,    # Cysteine
    'E': 0.5,  # Glutamic Acid
    'Q': 1,    # Glutamine
    'G': 0,    # Glycine
    'H': 1,    # Histidine
    'I': 0,    # Isoleucine
    'L': 0,    # Leucine
    'K': 0.5,  # Lysine
    'M': 0,    # Methionine
    'F': 0,    # Phenylalanine
    'P': 0,    # Proline
    'S': 1,    # Serine
    'T': 1,    # Threonine
    'W': 0.5,  # Tryptophan
    'Y': 1,    # Tyrosine
    'V': 0,    # Valine
}
has_sulfur = ['C','M']
is_aromatic = ['F','Y','W']
is_aliphatic = ['A','G','I','L','P','V']
is_basic = ['R','H','K']
is_acidic = ['D','E']
has_amide = ['N','Q']
vocab_bcp = ['hydrophobicity','volume','hbond','has_sulfur','is_aromatic',
             'is_aliphatic','is_basic','is_acidic','has_amide']
# > define a method to return the embedding for a given amino acid in BCP space
def bcp_translation(aa):
    embedding = []
    embedding.append(aa_hydrophobicity[aa])
    embedding.append(aa_volume[aa])
    embedding.append(aa_hbond[aa])
    embedding.append(1 * (aa in has_sulfur))
    embedding.append(1 * (aa in is_aromatic))
    embedding.append(1 * (aa in is_aliphatic))
    embedding.append(1 * (aa in is_basic))
    embedding.append(1 * (aa in is_acidic))
    embedding.append(1 * (aa in has_amide))
    return embedding
map_bcp = {x:bcp_translation(x) for x in vocab}
# blosum encoding
map_blosum = {x:[bl.BLOSUM(62)[x][y] for y in vocab] for x in vocab}

### Report Latent Dimension Correlation Data

In [None]:
# read in the values
a_trb = sc.read_h5ad('../outs/adata.trb.h5ad')
# derive the correlations between the latent dimensions
df_corr = pd.DataFrame(a_trb.X).corr()
def get_p(xs, ys): return ss.pearsonr(xs, ys)[1]
df_pval = pd.DataFrame(a_trb.X).corr(method=get_p)
for idx in df_pval.index: df_pval.loc[idx, idx] = 0

In [None]:
# report the maximum correlation
values = df_corr.abs().values.flatten()
values = values[values < 1]
print(values.max())
# compute the 1 - percentages less
rhos = []
for idx, x in enumerate(df_corr.index[:-1]):
    rhos += df_corr.loc[x][idx+1:].tolist()
print(1 - np.mean(rhos <= df_corr.loc[10, 12]))

In [None]:
# color in the values
fig, ax = plt.subplots(figsize=[6, 3]); ax.grid(False)
sns.kdeplot(rhos, fill=True, lw=1.5, color='skyblue', label='All Pairs')
ax.axvline(df_corr.loc[10, 12], color='dodgerblue', lw=1.5, linestyle='--', label='10 vs. 12')
ax.text(df_corr.loc[10, 12]*1.05, 20*0.98, '$\leftarrow$' + '%.2f' % (np.mean(rhos <= df_corr.loc[10, 12])*100) + '%', ha='right')
ax.set(xlabel='Pearson Correlation Coefficient\nBetween Latent Dimensions')
ax.legend(frameon=False)

In [None]:
g = sns.clustermap(df_corr, figsize=[8, 8], xticklabels=1, yticklabels=1,
                   cmap='GnBu', vmin=-0.20, vmax=0.20, method='ward',
                   mask=df_pval >= 0.05, cbar_pos=(-0.04, .83, .01, .08))
g.ax_heatmap.grid(False); g.ax_heatmap.set_facecolor('lightgrey')
g.ax_cbar.set_ylabel('Pearson\nCorrelation\nCoefficient', rotation=0, ha='left', y=1)

### Visualize Latent Dimensions, TCR Lengths

In [None]:
# plot the latent dimensions
sc.pl.umap(a_trb, color=['12'], cmap='plasma', vmin=-2, vmax=2)
sc.pl.umap(a_trb, color=['10'], cmap='plasma', vmin=-2, vmax=2)
# plot the length
sc.pl.umap(a_trb, color=['LEN'], cmap='Blues', vmin=10, vmax=20)

### Visualize Physicochemical Characteristics

In [None]:
from matplotlib.colors import ListedColormap
# create out own colormap
N = 256
vals = np.ones((N, 4))
vals[:, 0] = np.linspace(0, 112/256, N // 3).tolist() + np.linspace(112/256, 69/256, N - (N // 3)).tolist()
vals[:, 1] = np.linspace(0, 5/256, N // 3).tolist() + np.linspace(5/256, 227/256, N - (N // 3)).tolist()
vals[:, 2] = np.linspace(0, 150/256, N // 3).tolist() + np.linspace(150/256, 255/256, N - (N // 3)).tolist()
cmap = ListedColormap(vals)

In [None]:
# examine BCP characteristics
og_cols = a_trb.obs.columns
trbs = a_trb.obs.index
a_trb.obs['hydrophob_total'] = [sum([aa_hydrophobicity[y] for y in x]) for x in trbs]
a_trb.obs['hydrophob_mean'] = [np.mean([aa_hydrophobicity[y] for y in x]) for x in trbs]
a_trb.obs['volume_total'] = [sum([aa_volume[y] for y in x]) for x in trbs]
a_trb.obs['volume_mean'] = [np.mean([aa_volume[y] for y in x]) for x in trbs]
a_trb.obs['hbond_total'] = [sum([aa_hbond[y] for y in x]) for x in trbs]
a_trb.obs['hbond_mean'] = [np.mean([aa_hbond[y] for y in x]) for x in trbs]
a_trb.obs['has_sulfur_total'] = [sum([1 * (y in has_sulfur) for y in x]) for x in trbs]
a_trb.obs['has_sulfur_mean'] = [np.mean([1 * (y in has_sulfur) for y in x]) for x in trbs]
a_trb.obs['is_aromatic_total'] = [sum([1 * (y in is_aromatic) for y in x]) for x in trbs]
a_trb.obs['is_aromatic_mean'] = [np.mean([1 * (y in is_aromatic) for y in x]) for x in trbs]
a_trb.obs['is_aliphatic_total'] = [sum([1 * (y in is_aliphatic) for y in x]) for x in trbs]
a_trb.obs['is_aliphatic_mean'] = [np.mean([1 * (y in is_aliphatic) for y in x]) for x in trbs]
a_trb.obs['is_basic_total'] = [sum([1 * (y in is_basic) for y in x]) for x in trbs]
a_trb.obs['is_basic_mean'] = [np.mean([1 * (y in is_basic) for y in x]) for x in trbs]
a_trb.obs['is_acidic_total'] = [sum([1 * (y in is_acidic) for y in x]) for x in trbs]
a_trb.obs['is_acidic_mean'] = [np.mean([1 * (y in is_acidic) for y in x]) for x in trbs]
a_trb.obs['has_amide_total'] = [sum([1 * (y in has_amide) for y in x]) for x in trbs]
a_trb.obs['has_amide_mean'] = [np.mean([1 * (y in has_amide) for y in x]) for x in trbs]
bcp_cols = [x for x in a_trb.obs.columns if x not in og_cols]

In [None]:
# visually examine the differences for each BCP
for col in [x for x in bcp_cols if '_total' in x]:
    vmin, vmax = np.percentile(a_trb.obs[col], 1), np.percentile(a_trb.obs[col], 99)
    print(col, vmin, vmax)
    sc.pl.umap(a_trb, color=[col], cmap=cmap, vmin=vmin, vmax=vmax)

In [None]:
# highlight the different BCP characteristics
df_bcp = a_trb.obs[[x for x in bcp_cols if '_total' in x] + ['leiden']].groupby('leiden').mean()
g = sns.clustermap(df_bcp, method='ward', figsize=[4.3, 8], cmap=cmap,
                   standard_scale=1, cbar_pos=(0, 1, .01, .08), dendrogram_ratio=(.2, .1))
g.ax_heatmap.grid(False); g.ax_heatmap.tick_params(axis='y', labelrotation=0)

### Evaluate YLQ-Specific Physicochemical Charateristics and Latent Dimensions

In [None]:
# read in the data
df = pd.read_csv('../outs/df.int.clean.csv', index_col=0)
# retrieve the data
trbs = df.loc[df['AG'] == 'YLQPRTFLL', 'TRB']
trbs = trbs[trbs.isin(a_trb.obs.index)]
targ = a_trb[trbs].X

In [None]:
# plot the densities for latent dimensions
pvals = []
fig, axs = plt.subplots(32, 1, figsize=[3, 1.5*32])
for dim in range(32):
    np.random.seed(0)
    ax = axs[dim]
    ax.grid(False)
    sns.kdeplot(np.random.choice(a_trb.X[:, dim], size=len(targ), replace=False),
                color='lightgrey', lw=1.5, fill=True, alpha=0.5, ax=ax)
    sns.kdeplot(targ[:, dim], color='dodgerblue', lw=1.5, fill=True, alpha=0.5, ax=ax)
    ax.set_xticks([]); ax.set_yticks([])
    ax.set_xlabel(f'Latent Dimension {dim+1}')
    pvals.append(ss.mannwhitneyu(np.random.choice(a_trb.X[:, dim], size=len(targ), replace=False), targ[:, dim])[1])
fig.tight_layout()
# copy over pvals
pvals_dim = pvals

In [None]:
# retrieve the data
trbs = df.loc[df['AG'] == 'YLQPRTFLL', 'TRB']
trbs = trbs[trbs.isin(a_trb.obs.index)]
targ = a_trb.obs.loc[trbs, bcp_cols]
cols = [x for x in bcp_cols if '_total' in x]
labels = ['Hydrophobic','Volume','H-Bond','Sulfur','Aromatic','Aliphatic','Basic','Acidic','Amide']

In [None]:
# plot the densities
fig, axs = plt.subplots(9, 1, figsize=[3, 1.5*9])
pvals = []
for idx, (label, dim) in enumerate(zip(labels, cols)):
    np.random.seed(0)
    ax = axs[idx]
    ax.grid(False)
    sns.kdeplot(np.random.choice(a_trb.obs[dim], size=len(targ), replace=False),
                color='lightgrey', lw=1.5, fill=True, alpha=0.5, ax=ax)
    sns.kdeplot(targ[dim], color='dodgerblue', lw=1.5, fill=True, alpha=0.5, ax=ax)
    ax.set_xticks([]); ax.set_yticks([])
    ax.set_xlabel(label)
    pvals.append(ss.mannwhitneyu(np.random.choice(a_trb.obs[dim], size=len(targ), replace=False), targ[dim])[1])
fig.tight_layout()
# copy over pvals
pvals_bcp = pvals

In [None]:
# visualize p-values for latent dimensions
fig, ax = plt.subplots(figsize=[8, 4]); ax.grid(False)
ax.bar([str(x) for x in range(1, 33)], -np.log10(pvals_dim[:32]), edgecolor='dodgerblue', lw=1.5, color='skyblue')
ax.set(ylabel='-log$_{10}$(p-value)\nYLQ-specific vs. Random')
ax.tick_params(axis='x', labelrotation=90)
# and for BCPs
fig, ax = plt.subplots(figsize=[3, 4]); ax.grid(False)
ax.bar(labels, -np.log10(pvals_bcp), edgecolor='dodgerblue', lw=1.5, color='skyblue')
ax.set(ylabel='-log$_{10}$(p-value)\nYLQ-specific vs. Random')
ax.tick_params(axis='x', labelrotation=90)

### Repeat Exercise for Multiple Epitopes

In [None]:
# list the epitopes to study
epitopes = ['YLQPRTFLL','NLVPMVATV','TPRVTGGGAM','GILGFVFTL','GLCTLVAML','YVLDHLIVV',
            'ELAGIGILTV','EAAGIGILTV','SLLMWITQC','KLGGALQAK','AVFDRKSDAK','RAKFKQLL',
            'IVTDFSVIK','LLWNGPMAV','SPRWYFYYL','TTDPSFLGRY','RLRAEAQVK','LLLDRLNQL',
            'LTDEMIAQY','CINGVCWTV','KTFPPTEPK','QYIKWPWYI','VMTTVLATL','DATYQRTRALVR','NQKLIANQF','FLCMKALLL']

In [None]:
# retrieve the data
targs = []
for epitope in epitopes:
    trbs = df.loc[df['AG'] == epitope, 'TRB']
    trbs = trbs[trbs.isin(a_trb.obs.index)]
    targ = a_trb.obs.loc[trbs, cols].mean(0)
    targ.name = epitope
    targs.append(targ)
targ = pd.concat(targs, axis=1)
targ.index = labels

In [None]:
# plot the physicochemical attributes of binding TCRs
g = sns.clustermap(targ, standard_scale=0, method='ward', cmap=cmap, xticklabels=1,
                   cbar_pos=(0, 1, .01, .08), figsize=[8, 5], dendrogram_ratio=(.1, .35))
g.ax_heatmap.grid(False)

In [None]:
# plot the physicochemical attributes of binding TCRs
g = sns.clustermap(((targ.T - a_trb.obs[cols].mean(0).values)/a_trb.obs[cols].std(0).values).T, method='ward', cmap='seismic', xticklabels=1, yticklabels=1,
                   cbar_pos=(0, 1, .01, .08), figsize=[8, 5], dendrogram_ratio=(.1, .35), vmin=-1, vmax=1)
g.ax_heatmap.grid(False)

In [None]:
# compute and color on the deviations
df = ((targ.T - a_trb.obs[cols].mean(0).values)/a_trb.obs[cols].std(0).values).T.reset_index().melt(id_vars='index')
fig, ax = plt.subplots(figsize=[4, 4]); ax.grid(False)
sns.barplot(x='index', y='value', data=df, ax=ax, ci=95, errwidth=1.5, capsize=0.3, errcolor='dodgerblue',
            edgecolor='dodgerblue', saturation=1, color='skyblue', linewidth=1.5)
ax.tick_params(axis='x', labelrotation=90)
ax.axhline(0, linestyle='-', color='k')
ax.set_ylabel('Deviation from Background')

In [None]:
# read in the data
df = pd.read_csv('../outs/df.int.clean.csv', index_col=0)
df.shape
# retrieve the data
targs = []
for epitope in epitopes:
    trbs = df.loc[df['AG'] == epitope, 'TRB']
    trbs = trbs[trbs.isin(a_trb.obs.index)]
    targ = pd.Series(a_trb[trbs].X.mean(0))
    targ.name = epitope
    targs.append(targ)
targ = pd.concat(targs, axis=1)
targ.index += 1

In [None]:
# plot the physicochemical attributes of binding TCRs
g = sns.clustermap(targ, standard_scale=0, method='ward', cmap=cmap, xticklabels=1, yticklabels=1,
                   cbar_pos=(0, 1, .01, .08), figsize=[6, 9], dendrogram_ratio=(.1, .15))
g.ax_heatmap.grid(False)

In [None]:
# plot the physicochemical attributes of binding TCRs
g = sns.clustermap(((targ.T - a_trb.X.mean(0))/a_trb.X.std(0)).T, method='ward', cmap='seismic', xticklabels=1, yticklabels=1,
                   cbar_pos=(0, 1, .01, .08), figsize=[6, 9], dendrogram_ratio=(.1, .15), vmin=-1, vmax=1)
g.ax_heatmap.grid(False)

In [None]:
# compute and color on the deviations
df = ((targ.T - a_trb.X.mean(0))/a_trb.X.std(0)).T.reset_index().melt(id_vars='index')
fig, ax = plt.subplots(figsize=[8, 4]); ax.grid(False)
sns.barplot(x='index', y='value', data=df, ax=ax, ci=95, errwidth=1.5, capsize=0.3, errcolor='dodgerblue',
            edgecolor='dodgerblue', saturation=1, color='skyblue', linewidth=1.5)
ax.tick_params(axis='x', labelrotation=90)
ax.axhline(0, linestyle='-', color='k')
ax.set_ylabel('Deviation from Background')
ax.set_xlabel('Tarpon Latent Dimension')

### Visualize BCP Continuum Across Latent Dimensions

In [None]:
from tqdm import tqdm
# compile the data
datas = []; step = 1
cols = [x for x in bcp_cols if '_total' in x]
data = pd.concat([a_trb.obs[cols], sc.get.obs_df(a_trb, keys=['10','12'])], axis=1)
for col in ['10','12']:
    for idx in tqdm(range(step, 100, step), total=100):
        mask = data[col] >= np.percentile(data[col], idx)
        mask = mask & (data[col] < np.percentile(data[col], idx+step))
        datas.append(pd.concat([data.loc[mask].mean(0), pd.Series(col, index=['latent'])], axis=0))
# compile the data
data_ = pd.concat(datas, axis=1).T

In [None]:
# derive the continous spectrums
col = '12'
mask = data_['latent'] == col
data_plot = data_.loc[mask, cols].astype(float)
data_plot -= data_plot.mean(); data_plot /= data_plot.std()
data_plot.columns = labels
fig, ax = plt.subplots(figsize=[3, 4]); ax.grid(False)
sns.heatmap(data_plot.T, vmin=-2, vmax=2, cmap=cmap, xticklabels=0, yticklabels=1)

In [None]:
# derive the continous spectrums
col = '10'
mask = data_['latent'] == col
data_plot = data_.loc[mask, cols].astype(float)
data_plot -= data_plot.mean(); data_plot /= data_plot.std()
data_plot.columns = labels
fig, ax = plt.subplots(figsize=[3, 4]); ax.grid(False)
sns.heatmap(data_plot.T, vmin=-2, vmax=2, cmap=cmap, xticklabels=0, yticklabels=1)