In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scanpy as sc
import scipy.stats as ss
import seaborn as sns
sc.settings.set_figure_params(dpi=100)

### Clean TCRs and Ags

In [None]:
import pickle as pkl

# read in the data
df = pd.read_csv('../outs/df.int.clean.csv', index_col=0)
# define antigen and TRA/TRB repertoire
ags = pd.Series(df['AG'].unique()); print(len(ags))
tras = pd.Series(df['TRA'].unique()); print(len(tras))
trbs = pd.Series(df['TRB'].unique()); print(len(trbs))

# load the pickled data
with open('../external_data/results.tcr.pkl', 'rb') as f:
    results_tcr = pkl.load(f)
with open('../external_data/results.ag.pkl', 'rb') as f:
    results_ag = pkl.load(f)

In [None]:
# track the antigens to load in and expand the list
ags_ = []
# define the columns for each to load in as a potential antigen
cols = ['Predicted MANA sequence','WT sequence','MANA tested in culture']
for col in cols:
    ags_.extend(results_ag['CAUSHI_NATURE2021_NSCLC'][col].dropna().tolist())
# repeat for the other studies
cols = ['mut peptide','ref peptide']
for col in cols:
    ags_.extend(results_ag['MILLER_SCITRANSMED2024_PANCAN'][col].dropna().tolist())
cols = ['Mutant Neoantigen Sequence', 'WT Neoantigen Sequence', 'MHC-I Mutant Epitope (Best Prediction)', 'MHC-I WT Epitope']
for col in cols:
    ags_.extend(results_ag['ROJAS_NATURE2023_PDAC'][col].dropna().tolist())
ags = pd.Series(list(set(ags.tolist()+ags_))).unique()

In [None]:
from tqdm import tqdm
# add on resources
tras_, trbs_ = [], []
for key in tqdm(results_tcr):
    # try to add on tras
    if 'TRA' in results_tcr[key].columns.tolist():
        mask = results_tcr[key]['TRA'].astype(str).isin(['Cnan','nan'])
        results_tcr[key]['TRA'][mask] = np.nan
        tras_.extend(results_tcr[key]['TRA'].dropna().tolist())
    # try to add on trbs
    if 'TRB' in results_tcr[key].columns.tolist():
        mask = results_tcr[key]['TRB'].astype(str).isin(['Cnan','nan'])
        results_tcr[key]['TRB'][mask] = np.nan
        trbs_.extend(results_tcr[key]['TRB'].dropna().tolist())
    # try to add on paired
    if ('TRA' in results_tcr[key].columns.tolist()) & ('TRB' in results_tcr[key].columns.tolist()):
        tmp = results_tcr[key][['TRA','TRB']].reset_index().iloc[:, 1:].dropna()
        tmp['DB'] = key
        tmp['TcellType'] = results_tcr[key]['TcellType'].reset_index().iloc[:, 1:].loc[tmp.index]
        df = pd.concat([df, tmp], axis=0)

In [None]:
# find the paired TCRs and the counts
paired_tcrs = df[['TRA','TRB']].value_counts().reset_index()
# demonstrate the significant extension of tras and trbs
tras = pd.Series(list(set(tras.tolist()+tras_))).unique()
trbs = pd.Series(list(set(trbs.tolist()+trbs_))).unique()
# save those values
with open('../external_data/db.ags.pkl', 'wb') as f: pkl.dump(ags, f)
with open('../external_data/db.tras.pkl', 'wb') as f: pkl.dump(tras, f)
with open('../external_data/db.trbs.pkl', 'wb') as f: pkl.dump(trbs, f)
with open('../external_data/db.paired_tcrs.pkl', 'wb') as f: pkl.dump(paired_tcrs, f)
df.to_csv('../outs/df.int.clean.extended.csv')

### Embed Sequences

In [None]:
import blosum as bl
# perform encoding by direct, BCP, BLOSUM
vocab = ['A','C','D','E','F','G','H','I','K','L','M','N','P','Q','R','S','T','V','W','Y']
# direct encoding
map_direct = {x:[1 * (x == y) for y in vocab] for x in vocab}
# bcp encoding
aa_hydrophobicity = {
    'A': 1.8,  # Alanine
    'R': -4.5,  # Arginine
    'N': -3.5,  # Asparagine
    'D': -3.5,  # Aspartic Acid
    'C': 2.5,  # Cysteine
    'E': -3.5,  # Glutamic Acid
    'Q': -3.5,  # Glutamine
    'G': -0.4,  # Glycine
    'H': -3.2,  # Histidine
    'I': 4.5,  # Isoleucine
    'L': 3.8,  # Leucine
    'K': -3.9,  # Lysine
    'M': 1.9,  # Methionine
    'F': 2.8,  # Phenylalanine
    'P': -1.6,  # Proline
    'S': -0.8,  # Serine
    'T': -0.7,  # Threonine
    'W': -0.9,  # Tryptophan
    'Y': -1.3,  # Tyrosine
    'V': 4.2,  # Valine
}
# https://www.imgt.org/IMGTeducation/Aide-memoire/_UK/aminoacids/IMGTclasses.html
aa_volume = {
    'A': 88.6,   # Alanine
    'R': 173.4,  # Arginine
    'N': 114.1,  # Asparagine
    'D': 111.1,  # Aspartic Acid
    'C': 108.5,  # Cysteine
    'E': 138.4,  # Glutamic Acid
    'Q': 143.8,  # Glutamine
    'G': 60.1,   # Glycine
    'H': 153.2,  # Histidine
    'I': 166.7,  # Isoleucine
    'L': 166.7,  # Leucine
    'K': 168.6,  # Lysine
    'M': 162.9,  # Methionine
    'F': 189.9,  # Phenylalanine
    'P': 112.7,  # Proline
    'S': 89.0,   # Serine
    'T': 116.1,  # Threonine
    'W': 227.8,  # Tryptophan
    'Y': 193.6,  # Tyrosine
    'V': 140.0,  # Valine
}
# 1 = donor and acceptor, 0.5 = only donor or acceptor
aa_hbond = {
    'A': 0,    # Alanine
    'R': 0.5,  # Arginine
    'N': 1,    # Asparagine
    'D': 0.5,  # Aspartic Acid
    'C': 0,    # Cysteine
    'E': 0.5,  # Glutamic Acid
    'Q': 1,    # Glutamine
    'G': 0,    # Glycine
    'H': 1,    # Histidine
    'I': 0,    # Isoleucine
    'L': 0,    # Leucine
    'K': 0.5,  # Lysine
    'M': 0,    # Methionine
    'F': 0,    # Phenylalanine
    'P': 0,    # Proline
    'S': 1,    # Serine
    'T': 1,    # Threonine
    'W': 0.5,  # Tryptophan
    'Y': 1,    # Tyrosine
    'V': 0,    # Valine
}
has_sulfur = ['C','M']
is_aromatic = ['F','Y','W']
is_aliphatic = ['A','G','I','L','P','V']
is_basic = ['R','H','K']
is_acidic = ['D','E']
has_amide = ['N','Q']
vocab_bcp = ['hydrophobicity','volume','hbond','has_sulfur','is_aromatic',
             'is_aliphatic','is_basic','is_acidic','has_amide']
# > normalize the data for both volume and charge
vmin, vmax = min(list(aa_volume.values())), max(list(aa_volume.values()))
aa_volume = {k:(v-vmin)/(vmax-vmin) for k,v in aa_volume.items()}
vmax = max(abs(np.array(list(aa_hydrophobicity.values()))))
aa_hydrophobicity = {k:v/vmax for k,v in aa_hydrophobicity.items()}
# > define a method to return the embedding for a given amino acid in BCP space
def bcp_translation(aa):
    embedding = []
    embedding.append(aa_hydrophobicity[aa])
    embedding.append(aa_volume[aa])
    embedding.append(aa_hbond[aa])
    embedding.append(1 * (aa in has_sulfur))
    embedding.append(1 * (aa in is_aromatic))
    embedding.append(1 * (aa in is_aliphatic))
    embedding.append(1 * (aa in is_basic))
    embedding.append(1 * (aa in is_acidic))
    embedding.append(1 * (aa in has_amide))
    return embedding
map_bcp = {x:bcp_translation(x) for x in vocab}

In [None]:
# decide how to normalize the blosum matrix
xs = range(1, 10+1)
ys = [np.mean(abs(pd.DataFrame({x:[bl.BLOSUM(62)[x][y]/idx for y in vocab] for x in vocab}).values.flatten()) < 1) for idx in xs]
# create the elbow like plot
fig, ax = plt.subplots(); ax.grid(False)
ax.scatter(xs, np.array(ys)*100, edgecolor='dodgerblue', color='skyblue', lw=1.5)
ax.set(xlabel='Denominator', ylabel='% of |values| < 1')
# blosum encoding with five as shown above
map_blosum = {x:[bl.BLOSUM(62)[x][y] / 5 for y in vocab] for x in vocab}

In [None]:
# define a function to embed an amino acid with direct, bcp, blosum, and length
def embed_aa(aa):
    embed = [x for x in map_direct[aa]]
    embed += map_bcp[aa]
    embed += map_blosum[aa]
    embed += [0]
    return embed

In [None]:
from tqdm import tqdm
import torch
# define the number of samples per case
n_samples = 100
# define the number of lengths to test
targ_lens = range(5, 101, 5); mses = []
for targ_len in targ_lens:
    # set seed for reproducibility
    np.random.seed(0)
    # track the MSEs
    mse = 0
    for sequence in np.random.choice(tras, size=n_samples, replace=False):
        # retrieve the original length
        orig_len = len(sequence)
        # retrieve the embedding
        embedding = np.array([embed for embed in map(embed_aa, list(sequence))])
        tensor = torch.Tensor(embedding.T.reshape(1, 50, orig_len))
        res = torch.nn.functional.interpolate(tensor, size=(targ_len), mode='linear', align_corners=False)[0].T
        res_p = torch.nn.functional.interpolate(res.T.view((1, 50, targ_len)), size=(orig_len), mode='linear', align_corners=False)[0].T
        mse += (res_p - embedding).pow(2).sum()
    mses.append(torch.sqrt(mse / n_samples))
# create the elbow like plot
fig, ax = plt.subplots(); ax.grid(False)
ax.scatter(targ_lens, mses, edgecolor='dodgerblue', color='skyblue', lw=1.5)
ax.set(xlabel='TRA stretch length', ylabel='Average reconstruction loss')
tra_mses = [x for x in mses]

In [None]:
# define the number of lengths to test
targ_lens = range(5, 101, 5); mses = []
for targ_len in targ_lens:
    # set seed for reproducibility
    np.random.seed(0)
    # track the MSEs
    mse = 0
    for sequence in np.random.choice(trbs, size=n_samples, replace=False):
        # retrieve the original length
        orig_len = len(sequence)
        # retrieve the embedding
        embedding = np.array([embed for embed in map(embed_aa, list(sequence))])
        tensor = torch.Tensor(embedding.T.reshape(1, 50, orig_len))
        res = torch.nn.functional.interpolate(tensor, size=(targ_len), mode='linear', align_corners=False)[0].T
        res_p = torch.nn.functional.interpolate(res.T.view((1, 50, targ_len)), size=(orig_len), mode='linear', align_corners=False)[0].T
        mse += (res_p - embedding).pow(2).sum()
    mses.append(torch.sqrt(mse / n_samples))
# create the elbow like plot
fig, ax = plt.subplots(); ax.grid(False)
ax.scatter(targ_lens, mses, edgecolor='dodgerblue', color='skyblue', lw=1.5)
ax.set(xlabel='TRB stretch length', ylabel='Average reconstruction loss')
trb_mses = [x for x in mses]

In [None]:
# define the number of lengths to test
targ_lens = range(5, 101, 5); mses = []
for targ_len in targ_lens:
    # set seed for reproducibility
    np.random.seed(0)
    # track the MSEs
    mse = 0
    for sequence in np.random.choice(ags, size=n_samples, replace=False):
        # retrieve the original length
        orig_len = len(sequence)
        # retrieve the embedding
        embedding = np.array([embed for embed in map(embed_aa, list(sequence))])
        tensor = torch.Tensor(embedding.T.reshape(1, 50, orig_len))
        res = torch.nn.functional.interpolate(tensor, size=(targ_len), mode='linear', align_corners=False)[0].T
        res_p = torch.nn.functional.interpolate(res.T.view((1, 50, targ_len)), size=(orig_len), mode='linear', align_corners=False)[0].T
        mse += (res_p - embedding).pow(2).sum()
    mses.append(torch.sqrt(mse / n_samples))
# create the elbow like plot
fig, ax = plt.subplots(); ax.grid(False)
ax.scatter(targ_lens, mses, edgecolor='dodgerblue', color='skyblue', lw=1.5)
ax.set(xlabel='AG stretch length', ylabel='Average reconstruction loss')
ag_mses = [x for x in mses]

In [None]:
# create the elbow like plot
fig, ax = plt.subplots(); ax.grid(False)
ax.scatter(targ_lens, [ag_mses[idx]+tra_mses[idx]+trb_mses[idx] for idx in range(len(targ_lens))],
           edgecolor='dodgerblue', color='skyblue', lw=1.5)
ax.set(xlabel='Stretch length (ALL)', ylabel='Average reconstruction loss (SUM)')

In [None]:
# we therefore settle on a stretch length
targ_len = 48

In [None]:
import torch
# define a function to interpolate the protein
def stretch_pep(embedding, targ_len=targ_len):
    # get the current protein length
    orig_len, n_features = embedding.shape
    # derive the original and current lengths
    x = np.linspace(0, 1, targ_len)
    xp = np.linspace(0, 1, orig_len)
    # loop through each of the columns
    tensor = torch.Tensor(embedding.T.reshape(1, n_features, orig_len))
    res = torch.nn.functional.interpolate(tensor, size=(targ_len), mode='linear', align_corners=False)[0]
    # add an the extra length information
    res[-1, :] = orig_len
    return res

In [None]:
from tqdm import tqdm
# process the TRAs
tra_to_embed = {}
for sequence in tqdm(tras):
    # retrieve the embedding
    embedding = np.array([embed for embed in map(embed_aa, list(sequence))])
    # stretch the embedding
    embedding = stretch_pep(embedding, targ_len=targ_len)
    # save the embedding
    tra_to_embed[sequence] = embedding

In [None]:
# process the TRBs
trb_to_embed = {}
for sequence in tqdm(trbs):
    # retrieve the embedding
    embedding = np.array([embed for embed in map(embed_aa, list(sequence))])
    # stretch the embedding
    embedding = stretch_pep(embedding, targ_len=targ_len)
    # save the embedding
    trb_to_embed[sequence] = embedding

In [None]:
# process the AGs
ag_to_embed = {}
for sequence in tqdm(ags):
    # retrieve the embedding
    embedding = np.array([embed for embed in map(embed_aa, list(sequence))])
    # stretch the embedding
    embedding = stretch_pep(embedding, targ_len=targ_len)
    # save the embedding
    ag_to_embed[sequence] = embedding

In [None]:
import pickle as pkl
# save the embedding maps
with open('../outs/map.tra_to_embed.extended.pkl', 'wb') as f: pkl.dump(tra_to_embed, f)
with open('../outs/map.trb_to_embed.extended.pkl', 'wb') as f: pkl.dump(trb_to_embed, f)
with open('../outs/map.ag_to_embed.extended.pkl', 'wb') as f: pkl.dump(ag_to_embed, f)

### Data Cleaning

In [None]:
import pickle as pkl
# read in the aggregated values
with open('../external_data/db.ags.pkl', 'rb') as f: ags = pkl.load(f)
with open('../external_data/db.tras.pkl', 'rb') as f: tras = pkl.load(f)
with open('../external_data/db.trbs.pkl', 'rb') as f: trbs = pkl.load(f)
with open('../external_data/db.paired_tcrs.pkl', 'rb') as f: paired_tcrs = pkl.load(f)
ags, tras, trbs = pd.Series(ags), pd.Series(tras), pd.Series(trbs)

# move to uppercase
ags = pd.Series(ags).str.upper()
tras = pd.Series(tras).str.upper()
trbs = pd.Series(trbs).str.upper()
ags.shape[0], tras.shape[0], trbs.shape[0]

In [None]:
# remove any sequences with an invalid letter
for invalid in ['B','J','O','U','X','Z']:
    ags = ags[~ags.str.contains(invalid)]
    tras = tras[~tras.str.contains(invalid)]
    trbs = trbs[~trbs.str.contains(invalid)]
ags.shape[0], tras.shape[0], trbs.shape[0]

In [None]:
# examine the histogram for TRA, TRB
fig, ax = plt.subplots(figsize=[4, 4]); ax.grid(False)
ax.hist(tras.apply(len), edgecolor='dodgerblue', lw=1.5, color='skyblue', bins=20)
ax.set(xlabel='TRA length', ylabel='Count')
print(tras.apply(len).describe())
fig, ax = plt.subplots(figsize=[4, 4]); ax.grid(False)
ax.hist(trbs.apply(len), edgecolor='dodgerblue', lw=1.5, color='skyblue', bins=20)
ax.set(xlabel='TRB length', ylabel='Count')
print(trbs.apply(len).describe())
# examine the histogram for epitope, for peps
fig, ax = plt.subplots(figsize=[4, 4]); ax.grid(False)
ax.hist(ags.apply(len), edgecolor='dodgerblue', lw=1.5, color='skyblue', bins=20)
ax.set(xlabel='AG length', ylabel='Count')
print(ags.apply(len).describe())

In [None]:
# examine the histogram for TRA, TRB
fig, ax = plt.subplots(figsize=[4, 4]); ax.grid(False)
ax.hist(tras.apply(len)[(tras.apply(len) <= 24)&(tras.apply(len) >= 8)], edgecolor='dodgerblue', lw=1.5, color='skyblue', bins=15)
ax.set(xlabel='TRA length', ylabel='Count')
print(tras.apply(len).describe())
fig, ax = plt.subplots(figsize=[4, 4]); ax.grid(False)
ax.hist(trbs.apply(len)[(trbs.apply(len) <= 24)&(trbs.apply(len) >= 8)], edgecolor='dodgerblue', lw=1.5, color='skyblue', bins=15)
ax.set(xlabel='TRB length', ylabel='Count')
print(trbs.apply(len).describe())
# examine the histogram for epitope, for peps
fig, ax = plt.subplots(figsize=[4, 4]); ax.grid(False)
ax.hist(ags.apply(len)[(ags.apply(len) <= 12)&(ags.apply(len) >= 8)], edgecolor='dodgerblue', lw=1.5, color='skyblue', bins=15)
ax.set(xlabel='AG length', ylabel='Count')
print(ags.apply(len).describe())

In [None]:
# subset accordingly and save
tras = tras[(tras.apply(len) <= 24)&(tras.apply(len) >= 8)]
trbs = trbs[(trbs.apply(len) <= 24)&(trbs.apply(len) >= 8)]
ags = ags[(ags.apply(len) <= 12)&(ags.apply(len) >= 8)]
paired_tcrs = paired_tcrs[paired_tcrs['TRA'].isin(tras) & paired_tcrs['TRB'].isin(trbs)]

In [None]:
# check for any dups
assert len([x for x in tras if x[:len(x)//2] == x[len(x)//2:]]) == 0
assert len([x for x in trbs if x[:len(x)//2] == x[len(x)//2:]]) == 0
assert len([x for x in ags if x[:len(x)//2] == x[len(x)//2:]]) == 0
assert len([x for x in tras if x.count('CAS') > 2 | x.count('CSA') > 2]) == 0
assert len([x for x in trbs if x.count('CAS') > 2 | x.count('CSA') > 2]) == 0

In [None]:
# save those values
with open('../external_data/db.ags.pkl', 'wb') as f: pkl.dump(ags, f)
with open('../external_data/db.tras.pkl', 'wb') as f: pkl.dump(tras, f)
with open('../external_data/db.trbs.pkl', 'wb') as f: pkl.dump(trbs, f)
with open('../external_data/db.paired_tcrs.pkl', 'wb') as f: pkl.dump(paired_tcrs, f)

### Double Check Interpolation Length

In [None]:
from tqdm import tqdm
import torch
# define the number of samples per case
n_samples = 100
# define the number of lengths to test
targ_lens = range(4, 101, 4)
df_mse = pd.DataFrame(columns=['targ_len','rmse','seed'])
for targ_len in targ_lens:
    # set seed for reproducibility
    for seed in range(5):
        np.random.seed(seed)
        # track the MSEs
        mse = 0
        for sequence in np.random.choice(tras, size=n_samples, replace=False):
            # retrieve the original length
            orig_len = len(sequence)
            # retrieve the embedding
            embedding = np.array([embed for embed in map(embed_aa, list(sequence))])
            tensor = torch.Tensor(embedding.T.reshape(1, 50, orig_len))
            res = torch.nn.functional.interpolate(tensor, size=(targ_len), mode='linear', align_corners=False)[0].T
            res_p = torch.nn.functional.interpolate(res.T.view((1, 50, targ_len)), size=(orig_len), mode='linear', align_corners=False)[0].T
            mse += (res_p - embedding).pow(2).sum()
        rmse = torch.sqrt(mse / n_samples)
        df_mse.loc[df_mse.shape[0]] = targ_len, rmse.item(), seed

In [None]:
# plot in bar plot format
fig, ax = plt.subplots(figsize=[8, 4]); ax.grid(False)
sns.barplot(x='targ_len', y='rmse', data=df_mse, ci=95, errwidth=1.5, capsize=0.3, saturation=1,
            errcolor='dodgerblue', edgecolor='dodgerblue', linewidth=1.5, color='skyblue')
ax.set_xlim(-1, 25)
ax.tick_params(axis='x', labelrotation=90)
ax.set_xticklabels([int(float(x.get_text())) for x in ax.get_xticklabels()])
ax.set(xlabel='Interpolated Length', ylabel='Root Mean Squared Error (RMSE)', title='CDR3α Amino Acid Identity')

In [None]:
# define the number of lengths to test
targ_lens = range(4, 101, 4)
df_mse = pd.DataFrame(columns=['targ_len','rmse','seed'])
for targ_len in targ_lens:
    # set seed for reproducibility
    for seed in range(5):
        np.random.seed(seed)
        # track the MSEs
        mse = 0
        for sequence in np.random.choice(trbs, size=n_samples, replace=False):
            # retrieve the original length
            orig_len = len(sequence)
            # retrieve the embedding
            embedding = np.array([embed for embed in map(embed_aa, list(sequence))])
            tensor = torch.Tensor(embedding.T.reshape(1, 50, orig_len))
            res = torch.nn.functional.interpolate(tensor, size=(targ_len), mode='linear', align_corners=False)[0].T
            res_p = torch.nn.functional.interpolate(res.T.view((1, 50, targ_len)), size=(orig_len), mode='linear', align_corners=False)[0].T
            mse += (res_p - embedding).pow(2).sum()
        rmse = torch.sqrt(mse / n_samples)
        df_mse.loc[df_mse.shape[0]] = targ_len, rmse.item(), seed

In [None]:
# plot in bar plot format
fig, ax = plt.subplots(figsize=[8, 4]); ax.grid(False)
sns.barplot(x='targ_len', y='rmse', data=df_mse, ci=95, errwidth=1.5, capsize=0.3, saturation=1,
            errcolor='dodgerblue', edgecolor='dodgerblue', linewidth=1.5, color='skyblue')
ax.set_xlim(-1, 25)
ax.tick_params(axis='x', labelrotation=90)
ax.set_xticklabels([int(float(x.get_text())) for x in ax.get_xticklabels()])
ax.set(xlabel='Interpolated Length', ylabel='Root Mean Squared Error (RMSE)', title='CDR3β Amino Acid Identity')