In [None]:
from matplotlib.cm import get_cmap
from matplotlib.colors import to_hex
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import scanpy as sc
from tqdm import tqdm
sc.settings.set_figure_params(dpi=100)

In [None]:
# plot bar plot with colors as percentage as y-axis as counts
def plot_bar(counts, cmap, edgecolor, figsize=None):
    # convert to relevant colors
    colors = counts / counts.sum()
    colors = [to_hex(cmap(x)) for x in colors]
    figsize = [6, 4] if figsize is None else figsize
    fig, ax = plt.subplots(figsize=figsize); ax.grid(False)
    ax.bar(counts.index, counts, edgecolor=edgecolor, lw=1.5, color=colors)
    ax.tick_params(axis='x', labelrotation=90)
    return ax

### Databases

#### VDJDB

In [None]:
# read in the VDJDB and filter it down
df_vdjdb = pd.read_table('../data/VDJDB.human_paired_fixed.tsv')
# > need to have a certain level of confidence
df_vdjdb = df_vdjdb.loc[df_vdjdb['Score'] >= 2]
# > only examine paired samples
ids_tra = set(df_vdjdb.loc[df_vdjdb['Gene'] == 'TRA', 'complex.id'])
ids_trb = set(df_vdjdb.loc[df_vdjdb['Gene'] == 'TRB', 'complex.id'])
ids = ids_tra & ids_trb
df_vdjdb = df_vdjdb.loc[df_vdjdb['complex.id'].isin(ids)]
counts = df_vdjdb['complex.id'].value_counts()
df_vdjdb = df_vdjdb.loc[df_vdjdb['complex.id'].isin(counts.index[counts == 2])]

In [None]:
# assemble the dataframe
df_tmp = pd.DataFrame(columns=['DB','REF','METH','FREQ','TRA','TRB','AG','MHC','TYPE'])
for complex_id in tqdm(df_vdjdb['complex.id'].unique()):
    # > derive the mask
    mask = df_vdjdb['complex.id'] == complex_id
    mask_meth = df_meth['complex.id'] == complex_id
    # > isolate genes
    tra = df_vdjdb.loc[mask & (df_vdjdb['Gene'] == 'TRA'), 'CDR3'].iloc[0]
    trb = df_vdjdb.loc[mask & (df_vdjdb['Gene'] == 'TRB'), 'CDR3'].iloc[0]
    ags = set(df_vdjdb.loc[mask, 'Epitope'].tolist()); assert len(ags) == 1
    refs = set(df_vdjdb.loc[mask, 'Reference'].tolist()); assert len(refs) == 1
    mhcs = set(df_vdjdb.loc[mask, 'MHC A'].tolist()); assert len(mhcs) == 1
    # > if the method is greater than 0 drop the one without frequency information
    meths = set(df_meth.loc[mask_meth, 0].tolist())
    meth = meths.pop() if len(meths) == 1 else ';'.join(meths)
    freqs = set(df_meth.loc[mask_meth, 1].dropna().tolist())
    freq = np.nan if len(freqs) == 0 else np.min(freqs)
    df_tmp.loc[df_tmp.shape[0]] = 'VDJDB', refs.pop(), meth, freq, tra, trb, ags.pop(), mhcs.pop(), 'peptide'

In [None]:
# rename and save
df_vdjdb = df_tmp.copy()
df_vdjdb.to_csv('../outs/data.vdjdb.csv')

#### MCPAS

In [None]:
# read in the MCPAS
df_mcpas = pd.read_csv('../data/MCPAS.csv')
mask = df_mcpas['Species'] == 'Human'
df_mcpas = df_mcpas.loc[mask]

In [None]:
# antigen identification method was converted into a string
num2str = {1:'peptide-MHC multimers', 2:'selection post in-culture activation with Ag',
           2.1:'selection post in-culture activation with Ag: peptide',
           2.2:'selection post in-culture activation with Ag: whole protein',
           2.3:'selection post in-culture activation with Ag: whole pathogen',
           2.4:'selection post in-culture activation with Ag: tumor cells',
           2.5:'selection post in-culture activation with Ag: other types of in vitro stimulation',
           3:'revealed by direct sequencing of ex-vivo labeled T cells',
           4:'MCPAS #4'}  # the fourth does not have any description
df_mcpas['METH'] = df_mcpas['Antigen.identification.method'].map(num2str)
# rename the columns and subset for valid data
df_mcpas = df_mcpas[['MHC','CDR3.alpha.aa','CDR3.beta.aa','Epitope.peptide','METH','PubMed.ID']].dropna(subset=['MHC','CDR3.alpha.aa','CDR3.beta.aa','Epitope.peptide'])
df_mcpas['FREQ'] = np.nan
df_mcpas.columns = ['MHC','TRA','TRB','AG','METH','REF','FREQ']
df_mcpas['TYPE'] = 'peptide'
df_mcpas['DB'] = 'MCPAS'
df_mcpas = df_mcpas[['DB','REF','METH','FREQ','TRA','TRB','AG','MHC','TYPE']]

In [None]:
# write the processed data
df_mcpas.to_csv('../outs/data.mcpas.csv')

#### IEDB

In [None]:
# read in the IEDB
df_iedb = pd.read_csv('../data/IEDB.nonpeptide.receptor_table_export_1734038752.csv')
# only keep valid pairs with epitopes
mask_tra = ~df_iedb[['Chain 1 - CDR3 Curated','Chain 1 - CDR3 Calculated']].isna().all(1)
mask_trb = ~df_iedb[['Chain 2 - CDR3 Curated','Chain 2 - CDR3 Calculated']].isna().all(1)
mask_epitope = ~df_iedb['Epitope - Name'].isna()
mask = mask_tra & mask_trb & mask_epitope
df_iedb = df_iedb.loc[mask]
# add only valid tras and trbs
df_iedb['TRA'] = [pd.Series(x).dropna().iloc[0] for x in df_iedb[['Chain 1 - CDR3 Curated','Chain 1 - CDR3 Calculated']].values]
df_iedb['TRB'] = [pd.Series(x).dropna().iloc[0] for x in df_iedb[['Chain 2 - CDR3 Curated','Chain 2 - CDR3 Calculated']].values]
# rename the columns
df_iedb = df_iedb[['TRA','TRB','Epitope - Name','Assay - MHC Allele Names','Assay - IEDB IDs','Reference - IEDB IRI']]
df_iedb.columns = ['TRA','TRB','AG','MHC','METH','REF']
df_iedb['DB'] = 'IEDB'; df_iedb['TYPE'] = 'non-peptide'; df_iedb['FREQ'] = np.nan
df_iedb_nonpep = df_iedb[['DB','REF','METH','FREQ','TRA','TRB','AG','MHC','TYPE']].copy()

In [None]:
# read in the IEDB
df_iedb = pd.read_csv('../data/IEDB.paired_pep.receptor_table_export_1734038781.csv')
# only keep valid pairs with epitopes
mask_tra = ~df_iedb[['Chain 1 - CDR3 Curated','Chain 1 - CDR3 Calculated']].isna().all(1)
mask_trb = ~df_iedb[['Chain 2 - CDR3 Curated','Chain 2 - CDR3 Calculated']].isna().all(1)
mask_epitope = ~df_iedb['Epitope - Name'].isna()
mask = mask_tra & mask_trb & mask_epitope
df_iedb = df_iedb.loc[mask]
# add only valid tras and trbs
df_iedb['TRA'] = [pd.Series(x).dropna().iloc[0] for x in df_iedb[['Chain 1 - CDR3 Curated','Chain 1 - CDR3 Calculated']].values]
df_iedb['TRB'] = [pd.Series(x).dropna().iloc[0] for x in df_iedb[['Chain 2 - CDR3 Curated','Chain 2 - CDR3 Calculated']].values]
# rename the columns
df_iedb = df_iedb[['TRA','TRB','Epitope - Name','Assay - MHC Allele Names','Assay - IEDB IDs','Reference - IEDB IRI']]
df_iedb.columns = ['TRA','TRB','AG','MHC','METH','REF']
df_iedb['DB'] = 'IEDB'; df_iedb['TYPE'] = 'peptide'; df_iedb['FREQ'] = np.nan
df_iedb_pep = df_iedb[['DB','REF','METH','FREQ','TRA','TRB','AG','MHC','TYPE']].copy()

In [None]:
# write the processed data
df_iedb = pd.concat([df_iedb_nonpep, df_iedb_pep], axis=0)
df_iedb.to_csv('../outs/data.iedb.csv')

#### Integration

In [None]:
# integrate all datasets together
df = pd.concat([df_iedb, df_mcpas, df_vdjdb], axis=0)
print(df.shape)

In [None]:
# de-duplicate
df = df.astype(str).value_counts().reset_index()
df.columns = ['DB', 'REF', 'METH', 'FREQ', 'TRA', 'TRB', 'AG', 'MHC', 'TYPE', 'NREP']
# everything that does not begin with C should begin with C
mask = df['TRA'].str.startswith('C'); df.loc[~mask, 'TRA'] = 'C' + df.loc[~mask, 'TRA']
mask = df['TRB'].str.startswith('C'); df.loc[~mask, 'TRB'] = 'C' + df.loc[~mask, 'TRB']
# write the current raw version
df.to_csv('../outs/df.int.clean.csv')

### ZEMIN ZHANG PAN-CANCER T CELL ATLAS

In [None]:
import pyreadr
# retrieve zemin TCR data
result = pyreadr.read_r('/home/dchen2/LITERATURE/ZEMIN_PANCANCER/data/tcr/byCell/tcr.zhangLab.comb.flt.rds')
result = list(result.values())[0]
# add only relevant column names
result[['TRA','TRB','TcellType']] = result[['CDR3.A1','CDR3.B1','stype']]

In [None]:
# verification
vocab = ['A','C','D','E','F','G','H','I','K','L','M','N','P','Q','R','S','T','V','W','Y']
def verify(x):
    if str(x) == 'nan': return True
    return sum([x.count(aa) for aa in vocab]) == len(x)
va = result['TRA'].astype(str).apply(verify)
vb = result['TRB'].astype(str).apply(verify)
result = result.loc[va & vb]
result['TRA'][~result['TRA'].str.startswith('C')] = 'C' + result['TRA'][~result['TRA'].str.startswith('C')]
result['TRB'][~result['TRB'].str.startswith('C')] = 'C' + result['TRB'][~result['TRB'].str.startswith('C')]
results = {'ZHENG_SCIENCE2021_PANCAN': result}

### BARRAS_SCIIMMUN2024_MELA

In [None]:
from glob import glob
from tqdm import tqdm
# locate the TCR tables
fns = glob('../external_data/BARRAS_SCIIMMUN2024_MELA/GSM*_all_contig_annotations_Patient*_T*.csv.gz')
# process each of the TCR tables
dfs = []
for fn in tqdm(fns):
    # read in the table
    df = pd.read_csv(fn)
    # remove any chain for which it is unknown
    df = df.dropna(subset=['chain'])
    # remove any without cdr3
    df = df.dropna(subset=['cdr3'])
    # filter for cells
    df = df.loc[df['is_cell'] & df['high_confidence']]
    # remove anything not in a valid chain
    df = df.loc[df['chain'].isin(['TRA','TRB','TRD','TRG'])]
    # remove the invalid barcodes from duplicates with low counts
    counts_tra = df.loc[df['chain'] == 'TRA', 'barcode'].value_counts()
    counts_trb = df.loc[df['chain'] == 'TRB', 'barcode'].value_counts()
    counts_trd = df.loc[df['chain'] == 'TRD', 'barcode'].value_counts()
    counts_trg = df.loc[df['chain'] == 'TRG', 'barcode'].value_counts()
    invalid_bcs_tra = counts_tra.index[counts_tra >= 2]
    invalid_bcs_trb = counts_trb.index[counts_trb >= 2]
    invalid_bcs_trd = counts_trd.index[counts_trd >= 2]
    invalid_bcs_trg = counts_trg.index[counts_trg >= 2]
    invalid_idxs = []
    for bc in invalid_bcs_tra:
        # isolate the minimally expressed idxs
        idxs = df.loc[(df['chain'] == 'TRA')&(df['barcode'] == bc), 'reads'].sort_values().index[0:-1]
        invalid_idxs.extend(idxs.tolist())
    for bc in invalid_bcs_trb:
        # isolate the minimally expressed idxs
        idxs = df.loc[(df['chain'] == 'TRB')&(df['barcode'] == bc), 'reads'].sort_values().index[0:-1]
        invalid_idxs.extend(idxs.tolist())
    for bc in invalid_bcs_trd:
        # isolate the minimally expressed idxs
        idxs = df.loc[(df['chain'] == 'TRD')&(df['barcode'] == bc), 'reads'].sort_values().index[0:-1]
        invalid_idxs.extend(idxs.tolist())
    for bc in invalid_bcs_trg:
        # isolate the minimally expressed idxs
        idxs = df.loc[(df['chain'] == 'TRG')&(df['barcode'] == bc), 'reads'].sort_values().index[0:-1]
        invalid_idxs.extend(idxs.tolist())
    df = df.loc[~df.index.isin(invalid_idxs)]
    # verify there are not any barcodes that have three rows
    counts = df[['barcode','chain']].value_counts()
    assert counts.max() < 2
    # assemble it up
    df = df.pivot_table(index='barcode', columns=['chain'], values=df.columns[1:-1].tolist(), aggfunc=np.sum)
    df.columns = [':'.join(x) for x in df.columns]
    # add on special columns
    pt, tp = fn.split('/')[-1].split('all_contig_annotations_Patient')[-1].split('.csv.gz')[0].split('_T')
    df['patient'] = pt; df['timepoint'] = tp
    # process the non chain dependent columns
    df[['TRA','TRB']] = df[['cdr3:TRA','cdr3:TRB']]
    df['TcellType'] = 'T'
    dfs.append(df)

In [None]:
# add on the additional CR/PD and RECIST annotations
result = pd.concat(dfs, axis=0)
anno = pd.read_excel('../external_data/BARRAS_SCIIMMUN2024_MELA/Datafile_S1.xlsx', index_col=0, sheet_name=0)
anno.index = anno.index.astype(str)
for col in anno.columns:
    result[col] = result['patient'].astype(str).map(anno[col])
anno = pd.read_excel('../external_data/BARRAS_SCIIMMUN2024_MELA/Datafile_S1.xlsx', index_col=0, sheet_name=1)
anno.index = anno.index.astype(str)
for col in anno.columns:
    result[col] = result['patient'].astype(str).map(anno[col])

In [None]:
# verification
va = result['TRA'].astype(str).apply(verify)
vb = result['TRB'].astype(str).apply(verify)
result = result.loc[va & vb]
mask = (~result['TRA'].astype(str).str.startswith('C')) & (~result['TRA'].isna())
result['TRA'][mask] = 'C' + result['TRA'][mask]
mask = (~result['TRB'].astype(str).str.startswith('C')) & (~result['TRB'].isna())
result['TRB'][mask] = 'C' + result['TRB'][mask]
results['BARRAS_SCIIMMUN2024_MELA'] = result

### BOLAND_SCIIMMUN2020_IBD

In [None]:
# tissue is R for rectum and PBMC for PB, I is ileum (we only use the first two not sure where the third is from)
# for patients C stands for control and U stands for UC

In [None]:
# take the TCR table and convert
fns = glob('../external_data/BOLAND_SCIIMMUN2020_IBD/GSM*_full-length_productive_TCR_table.tsv.gz')
fns = [x for x in fns if '_I_full' not in fns]
# process each of the TCR tables
dfs = []
for fn in tqdm(fns):
    # read in the table
    df = pd.read_table(fn)
    # remove any barcodes that have three rows, very rare but do exist
    counts = df['barcode'].value_counts()
    valid_bcs = counts.index[(counts >= 1) & (counts <= 2)]
    df = df.loc[df['barcode'].isin(valid_bcs)]
    # add in an extra column indicative of TRA or TRB nature
    # > we have to assure that the v gene
    assert df['v_gene'].isna().sum() == 0
    df['chain'] = df['v_gene'].str.slice(0, 3)
    # remove anything not in a valid chain
    df = df.loc[df['chain'].isin(['TRA','TRB','TRD','TRG'])]
    # do not allow for barcodes with multiple rows
    df['tag'] = df[['barcode','chain']].astype(str).agg(':'.join, axis=1)
    counts = df['tag'].value_counts()
    df = df.loc[df['tag'].isin(counts.index[counts < 2])]
    counts = df[['barcode','chain']].value_counts()
    assert counts.max() < 2
    # assemble it up
    df = df.pivot_table(index='barcode', columns=['chain'], values=df.columns[1:-1].tolist(), aggfunc=np.sum)
    df.columns = [':'.join(x) for x in df.columns]
    # process the non chain dependent columns
    cols = ['batch_id','celltype','disease','patient_id','tissue_type']
    for col in cols:
        # confirm the values are valid
        assert ((df[f'{col}:TRA'] == df[f'{col}:TRB']) | df[[f'{col}:TRA',f'{col}:TRB']].isna().any(axis=1)).all()
        # retrieve the values via a combination of TRA and TRB whatever has the values
        values_tra = df[f'{col}:TRA']
        values_trb = df[f'{col}:TRB']
        values_tra[values_tra.isna()] = values_trb[values_tra.isna()]
        df[col] = values_tra
        del df[f'{col}:TRA'], df[f'{col}:TRB']
    df[['TRA','TRB']] = df[['cdr3:TRA','cdr3:TRB']]
    df['TcellType'] = 'T'
    dfs.append(df)
result = pd.concat(dfs, axis=0)

In [None]:
# verification
va = result['TRA'].astype(str).apply(verify)
vb = result['TRB'].astype(str).apply(verify)
result = result.loc[va & vb]
mask = (~result['TRA'].astype(str).str.startswith('C')) & (~result['TRA'].isna())
result['TRA'][mask] = 'C' + result['TRA'][mask]
mask = (~result['TRB'].astype(str).str.startswith('C')) & (~result['TRB'].isna())
result['TRB'][mask] = 'C' + result['TRB'][mask]
results['BOLAND_SCIIMMUN2020_IBD'] = result

### CAUSHI_NATURE2021_NSCLC

In [None]:
# resected and PB samples from some patients on anti-PD1 and some gained resistance
# examining viral specific and neoantigen specific libraries or specific peptides
# HIV is negative control, i think, H1N1 and H3N2 have separate matrix and nucleoprotein libraries
# there are three pools i believe to increase direction, they are highly correlated
# these are all peripheral blood

In [None]:
# prepare to assemble all of the viruses
dfs = []
# assemble H1N1
fns = sorted(glob('../external_data/CAUSHI_NATURE2021_NSCLC/GSM*_H1N1_1.txt.gz'))
for fn in tqdm(fns):
    # read in the tables for H1N1
    tb1 = pd.read_table(fn)
    tb2 = pd.read_table(glob('../external_data/CAUSHI_NATURE2021_NSCLC/*' + '_'.join(fn.split('/')[-1].split('_')[1:3]) + '*_H1N1_2.txt.gz')[0])
    tb3 = pd.read_table(glob('../external_data/CAUSHI_NATURE2021_NSCLC/*' + '_'.join(fn.split('/')[-1].split('_')[1:3]) + '*_H1N1_3.txt.gz')[0])
    # concatenate together and average
    df = pd.concat([tb1, tb2, tb3], axis=0).groupby(['cdr3nt','cdr3aa','v','d','j','VEnd','DStart','DEnd','JStart']).mean().reset_index()
    df['patient'] = fn.split('/')[-1].split('_')[1]
    df['epitope'] = fn.split('/')[-1].split('_')[2]
    df['virus'] = fn.split('/')[-1].split('_')[3]
    dfs.append(df)
# assemble H3N2
fns = sorted(glob('../external_data/CAUSHI_NATURE2021_NSCLC/GSM*_H3N2_1.txt.gz'))
for fn in tqdm(fns):
    # read in the tables for H1N1
    tb1 = pd.read_table(fn)
    tb2 = pd.read_table(glob('../external_data/CAUSHI_NATURE2021_NSCLC/*' + '_'.join(fn.split('/')[-1].split('_')[1:3]) + '*_H3N2_2.txt.gz')[0])
    tb3 = pd.read_table(glob('../external_data/CAUSHI_NATURE2021_NSCLC/*' + '_'.join(fn.split('/')[-1].split('_')[1:3]) + '*_H3N2_3.txt.gz')[0])
    # concatenate together and average
    df = pd.concat([tb1, tb2, tb3], axis=0).groupby(['cdr3nt','cdr3aa','v','d','j','VEnd','DStart','DEnd','JStart']).mean().reset_index()
    df['patient'] = fn.split('/')[-1].split('_')[1]
    df['epitope'] = fn.split('/')[-1].split('_')[2]
    df['virus'] = fn.split('/')[-1].split('_')[3]
    dfs.append(df)

In [None]:
# create a conversion dataframe
series_sample_id = 'GSM5266261 GSM5266262 GSM5266263 GSM5266264 GSM5266265 GSM5266266 GSM5266267 GSM5266268 GSM5266269 GSM5266270 GSM5266271 GSM5266272 GSM5266273 GSM5266274 GSM5266275 GSM5266276 GSM5266277 GSM5266278 GSM5266279 GSM5266280 GSM5266281 GSM5266282 GSM5266283 GSM5266284 GSM5266285 GSM5266286 GSM5266287 GSM5266288 GSM5266289 GSM5266290 GSM5266291 GSM5266292 GSM5266293 GSM5266294 GSM5266295 GSM5266296 GSM5266297 GSM5266298 GSM5266299 GSM5266300 GSM5266301 GSM5266302 GSM5266303 GSM5266304 GSM5266305 GSM5266306 GSM5266307 GSM5266308 GSM5266309 GSM5266310 GSM5266311 GSM5266312 GSM5266313 GSM5266314 GSM5266315 GSM5266316 GSM5266317 GSM5266318 GSM5266319 GSM5266320 GSM5266321 GSM5266322 GSM5266323 GSM5266324 GSM5266325 GSM5266326 GSM5266327 GSM5266328 GSM5266329 GSM5266330 GSM5266331 GSM5266332 GSM5266333 GSM5266334 GSM5266335 GSM5266336 GSM5266337 GSM5266338 GSM5266339 GSM5266340 GSM5266341 GSM5266342 GSM5266343 GSM5266344 GSM5266345 GSM5266346 GSM5266347 GSM5266348 GSM5266349 GSM5266350 GSM5266351 GSM5266352 GSM5266353 GSM5266354 GSM5266355 GSM5266356 GSM5266357 GSM5266358 GSM5266359 GSM5266360 GSM5266361 GSM5266362'.split(' ')
sample_title = 'MD01-004_HIV"	"MD01-004_MP1_H1N1_1"	"MD01-004_MP1_H1N1_2"	"MD01-004_MP1_H1N1_3"	"MD01-004_MP1_H3N2_1"	"MD01-004_MP1_H3N2_2"	"MD01-004_MP1_H3N2_3"	"MD01-004_NP_H1N1_1"	"MD01-004_NP_H1N1_2"	"MD01-004_NP_H1N1_3"	"MD01-004_NP_H3N2_1"	"MD01-004_NP_H3N2_2"	"MD01-004_NP_H3N2_3"	"MD043-011_HIV"	"MD043-011_MP1_H1N1_1"	"MD043-011_MP1_H1N1_2"	"MD043-011_MP1_H1N1_3"	"MD043-011_MP1_H3N2_1"	"MD043-011_MP1_H3N2_2"	"MD043-011_MP1_H3N2_3"	"MD043-011_NP_H1N1_1"	"MD043-011_NP_H1N1_2"	"MD043-011_NP_H1N1_3"	"MD043-011_NP_H3N2_1"	"MD043-011_NP_H3N2_2"	"MD043-011_NP_H3N2_3"	"NY016-007_1"	"NY016-007_10"	"NY016-007_11"	"NY016-007_12"	"NY016-007_13"	"NY016-007_14"	"NY016-007_15"	"NY016-007_16"	"NY016-007_17"	"NY016-007_2"	"NY016-007_3"	"NY016-007_4"	"NY016-007_5"	"NY016-007_6"	"NY016-007_7"	"NY016-007_8"	"NY016-007_9"	"NY016-007_CEF"	"NY016-007_NoPep"	"NY016-014_1"	"NY016-014_10"	"NY016-014_11"	"NY016-014_12"	"NY016-014_13"	"NY016-014_14"	"NY016-014_15"	"NY016-014_16"	"NY016-014_2"	"NY016-014_21"	"NY016-014_22"	"NY016-014_23"	"NY016-014_24"	"NY016-014_25"	"NY016-014_3"	"NY016-014_31"	"NY016-014_32"	"NY016-014_4"	"NY016-014_5"	"NY016-014_6"	"NY016-014_7"	"NY016-014_8"	"NY016-014_9"	"NY016-014_CEF"	"NY016-014_No_Pep"	"NY016-025_1"	"NY016-025_10"	"NY016-025_11"	"NY016-025_12"	"NY016-025_13"	"NY016-025_14"	"NY016-025_15"	"NY016-025_16"	"NY016-025_17"	"NY016-025_2"	"NY016-025_3"	"NY016-025_4"	"NY016-025_5"	"NY016-025_6"	"NY016-025_7"	"NY016-025_8"	"NY016-025_9"	"NY016-025_CEF"	"NY016-025_NoPep"	"MD01-005_HIV"	"MD01-005_MP1_H1N1_1"	"MD01-005_MP1_H1N1_2"	"MD01-005_MP1_H1N1_3"	"MD01-005_MP1_H3N2_1"	"MD01-005_MP1_H3N2_2"	"MD01-005_MP1_H3N2_3"	"MD01-005_NP_H1N1_1"	"MD01-005_NP_H1N1_2"	"MD01-005_NP_H1N1_3"	"MD01-005_NP_H3N2_1"	"MD01-005_NP_H3N2_2"	"MD01-005_NP_H3N2_3'.split('"	"')
gene_of_origin = 'gene epitope derived from: NA"	"gene epitope derived from: NA"	"gene epitope derived from: NA"	"gene epitope derived from: NA"	"gene epitope derived from: NA"	"gene epitope derived from: NA"	"gene epitope derived from: NA"	"gene epitope derived from: NA"	"gene epitope derived from: NA"	"gene epitope derived from: NA"	"gene epitope derived from: NA"	"gene epitope derived from: NA"	"gene epitope derived from: NA"	"gene epitope derived from: NA"	"gene epitope derived from: NA"	"gene epitope derived from: NA"	"gene epitope derived from: NA"	"gene epitope derived from: NA"	"gene epitope derived from: NA"	"gene epitope derived from: NA"	"gene epitope derived from: NA"	"gene epitope derived from: NA"	"gene epitope derived from: NA"	"gene epitope derived from: NA"	"gene epitope derived from: NA"	"gene epitope derived from: NA"	"gene epitope derived from: SON"	"gene epitope derived from: MUC4"	"gene epitope derived from: MUC4"	"gene epitope derived from: TMPRSS13"	"gene epitope derived from: TMPRSS13"	"gene epitope derived from: TMPRSS13"	"gene epitope derived from: TMPRSS13"	"gene epitope derived from: TMPRSS13"	"gene epitope derived from: CARD14"	"gene epitope derived from: TMPRSS13"	"gene epitope derived from: SON"	"gene epitope derived from: SON"	"gene epitope derived from: SON"	"gene epitope derived from: TMPRSS13"	"gene epitope derived from: TMPRSS13"	"gene epitope derived from: CARD14"	"gene epitope derived from: MUC4"	"gene epitope derived from: NA"	"gene epitope derived from: NA"	"gene epitope derived from: SNX1"	"gene epitope derived from: CCNL1"	"gene epitope derived from: CUL4A"	"gene epitope derived from: CCNL1"	"gene epitope derived from: CCNL1"	"gene epitope derived from: COL11A1"	"gene epitope derived from: CENPE"	"gene epitope derived from: C2orf16"	"gene epitope derived from: SNX1"	"gene epitope derived from: ZNF692"	"gene epitope derived from: ZNF692"	"gene epitope derived from: CMKLR1"	"gene epitope derived from: CMKLR1"	"gene epitope derived from: COL11A1"	"gene epitope derived from: ZNF692"	"gene epitope derived from: ZNF692"	"gene epitope derived from: ZNF692"	"gene epitope derived from: CMKLR1"	"gene epitope derived from: CMKLR1"	"gene epitope derived from: CMKLR1"	"gene epitope derived from: CMKLR1"	"gene epitope derived from: CDKN2A"	"gene epitope derived from: C2orf16"	"gene epitope derived from: NA"	"gene epitope derived from: NA"	"gene epitope derived from: COL6A3"	"gene epitope derived from: SHPRH"	"gene epitope derived from: EZH1"	"gene epitope derived from: NTF3"	"gene epitope derived from: COL6A3"	"gene epitope derived from: SHPRH"	"gene epitope derived from: SHPRH"	"gene epitope derived from: OR4P4"	"gene epitope derived from: OR4P4"	"gene epitope derived from: NEB"	"gene epitope derived from: NTF3"	"gene epitope derived from: COL6A3"	"gene epitope derived from: COL6A3"	"gene epitope derived from: ARMCX2"	"gene epitope derived from: ZNF219"	"gene epitope derived from: ZNF219"	"gene epitope derived from: SHPRH"	"gene epitope derived from: NA"	"gene epitope derived from: NA"	"gene epitope derived from: NA"	"gene epitope derived from: NA"	"gene epitope derived from: NA"	"gene epitope derived from: NA"	"gene epitope derived from: NA"	"gene epitope derived from: NA"	"gene epitope derived from: NA"	"gene epitope derived from: NA"	"gene epitope derived from: NA"	"gene epitope derived from: NA"	"gene epitope derived from: NA"	"gene epitope derived from: NA"	"gene epitope derived from: NA'.split('"	"')
epitope = 'epitope tested: Viral (HIV)"	"epitope tested: Viral (Influenza, MP1_H1N1)"	"epitope tested: Viral (Influenza, MP1_H1N1)"	"epitope tested: Viral (Influenza, MP1_H1N1)"	"epitope tested: Viral (Influenza, MP1_H3N2)"	"epitope tested: Viral (Influenza, MP1_H3N2)"	"epitope tested: Viral (Influenza, MP1_H3N2)"	"epitope tested: Viral (Influenza, NP_H1N1)"	"epitope tested: Viral (Influenza, NP_H1N1)"	"epitope tested: Viral (Influenza, NP_H1N1)"	"epitope tested: Viral (Influenza, NP_H3N2)"	"epitope tested: Viral (Influenza, NP_H3N2)"	"epitope tested: Viral (Influenza, NP_H3N2)"	"epitope tested: Viral (HIV)"	"epitope tested: Viral (Influenza, MP1_H1N1)"	"epitope tested: Viral (Influenza, MP1_H1N1)"	"epitope tested: Viral (Influenza, MP1_H1N1)"	"epitope tested: Viral (Influenza, MP1_H3N2)"	"epitope tested: Viral (Influenza, MP1_H3N2)"	"epitope tested: Viral (Influenza, MP1_H3N2)"	"epitope tested: Viral (Influenza, NP_H1N1)"	"epitope tested: Viral (Influenza, NP_H1N1)"	"epitope tested: Viral (Influenza, NP_H1N1)"	"epitope tested: Viral (Influenza, NP_H3N2)"	"epitope tested: Viral (Influenza, NP_H3N2)"	"epitope tested: Viral (Influenza, NP_H3N2)"	"epitope tested: LTSPIVCF"	"epitope tested: AITSKVSTV"	"epitope tested: SAITSKVSTV"	"epitope tested: ASLARASPA"	"epitope tested: SLARASPA"	"epitope tested: ASLARASPAL"	"epitope tested: QASLARASPA"	"epitope tested: LARASPALA"	"epitope tested: KLRSLTFSLV"	"epitope tested: LARASPALASL"	"epitope tested: LRNGALTSPI"	"epitope tested: VLRNGALTSPI"	"epitope tested: LRNGALTSPIV"	"epitope tested: LARASPAL"	"epitope tested: SLARASPAL"	"epitope tested: LRSLTFSLV"	"epitope tested: SAITSKVSTV"	"epitope tested: Viral pool (CMV, EBV, Influenza A)"	"epitope tested: NA"	"epitope tested: LLADATVEL"	"epitope tested: TMACINLA"	"epitope tested: LSKDIMFHFK"	"epitope tested: VTMACINLASK"	"epitope tested: TMACINLASK"	"epitope tested: MSYDNNLFIK"	"epitope tested: KTWKEKTLK"	"epitope tested: VTLIDVPK"	"epitope tested: LLADATVELSL"	"epitope tested: MPLVHMAF"	"epitope tested: MPLVHMAFSPA"	"epitope tested: YPDYLDSIVF"	"epitope tested: YPDYLDSIVFL"	"epitope tested: MSYDNNLF"	"epitope tested: HMAFSPAV"	"epitope tested: MPLVHMAF"	"epitope tested: MAFSPAVDV"	"epitope tested: YLDSIVFL"	"epitope tested: FLEDLSPL"	"epitope tested: YLDSIVFLEDL"	"epitope tested: FLEDLSPLEA"	"epitope tested: LLLHGAEPKL"	"epitope tested: TLIDVPKV"	"epitope tested: Viral pool (CMV, EBV, Influenza A)"	"epitope tested: NA"	"epitope tested: AVQWLRPK"	"epitope tested: LLHEYWMSLR"	"epitope tested: EVKEEDEPF"	"epitope tested: QVNKVMYILFY"	"epitope tested: EVQNAVQWL"	"epitope tested: LHEYWMSL"	"epitope tested: YKLLHEYWMSL"	"epitope tested: MEESNNSTL"	"epitope tested: MEESNNSTLFI"	"epitope tested: HVMPDTPDILK"	"epitope tested: KVMYILFY"	"epitope tested: VQNAVQWLRPK"	"epitope tested: VQNAVQWLR"	"epitope tested: TLFQIIYDNLR"	"epitope tested: CLASLHPR"	"epitope tested: RSLGCLASLH"	"epitope tested: KLLHEYWMSLR"	"epitope tested: Viral pool (CMV, EBV, Influenza A)"	"epitope tested: NA"	"epitope tested: Viral (HIV)"	"epitope tested: Viral (Influenza, MP1_H1N1)"	"epitope tested: Viral (Influenza, MP1_H1N1)"	"epitope tested: Viral (Influenza, MP1_H1N1)"	"epitope tested: Viral (Influenza, MP1_H3N2)"	"epitope tested: Viral (Influenza, MP1_H3N2)"	"epitope tested: Viral (Influenza, MP1_H3N2)"	"epitope tested: Viral (Influenza, NP_H1N1)"	"epitope tested: Viral (Influenza, NP_H1N1)"	"epitope tested: Viral (Influenza, NP_H1N1)"	"epitope tested: Viral (Influenza, NP_H3N2)"	"epitope tested: Viral (Influenza, NP_H3N2)"	"epitope tested: Viral (Influenza, NP_H3N2)'.split('"	"')
df = pd.DataFrame([series_sample_id, sample_title, gene_of_origin, epitope]).T.set_index(0)
df['AG'] = df[3].str.split(': ', expand=True).iloc[:, 1]
df['GENE'] = df[2].str.split(': ', expand=True).iloc[:, 1]
df = df.loc[df[1].str.startswith('NY')]

In [None]:
# assemble neoantigens
fns = sorted(glob('../external_data/CAUSHI_NATURE2021_NSCLC/GSM*_NY*.txt.gz'))
for fn in fns:
    # read in the table
    tb = pd.read_table(fn)
    # add on the annotations
    tb['patient'] = fn.split('/')[-1].split('_')[1]
    tb['epitope'] = df.loc[fn.split('/')[-1].split('_')[0], 'AG']
    tb['virus'] = df.loc[fn.split('/')[-1].split('_')[0], 'GENE']
    dfs.append(tb)
result = pd.concat(dfs, axis=0)
result['TRB'] = result['cdr3aa']
result['TcellType'] = 'T'

In [None]:
# verification
# va = result['TRA'].astype(str).apply(verify)
vb = result['TRB'].astype(str).apply(verify)
# result = result.loc[va & vb]
result = result.loc[vb]
# mask = (~result['TRA'].astype(str).str.startswith('C')) & (~result['TRA'].isna())
# result['TRA'][mask] = 'C' + result['TRA'][mask]
mask = (~result['TRB'].astype(str).str.startswith('C')) & (~result['TRB'].isna())
result['TRB'][mask] = 'C' + result['TRB'][mask]
results['CAUSHI_NATURE2021_NSCLC'] = result

In [None]:
# this is the antigen, we will want to reference it later
# ../external_data/CAUSHI_NATURE2021_NSCLC/manafest_flufest_results.xlsx <-- also good resource
tb1 = pd.read_excel('../external_data/CAUSHI_NATURE2021_NSCLC/si_tables.xlsx', sheet_name=5, skiprows=1, index_col=1)
tb2 = pd.read_excel('../external_data/CAUSHI_NATURE2021_NSCLC/si_tables.xlsx', sheet_name=6, index_col=3, skiprows=1).dropna()
ag = pd.concat([tb1, tb2], axis=1)
cols = ['Predicted MANA sequence','WT sequence','MANA tested in culture']
verifications = []
for col in cols:
    verifications.append(ag[col].astype(str).apply(verify))
ag = ag.loc[pd.concat(verifications, axis=1).all(axis=1)]
results_ag = {'CAUSHI_NATURE2021_NSCLC': ag}

### GAO_NATCOMM2022_TLGLL

In [None]:
# derive annotation
sample = 'GSM5171588 GSM5171589 GSM5171590 GSM5171591 GSM5171592 GSM5171593 GSM5171594 GSM5171595 GSM5171596 GSM5171597 GSM5171598 GSM5171599 GSM5171600 GSM5171601 GSM5171602 GSM5171603 GSM5171604 GSM5171605 GSM5171606 GSM5171607 GSM5171608 GSM5171609 GSM5171610 GSM5171611 GSM5171612 GSM5171613 GSM5171614 GSM5171615 GSM5171616 GSM5171617 GSM5171618 GSM5171619 GSM5171620 GSM5171621 GSM5171622 GSM5171623 GSM5171624 GSM5171625 GSM5171626 GSM5171627 GSM5171628 GSM5171629 GSM5171630 GSM5171631 GSM5171632 GSM5171633 GSM5171634 GSM5171635 GSM5171636 GSM5171637 GSM5171638 GSM5171639 GSM5171640 GSM5171641 GSM5171642 GSM5171643 GSM5171644 GSM5171645 GSM5171646 GSM5171647 GSM5171648 GSM5171649 GSM5171650 GSM5171651'.split(' ')
tx = 'UPN24 pretreatment_geneexpression"	"UPN24 posttreatment_geneexpression"	"UPN10 pretreatment_geneexpression"	"UPN10 posttreatment_geneexpression"	"UPN19 pretreatment_geneexpression"	"UPN19 posttreatment_geneexpression"	"Healthydonor1_geneexpression"	"Healthydonor2_geneexpression"	"UPN18 pretreatment_geneexpression"	"UPN18 posttreatment_geneexpression"	"UPN4 pretreatment_geneexpression"	"UPN4 posttreatment_geneexpression"	"UPN17 pretreatment_geneexpression"	"UPN17 posttreatment_geneexpression"	"Healthydonor3_geneexpression"	"Healthydonor4_geneexpression"	"UPN1 pretreatment_geneexpression"	"UPN1 posttreatment_geneexpression"	"UPN12 pretreatment_geneexpression"	"UPN12 posttreatment_geneexpression"	"UPN8 pretreatment_geneexpression"	"UPN13 pretreatment_geneexpression"	"Healthydonor5_geneexpression"	"Healthydonor6_geneexpression"	"UPN8 posttreatment_geneexpression"	"UPN13 posttreatment_geneexpression"	"UPN14 pretreatment_geneexpression"	"UPN14 posttreatment_geneexpression"	"UPN15 pretreatment_geneexpression"	"UPN15 posttreatment_geneexpression"	"UPN6 pretreatment_geneexpression"	"Healthydonor7_geneexpression"	"UPN24 pretreatment_VDJ"	"UPN24 posttreatment_VDJ"	"UPN10 pretreatment_VDJ"	"UPN10 posttreatment_VDJ"	"UPN19 pretreatment_VDJ"	"UPN19 posttreatment_VDJ"	"Healthydonor1_VDJ"	"Healthydonor2_VDJ"	"UPN18 pretreatment_VDJ"	"UPN18 posttreatment_VDJ"	"UPN4 pretreatment_VDJ"	"UPN4 posttreatment_VDJ"	"UPN17 pretreatment_VDJ"	"UPN17 posttreatment_VDJ"	"Healthydonor3_VDJ"	"Healthydonor4_VDJ"	"UPN1 pretreatment_VDJ"	"UPN1 posttreatment_VDJ"	"UPN12 pretreatment_VDJ"	"UPN12 posttreatment_VDJ"	"UPN8 pretreatment_VDJ"	"UPN13 pretreatment_VDJ"	"Healthydonor5_VDJ"	"Healthydonor6_VDJ"	"UPN8 posttreatment_VDJ"	"UPN13 posttreatment_VDJ"	"UPN14 pretreatment_VDJ"	"UPN14 posttreatment_VDJ"	"UPN15 pretreatment_VDJ"	"UPN15 posttreatment_VDJ"	"UPN6 pretreatment_VDJ"	"Healthydonor7_VDJ'.split('"	"')
s2t = pd.Series(tx, index=sample)

In [None]:
# there is tx with anti-CD52, ignore SI tables not relevant
# locate the TCR tables
fns = glob('../external_data/GAO_NATCOMM2022_TLGLL/GSM*_filtered_contig_annotations.csv.gz')
# process each of the TCR tables
dfs = []
for fn in tqdm(fns):
    # read in the table
    df = pd.read_csv(fn)
    # remove any chain for which it is unknown
    df = df.dropna(subset=['chain','cdr3'])
    df = df.loc[df['chain'] != 'Multi']
    # filter for cells
    df = df.loc[df['is_cell'] & df['high_confidence']]
    df = df.loc[df['chain'].isin(['TRA','TRB','TRD','TRG'])]
    # remove the invalid barcodes from duplicates with low counts
    counts_tra = df.loc[df['chain'] == 'TRA', 'barcode'].value_counts()
    counts_trb = df.loc[df['chain'] == 'TRB', 'barcode'].value_counts()
    counts_trd = df.loc[df['chain'] == 'TRD', 'barcode'].value_counts()
    counts_trg = df.loc[df['chain'] == 'TRG', 'barcode'].value_counts()
    invalid_bcs_tra = counts_tra.index[counts_tra >= 2]
    invalid_bcs_trb = counts_trb.index[counts_trb >= 2]
    invalid_bcs_trd = counts_trd.index[counts_trd >= 2]
    invalid_bcs_trg = counts_trg.index[counts_trg >= 2]
    invalid_idxs = []
    for bc in invalid_bcs_tra:
        # isolate the minimally expressed idxs
        idxs = df.loc[(df['chain'] == 'TRA')&(df['barcode'] == bc), 'reads'].sort_values().index[0:-1]
        invalid_idxs.extend(idxs.tolist())
    for bc in invalid_bcs_trb:
        # isolate the minimally expressed idxs
        idxs = df.loc[(df['chain'] == 'TRB')&(df['barcode'] == bc), 'reads'].sort_values().index[0:-1]
        invalid_idxs.extend(idxs.tolist())
    for bc in invalid_bcs_trd:
        # isolate the minimally expressed idxs
        idxs = df.loc[(df['chain'] == 'TRD')&(df['barcode'] == bc), 'reads'].sort_values().index[0:-1]
        invalid_idxs.extend(idxs.tolist())
    for bc in invalid_bcs_trg:
        # isolate the minimally expressed idxs
        idxs = df.loc[(df['chain'] == 'TRG')&(df['barcode'] == bc), 'reads'].sort_values().index[0:-1]
        invalid_idxs.extend(idxs.tolist())
    df = df.loc[~df.index.isin(invalid_idxs)]
    # verify there are not any barcodes that have three rows
    counts = df[['barcode','chain']].value_counts()
    assert counts.max() < 2
    # assemble it up
    df = df.pivot_table(index='barcode', columns=['chain'], values=df.columns[1:-1].tolist(), aggfunc=np.sum)
    df.columns = [':'.join(x) for x in df.columns]
    # add on special columns
    gsm = fn.split('/')[-1].split('_')[0]
    res = s2t[gsm].split('_')[0].split(' ')
    pt = res[0]; tp = res[-1]
    df['patient'] = pt; df['timepoint'] = tp
    # process the non chain dependent columns
    df[['TRA','TRB']] = df[['cdr3:TRA','cdr3:TRB']]
    df['TcellType'] = 'T'
    dfs.append(df)
result = pd.concat(dfs, axis=0)

In [None]:
# verification
va = result['TRA'].astype(str).apply(verify)
vb = result['TRB'].astype(str).apply(verify)
result = result.loc[va & vb]
mask = (~result['TRA'].astype(str).str.startswith('C')) & (~result['TRA'].isna())
result['TRA'][mask] = 'C' + result['TRA'][mask]
mask = (~result['TRB'].astype(str).str.startswith('C')) & (~result['TRB'].isna())
result['TRB'][mask] = 'C' + result['TRB'][mask]
results['GAO_NATCOMM2022_TLGLL'] = result

### GUO_NATMED2018_NSCLC

In [None]:
# read in the data
df = pd.read_excel('../external_data/GUO_NATMED2018_NSCLC/TCR.xlsx')
df = df.dropna(subset=['CDR3(Alpha1)','CDR3(Beta1)'], how='all')
# map patient annotations
anno = pd.read_excel('../external_data/GUO_NATMED2018_NSCLC/PT_CT.xlsx', sheet_name=1, index_col=0)
for col in anno.columns:
    df[col] = df['Patient'].map(anno[col])
df[['TRA','TRB']] = df[['CDR3(Alpha1)','CDR3(Beta1)']]
df['TcellType'] = 'T'
result = df.copy()

In [None]:
# verification
va = result['TRA'].astype(str).apply(verify)
vb = result['TRB'].astype(str).apply(verify)
result = result.loc[va & vb]
mask = (~result['TRA'].astype(str).str.startswith('C')) & (~result['TRA'].isna())
result['TRA'][mask] = 'C' + result['TRA'][mask]
mask = (~result['TRB'].astype(str).str.startswith('C')) & (~result['TRB'].isna())
result['TRB'][mask] = 'C' + result['TRB'][mask]
results['GUO_NATMED2018_NSCLC'] = result

### HUANG_NATBIOT2020_MTUB

In [None]:
# add on the mycobacterium tuberculosis access
df = pd.read_excel('../external_data/HUANG_NATBIOT2020_MTUB/TCR.xlsx')
df['TRB'] = df['CDR3beta']
df['TcellType'] = 'CD4'
result = df.copy()

In [None]:
# verification
# va = result['TRA'].astype(str).apply(verify)
vb = result['TRB'].astype(str).apply(verify)
# result = result.loc[va & vb]
result = result.loc[vb]
# mask = (~result['TRA'].astype(str).str.startswith('C')) & (~result['TRA'].isna())
# result['TRA'][mask] = 'C' + result['TRA'][mask]
mask = (~result['TRB'].astype(str).str.startswith('C')) & (~result['TRB'].isna())
result['TRB'][mask] = 'C' + result['TRB'][mask]
results['HUANG_NATBIOT2020_MTUB'] = result

### LI_CELL2018_MELA

In [None]:
# these are TCR beta sequences, we utilize the first
df = pd.read_table('../external_data/LI_CELL2018_MELA/GSE123139_T_cells_tcrb_v2.txt.gz')
df['TRB'] = df['CDR3_translation_first']
df = df.dropna(subset=['TRB'])
df['TcellType'] = 'T'
result = df.copy()

In [None]:
# verification
# va = result['TRA'].astype(str).apply(verify)
vb = result['TRB'].astype(str).apply(verify)
# result = result.loc[va & vb]
result = result.loc[vb]
# mask = (~result['TRA'].astype(str).str.startswith('C')) & (~result['TRA'].isna())
# result['TRA'][mask] = 'C' + result['TRA'][mask]
mask = (~result['TRB'].astype(str).str.startswith('C')) & (~result['TRB'].isna())
result['TRB'][mask] = 'C' + result['TRB'][mask]
results['LI_CELL2018_MELA'] = result

### LIU_NATCAN2022_NSCLC

In [None]:
# read in the single cell NSCLC data
df = pd.read_table('../external_data/LIU_NATCAN2022_NSCLC/GSE179994_all.scTCR.tsv.gz')
df[['TRA','TRB']] = df[['CDR3(Alpha1)','CDR3(Beta1)']]
df['TcellType'] = 'T'
result = df.copy()

In [None]:
# verification
va = result['TRA'].astype(str).apply(verify)
vb = result['TRB'].astype(str).apply(verify)
result = result.loc[va & vb]
mask = (~result['TRA'].astype(str).str.startswith('C')) & (~result['TRA'].isna())
result['TRA'][mask] = 'C' + result['TRA'][mask]
mask = (~result['TRB'].astype(str).str.startswith('C')) & (~result['TRB'].isna())
result['TRB'][mask] = 'C' + result['TRB'][mask]
results['LIU_NATCAN2022_NSCLC'] = result

### MILLER_SCITRANSMED2024_PANCAN

In [None]:
# antigens to be utilized later on
tb1 = pd.read_excel('../external_data/MILLER_SCITRANSMED2024_PANCAN/AG_ALL.xlsx')
tb2 = pd.read_excel('../external_data/MILLER_SCITRANSMED2024_PANCAN/AG_PDAC.xlsx')
ag = pd.concat([tb1, tb2], axis=0)
cols = ['mut peptide','ref peptide']
verifications = []
for col in cols:
    verifications.append(ag[col].astype(str).apply(verify))
ag = ag.loc[pd.concat(verifications, axis=1).all(axis=1)]
results_ag['MILLER_SCITRANSMED2024_PANCAN'] = ag

### MINOWA_SCITRANSMED2024_MELA

In [None]:
# read in the patient characteristics for annotation
anno = pd.read_excel('../external_data/MINOWA_SCITRANSMED2024_MELA/PT_CT.xlsx')
for col in anno.columns:
    anno[col] = anno[col].astype(str).str.replace('\n',' ')
    anno[col][anno[col] == 'nan'] = np.nan

In [None]:
from tqdm import tqdm
from glob import glob
# read in the filtered data
fns = glob('../external_data/MINOWA_SCITRANSMED2024_MELA/*filtered_contig_annotations.csv.gz')
dfs = []
for fn in tqdm(fns):
    # read in the table
    df = pd.read_csv(fn)
    # remove any chain for which it is unknown
    df = df.dropna(subset=['chain','cdr3'])
    df = df.loc[df['chain'] != 'Multi']
    # filter for cells
    df = df.loc[df['is_cell'] & df['high_confidence']]
    df = df.loc[df['chain'].isin(['TRA','TRB','TRD','TRG'])]
    # remove the invalid barcodes from duplicates with low counts
    counts_tra = df.loc[df['chain'] == 'TRA', 'barcode'].value_counts()
    counts_trb = df.loc[df['chain'] == 'TRB', 'barcode'].value_counts()
    counts_trd = df.loc[df['chain'] == 'TRD', 'barcode'].value_counts()
    counts_trg = df.loc[df['chain'] == 'TRG', 'barcode'].value_counts()
    invalid_bcs_tra = counts_tra.index[counts_tra >= 2]
    invalid_bcs_trb = counts_trb.index[counts_trb >= 2]
    invalid_bcs_trd = counts_trd.index[counts_trd >= 2]
    invalid_bcs_trg = counts_trg.index[counts_trg >= 2]
    invalid_idxs = []
    for bc in invalid_bcs_tra:
        # isolate the minimally expressed idxs
        idxs = df.loc[(df['chain'] == 'TRA')&(df['barcode'] == bc), 'reads'].sort_values().index[0:-1]
        invalid_idxs.extend(idxs.tolist())
    for bc in invalid_bcs_trb:
        # isolate the minimally expressed idxs
        idxs = df.loc[(df['chain'] == 'TRB')&(df['barcode'] == bc), 'reads'].sort_values().index[0:-1]
        invalid_idxs.extend(idxs.tolist())
    for bc in invalid_bcs_trd:
        # isolate the minimally expressed idxs
        idxs = df.loc[(df['chain'] == 'TRD')&(df['barcode'] == bc), 'reads'].sort_values().index[0:-1]
        invalid_idxs.extend(idxs.tolist())
    for bc in invalid_bcs_trg:
        # isolate the minimally expressed idxs
        idxs = df.loc[(df['chain'] == 'TRG')&(df['barcode'] == bc), 'reads'].sort_values().index[0:-1]
        invalid_idxs.extend(idxs.tolist())
    df = df.loc[~df.index.isin(invalid_idxs)]
    # verify there are not any barcodes that have three rows
    counts = df[['barcode','chain']].value_counts()
    assert counts.max() < 2
    # assemble it up
    df = df.pivot_table(index='barcode', columns=['chain'], values=df.columns[1:-1].tolist(), aggfunc=np.sum)
    df.columns = [':'.join(x) for x in df.columns]
    # add on special columns
    pt, tissue_of_origin = fn.split('/')[-1].split('_')[2:4]
    df['patient'] = pt; df['tissue'] = tissue_of_origin
    # process the non chain dependent columns
    df[['TRA','TRB']] = df[['cdr3:TRA','cdr3:TRB']]
    df['TcellType'] = 'T'
    dfs.append(df)
result = pd.concat(dfs, axis=0)

In [None]:
# verification
va = result['TRA'].astype(str).apply(verify)
vb = result['TRB'].astype(str).apply(verify)
result = result.loc[va & vb]
mask = (~result['TRA'].astype(str).str.startswith('C')) & (~result['TRA'].isna())
result['TRA'][mask] = 'C' + result['TRA'][mask]
mask = (~result['TRB'].astype(str).str.startswith('C')) & (~result['TRB'].isna())
result['TRB'][mask] = 'C' + result['TRB'][mask]
results['MINOWA_SCITRANSMED2024_MELA'] = result

### NAULAERTS_SCITRANSMED2023_GBM

In [None]:
# read in pancan data
result = sc.read_h5ad('../external_data/NAULAERTS_SCITRANSMED2023_GBM/TCRobject.h5ad').obs
result[['TRA','TRB']] = result[['IR_VJ_1_junction_aa','IR_VDJ_1_junction_aa']].astype(str)
result['TcellType'] = 'T'

In [None]:
# verification
va = result['TRA'].astype(str).apply(verify)
vb = result['TRB'].astype(str).apply(verify)
result = result.loc[va & vb]
mask = (~result['TRA'].astype(str).str.startswith('C')) & (~result['TRA'].isna())
result['TRA'][mask] = 'C' + result['TRA'][mask].astype(str)
mask = (~result['TRB'].astype(str).str.startswith('C')) & (~result['TRB'].isna())
result['TRB'][mask] = 'C' + result['TRB'][mask].astype(str)
results['NAULAERTS_SCITRANSMED2023_GBM'] = result

### ROJAS_NATURE2023_PDAC

In [None]:
# read in the AGs for PDAC
tb1 = pd.read_excel('../external_data/ROJAS_NATURE2023_PDAC/AG.xlsx')
ag = tb1
cols = ['Mutant Neoantigen Sequence', 'WT Neoantigen Sequence', 'MHC-I Mutant Epitope (Best Prediction)', 'MHC-I WT Epitope', 'MHC-II Mutant Epitope (Best Prediction)', 'MHC-II WT Epitope']
verifications = []
for col in cols:
    verifications.append(ag[col].astype(str).apply(verify))
ag = ag.loc[pd.concat(verifications, axis=1).all(axis=1)]
results_ag['ROJAS_NATURE2023_PDAC'] = ag

In [None]:
# read in the data
fns = glob('../external_data/ROJAS_NATURE2023_PDAC/GSM*_filtered_contig_annotations.csv.gz')
dfs = []
for fn in tqdm(fns):
    # read in the table
    df = pd.read_csv(fn)
    # remove any chain for which it is unknown
    df = df.dropna(subset=['chain','cdr3'])
    df = df.loc[df['chain'] != 'Multi']
    # filter for cells
    df = df.loc[df['is_cell'] & df['high_confidence']]
    df = df.loc[df['chain'].isin(['TRA','TRB','TRD','TRG'])]
    # remove the invalid barcodes from duplicates with low counts
    counts_tra = df.loc[df['chain'] == 'TRA', 'barcode'].value_counts()
    counts_trb = df.loc[df['chain'] == 'TRB', 'barcode'].value_counts()
    counts_trd = df.loc[df['chain'] == 'TRD', 'barcode'].value_counts()
    counts_trg = df.loc[df['chain'] == 'TRG', 'barcode'].value_counts()
    invalid_bcs_tra = counts_tra.index[counts_tra >= 2]
    invalid_bcs_trb = counts_trb.index[counts_trb >= 2]
    invalid_bcs_trd = counts_trd.index[counts_trd >= 2]
    invalid_bcs_trg = counts_trg.index[counts_trg >= 2]
    invalid_idxs = []
    for bc in invalid_bcs_tra:
        # isolate the minimally expressed idxs
        idxs = df.loc[(df['chain'] == 'TRA')&(df['barcode'] == bc), 'reads'].sort_values().index[0:-1]
        invalid_idxs.extend(idxs.tolist())
    for bc in invalid_bcs_trb:
        # isolate the minimally expressed idxs
        idxs = df.loc[(df['chain'] == 'TRB')&(df['barcode'] == bc), 'reads'].sort_values().index[0:-1]
        invalid_idxs.extend(idxs.tolist())
    for bc in invalid_bcs_trd:
        # isolate the minimally expressed idxs
        idxs = df.loc[(df['chain'] == 'TRD')&(df['barcode'] == bc), 'reads'].sort_values().index[0:-1]
        invalid_idxs.extend(idxs.tolist())
    for bc in invalid_bcs_trg:
        # isolate the minimally expressed idxs
        idxs = df.loc[(df['chain'] == 'TRG')&(df['barcode'] == bc), 'reads'].sort_values().index[0:-1]
        invalid_idxs.extend(idxs.tolist())
    df = df.loc[~df.index.isin(invalid_idxs)]
    # verify there are not any barcodes that have three rows
    counts = df[['barcode','chain']].value_counts()
    assert counts.max() < 2
    # assemble it up
    df = df.pivot_table(index='barcode', columns=['chain'], values=df.columns[1:-1].tolist(), aggfunc=np.sum)
    df.columns = [':'.join(x) for x in df.columns]
    # add on special columns
    pt, tp = fn.split('_TCR_filtered_contig_annotations.csv.gz')[0].split('/')[-1].split('_')[1:]
    df['patient'] = pt; df['timepoint'] = tp
    # process the non chain dependent columns
    df[['TRA','TRB']] = df[['cdr3:TRA','cdr3:TRB']]
    df['TcellType'] = 'T'
    dfs.append(df)
result = pd.concat(dfs, axis=0)

In [None]:
# verification
va = result['TRA'].astype(str).apply(verify)
vb = result['TRB'].astype(str).apply(verify)
result = result.loc[va & vb]
mask = (~result['TRA'].astype(str).str.startswith('C')) & (~result['TRA'].isna())
result['TRA'][mask] = 'C' + result['TRA'][mask]
mask = (~result['TRB'].astype(str).str.startswith('C')) & (~result['TRB'].isna())
result['TRB'][mask] = 'C' + result['TRB'][mask]
results['ROJAS_NATURE2023_PDAC'] = result

### SALUZZO_IMMUNITY2021_HIV

In [None]:
# process HIV ART dataset
fns = glob('../external_data/SALUZZO_IMMUNITY2021_HIV/SSHIV*_transcriptome/QC_categories.csv')
dfs = []
for fn in tqdm(fns):
    # read in both tables
    fn_tcr = fn.replace('_transcriptome/QC_categories.csv','_VDJ/filtered_contig_annotations.csv')
    df = pd.read_csv(fn_tcr)
    anno = pd.read_csv(fn)
    # only keep valid samples
    anno = anno.loc[anno['hto_demux'].isin(['HTO-skin','HTO-blood'])&anno['pass_QC']]
    df = df.loc[df['barcode'].isin(anno['barcode'])]
    anno = anno.set_index('barcode')['sample']
    # remove any chain for which it is unknown
    df = df.dropna(subset=['chain','cdr3'])
    df = df.loc[df['chain'] != 'Multi']
    # filter for cells
    df = df.loc[df['is_cell'] & df['high_confidence']]
    df = df.loc[df['chain'].isin(['TRA','TRB','TRD','TRG'])]
    # remove the invalid barcodes from duplicates with low counts
    counts_tra = df.loc[df['chain'] == 'TRA', 'barcode'].value_counts()
    counts_trb = df.loc[df['chain'] == 'TRB', 'barcode'].value_counts()
    counts_trd = df.loc[df['chain'] == 'TRD', 'barcode'].value_counts()
    counts_trg = df.loc[df['chain'] == 'TRG', 'barcode'].value_counts()
    invalid_bcs_tra = counts_tra.index[counts_tra >= 2]
    invalid_bcs_trb = counts_trb.index[counts_trb >= 2]
    invalid_bcs_trd = counts_trd.index[counts_trd >= 2]
    invalid_bcs_trg = counts_trg.index[counts_trg >= 2]
    invalid_idxs = []
    for bc in invalid_bcs_tra:
        # isolate the minimally expressed idxs
        idxs = df.loc[(df['chain'] == 'TRA')&(df['barcode'] == bc), 'reads'].sort_values().index[0:-1]
        invalid_idxs.extend(idxs.tolist())
    for bc in invalid_bcs_trb:
        # isolate the minimally expressed idxs
        idxs = df.loc[(df['chain'] == 'TRB')&(df['barcode'] == bc), 'reads'].sort_values().index[0:-1]
        invalid_idxs.extend(idxs.tolist())
    for bc in invalid_bcs_trd:
        # isolate the minimally expressed idxs
        idxs = df.loc[(df['chain'] == 'TRD')&(df['barcode'] == bc), 'reads'].sort_values().index[0:-1]
        invalid_idxs.extend(idxs.tolist())
    for bc in invalid_bcs_trg:
        # isolate the minimally expressed idxs
        idxs = df.loc[(df['chain'] == 'TRG')&(df['barcode'] == bc), 'reads'].sort_values().index[0:-1]
        invalid_idxs.extend(idxs.tolist())
    df = df.loc[~df.index.isin(invalid_idxs)]
    # verify there are not any barcodes that have three rows
    counts = df[['barcode','chain']].value_counts()
    assert counts.max() < 2
    # assemble it up
    df = df.pivot_table(index='barcode', columns=['chain'], values=df.columns[1:-1].tolist(), aggfunc=np.sum)
    df.columns = [':'.join(x) for x in df.columns]
    # add on special columns
    df['sample'] = df.index.map(anno)
    # process the non chain dependent columns
    df[['TRA','TRB']] = df[['cdr3:TRA','cdr3:TRB']]
    df['TcellType'] = 'T'
    dfs.append(df)
result = pd.concat(dfs, axis=0)
result[['cohort','patient','tissue']] = result['sample'].str.split('_', expand=True)

In [None]:
# verification
va = result['TRA'].astype(str).apply(verify)
vb = result['TRB'].astype(str).apply(verify)
result = result.loc[va & vb]
mask = (~result['TRA'].astype(str).str.startswith('C')) & (~result['TRA'].isna())
result['TRA'][mask] = 'C' + result['TRA'][mask]
mask = (~result['TRB'].astype(str).str.startswith('C')) & (~result['TRB'].isna())
result['TRB'][mask] = 'C' + result['TRB'][mask]
results['SALUZZO_IMMUNITY2021_HIV'] = result

### SCHMIDT_NATCOMM2023_PANCAN

In [None]:
# read in the peripheral blood vs. tumor frequency
df = pd.read_excel('../external_data/SCHMIDT_NATCOMM2023_PANCAN/TRA_PBL_TIL_SI.xlsx')
df[['TRAV','TRA','TRAJ']] = df.iloc[:, 0].str.slice(1).str.split('_', expand=True)
df['TRAJ'] = df['TRAJ'].str.slice(1)
df['source'] = 'SI'
dfs = [df]
# read in the main version
df = pd.read_excel('../external_data/SCHMIDT_NATCOMM2023_PANCAN/TRA_PBL_TIL.xlsx')
df[['TRAV','TRA','TRAJ']] = df.iloc[:, 0].str.slice(1).str.split('_', expand=True)
df['TRAJ'] = df['TRAJ'].str.slice(1)
df['source'] = 'MAIN'
dfs.append(df)
# add on the avidity model data
df = pd.read_excel('../external_data/SCHMIDT_NATCOMM2023_PANCAN/TRB_MELA.xlsx')
df[['TRA','TRB']] = df[['CDR3a','CDR3b']]
df['source'] = 'MODEL'
dfs.append(df)
result = pd.concat(dfs, axis=0)
result['TcellType'] = 'T'

In [None]:
# verification
va = result['TRA'].astype(str).apply(verify)
vb = result['TRB'].astype(str).apply(verify)
result = result.loc[va & vb]
mask = (~result['TRA'].astype(str).str.startswith('C')) & (~result['TRA'].isna())
result['TRA'][mask] = 'C' + result['TRA'][mask]
mask = (~result['TRB'].astype(str).str.startswith('C')) & (~result['TRB'].isna())
result['TRB'][mask] = 'C' + result['TRB'][mask]
results['SCHMIDT_NATCOMM2023_PANCAN'] = result

### SUO_SCIENCE2022_FETAL

In [None]:
# read in the fetal data yay!!!
result = sc.read_h5ad('../external_data/SUO_SCIENCE2022_FETAL/PAN.A01.v01.raw_count.20210429.NKT.embedding.abTCR.h5ad')
result = result.obs.loc[result.obs['predicted_doublets'] == 'False']
for col in result.columns:
    result[col][result[col] == 'nan'] = np.nan
    result[col][result[col] == 'None'] = np.nan
result = result.dropna(subset=['IR_VJ_1_cdr3','IR_VDJ_1_cdr3'], how='all')
result[['TRA','TRB']] = result[['IR_VJ_1_cdr3','IR_VDJ_1_cdr3']]
result = result.loc[~result['celltype_annotation'].isin(['DOUBLET','HIGH_MITO','PRE_PRO_B','CYCLING_MEMP','CMP','DOUBLET_ERY_B','LOW_Q_INCONSISTENT'])]
result['TcellType'] = 'T'
result.loc[result['celltype_annotation'].isin(['CD4+T','TREG']), 'TcellType'] = 'CD4'
result.loc[result['celltype_annotation'].isin(['CD8+T']), 'TcellType'] = 'CD8'

In [None]:
# verification
va = result['TRA'].astype(str).apply(verify)
vb = result['TRB'].astype(str).apply(verify)
result = result.loc[va & vb]
mask = (~result['TRA'].astype(str).str.startswith('C')) & (~result['TRA'].isna())
result['TRA'][mask] = 'C' + result['TRA'][mask].astype(str)
mask = (~result['TRB'].astype(str).str.startswith('C')) & (~result['TRB'].isna())
result['TRB'][mask] = 'C' + result['TRB'][mask].astype(str)
results['SUO_SCIENCE2022_FETAL'] = result

### YOST_NATMED2019_BSCC

In [None]:
# read in the data
meta = pd.read_table('../external_data/YOST_NATMED2019_BSCC/GSE123813_bcc_all_metadata.txt.gz', index_col=0)
df = pd.read_table('../external_data/YOST_NATMED2019_BSCC/GSE123813_bcc_tcr.txt.gz', index_col=0)
df[['TRA','TRB']] = np.nan
for idx in tqdm(df.index):
    tcr = df.loc[idx, 'cdr3s_aa'].split(';')
    for el in tcr:
        if el.startswith('TRA'):
            df.loc[idx, 'TRA'] = el[4:]
        elif el.startswith('TRB'):
            df.loc[idx, 'TRB'] = el[4:]
for col in meta.columns:
    df[col] = df.index.map(meta[col])
df['cancer'] = 'BCC'
dfs = [df]
# repeat for SCC
meta = pd.read_table('../external_data/YOST_NATMED2019_BSCC/GSE123813_scc_metadata.txt.gz', index_col=0)
df = pd.read_table('../external_data/YOST_NATMED2019_BSCC/GSE123813_scc_tcr.txt.gz', index_col=0)
df[['TRA','TRB']] = np.nan
for idx in tqdm(df.index):
    tcr = df.loc[idx, 'cdr3s_aa'].split(';')
    for el in tcr:
        if el.startswith('TRA'):
            df.loc[idx, 'TRA'] = el[4:]
        elif el.startswith('TRB'):
            df.loc[idx, 'TRB'] = el[4:]
for col in meta.columns:
    df[col] = df.index.map(meta[col])
df['cancer'] = 'SCC'
dfs.append(df)
result = pd.concat(dfs, axis=0)
result['TcellType'] = 'T'

In [None]:
# verification
va = result['TRA'].astype(str).apply(verify)
vb = result['TRB'].astype(str).apply(verify)
result = result.loc[va & vb]
mask = (~result['TRA'].astype(str).str.startswith('C')) & (~result['TRA'].isna())
result['TRA'][mask] = 'C' + result['TRA'][mask].astype(str)
mask = (~result['TRB'].astype(str).str.startswith('C')) & (~result['TRB'].isna())
result['TRB'][mask] = 'C' + result['TRB'][mask].astype(str)
results['YOST_NATMED2019_BSCC'] = result

### COVID-19 Atlas

In [None]:
# ! these sequences were not included in model training
# due to hyperfitting issues, but all sequences were utilized for inference
# verification
vocab = ['A','C','D','E','F','G','H','I','K','L','M','N','P','Q','R','S','T','V','W','Y']
def verify(x):
    if str(x) == 'nan': return True
    return sum([x.count(aa) for aa in vocab]) == len(x)

In [None]:
# read in the COVID-19 dataset information
a = sc.read_h5ad('/home/dchen2/dchen2/COVID_ISB_STORAGE/upto_v16_P_GE_int_tcr_cd8_t_cells.h5ad')
# read in the incov data
result = a.obs.copy()
for col in result.columns:
    result[col][result[col] == 'nan'] = np.nan
    result[col][result[col] == 'None'] = np.nan
result = result.dropna(subset=['IR_VJ_1_junction_aa','IR_VDJ_1_junction_aa'], how='all')
result[['TRA','TRB']] = result[['IR_VJ_1_junction_aa','IR_VDJ_1_junction_aa']]
result['TcellType'] = 'CD8'
# verification
va = result['TRA'].astype(str).apply(verify)
vb = result['TRB'].astype(str).apply(verify)
result = result.loc[va & vb]
mask = (~result['TRA'].astype(str).str.startswith('C')) & (~result['TRA'].isna())
result['TRA'][mask] = 'C' + result['TRA'][mask].astype(str)
mask = (~result['TRB'].astype(str).str.startswith('C')) & (~result['TRB'].isna())
result['TRB'][mask] = 'C' + result['TRB'][mask].astype(str)
result_cd8 = result.copy()

In [None]:
# read in the COVID-19 dataset information
a = sc.read_h5ad('/home/dchen2/dchen2/COVID_ISB_STORAGE/upto_v16_P_GE_int_tcr_cd4_t_cells.h5ad')
# read in the incov data
result = a.obs.copy()
for col in result.columns:
    result[col][result[col] == 'nan'] = np.nan
    result[col][result[col] == 'None'] = np.nan
result = result.dropna(subset=['IR_VJ_1_junction_aa','IR_VDJ_1_junction_aa'], how='all')
result[['TRA','TRB']] = result[['IR_VJ_1_junction_aa','IR_VDJ_1_junction_aa']]
result['TcellType'] = 'CD4'
# verification
va = result['TRA'].astype(str).apply(verify)
vb = result['TRB'].astype(str).apply(verify)
result = result.loc[va & vb]
mask = (~result['TRA'].astype(str).str.startswith('C')) & (~result['TRA'].isna())
result['TRA'][mask] = 'C' + result['TRA'][mask].astype(str)
mask = (~result['TRB'].astype(str).str.startswith('C')) & (~result['TRB'].isna())
result['TRB'][mask] = 'C' + result['TRB'][mask].astype(str)
result_cd4 = result.copy()

In [None]:
# conatenate the data together
result = pd.concat([result_cd8, result_cd4], axis=0)
# add to the data
results_tcr['SU_CELL2022_COVID19'] = result

In [None]:
# write the pickled data
with open('../external_data/results.tcr.pkl', 'wb') as f:
    pkl.dump(results_tcr, f)
with open('../external_data/results.ag.pkl', 'wb') as f:
    pkl.dump(results_ag, f)