In [115]:
import numpy as np
import pandas as pd
import scanpy as sc
import scirpy as ir
from matplotlib import pyplot as plt, cm as mpl_cm
from cycler import cycler

sc.set_figure_params(figsize=(4, 4))
sc.settings.verbosity = 2  # verbosity: errors (0), warnings (1), info (2), hints (3)

In [116]:
from tcr_processing import *

In [117]:
import datetime
time=datetime.datetime.now().strftime('%Y%m%d_%H%M')

parameter_dict=dict(
    contigs="data/vdj_v1_hs_aggregated_donor2_all_contig_annotations.csv",
    bc_matrix="data/vdj_v1_hs_aggregated_donor2_filtered_feature_bc_matrix.h5",
    binarized_matrix='data/vdj_v1_hs_aggregated_donor2_binarized_matrix.csv',
    preprocess=True,
    min_cells=10,
    min_genes=100,
    max_genes=2500,
    mito_cutoff=5,
    chain_qc=True,
    epitopes_loadfile=None,
    epitopes_outfile="%s_epitopes_donor2.pkl"%(time),
    tcr_outfile="%s_TCRs_donor2.pkl"%(time),
    subj='donor2:healthy',
    organism='homo sapiens',
    cluster_distance='hamming',
)



In [118]:
class TCR:
    '''Create a class object for loading, processing and clustering 10X TCR data with Scirpy'''
    
    def __init__(self, parameters):
        self.params = parameters
    
    def load_data(self):
        contigs, feature_matrix= [self.params[x] for x in ['contigs','bc_matrix']]

        # Load TCR data
        adata_tcr = ir.io.read_10x_vdj(contigs)

        # Load associated transcriptomics data
        adata = sc.read_10x_h5(feature_matrix)

        # Merge
        ir.pp.merge_with_ir(adata, adata_tcr)

        print("Data loaded to Ann Object with shape: ",adata.shape)
        
        self.adata=adata
        return adata


    def drop_dead(self,adata):
        
        max_genes, mito_cutoff= [self.params[x] for x in ['max_genes','mito_cutoff']]
        if mito_cutoff:
            print('Eliminating cells above mitochondrial gene expression threshold')
            adata.var['mt'] = adata.var_names.str.startswith('MT-')  # Generates a boolean mask for genes starting with 'MT'
            sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)  # Returns total_counts_['mt'] for vars fed in via qc_vars
            adata = adata[adata.obs.pct_counts_mt < mito_cutoff, :]

        if max_genes:
            print('Eliminating suspected doublets')
            adata = adata[adata.obs.n_genes_by_counts < max_genes, :]

        return adata

    def preprocess(self):
        
        adata=self.adata

        if self.params['preprocess']==True:

            min_cells, min_genes,chain_qc = [self.params[x] for x in ['min_cells','min_genes','chain_qc']]
        
            # Filter dead and doublet cells

            if min_cells:
                sc.pp.filter_genes(adata, min_cells=min_cells)
            if min_genes:
                sc.pp.filter_cells(adata, min_genes=min_genes)
            
            adata=self.drop_dead(adata)

            # Normalise count data
            sc.pp.normalize_per_cell(adata, counts_per_cell_after=1000)
            sc.pp.log1p(adata)

            # QC of TCR chains
            if chain_qc==True:
                ir.tl.chain_qc(adata)
                print(
                    "Fraction of cells with more than one pair of TCRs: {:.2f}".format(
                        np.sum(adata.obs["chain_pairing"].isin(
                                ["extra VJ", "extra VDJ", "two full chains"])) / adata.n_obs))
                print('Dropping multichain and single chain receptors')

                # Drop multichain instances
                adata = adata[adata.obs["chain_pairing"] != "multichain", :].copy()

                # Drop single chain instances
                adata = adata[~adata.obs["chain_pairing"].isin(["orphan VDJ", "orphan VJ"]), :].copy()
        
        self.adata=adata
        return adata
    
    def get_epitopes(self):

        '''Extract epitope map from binarized matrix file'''
        binarized_matrix, epitopes_loadfile, epitopes_outfile=[self.params[x] for x in ['binarized_matrix',
                                                                                            'epitopes_loadfile',
                                                                                            'epitopes_outfile']]
        if epitopes_loadfile:
            epitopes=load_pickle(epitopes_loadfile)
        
        elif epitopes_outfile and not epitopes_loadfile:

            print('Loading binary matrix')
            binary_matrix=pd.read_csv(binarized_matrix)
            sub=binary_matrix[binary_matrix.columns[68:]]   # Check this for other donors
            sub.set_index(binary_matrix['barcode'].values,inplace=True)
            print('Reading epitope specifity from %s files'%(len(sub)))
            eps=[sub.iloc[i][sub.iloc[i]==True] for i in range(len(sub))]
            assert len(eps)==len(sub)
            epitopes={}
            for i in range(len(eps)):
                try:
                    epitopes[eps[i].name]=eps[i].index[0].strip('_binder')
                except IndexError:
                    epitopes[eps[i].name]='None'
            save_pickle(epitopes,epitopes_outfile)
            
        else:
            raise ValueError("You must specify either a pre-existing epitope file, or an epitopes outfile location")
        
        self.epitopes=epitopes

        return epitopes

    def get_clusters(self):
        dist=self.params['cluster_distance']
        if dist not in ['identity', 'hamming', 'levenshtein']:
            raise ValueError('Enter a distance metric from "identity", "hamming", "levenshtein"')

        ir.pp.ir_dist(
            self.adata,
            metric=dist,
            cutoff=1,
            sequence="aa",
        )
        ir.tl.define_clonotype_clusters(self.adata, sequence='aa', metric=dist, cutoff=1,
                                        receptor_arms='all', dual_ir='primary_only', same_v_gene=False, n_jobs=4)

        ir.tl.clonotype_network(self.adata, min_cells=2)

        idx = {x:k for k in self.adata.uns['cc_aa_identity']['cell_indices'].keys() for x in tcrs.uns['cc_aa_identity']['cell_indices'][k]}
        clusts= []
        tcr=self.adata.uns['TCRs']
        for i in range(len(tcr)):
            barcode = tcr.iloc[i].name
            if barcode in idx.keys():
                clusts.append(idx[barcode])
            else:
                clusts.append('None')
        assert len(clusts)==len(tcr)
        tcr['scirpy_%'%(dist)]=clusts
        self.adata.uns['TCRs']=tcr
            
    def get_TCRs(self):

        '''Extract TCRs from Ann object, and map to epitope specificity where provided'''
        TCR = self.adata.obs[['IR_VJ_1_j_call','IR_VJ_1_junction_aa','IR_VDJ_1_j_call','IR_VDJ_1_junction_aa']].copy()
        TCR.columns=['v.alpha','cdr3.alpha','v.beta','cdr3.beta']
        epitope_dict=self.epitopes
        eps=[]
        for i in range(len(TCR)):
                if TCR.iloc[i].name in epitope_dict.keys():
                    eps.append(epitope_dict[TCR.index[i]])
                else:
                    eps.append('None')
        TCR['Epitope']=eps
        TCR['subject:condition']=self.params['subj']

        self.adata.uns['TCRs']=TCR

        if self.params['cluster_distance']:
            self.get_clusters()
        
        if self.params['tcr_outfile']:
            print('Saving')
            TCR.to_pickle(self.params['tcr_outfile'])

        return self.adata
        


In [119]:
tcr_obj=TCR(parameter_dict)

In [120]:
tcrs=tcr_obj.load_data()



In [None]:
tcrs=tcr_obj.preprocess()

filtered out 15493 genes that are detected in less than 10 cells


Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.


filtered out 11 cells that have less than 100 genes expressed


Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.


Eliminating cells above mitochondrial gene expression threshold
Eliminating suspected doublets
normalizing by total count per cell


Trying to set attribute `.obs` of view, copying.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.


    finished (0:00:01): normalized adata.X and added    'n_counts', counts per cell before normalization (adata.obs)
Fraction of cells with more than one pair of TCRs: 0.07
Dropping multichain and single chain receptors


Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.


In [None]:
epitopes=tcr_obj.get_epitopes()

Loading binary matrix
Reading epitope specifity from 77854 files
Saving file to  20220121_1318_epitopes_donor2.pkl


In [None]:
tcrs = tcr_obj.get_TCRs()

Computing sequence x sequence distance matrix for VJ sequences.


100%|██████████| 4753/4753 [00:08<00:00, 563.06it/s] 


Computing sequence x sequence distance matrix for VDJ sequences.


100%|██████████| 4950/4950 [00:08<00:00, 578.00it/s]


Initializing lookup tables. 
Computing clonotype x clonotype distances.
NB: Computation happens in chunks. The progressbar only advances when a chunk has finished. 


100%|██████████| 5639/5639 [00:11<00:00, 486.10it/s]


Stored clonal assignments in `adata.obs["cc_aa_hamming"]`.


ValueError: Connectivity data not found. Did you run `tl.define_clonotypes` or `tl.define_clonotype_clusters`, respectively?

In [None]:
ir_dist_aa_hamming

AnnData object with n_obs × n_vars = 29761 × 18045
    obs: 'is_cell', 'high_confidence', 'multi_chain', 'extra_chains', 'IR_VJ_1_c_call', 'IR_VJ_2_c_call', 'IR_VDJ_1_c_call', 'IR_VDJ_2_c_call', 'IR_VJ_1_consensus_count', 'IR_VJ_2_consensus_count', 'IR_VDJ_1_consensus_count', 'IR_VDJ_2_consensus_count', 'IR_VJ_1_d_call', 'IR_VJ_2_d_call', 'IR_VDJ_1_d_call', 'IR_VDJ_2_d_call', 'IR_VJ_1_duplicate_count', 'IR_VJ_2_duplicate_count', 'IR_VDJ_1_duplicate_count', 'IR_VDJ_2_duplicate_count', 'IR_VJ_1_j_call', 'IR_VJ_2_j_call', 'IR_VDJ_1_j_call', 'IR_VDJ_2_j_call', 'IR_VJ_1_junction', 'IR_VJ_2_junction', 'IR_VDJ_1_junction', 'IR_VDJ_2_junction', 'IR_VJ_1_junction_aa', 'IR_VJ_2_junction_aa', 'IR_VDJ_1_junction_aa', 'IR_VDJ_2_junction_aa', 'IR_VJ_1_locus', 'IR_VJ_2_locus', 'IR_VDJ_1_locus', 'IR_VDJ_2_locus', 'IR_VJ_1_productive', 'IR_VJ_2_productive', 'IR_VDJ_1_productive', 'IR_VDJ_2_productive', 'IR_VJ_1_v_call', 'IR_VJ_2_v_call', 'IR_VDJ_1_v_call', 'IR_VDJ_2_v_call', 'has_ir', 'n_genes', 'n_gen

In [None]:
# Compute scirpy clonotypes based on nucleotide sequence identity
# ir.pp.ir_dist(
#     tcrs,
#     metric="identity",
#     sequence="aa",
# )
# ir.tl.define_clonotype_clusters(tcrs, sequence='aa', metric='identity', receptor_arms='all', dual_ir='any', same_v_gene=False, n_jobs=4)
# ir.tl.clonotype_network(tcrs, min_cells=2)

Computing sequence x sequence distance matrix for VJ sequences.
Computing sequence x sequence distance matrix for VDJ sequences.
Initializing lookup tables. 
Computing clonotype x clonotype distances.
NB: Computation happens in chunks. The progressbar only advances when a chunk has finished. 


100%|██████████| 6132/6132 [00:15<00:00, 407.12it/s]


Stored clonal assignments in `adata.obs["cc_aa_identity"]`.


In [None]:
tcrs.uns['cc_aa_identity']


{'distances': <6132x6132 sparse matrix of type '<class 'numpy.uint8'>'
 	with 12232 stored elements in Compressed Sparse Row format>,
 'cell_indices': {'0': array(['AAAGCAACACCGAATT-1', 'GCTCCTACAGGTGGAT-1', 'AAGCCGCGTGATGTCT-3',
         'ATAACGCGTCGAACAG-3', 'CGTGTCTAGGACGAAA-3', 'TTTATGCGTAAGGATT-3',
         'CGTGAGCGTAGGCATG-4', 'TATGCCCTCTCTAGGA-4', 'TTTGGTTGTAAGTGTA-4',
         'AACGTTGAGAAGGCCT-5', 'CCTATTAAGTCCGGTC-5', 'GGCGTGTGTCCTAGCG-5',
         'TCGTAGAGTTCCACGG-5', 'TGCGGGTAGTATCGAA-5', 'AGCAGCCGTGTTTGGT-6',
         'AGCTCTCAGGTGACCA-6', 'ATCCGAATCGTTGACA-6', 'CCATGTCTCGGAAACG-6',
         'CGTCAGGGTAATTGGA-6', 'GCTGCTTTCGCATGGC-6', 'TCCACACCAGCGTAAG-6',
         'AAAGCAAAGTACGCGA-7', 'AAGCCGCGTGCGGTAA-7', 'GGTGAAGCACCGAAAG-7',
         'TAAACCGAGTTCGCGC-7', 'TGACTAGCACTCTGTC-7', 'GATCAGTAGCTAGCCC-8',
         'TTCGAAGCACCCATGG-8', 'GCCTCTAAGATGAGAG-9', 'TGGCGCATCAACGAAA-9',
         'ACACCAAGTGGTCCGT-10', 'ACCTTTAAGGTAAACT-10',
         'CGGACACTCAACACAC-10', 'GGTGAAG

In [None]:
# adata.write_h5ad('data/anndata_object.h5ad')