In [1]:
import numpy as np
import pandas as pd
import scanpy as sc
import scirpy as ir
from matplotlib import pyplot as plt, cm as mpl_cm
from cycler import cycler
from tcr_processing import *
import datetime
import csv

sc.set_figure_params(figsize=(4, 4))
sc.settings.verbosity = 2  # verbosity: errors (0), warnings (1), info (2), hints (3)

In [2]:
class TCR:
    '''Create a class object for loading, processing 
        and clustering 10X TCR data with Scirpy'''
    
    def __init__(self, parameters):
        print('Initialising class object')

        assert type(parameters)==dict

        self.params = parameters
    
    def load_data(self):

        '''Load 10X transcriptomic and cell binding data'''
        
        print('Loading TCR binding and transcriptomics data')
        
        contigs, feature_matrix= [self.params[x] for x in ['contigs','bc_matrix']]
        
        if not contigs[-3:]=='csv':
            raise TypeError('Contigs file should be in csv format')

        if not feature_matrix[-2:]=='h5':
            raise TypeError('Contigs file should be in h5 format')
        
        # Load TCR data
        adata_tcr = ir.io.read_10x_vdj(contigs)

        # Load associated transcriptomics data
        adata = sc.read_10x_h5(feature_matrix)

        # Merge
        ir.pp.merge_with_ir(adata, adata_tcr)

        print("Data loaded to Ann Object with shape: ",adata.shape)
        
        self.adata=adata

    def drop_dead(self):
        '''Remove cells above a set % of mitochondrial gene expression 
            or falling above a total gene count threshold'''
        
        max_genes, mito_cutoff= [self.params[x] for x in ['max_genes','mito_cutoff']]
        if mito_cutoff:
            print('Eliminating cells above mitochondrial gene expression threshold')
            self.adata.var['mt'] = self.adata.var_names.str.startswith('MT-')  # Generates a boolean mask for genes starting with 'MT'
            sc.pp.calculate_qc_metrics(self.adata, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)  # Returns total_counts_['mt'] for vars fed in via qc_vars
            self.adata = self.adata[self.adata.obs.pct_counts_mt < mito_cutoff, :]

        if max_genes:
            print('Eliminating suspected doublets')
            self.adata = self.adata[self.adata.obs.n_genes_by_counts < max_genes, :]

    def preprocess(self):
        '''Preprocess single cell data and conduct TCR chain quality control'''

        if self.params['preprocess']==True:

            min_cells, min_genes = [self.params[x] for x in ['min_cells','min_genes']]
        
            # Filter dead and doublet cells

            if min_cells:
                sc.pp.filter_genes(self.adata, min_cells=min_cells)
            if min_genes:
                sc.pp.filter_cells(self.adata, min_genes=min_genes)
            
            self.drop_dead()

            # Normalise count data
            sc.pp.normalize_per_cell(self.adata, counts_per_cell_after=1000)
            sc.pp.log1p(self.adata)

        # QC of TCR chains
    
        ir.tl.chain_qc(self.adata)
        
        print(
            "Fraction of cells with more than one pair of TCRs: {:.2f}".format(
                np.sum(self.adata.obs["chain_pairing"].isin(
                        ["extra VJ", "extra VDJ", "two full chains"])) / self.adata.n_obs))
        print('Dropping multichain and single chain receptors')

        # Drop multichain instances
        self.adata = self.adata[self.adata.obs["chain_pairing"] != "multichain", :].copy()

        # Drop single chain instances
        self.adata = self.adata[~self.adata.obs["chain_pairing"].isin(["orphan VDJ", "orphan VJ"]), :].copy()
    
    def get_epitopes(self):

        '''Extract epitope map from binarized matrix file'''
        binarized_matrix, epitopes_loadfile, epitopes_outfile=[self.params[x] for x in ['binarized_matrix',
                                                                                            'epitopes_loadfile',
                                                                                            'epitopes_outfile']]
        if epitopes_loadfile:
            epitopes=load_pickle(epitopes_loadfile)
        
        elif epitopes_outfile and not epitopes_loadfile:

            print('Loading binary matrix')
            binary_matrix=pd.read_csv(binarized_matrix)
            sub=binary_matrix[binary_matrix.columns[68:]]   # Check this for other donors
            sub.set_index(binary_matrix['barcode'].values,inplace=True)
            print('Reading epitope specificity from %s cells'%(len(sub)))
            eps=[sub.iloc[i][sub.iloc[i]==True] for i in range(len(sub))]
            assert len(eps)==len(sub)
            epitopes={}
            for i in range(len(eps)):
                try:
                    epitopes[eps[i].name]=eps[i].index[0].strip('_binder')
                except IndexError:
                    epitopes[eps[i].name]='None'
            save_pickle(epitopes,epitopes_outfile)
            
        else:
            raise ValueError("You must specify either a pre-existing epitope file, or an epitopes outfile location")
        
        self.epitopes=epitopes

    def get_clusters(self):

        '''Generate baseline clusters using scirpy clustering function'''

        dist=self.params['cluster_distance']
        if dist not in ['identity', 'hamming', 'levenshtein']:
            raise ValueError('Enter a distance metric from "identity", "hamming", "levenshtein"')

        ir.pp.ir_dist(
            self.adata,
            metric=dist,
            cutoff=1,
            sequence="aa",
        )
        ir.tl.define_clonotype_clusters(self.adata, sequence='aa', metric=dist, receptor_arms='all', dual_ir='primary_only', same_v_gene=False, n_jobs=4)
        ir.tl.clonotype_network(self.adata, sequence='aa', metric=dist,min_cells=2) 
        idx = {x:k for k in self.adata.uns['cc_aa_%s'%(dist)]['cell_indices'].keys() for x in self.adata.uns['cc_aa_%s'%(dist)]['cell_indices'][k]}
        clusts= []
        tcr=self.adata.uns['TCRs']
        for i in range(len(tcr)):
            barcode = tcr.iloc[i].name
            if barcode in idx.keys():
                clusts.append(idx[barcode])
            else:
                clusts.append('None')
        assert len(clusts)==len(tcr)
        tcr['scirpy_%s'%(dist)]=clusts
        self.adata.uns['TCRs']=tcr
            
    def get_TCRs(self):

        '''Extract TCRs from Ann object, and map to epitope specificity'''
        TCR = self.adata.obs[['IR_VDJ_1_junction_aa','IR_VDJ_1_v_call','IR_VDJ_1_j_call',
                              'IR_VJ_1_junction_aa','IR_VJ_1_v_call','IR_VJ_1_j_call']].copy()
        TCR.columns=['cdr3.beta','v.beta','j.beta','cdr3.alpha','v.alpha','j.alpha']
        epitope_dict=self.epitopes
        eps=[]
        for i in range(len(TCR)):
                if TCR.iloc[i].name in epitope_dict.keys():
                    eps.append(epitope_dict[TCR.index[i]])
                else:
                    eps.append('None')
        TCR['Epitope']=eps
        TCR['subject:condition']=self.params['subj']

        self.adata.uns['TCRs']=TCR

        if self.params['cluster_distance']:
            self.get_clusters()
        
        if self.params['tcr_outfile']:
            print('Saving')
            TCR.to_pickle(self.params['tcr_outfile'])

    def write_record(self,record):
        '''Write parameters to csv file'''
        print('Recording parameters to: ',record)
        with open(record,'a') as f:
            writer=csv.writer(f)
            writer.writerow(list(self.params.values()))    


In [3]:
def run(param_dict,record):
    tcr_obj=TCR(parameter_dict)
    tcr_obj.load_data()
    tcr_obj.preprocess()
    tcr_obj.get_epitopes()
    tcr_obj.get_TCRs()
    tcr_obj.write_record(record)
    print('Complete')   

In [5]:
# Execute

time=datetime.datetime.now().strftime('%Y%m%d_%H%M')
record='data/record.csv'

parameter_dict=dict(
tcr_outfile="%s_TCRs_donor2.pkl"%(time),
contigs="data/vdj_v1_hs_aggregated_donor2_all_contig_annotations.csv",
bc_matrix="data/vdj_v1_hs_aggregated_donor2_filtered_feature_bc_matrix.h5",
binarized_matrix='data/vdj_v1_hs_aggregated_donor2_binarized_matrix.csv',
preprocess=False,
min_cells=10,
min_genes=100,
max_genes=2500,
mito_cutoff=5,
epitopes_loadfile=None,
epitopes_outfile="%s_epitopes_donor2.pkl"%(time),
subj='donor2:healthy',
organism='homo sapiens',
cluster_distance='hamming')

run(parameter_dict,record)


Initialising class object
Loading TCR binding and transcriptomics data
reading data/vdj_v1_hs_aggregated_donor2_filtered_feature_bc_matrix.h5


Variable names are not unique. To make them unique, call `.var_names_make_unique`.


 (0:00:05)


Variable names are not unique. To make them unique, call `.var_names_make_unique`.
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'feature_types' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'genome' as categorical


Data loaded to Ann Object with shape:  (91921, 33538)
Fraction of cells with more than one pair of TCRs: 0.07
Dropping multichain and single chain receptors


Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.


Loading binary matrix
Reading epitope specificity from 77854 cells
Saving file to  20220124_0955_epitopes_donor2.pkl
Computing sequence x sequence distance matrix for VJ sequences.


100%|██████████| 22366/22366 [00:10<00:00, 2092.22it/s] 


Computing sequence x sequence distance matrix for VDJ sequences.


100%|██████████| 24976/24976 [00:10<00:00, 2393.52it/s] 


Initializing lookup tables. 
Computing clonotype x clonotype distances.
NB: Computation happens in chunks. The progressbar only advances when a chunk has finished. 


100%|██████████| 12921/12921 [00:15<00:00, 833.92it/s] 


Stored clonal assignments in `adata.obs["cc_aa_hamming"]`.
Saving
Recording parameters to:  data/record.csv
Complete
